]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ********************************************************************** | |
3 | * Copyright (C) 1999-2003 IBM Corp. All rights reserved. | |
4 | ********************************************************************** | |
5 | * Date Name Description | |
6 | * 12/1/99 rgillam Complete port from Java. | |
7 | * 01/13/2000 helena Added UErrorCode to ctors. | |
8 | ********************************************************************** | |
9 | */ | |
10 | ||
11 | #ifndef DBBI_H | |
12 | #define DBBI_H | |
13 | ||
14 | #include "unicode/rbbi.h" | |
15 | ||
16 | #if !UCONFIG_NO_BREAK_ITERATION | |
17 | ||
18 | U_NAMESPACE_BEGIN | |
19 | ||
20 | /* forward declaration */ | |
21 | class DictionaryBasedBreakIteratorTables; | |
22 | ||
23 | /** | |
24 | * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary | |
25 | * to further subdivide ranges of text beyond what is possible using just the | |
26 | * state-table-based algorithm. This is necessary, for example, to handle | |
27 | * word and line breaking in Thai, which doesn't use spaces between words. The | |
28 | * state-table-based algorithm used by RuleBasedBreakIterator is used to divide | |
29 | * up text as far as possible, and then contiguous ranges of letters are | |
30 | * repeatedly compared against a list of known words (i.e., the dictionary) | |
31 | * to divide them up into words. | |
32 | * | |
33 | * <p>Applications do not normally need to include this header.</p> | |
34 | * | |
35 | * <p>This class will probably be deprecated in a future release of ICU, and replaced | |
36 | * with a more flexible and capable dictionary based break iterator. This change | |
37 | * should be invisible to applications, because creation and use of instances of | |
38 | * DictionaryBasedBreakIterator is through the factories and abstract | |
39 | * API on class BreakIterator, which will remain stable.</p> | |
40 | * | |
41 | * <p>This class is not intended to be subclassed.</p> | |
42 | * | |
43 | * | |
44 | * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator, | |
45 | * but adds one more special substitution name: <dictionary>. This substitution | |
46 | * name is used to identify characters in words in the dictionary. The idea is that | |
47 | * if the iterator passes over a chunk of text that includes two or more characters | |
48 | * in a row that are included in <dictionary>, it goes back through that range and | |
49 | * derives additional break positions (if possible) using the dictionary. | |
50 | * | |
51 | * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary | |
52 | * file. It follows a prescribed search path to locate the dictionary (right now, | |
53 | * it looks for it in /com/ibm/text/resources in each directory in the classpath, | |
54 | * and won't find it in JAR files, but this location is likely to change). The | |
55 | * dictionary file is in a serialized binary format. We have a very primitive (and | |
56 | * slow) BuildDictionaryFile utility for creating dictionary files, but aren't | |
57 | * currently making it public. Contact us for help. | |
58 | * <p> | |
59 | * <b> NOTE </b> The DictionaryBasedIterator class is still under development. The | |
60 | * APIs are not in stable condition yet. | |
61 | */ | |
62 | class U_COMMON_API DictionaryBasedBreakIterator : public RuleBasedBreakIterator { | |
63 | ||
64 | private: | |
65 | ||
66 | /** | |
67 | * when a range of characters is divided up using the dictionary, the break | |
68 | * positions that are discovered are stored here, preventing us from having | |
69 | * to use either the dictionary or the state table again until the iterator | |
70 | * leaves this range of text | |
71 | */ | |
72 | int32_t* cachedBreakPositions; | |
73 | ||
74 | /** | |
75 | * The number of elements in cachedBreakPositions | |
76 | */ | |
77 | int32_t numCachedBreakPositions; | |
78 | ||
79 | /** | |
80 | * if cachedBreakPositions is not null, this indicates which item in the | |
81 | * cache the current iteration position refers to | |
82 | */ | |
83 | int32_t positionInCache; | |
84 | ||
85 | DictionaryBasedBreakIteratorTables *fTables; | |
86 | ||
87 | /** | |
88 | * Class ID | |
89 | */ | |
90 | static const char fgClassID; | |
91 | ||
92 | /**======================================================================= | |
93 | * Create a dictionary based break boundary detection iterator. | |
94 | * @param tablesImage The location for the dictionary to be loaded into memory | |
95 | * @param dictionaryFilename The name of the dictionary file | |
96 | * @param status the error code status | |
97 | * @return A dictionary based break detection iterator. The UErrorCode& status | |
98 | * parameter is used to return status information to the user. | |
99 | * To check whether the construction succeeded or not, you should check | |
100 | * the value of U_SUCCESS(err). If you wish more detailed information, you | |
101 | * can check for informational error results which still indicate success. For example, | |
102 | * U_FILE_ACCESS_ERROR will be returned if the file does not exist. | |
103 | * The caller owns the returned object and is responsible for deleting it. | |
104 | ======================================================================= */ | |
105 | DictionaryBasedBreakIterator(UDataMemory* tablesImage, const char* dictionaryFilename, UErrorCode& status); | |
106 | ||
107 | public: | |
108 | //======================================================================= | |
109 | // boilerplate | |
110 | //======================================================================= | |
111 | ||
112 | /** | |
113 | * Destructor | |
114 | * @stable ICU 2.0 | |
115 | */ | |
116 | virtual ~DictionaryBasedBreakIterator(); | |
117 | ||
118 | /** | |
119 | * Default constructor. Creates an "empty" break iterator. | |
120 | * Such an iterator can subsequently be assigned to. | |
121 | * @return the newly created DictionaryBaseBreakIterator. | |
122 | * @stable ICU 2.0 | |
123 | */ | |
124 | DictionaryBasedBreakIterator(); | |
125 | ||
126 | /** | |
127 | * Copy constructor. | |
128 | * @param other The DictionaryBasedBreakIterator to be copied. | |
129 | * @return the newly created DictionaryBasedBreakIterator. | |
130 | * @stable ICU 2.0 | |
131 | */ | |
132 | DictionaryBasedBreakIterator(const DictionaryBasedBreakIterator &other); | |
133 | ||
134 | /** | |
135 | * Assignment operator. | |
136 | * @param that The object to be copied. | |
137 | * @return the newly set DictionaryBasedBreakIterator. | |
138 | * @stable ICU 2.0 | |
139 | */ | |
140 | DictionaryBasedBreakIterator& operator=(const DictionaryBasedBreakIterator& that); | |
141 | ||
142 | /** | |
143 | * Returns a newly-constructed RuleBasedBreakIterator with the same | |
144 | * behavior, and iterating over the same text, as this one. | |
145 | * @return Returns a newly-constructed RuleBasedBreakIterator. | |
146 | * @stable ICU 2.0 | |
147 | */ | |
148 | virtual BreakIterator* clone(void) const; | |
149 | ||
150 | //======================================================================= | |
151 | // BreakIterator overrides | |
152 | //======================================================================= | |
153 | /** | |
154 | * Advances the iterator backwards, to the last boundary preceding this one. | |
155 | * @return The position of the last boundary position preceding this one. | |
156 | * @stable ICU 2.0 | |
157 | */ | |
158 | virtual int32_t previous(void); | |
159 | ||
160 | /** | |
161 | * Sets the iterator to refer to the first boundary position following | |
162 | * the specified position. | |
163 | * @offset The position from which to begin searching for a break position. | |
164 | * @return The position of the first break after the current position. | |
165 | * @stable ICU 2.0 | |
166 | */ | |
167 | virtual int32_t following(int32_t offset); | |
168 | ||
169 | /** | |
170 | * Sets the iterator to refer to the last boundary position before the | |
171 | * specified position. | |
172 | * @offset The position to begin searching for a break from. | |
173 | * @return The position of the last boundary before the starting position. | |
174 | * @stable ICU 2.0 | |
175 | */ | |
176 | virtual int32_t preceding(int32_t offset); | |
177 | ||
178 | /** | |
179 | * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. | |
180 | * This method is to implement a simple version of RTTI, since not all | |
181 | * C++ compilers support genuine RTTI. Polymorphic operator==() and | |
182 | * clone() methods call this method. | |
183 | * | |
184 | * @return The class ID for this object. All objects of a | |
185 | * given class have the same class ID. Objects of | |
186 | * other classes have different class IDs. | |
187 | * @stable ICU 2.0 | |
188 | */ | |
189 | virtual UClassID getDynamicClassID(void) const; | |
190 | ||
191 | /** | |
192 | * Returns the class ID for this class. This is useful only for | |
193 | * comparing to a return value from getDynamicClassID(). For example: | |
194 | * | |
195 | * Base* polymorphic_pointer = createPolymorphicObject(); | |
196 | * if (polymorphic_pointer->getDynamicClassID() == | |
197 | * Derived::getStaticClassID()) ... | |
198 | * | |
199 | * @return The class ID for all objects of this class. | |
200 | * @stable ICU 2.0 | |
201 | */ | |
202 | static inline UClassID getStaticClassID(void); | |
203 | ||
204 | protected: | |
205 | //======================================================================= | |
206 | // implementation | |
207 | //======================================================================= | |
208 | /** | |
209 | * This method is the actual implementation of the next() method. All iteration | |
210 | * vectors through here. This method initializes the state machine to state 1 | |
211 | * and advances through the text character by character until we reach the end | |
212 | * of the text or the state machine transitions to state 0. We update our return | |
213 | * value every time the state machine passes through a possible end state. | |
214 | * @internal | |
215 | */ | |
216 | virtual int32_t handleNext(void); | |
217 | ||
218 | /** | |
219 | * removes the cache of break positions (usually in response to a change in | |
220 | * position of some sort) | |
221 | * @internal | |
222 | */ | |
223 | virtual void reset(void); | |
224 | ||
225 | /** | |
226 | * init Initialize a dbbi. Common routine for use by constructors. | |
227 | * @internal | |
228 | */ | |
229 | void init(); | |
230 | ||
231 | /** | |
232 | * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated. | |
233 | * If buffer is not large enough, new memory will be allocated. | |
234 | * @param BufferSize reference to size of allocated space. | |
235 | * If BufferSize == 0, a sufficient size for use in cloning will | |
236 | * be returned ('pre-flighting') | |
237 | * If BufferSize is not enough for a stack-based safe clone, | |
238 | * new memory will be allocated. | |
239 | * @param status to indicate whether the operation went on smoothly or there were errors | |
240 | * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were | |
241 | * necessary. | |
242 | * @return pointer to the new clone | |
243 | * @internal | |
244 | */ | |
245 | virtual BreakIterator * createBufferClone(void *stackBuffer, | |
246 | int32_t &BufferSize, | |
247 | UErrorCode &status); | |
248 | ||
249 | ||
250 | private: | |
251 | /** | |
252 | * This is the function that actually implements the dictionary-based | |
253 | * algorithm. Given the endpoints of a range of text, it uses the | |
254 | * dictionary to determine the positions of any boundaries in this | |
255 | * range. It stores all the boundary positions it discovers in | |
256 | * cachedBreakPositions so that we only have to do this work once | |
257 | * for each time we enter the range. | |
258 | * @param startPos The start position of a range of text | |
259 | * @param endPos The end position of a range of text | |
260 | * @param status The error code status | |
261 | */ | |
262 | void divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status); | |
263 | ||
264 | ||
265 | /* | |
266 | * HSYS : Please revisit with Rich, the ctors of the DBBI class is currently | |
267 | * marked as private. | |
268 | */ | |
269 | friend class DictionaryBasedBreakIteratorTables; | |
270 | friend class BreakIterator; | |
271 | }; | |
272 | ||
273 | inline UClassID | |
274 | DictionaryBasedBreakIterator::getStaticClassID(void) | |
275 | { return (UClassID)(&fgClassID); } | |
276 | ||
277 | inline UClassID | |
278 | DictionaryBasedBreakIterator::getDynamicClassID(void) const | |
279 | { return DictionaryBasedBreakIterator::getStaticClassID(); } | |
280 | ||
281 | U_NAMESPACE_END | |
282 | ||
283 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | |
284 | ||
285 | #endif |