]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | *************************************************************************** | |
3 | * Copyright (C) 1999-2003 International Business Machines Corporation * | |
4 | * and others. All rights reserved. * | |
5 | *************************************************************************** | |
6 | ||
7 | ********************************************************************** | |
8 | * Date Name Description | |
9 | * 10/22/99 alan Creation. | |
10 | * 11/11/99 rgillam Complete port from Java. | |
11 | ********************************************************************** | |
12 | */ | |
13 | ||
14 | #ifndef RBBI_H | |
15 | #define RBBI_H | |
16 | ||
17 | #include "unicode/utypes.h" | |
18 | ||
19 | #if !UCONFIG_NO_BREAK_ITERATION | |
20 | ||
21 | #include "unicode/brkiter.h" | |
22 | #include "unicode/udata.h" | |
23 | #include "unicode/parseerr.h" | |
24 | ||
25 | struct UTrie; | |
26 | ||
27 | U_NAMESPACE_BEGIN | |
28 | ||
29 | struct RBBIDataHeader; | |
30 | class RuleBasedBreakIteratorTables; | |
31 | class BreakIterator; | |
32 | class RBBIDataWrapper; | |
33 | ||
34 | ||
35 | ||
36 | /** | |
37 | * A subclass of BreakIterator whose behavior is specified using a list of rules. | |
38 | * <p>Instances of this class are most commonly created by the factory methods of | |
39 | * BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc., | |
40 | * and then used via the abstract API in class BreakIterator</p> | |
41 | * | |
42 | * <p>See the ICU User Guide for information on Break Iterator Rules.</p> | |
43 | * | |
44 | * <p>This class is not intended to be subclassed. (Class DictionaryBasedBreakIterator | |
45 | * is a subclass, but that relationship is effectively internal to the ICU | |
46 | * implementation. The subclassing interface to RulesBasedBreakIterator is | |
47 | * not part of the ICU API, and may not remain stable.</p> | |
48 | * | |
49 | */ | |
50 | class U_COMMON_API RuleBasedBreakIterator : public BreakIterator { | |
51 | ||
52 | protected: | |
53 | /** | |
54 | * The character iterator through which this BreakIterator accesses the text | |
55 | * @internal | |
56 | */ | |
57 | CharacterIterator* fText; | |
58 | ||
59 | /** | |
60 | * The rule data for this BreakIterator instance | |
61 | * @internal | |
62 | */ | |
63 | RBBIDataWrapper *fData; | |
64 | /** @internal */ | |
65 | UTrie *fCharMappings; | |
66 | ||
67 | /** Rule {tag} value for the most recent match. | |
68 | * @internal | |
69 | */ | |
70 | int32_t fLastBreakTag; | |
71 | ||
72 | /** | |
73 | * Rule tag value valid flag. | |
74 | * Some iterator operations don't intrinsically set the correct tag value. | |
75 | * This flag lets us lazily compute the value if we are ever asked for it. | |
76 | * @internal | |
77 | */ | |
78 | UBool fLastBreakTagValid; | |
79 | ||
80 | /** | |
81 | * Counter for the number of characters encountered with the "dictionary" | |
82 | * flag set. Normal RBBI iterators don't use it, although the code | |
83 | * for updating it is live. Dictionary Based break iterators (a subclass | |
84 | * of us) access this field directly. | |
85 | * @internal | |
86 | */ | |
87 | uint32_t fDictionaryCharCount; | |
88 | ||
89 | /** | |
90 | * Debugging flag. Trace operation of state machine when true. | |
91 | * @internal | |
92 | */ | |
93 | static UBool fTrace; | |
94 | ||
95 | ||
96 | ||
97 | private: | |
98 | /** | |
99 | * Class ID | |
100 | */ | |
101 | static const char fgClassID; | |
102 | ||
103 | protected: | |
104 | //======================================================================= | |
105 | // constructors | |
106 | //======================================================================= | |
107 | ||
108 | /** | |
109 | * This constructor uses the udata interface to create a BreakIterator | |
110 | * whose internal tables live in a memory-mapped file. "image" is a pointer | |
111 | * to the beginning of that file. | |
112 | * @internal | |
113 | */ | |
114 | RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status); | |
115 | ||
116 | /** | |
117 | * Constructor from a flattened set of RBBI data in malloced memory. | |
118 | * RulesBasedBreakIterators built from a custom set of rules | |
119 | * are created via this constructor; the rules are compiled | |
120 | * into memory, then the break iterator is constructed here. | |
121 | * | |
122 | * The break iterator adopts the memory, and will | |
123 | * free it when done. | |
124 | * @internal | |
125 | */ | |
126 | RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status); | |
127 | ||
128 | friend class RBBIRuleBuilder; /** @internal */ | |
129 | friend class BreakIterator; | |
130 | ||
131 | ||
132 | ||
133 | public: | |
134 | ||
135 | /** Default constructor. Creates an empty shell of an iterator, with no | |
136 | * rules or text to iterate over. Object can subsequently be assigned to. | |
137 | * @draft ICU 2.2 | |
138 | */ | |
139 | RuleBasedBreakIterator(); | |
140 | ||
141 | /** | |
142 | * Copy constructor. Will produce a break iterator with the same behavior, | |
143 | * and which iterates over the same text, as the one passed in. | |
144 | * @param that The RuleBasedBreakIterator passed to be copied | |
145 | * @stable ICU 2.0 | |
146 | */ | |
147 | RuleBasedBreakIterator(const RuleBasedBreakIterator& that); | |
148 | ||
149 | /** | |
150 | * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. | |
151 | * @param rules The break rules to be used. | |
152 | * @param parseError In the event of a syntax error in the rules, provides the location | |
153 | * within the rules of the problem. | |
154 | * @param status Information on any errors encountered. | |
155 | * @draft ICU 2.2 | |
156 | */ | |
157 | RuleBasedBreakIterator( const UnicodeString &rules, | |
158 | UParseError &parseError, | |
159 | UErrorCode &status); | |
160 | /** | |
161 | * Destructor | |
162 | * @stable ICU 2.0 | |
163 | */ | |
164 | virtual ~RuleBasedBreakIterator(); | |
165 | ||
166 | /** | |
167 | * Assignment operator. Sets this iterator to have the same behavior, | |
168 | * and iterate over the same text, as the one passed in. | |
169 | * @param that The RuleBasedBreakItertor passed in | |
170 | * @return the newly created RuleBasedBreakIterator | |
171 | * @stable ICU 2.0 | |
172 | */ | |
173 | RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that); | |
174 | ||
175 | /** | |
176 | * Equality operator. Returns TRUE if both BreakIterators are of the | |
177 | * same class, have the same behavior, and iterate over the same text. | |
178 | * @param that The BreakIterator to be compared for equality | |
179 | * @Return TRUE if both BreakIterators are of the | |
180 | * same class, have the same behavior, and iterate over the same text. | |
181 | * @stable ICU 2.0 | |
182 | */ | |
183 | virtual UBool operator==(const BreakIterator& that) const; | |
184 | ||
185 | /** | |
186 | * Not-equal operator. If operator== returns TRUE, this returns FALSE, | |
187 | * and vice versa. | |
188 | * @param that The BreakIterator to be compared for inequality | |
189 | * @return TRUE if both BreakIterators are not same. | |
190 | * @stable ICU 2.0 | |
191 | */ | |
192 | UBool operator!=(const BreakIterator& that) const; | |
193 | ||
194 | /** | |
195 | * Returns a newly-constructed RuleBasedBreakIterator with the same | |
196 | * behavior, and iterating over the same text, as this one. | |
197 | * Differs from the copy constructor in that it is polymorphic, and | |
198 | * will correctly clone (copy) a derived class. | |
199 | * clone() is thread safe. Multiple threads may simultaeneously | |
200 | * clone the same source break iterator. | |
201 | * @stable ICU 2.0 | |
202 | */ | |
203 | virtual BreakIterator* clone() const; | |
204 | ||
205 | /** | |
206 | * Compute a hash code for this BreakIterator | |
207 | * @return A hash code | |
208 | * @stable ICU 2.0 | |
209 | */ | |
210 | virtual int32_t hashCode(void) const; | |
211 | ||
212 | /** | |
213 | * Returns the description used to create this iterator | |
214 | * @return the description used to create this iterator | |
215 | * @stable ICU 2.0 | |
216 | */ | |
217 | virtual const UnicodeString& getRules(void) const; | |
218 | ||
219 | //======================================================================= | |
220 | // BreakIterator overrides | |
221 | //======================================================================= | |
222 | ||
223 | /** | |
224 | * Return a CharacterIterator over the text being analyzed. This version | |
225 | * of this method returns the actual CharacterIterator we're using internally. | |
226 | * Changing the state of this iterator can have undefined consequences. If | |
227 | * you need to change it, clone it first. | |
228 | * @return An iterator over the text being analyzed. | |
229 | * @stable ICU 2.0 | |
230 | */ | |
231 | virtual const CharacterIterator& getText(void) const; | |
232 | ||
233 | ||
234 | /** | |
235 | * Set the iterator to analyze a new piece of text. This function resets | |
236 | * the current iteration position to the beginning of the text. | |
237 | * @param newText An iterator over the text to analyze. The BreakIterator | |
238 | * takes ownership of the character iterator. The caller MUST NOT delete it! | |
239 | * @stable ICU 2.0 | |
240 | */ | |
241 | virtual void adoptText(CharacterIterator* newText); | |
242 | ||
243 | /** | |
244 | * Set the iterator to analyze a new piece of text. This function resets | |
245 | * the current iteration position to the beginning of the text. | |
246 | * @param newText The text to analyze. | |
247 | * @stable ICU 2.0 | |
248 | */ | |
249 | virtual void setText(const UnicodeString& newText); | |
250 | ||
251 | /** | |
252 | * Sets the current iteration position to the beginning of the text. | |
253 | * (i.e., the CharacterIterator's starting offset). | |
254 | * @return The offset of the beginning of the text. | |
255 | * @stable ICU 2.0 | |
256 | */ | |
257 | virtual int32_t first(void); | |
258 | ||
259 | /** | |
260 | * Sets the current iteration position to the end of the text. | |
261 | * (i.e., the CharacterIterator's ending offset). | |
262 | * @return The text's past-the-end offset. | |
263 | * @stable ICU 2.0 | |
264 | */ | |
265 | virtual int32_t last(void); | |
266 | ||
267 | /** | |
268 | * Advances the iterator either forward or backward the specified number of steps. | |
269 | * Negative values move backward, and positive values move forward. This is | |
270 | * equivalent to repeatedly calling next() or previous(). | |
271 | * @param n The number of steps to move. The sign indicates the direction | |
272 | * (negative is backwards, and positive is forwards). | |
273 | * @return The character offset of the boundary position n boundaries away from | |
274 | * the current one. | |
275 | * @stable ICU 2.0 | |
276 | */ | |
277 | virtual int32_t next(int32_t n); | |
278 | ||
279 | /** | |
280 | * Advances the iterator to the next boundary position. | |
281 | * @return The position of the first boundary after this one. | |
282 | * @stable ICU 2.0 | |
283 | */ | |
284 | virtual int32_t next(void); | |
285 | ||
286 | /** | |
287 | * Moves the iterator backwards, to the last boundary preceding this one. | |
288 | * @return The position of the last boundary position preceding this one. | |
289 | * @stable ICU 2.0 | |
290 | */ | |
291 | virtual int32_t previous(void); | |
292 | ||
293 | /** | |
294 | * Sets the iterator to refer to the first boundary position following | |
295 | * the specified position. | |
296 | * @param offset The position from which to begin searching for a break position. | |
297 | * @return The position of the first break after the current position. | |
298 | * @stable ICU 2.0 | |
299 | */ | |
300 | virtual int32_t following(int32_t offset); | |
301 | ||
302 | /** | |
303 | * Sets the iterator to refer to the last boundary position before the | |
304 | * specified position. | |
305 | * @param offset The position to begin searching for a break from. | |
306 | * @return The position of the last boundary before the starting position. | |
307 | * @stable ICU 2.0 | |
308 | */ | |
309 | virtual int32_t preceding(int32_t offset); | |
310 | ||
311 | /** | |
312 | * Returns true if the specfied position is a boundary position. As a side | |
313 | * effect, leaves the iterator pointing to the first boundary position at | |
314 | * or after "offset". | |
315 | * @param offset the offset to check. | |
316 | * @return True if "offset" is a boundary position. | |
317 | * @stable ICU 2.0 | |
318 | */ | |
319 | virtual UBool isBoundary(int32_t offset); | |
320 | ||
321 | /** | |
322 | * Returns the current iteration position. | |
323 | * @return The current iteration position. | |
324 | * @stable ICU 2.0 | |
325 | */ | |
326 | virtual int32_t current(void) const; | |
327 | ||
328 | ||
329 | /** | |
330 | * Return the status tag from the break rule that determined the most recently | |
331 | * returned break position. The values appear in the rule source | |
332 | * within brackets, {123}, for example. For rules that do not specify a | |
333 | * status, a default value of 0 is returned. | |
334 | * <p> | |
335 | * Of the standard types of ICU break iterators, only the word break | |
336 | * iterator provides status values. The values are defined in | |
337 | * <code>enum UWordBreak</code>, and allow distinguishing between words | |
338 | * that contain alphabetic letters, "words" that appear to be numbers, | |
339 | * punctuation and spaces, words containing ideographic characters, and | |
340 | * more. Call <code>getRuleStatus</code> after obtaining a boundary | |
341 | * position from <code>next()<code>, <code>previous()</code>, or | |
342 | * any other break iterator functions that returns a boundary position. | |
343 | * <p> | |
344 | * @return the status from the break rule that determined the most recently | |
345 | * returned break position. | |
346 | * | |
347 | * @see UWordBreak | |
348 | * @draft ICU 2.2 | |
349 | */ | |
350 | virtual int32_t getRuleStatus() const; | |
351 | ||
352 | /** | |
353 | * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. | |
354 | * This method is to implement a simple version of RTTI, since not all | |
355 | * C++ compilers support genuine RTTI. Polymorphic operator==() and | |
356 | * clone() methods call this method. | |
357 | * | |
358 | * @return The class ID for this object. All objects of a | |
359 | * given class have the same class ID. Objects of | |
360 | * other classes have different class IDs. | |
361 | * @stable ICU 2.0 | |
362 | */ | |
363 | inline virtual UClassID getDynamicClassID(void) const; | |
364 | ||
365 | /** | |
366 | * Returns the class ID for this class. This is useful only for | |
367 | * comparing to a return value from getDynamicClassID(). For example: | |
368 | * | |
369 | * Base* polymorphic_pointer = createPolymorphicObject(); | |
370 | * if (polymorphic_pointer->getDynamicClassID() == | |
371 | * Derived::getStaticClassID()) ... | |
372 | * | |
373 | * @return The class ID for all objects of this class. | |
374 | * @stable ICU 2.0 | |
375 | */ | |
376 | inline static UClassID getStaticClassID(void); | |
377 | ||
378 | /* | |
379 | * Create a clone (copy) of this break iterator in memory provided | |
380 | * by the caller. The idea is to increase performance by avoiding | |
381 | * a storage allocation. Use of this functoin is NOT RECOMMENDED. | |
382 | * Performance gains are minimal, and correct buffer management is | |
383 | * tricky. Use clone() instead. | |
384 | * | |
385 | * @param stackBuffer The pointer to the memory into which the cloned object | |
386 | * should be placed. If NULL, allocate heap memory | |
387 | * for the cloned object. | |
388 | * @param BufferSize The size of the buffer. If zero, return the required | |
389 | * buffer size, but do not clone the object. If the | |
390 | * size was too small (but not zero), allocate heap | |
391 | * storage for the cloned object. | |
392 | * | |
393 | * @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be | |
394 | * returned if the the provided buffer was too small, and | |
395 | * the clone was therefore put on the heap. | |
396 | * | |
397 | * @return Pointer to the clone object. This may differ from the stackBuffer | |
398 | * address if the byte alignment of the stack buffer was not suitable | |
399 | * or if the stackBuffer was too small to hold the clone. | |
400 | * @stable ICU 2.0 | |
401 | */ | |
402 | virtual BreakIterator * createBufferClone(void *stackBuffer, | |
403 | int32_t &BufferSize, | |
404 | UErrorCode &status); | |
405 | ||
406 | ||
407 | /** | |
408 | * Return the binary form of compiled break rules, | |
409 | * which can then be used to create a new break iterator at some | |
410 | * time in the future. Creating a break iterator from pre-compiled rules | |
411 | * is much faster than building one from the source form of the | |
412 | * break rules. | |
413 | * | |
414 | * The binary data is can only be used with the same version of ICU | |
415 | * and on the same platform type (processor endian-ness) | |
416 | * | |
417 | * @param length Returns the length of the binary data. (Out paramter.) | |
418 | * | |
419 | * @return A pointer to the binary (compiled) rule data. The storage | |
420 | * belongs to the RulesBasedBreakIterator object, not the | |
421 | * caller, and must not be modified or deleted. | |
422 | * @internal | |
423 | */ | |
424 | virtual const uint8_t *getBinaryRules(uint32_t &length); | |
425 | ||
426 | ||
427 | protected: | |
428 | //======================================================================= | |
429 | // implementation | |
430 | //======================================================================= | |
431 | /** | |
432 | * This method is the actual implementation of the next() method. All iteration | |
433 | * vectors through here. This method initializes the state machine to state 1 | |
434 | * and advances through the text character by character until we reach the end | |
435 | * of the text or the state machine transitions to state 0. We update our return | |
436 | * value every time the state machine passes through a possible end state. | |
437 | * @internal | |
438 | */ | |
439 | virtual int32_t handleNext(void); | |
440 | ||
441 | /** | |
442 | * This method backs the iterator back up to a "safe position" in the text. | |
443 | * This is a position that we know, without any context, must be a break position. | |
444 | * The various calling methods then iterate forward from this safe position to | |
445 | * the appropriate position to return. (For more information, see the description | |
446 | * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.) | |
447 | * @internal | |
448 | */ | |
449 | virtual int32_t handlePrevious(void); | |
450 | ||
451 | /** | |
452 | * Dumps caches and performs other actions associated with a complete change | |
453 | * in text or iteration position. This function is a no-op in RuleBasedBreakIterator, | |
454 | * but subclasses can and do override it. | |
455 | * @internal | |
456 | */ | |
457 | virtual void reset(void); | |
458 | ||
459 | /** | |
460 | * Return true if the category lookup for this char | |
461 | * indicates that it is in the set of dictionary lookup chars. | |
462 | * This function is intended for use by dictionary based break iterators. | |
463 | * @return true if the category lookup for this char | |
464 | * indicates that it is in the set of dictionary lookup chars. | |
465 | * @internal | |
466 | */ | |
467 | virtual UBool isDictionaryChar(UChar32); | |
468 | ||
469 | /** | |
470 | * Common initialization function, used by constructors and bufferClone. | |
471 | * (Also used by DictionaryBasedBreakIterator::createBufferClone().) | |
472 | * @internal | |
473 | */ | |
474 | void init(); | |
475 | ||
476 | }; | |
477 | ||
478 | //---------------------------------------------------------------------------------- | |
479 | // | |
480 | // Inline Functions Definitions ... | |
481 | // | |
482 | //---------------------------------------------------------------------------------- | |
483 | ||
484 | inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const { | |
485 | return !operator==(that); | |
486 | } | |
487 | ||
488 | inline UClassID RuleBasedBreakIterator::getStaticClassID(void) { | |
489 | return (UClassID)(&fgClassID); | |
490 | } | |
491 | ||
492 | inline UClassID RuleBasedBreakIterator::getDynamicClassID(void) const { | |
493 | return RuleBasedBreakIterator::getStaticClassID(); | |
494 | } | |
495 | ||
496 | U_NAMESPACE_END | |
497 | ||
498 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | |
499 | ||
500 | #endif |