]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/unicode/rbbi.h
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / common / unicode / rbbi.h
CommitLineData
b75a7d8f
A
1/*
2***************************************************************************
3* Copyright (C) 1999-2003 International Business Machines Corporation *
4* and others. All rights reserved. *
5***************************************************************************
6
7**********************************************************************
8* Date Name Description
9* 10/22/99 alan Creation.
10* 11/11/99 rgillam Complete port from Java.
11**********************************************************************
12*/
13
14#ifndef RBBI_H
15#define RBBI_H
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_BREAK_ITERATION
20
21#include "unicode/brkiter.h"
22#include "unicode/udata.h"
23#include "unicode/parseerr.h"
24
25struct UTrie;
26
27U_NAMESPACE_BEGIN
28
29struct RBBIDataHeader;
30class RuleBasedBreakIteratorTables;
31class BreakIterator;
32class RBBIDataWrapper;
33
34
35
36/**
37 * A subclass of BreakIterator whose behavior is specified using a list of rules.
38 * <p>Instances of this class are most commonly created by the factory methods of
39 * BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc.,
40 * and then used via the abstract API in class BreakIterator</p>
41 *
42 * <p>See the ICU User Guide for information on Break Iterator Rules.</p>
43 *
44 * <p>This class is not intended to be subclassed. (Class DictionaryBasedBreakIterator
45 * is a subclass, but that relationship is effectively internal to the ICU
46 * implementation. The subclassing interface to RulesBasedBreakIterator is
47 * not part of the ICU API, and may not remain stable.</p>
48 *
49 */
50class U_COMMON_API RuleBasedBreakIterator : public BreakIterator {
51
52protected:
53 /**
54 * The character iterator through which this BreakIterator accesses the text
55 * @internal
56 */
57 CharacterIterator* fText;
58
59 /**
60 * The rule data for this BreakIterator instance
61 * @internal
62 */
63 RBBIDataWrapper *fData;
64 /** @internal */
65 UTrie *fCharMappings;
66
67 /** Rule {tag} value for the most recent match.
68 * @internal
69 */
70 int32_t fLastBreakTag;
71
72 /**
73 * Rule tag value valid flag.
74 * Some iterator operations don't intrinsically set the correct tag value.
75 * This flag lets us lazily compute the value if we are ever asked for it.
76 * @internal
77 */
78 UBool fLastBreakTagValid;
79
80 /**
81 * Counter for the number of characters encountered with the "dictionary"
82 * flag set. Normal RBBI iterators don't use it, although the code
83 * for updating it is live. Dictionary Based break iterators (a subclass
84 * of us) access this field directly.
85 * @internal
86 */
87 uint32_t fDictionaryCharCount;
88
89 /**
90 * Debugging flag. Trace operation of state machine when true.
91 * @internal
92 */
93 static UBool fTrace;
94
95
96
97private:
98 /**
99 * Class ID
100 */
101 static const char fgClassID;
102
103protected:
104 //=======================================================================
105 // constructors
106 //=======================================================================
107
108 /**
109 * This constructor uses the udata interface to create a BreakIterator
110 * whose internal tables live in a memory-mapped file. "image" is a pointer
111 * to the beginning of that file.
112 * @internal
113 */
114 RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status);
115
116 /**
117 * Constructor from a flattened set of RBBI data in malloced memory.
118 * RulesBasedBreakIterators built from a custom set of rules
119 * are created via this constructor; the rules are compiled
120 * into memory, then the break iterator is constructed here.
121 *
122 * The break iterator adopts the memory, and will
123 * free it when done.
124 * @internal
125 */
126 RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
127
128 friend class RBBIRuleBuilder; /** @internal */
129 friend class BreakIterator;
130
131
132
133public:
134
135 /** Default constructor. Creates an empty shell of an iterator, with no
136 * rules or text to iterate over. Object can subsequently be assigned to.
137 * @draft ICU 2.2
138 */
139 RuleBasedBreakIterator();
140
141 /**
142 * Copy constructor. Will produce a break iterator with the same behavior,
143 * and which iterates over the same text, as the one passed in.
144 * @param that The RuleBasedBreakIterator passed to be copied
145 * @stable ICU 2.0
146 */
147 RuleBasedBreakIterator(const RuleBasedBreakIterator& that);
148
149 /**
150 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
151 * @param rules The break rules to be used.
152 * @param parseError In the event of a syntax error in the rules, provides the location
153 * within the rules of the problem.
154 * @param status Information on any errors encountered.
155 * @draft ICU 2.2
156 */
157 RuleBasedBreakIterator( const UnicodeString &rules,
158 UParseError &parseError,
159 UErrorCode &status);
160 /**
161 * Destructor
162 * @stable ICU 2.0
163 */
164 virtual ~RuleBasedBreakIterator();
165
166 /**
167 * Assignment operator. Sets this iterator to have the same behavior,
168 * and iterate over the same text, as the one passed in.
169 * @param that The RuleBasedBreakItertor passed in
170 * @return the newly created RuleBasedBreakIterator
171 * @stable ICU 2.0
172 */
173 RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
174
175 /**
176 * Equality operator. Returns TRUE if both BreakIterators are of the
177 * same class, have the same behavior, and iterate over the same text.
178 * @param that The BreakIterator to be compared for equality
179 * @Return TRUE if both BreakIterators are of the
180 * same class, have the same behavior, and iterate over the same text.
181 * @stable ICU 2.0
182 */
183 virtual UBool operator==(const BreakIterator& that) const;
184
185 /**
186 * Not-equal operator. If operator== returns TRUE, this returns FALSE,
187 * and vice versa.
188 * @param that The BreakIterator to be compared for inequality
189 * @return TRUE if both BreakIterators are not same.
190 * @stable ICU 2.0
191 */
192 UBool operator!=(const BreakIterator& that) const;
193
194 /**
195 * Returns a newly-constructed RuleBasedBreakIterator with the same
196 * behavior, and iterating over the same text, as this one.
197 * Differs from the copy constructor in that it is polymorphic, and
198 * will correctly clone (copy) a derived class.
199 * clone() is thread safe. Multiple threads may simultaeneously
200 * clone the same source break iterator.
201 * @stable ICU 2.0
202 */
203 virtual BreakIterator* clone() const;
204
205 /**
206 * Compute a hash code for this BreakIterator
207 * @return A hash code
208 * @stable ICU 2.0
209 */
210 virtual int32_t hashCode(void) const;
211
212 /**
213 * Returns the description used to create this iterator
214 * @return the description used to create this iterator
215 * @stable ICU 2.0
216 */
217 virtual const UnicodeString& getRules(void) const;
218
219 //=======================================================================
220 // BreakIterator overrides
221 //=======================================================================
222
223 /**
224 * Return a CharacterIterator over the text being analyzed. This version
225 * of this method returns the actual CharacterIterator we're using internally.
226 * Changing the state of this iterator can have undefined consequences. If
227 * you need to change it, clone it first.
228 * @return An iterator over the text being analyzed.
229 * @stable ICU 2.0
230 */
231 virtual const CharacterIterator& getText(void) const;
232
233
234 /**
235 * Set the iterator to analyze a new piece of text. This function resets
236 * the current iteration position to the beginning of the text.
237 * @param newText An iterator over the text to analyze. The BreakIterator
238 * takes ownership of the character iterator. The caller MUST NOT delete it!
239 * @stable ICU 2.0
240 */
241 virtual void adoptText(CharacterIterator* newText);
242
243 /**
244 * Set the iterator to analyze a new piece of text. This function resets
245 * the current iteration position to the beginning of the text.
246 * @param newText The text to analyze.
247 * @stable ICU 2.0
248 */
249 virtual void setText(const UnicodeString& newText);
250
251 /**
252 * Sets the current iteration position to the beginning of the text.
253 * (i.e., the CharacterIterator's starting offset).
254 * @return The offset of the beginning of the text.
255 * @stable ICU 2.0
256 */
257 virtual int32_t first(void);
258
259 /**
260 * Sets the current iteration position to the end of the text.
261 * (i.e., the CharacterIterator's ending offset).
262 * @return The text's past-the-end offset.
263 * @stable ICU 2.0
264 */
265 virtual int32_t last(void);
266
267 /**
268 * Advances the iterator either forward or backward the specified number of steps.
269 * Negative values move backward, and positive values move forward. This is
270 * equivalent to repeatedly calling next() or previous().
271 * @param n The number of steps to move. The sign indicates the direction
272 * (negative is backwards, and positive is forwards).
273 * @return The character offset of the boundary position n boundaries away from
274 * the current one.
275 * @stable ICU 2.0
276 */
277 virtual int32_t next(int32_t n);
278
279 /**
280 * Advances the iterator to the next boundary position.
281 * @return The position of the first boundary after this one.
282 * @stable ICU 2.0
283 */
284 virtual int32_t next(void);
285
286 /**
287 * Moves the iterator backwards, to the last boundary preceding this one.
288 * @return The position of the last boundary position preceding this one.
289 * @stable ICU 2.0
290 */
291 virtual int32_t previous(void);
292
293 /**
294 * Sets the iterator to refer to the first boundary position following
295 * the specified position.
296 * @param offset The position from which to begin searching for a break position.
297 * @return The position of the first break after the current position.
298 * @stable ICU 2.0
299 */
300 virtual int32_t following(int32_t offset);
301
302 /**
303 * Sets the iterator to refer to the last boundary position before the
304 * specified position.
305 * @param offset The position to begin searching for a break from.
306 * @return The position of the last boundary before the starting position.
307 * @stable ICU 2.0
308 */
309 virtual int32_t preceding(int32_t offset);
310
311 /**
312 * Returns true if the specfied position is a boundary position. As a side
313 * effect, leaves the iterator pointing to the first boundary position at
314 * or after "offset".
315 * @param offset the offset to check.
316 * @return True if "offset" is a boundary position.
317 * @stable ICU 2.0
318 */
319 virtual UBool isBoundary(int32_t offset);
320
321 /**
322 * Returns the current iteration position.
323 * @return The current iteration position.
324 * @stable ICU 2.0
325 */
326 virtual int32_t current(void) const;
327
328
329 /**
330 * Return the status tag from the break rule that determined the most recently
331 * returned break position. The values appear in the rule source
332 * within brackets, {123}, for example. For rules that do not specify a
333 * status, a default value of 0 is returned.
334 * <p>
335 * Of the standard types of ICU break iterators, only the word break
336 * iterator provides status values. The values are defined in
337 * <code>enum UWordBreak</code>, and allow distinguishing between words
338 * that contain alphabetic letters, "words" that appear to be numbers,
339 * punctuation and spaces, words containing ideographic characters, and
340 * more. Call <code>getRuleStatus</code> after obtaining a boundary
341 * position from <code>next()<code>, <code>previous()</code>, or
342 * any other break iterator functions that returns a boundary position.
343 * <p>
344 * @return the status from the break rule that determined the most recently
345 * returned break position.
346 *
347 * @see UWordBreak
348 * @draft ICU 2.2
349 */
350 virtual int32_t getRuleStatus() const;
351
352 /**
353 * Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
354 * This method is to implement a simple version of RTTI, since not all
355 * C++ compilers support genuine RTTI. Polymorphic operator==() and
356 * clone() methods call this method.
357 *
358 * @return The class ID for this object. All objects of a
359 * given class have the same class ID. Objects of
360 * other classes have different class IDs.
361 * @stable ICU 2.0
362 */
363 inline virtual UClassID getDynamicClassID(void) const;
364
365 /**
366 * Returns the class ID for this class. This is useful only for
367 * comparing to a return value from getDynamicClassID(). For example:
368 *
369 * Base* polymorphic_pointer = createPolymorphicObject();
370 * if (polymorphic_pointer->getDynamicClassID() ==
371 * Derived::getStaticClassID()) ...
372 *
373 * @return The class ID for all objects of this class.
374 * @stable ICU 2.0
375 */
376 inline static UClassID getStaticClassID(void);
377
378 /*
379 * Create a clone (copy) of this break iterator in memory provided
380 * by the caller. The idea is to increase performance by avoiding
381 * a storage allocation. Use of this functoin is NOT RECOMMENDED.
382 * Performance gains are minimal, and correct buffer management is
383 * tricky. Use clone() instead.
384 *
385 * @param stackBuffer The pointer to the memory into which the cloned object
386 * should be placed. If NULL, allocate heap memory
387 * for the cloned object.
388 * @param BufferSize The size of the buffer. If zero, return the required
389 * buffer size, but do not clone the object. If the
390 * size was too small (but not zero), allocate heap
391 * storage for the cloned object.
392 *
393 * @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be
394 * returned if the the provided buffer was too small, and
395 * the clone was therefore put on the heap.
396 *
397 * @return Pointer to the clone object. This may differ from the stackBuffer
398 * address if the byte alignment of the stack buffer was not suitable
399 * or if the stackBuffer was too small to hold the clone.
400 * @stable ICU 2.0
401 */
402 virtual BreakIterator * createBufferClone(void *stackBuffer,
403 int32_t &BufferSize,
404 UErrorCode &status);
405
406
407 /**
408 * Return the binary form of compiled break rules,
409 * which can then be used to create a new break iterator at some
410 * time in the future. Creating a break iterator from pre-compiled rules
411 * is much faster than building one from the source form of the
412 * break rules.
413 *
414 * The binary data is can only be used with the same version of ICU
415 * and on the same platform type (processor endian-ness)
416 *
417 * @param length Returns the length of the binary data. (Out paramter.)
418 *
419 * @return A pointer to the binary (compiled) rule data. The storage
420 * belongs to the RulesBasedBreakIterator object, not the
421 * caller, and must not be modified or deleted.
422 * @internal
423 */
424 virtual const uint8_t *getBinaryRules(uint32_t &length);
425
426
427protected:
428 //=======================================================================
429 // implementation
430 //=======================================================================
431 /**
432 * This method is the actual implementation of the next() method. All iteration
433 * vectors through here. This method initializes the state machine to state 1
434 * and advances through the text character by character until we reach the end
435 * of the text or the state machine transitions to state 0. We update our return
436 * value every time the state machine passes through a possible end state.
437 * @internal
438 */
439 virtual int32_t handleNext(void);
440
441 /**
442 * This method backs the iterator back up to a "safe position" in the text.
443 * This is a position that we know, without any context, must be a break position.
444 * The various calling methods then iterate forward from this safe position to
445 * the appropriate position to return. (For more information, see the description
446 * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
447 * @internal
448 */
449 virtual int32_t handlePrevious(void);
450
451 /**
452 * Dumps caches and performs other actions associated with a complete change
453 * in text or iteration position. This function is a no-op in RuleBasedBreakIterator,
454 * but subclasses can and do override it.
455 * @internal
456 */
457 virtual void reset(void);
458
459 /**
460 * Return true if the category lookup for this char
461 * indicates that it is in the set of dictionary lookup chars.
462 * This function is intended for use by dictionary based break iterators.
463 * @return true if the category lookup for this char
464 * indicates that it is in the set of dictionary lookup chars.
465 * @internal
466 */
467 virtual UBool isDictionaryChar(UChar32);
468
469 /**
470 * Common initialization function, used by constructors and bufferClone.
471 * (Also used by DictionaryBasedBreakIterator::createBufferClone().)
472 * @internal
473 */
474 void init();
475
476};
477
478//----------------------------------------------------------------------------------
479//
480// Inline Functions Definitions ...
481//
482//----------------------------------------------------------------------------------
483
484inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {
485 return !operator==(that);
486}
487
488inline UClassID RuleBasedBreakIterator::getStaticClassID(void) {
489 return (UClassID)(&fgClassID);
490}
491
492inline UClassID RuleBasedBreakIterator::getDynamicClassID(void) const {
493 return RuleBasedBreakIterator::getStaticClassID();
494}
495
496U_NAMESPACE_END
497
498#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
499
500#endif