1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ********************************************************************************
5 * Copyright (C) 1997-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 ********************************************************************************
11 * Modification History:
13 * Date Name Description
14 * 02/18/97 aliu Added typedef for TextCount. Made DONE const.
15 * 05/07/97 aliu Fixed DLL declaration.
16 * 07/09/97 jfitz Renamed BreakIterator and interface synced with JDK
17 * 08/11/98 helena Sync-up JDK1.2.
18 * 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.
19 ********************************************************************************
25 #include "unicode/utypes.h"
29 * \brief C++ API: Break Iterator.
32 #if UCONFIG_NO_BREAK_ITERATION
34 #if U_SHOW_CPLUSPLUS_API
38 * Allow the declaration of APIs with pointers to BreakIterator
39 * even when break iteration is removed from the build.
44 #endif // U_SHOW_CPLUSPLUS_API
48 #include "unicode/uobject.h"
49 #include "unicode/unistr.h"
50 #include "unicode/chariter.h"
51 #include "unicode/locid.h"
52 #include "unicode/ubrk.h"
53 #include "unicode/strenum.h"
54 #include "unicode/utext.h"
55 #include "unicode/umisc.h"
57 #if U_SHOW_CPLUSPLUS_API
61 * The BreakIterator class implements methods for finding the location
62 * of boundaries in text. BreakIterator is an abstract base class.
63 * Instances of BreakIterator maintain a current position and scan over
64 * text returning the index of characters where boundaries occur.
66 * Line boundary analysis determines where a text string can be broken
67 * when line-wrapping. The mechanism correctly handles punctuation and
70 * Sentence boundary analysis allows selection with correct
71 * interpretation of periods within numbers and abbreviations, and
72 * trailing punctuation marks such as quotation marks and parentheses.
74 * Word boundary analysis is used by search and replace functions, as
75 * well as within text editing applications that allow the user to
76 * select words with a double click. Word selection provides correct
77 * interpretation of punctuation marks within and following
78 * words. Characters that are not part of a word, such as symbols or
79 * punctuation marks, have word-breaks on both sides.
81 * Character boundary analysis allows users to interact with
82 * characters as they expect to, for example, when moving the cursor
83 * through a text string. Character boundary analysis provides correct
84 * navigation of through character strings, regardless of how the
85 * character is stored. For example, an accented character might be
86 * stored as a base character and a diacritical mark. What users
87 * consider to be a character can differ between languages.
89 * The text boundary positions are found according to the rules
90 * described in Unicode Standard Annex #29, Text Boundaries, and
91 * Unicode Standard Annex #14, Line Breaking Properties. These
92 * are available at http://www.unicode.org/reports/tr14/ and
93 * http://www.unicode.org/reports/tr29/.
95 * In addition to the C++ API defined in this header file, a
96 * plain C API with equivalent functionality is defined in the
99 * Code snippets illustrating the use of the Break Iterator APIs
100 * are available in the ICU User Guide,
101 * http://icu-project.org/userguide/boundaryAnalysis.html
102 * and in the sample program icu/source/samples/break/break.cpp
105 class U_COMMON_API BreakIterator
: public UObject
{
111 virtual ~BreakIterator();
114 * Return true if another object is semantically equal to this
115 * one. The other object should be an instance of the same subclass of
116 * BreakIterator. Objects of different subclasses are considered
119 * Return true if this BreakIterator is at the same position in the
120 * same text, and is the same class and type (word, line, etc.) of
121 * BreakIterator, as the argument. Text is considered the same if
122 * it contains the same characters, it need not be the same
123 * object, and styles are not considered.
126 virtual UBool
operator==(const BreakIterator
&) const = 0;
129 * Returns the complement of the result of operator==
130 * @param rhs The BreakIterator to be compared for inequality
131 * @return the complement of the result of operator==
134 UBool
operator!=(const BreakIterator
& rhs
) const { return !operator==(rhs
); }
137 * Return a polymorphic copy of this object. This is an abstract
138 * method which subclasses implement.
141 virtual BreakIterator
* clone(void) const = 0;
144 * Return a polymorphic class ID for this object. Different subclasses
145 * will return distinct unequal values.
148 virtual UClassID
getDynamicClassID(void) const = 0;
151 * Return a CharacterIterator over the text being analyzed.
154 virtual CharacterIterator
& getText(void) const = 0;
158 * Get a UText for the text being analyzed.
159 * The returned UText is a shallow clone of the UText used internally
160 * by the break iterator implementation. It can safely be used to
161 * access the text without impacting any break iterator operations,
162 * but the underlying text itself must not be altered.
164 * @param fillIn A UText to be filled in. If NULL, a new UText will be
165 * allocated to hold the result.
166 * @param status receives any error codes.
167 * @return The current UText for this break iterator. If an input
168 * UText was provided, it will always be returned.
171 virtual UText
*getUText(UText
*fillIn
, UErrorCode
&status
) const = 0;
174 * Change the text over which this operates. The text boundary is
175 * reset to the start.
177 * The BreakIterator will retain a reference to the supplied string.
178 * The caller must not modify or delete the text while the BreakIterator
179 * retains the reference.
181 * @param text The UnicodeString used to change the text.
184 virtual void setText(const UnicodeString
&text
) = 0;
187 * Reset the break iterator to operate over the text represented by
188 * the UText. The iterator position is reset to the start.
190 * This function makes a shallow clone of the supplied UText. This means
191 * that the caller is free to immediately close or otherwise reuse the
192 * Utext that was passed as a parameter, but that the underlying text itself
193 * must not be altered while being referenced by the break iterator.
195 * All index positions returned by break iterator functions are
196 * native indices from the UText. For example, when breaking UTF-8
197 * encoded text, the break positions returned by next(), previous(), etc.
198 * will be UTF-8 string indices, not UTF-16 positions.
200 * @param text The UText used to change the text.
201 * @param status receives any error codes.
204 virtual void setText(UText
*text
, UErrorCode
&status
) = 0;
207 * Change the text over which this operates. The text boundary is
208 * reset to the start.
209 * Note that setText(UText *) provides similar functionality to this function,
210 * and is more efficient.
211 * @param it The CharacterIterator used to change the text.
214 virtual void adoptText(CharacterIterator
* it
) = 0;
218 * DONE is returned by previous() and next() after all valid
219 * boundaries have been returned.
226 * Sets the current iteration position to the beginning of the text, position zero.
227 * @return The offset of the beginning of the text, zero.
230 virtual int32_t first(void) = 0;
233 * Set the iterator position to the index immediately BEYOND the last character in the text being scanned.
234 * @return The index immediately BEYOND the last character in the text being scanned.
237 virtual int32_t last(void) = 0;
240 * Set the iterator position to the boundary preceding the current boundary.
241 * @return The character index of the previous text boundary or DONE if all
242 * boundaries have been returned.
245 virtual int32_t previous(void) = 0;
248 * Advance the iterator to the boundary following the current boundary.
249 * @return The character index of the next text boundary or DONE if all
250 * boundaries have been returned.
253 virtual int32_t next(void) = 0;
256 * Return character index of the current iterator position within the text.
257 * @return The boundary most recently returned.
260 virtual int32_t current(void) const = 0;
263 * Advance the iterator to the first boundary following the specified offset.
264 * The value returned is always greater than the offset or
265 * the value BreakIterator.DONE
266 * @param offset the offset to begin scanning.
267 * @return The first boundary after the specified offset.
270 virtual int32_t following(int32_t offset
) = 0;
273 * Set the iterator position to the first boundary preceding the specified offset.
274 * The value returned is always smaller than the offset or
275 * the value BreakIterator.DONE
276 * @param offset the offset to begin scanning.
277 * @return The first boundary before the specified offset.
280 virtual int32_t preceding(int32_t offset
) = 0;
283 * Return true if the specified position is a boundary position.
284 * As a side effect, the current position of the iterator is set
285 * to the first boundary position at or following the specified offset.
286 * @param offset the offset to check.
287 * @return True if "offset" is a boundary position.
290 virtual UBool
isBoundary(int32_t offset
) = 0;
293 * Set the iterator position to the nth boundary from the current boundary
294 * @param n the number of boundaries to move by. A value of 0
295 * does nothing. Negative values move to previous boundaries
296 * and positive values move to later boundaries.
297 * @return The new iterator position, or
298 * DONE if there are fewer than |n| boundaries in the specified direction.
301 virtual int32_t next(int32_t n
) = 0;
304 * For RuleBasedBreakIterators, return the status tag from the break rule
305 * that determined the boundary at the current iteration position.
307 * For break iterator types that do not support a rule status,
308 * a default value of 0 is returned.
310 * @return the status from the break rule that determined the boundary at
311 * the current iteration position.
312 * @see RuleBaseBreakIterator::getRuleStatus()
316 virtual int32_t getRuleStatus() const;
319 * For RuleBasedBreakIterators, get the status (tag) values from the break rule(s)
320 * that determined the boundary at the current iteration position.
322 * For break iterator types that do not support rule status,
323 * no values are returned.
325 * The returned status value(s) are stored into an array provided by the caller.
326 * The values are stored in sorted (ascending) order.
327 * If the capacity of the output array is insufficient to hold the data,
328 * the output will be truncated to the available length, and a
329 * U_BUFFER_OVERFLOW_ERROR will be signaled.
331 * @see RuleBaseBreakIterator::getRuleStatusVec
333 * @param fillInVec an array to be filled in with the status values.
334 * @param capacity the length of the supplied vector. A length of zero causes
335 * the function to return the number of status values, in the
336 * normal way, without attempting to store any values.
337 * @param status receives error codes.
338 * @return The number of rule status values from rules that determined
339 * the boundary at the current iteration position.
340 * In the event of a U_BUFFER_OVERFLOW_ERROR, the return value
341 * is the total number of status values that were available,
342 * not the reduced number that were actually returned.
346 virtual int32_t getRuleStatusVec(int32_t *fillInVec
, int32_t capacity
, UErrorCode
&status
);
349 * Create BreakIterator for word-breaks using the given locale.
350 * Returns an instance of a BreakIterator implementing word breaks.
351 * WordBreak is useful for word selection (ex. double click)
352 * @param where the locale.
353 * @param status the error code
354 * @return A BreakIterator for word-breaks. The UErrorCode& status
355 * parameter is used to return status information to the user.
356 * To check whether the construction succeeded or not, you should check
357 * the value of U_SUCCESS(err). If you wish more detailed information, you
358 * can check for informational error results which still indicate success.
359 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
360 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
361 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
362 * used; neither the requested locale nor any of its fall back locales
364 * The caller owns the returned object and is responsible for deleting it.
367 static BreakIterator
* U_EXPORT2
368 createWordInstance(const Locale
& where
, UErrorCode
& status
);
371 * Create BreakIterator for line-breaks using specified locale.
372 * Returns an instance of a BreakIterator implementing line breaks. Line
373 * breaks are logically possible line breaks, actual line breaks are
374 * usually determined based on display width.
375 * LineBreak is useful for word wrapping text.
376 * @param where the locale.
377 * @param status The error code.
378 * @return A BreakIterator for line-breaks. The UErrorCode& status
379 * parameter is used to return status information to the user.
380 * To check whether the construction succeeded or not, you should check
381 * the value of U_SUCCESS(err). If you wish more detailed information, you
382 * can check for informational error results which still indicate success.
383 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
384 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
385 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
386 * used; neither the requested locale nor any of its fall back locales
388 * The caller owns the returned object and is responsible for deleting it.
391 static BreakIterator
* U_EXPORT2
392 createLineInstance(const Locale
& where
, UErrorCode
& status
);
395 * Create BreakIterator for character-breaks using specified locale
396 * Returns an instance of a BreakIterator implementing character breaks.
397 * Character breaks are boundaries of combining character sequences.
398 * @param where the locale.
399 * @param status The error code.
400 * @return A BreakIterator for character-breaks. The UErrorCode& status
401 * parameter is used to return status information to the user.
402 * To check whether the construction succeeded or not, you should check
403 * the value of U_SUCCESS(err). If you wish more detailed information, you
404 * can check for informational error results which still indicate success.
405 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
406 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
407 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
408 * used; neither the requested locale nor any of its fall back locales
410 * The caller owns the returned object and is responsible for deleting it.
413 static BreakIterator
* U_EXPORT2
414 createCharacterInstance(const Locale
& where
, UErrorCode
& status
);
417 * Create BreakIterator for sentence-breaks using specified locale
418 * Returns an instance of a BreakIterator implementing sentence breaks.
419 * @param where the locale.
420 * @param status The error code.
421 * @return A BreakIterator for sentence-breaks. The UErrorCode& status
422 * parameter is used to return status information to the user.
423 * To check whether the construction succeeded or not, you should check
424 * the value of U_SUCCESS(err). If you wish more detailed information, you
425 * can check for informational error results which still indicate success.
426 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
427 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
428 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
429 * used; neither the requested locale nor any of its fall back locales
431 * The caller owns the returned object and is responsible for deleting it.
434 static BreakIterator
* U_EXPORT2
435 createSentenceInstance(const Locale
& where
, UErrorCode
& status
);
437 #ifndef U_HIDE_DEPRECATED_API
439 * Create BreakIterator for title-casing breaks using the specified locale
440 * Returns an instance of a BreakIterator implementing title breaks.
441 * The iterator returned locates title boundaries as described for
442 * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
443 * please use a word boundary iterator. See {@link #createWordInstance }.
445 * @param where the locale.
446 * @param status The error code.
447 * @return A BreakIterator for title-breaks. The UErrorCode& status
448 * parameter is used to return status information to the user.
449 * To check whether the construction succeeded or not, you should check
450 * the value of U_SUCCESS(err). If you wish more detailed information, you
451 * can check for informational error results which still indicate success.
452 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
453 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
454 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
455 * used; neither the requested locale nor any of its fall back locales
457 * The caller owns the returned object and is responsible for deleting it.
458 * @deprecated ICU 64 Use createWordInstance instead.
460 static BreakIterator
* U_EXPORT2
461 createTitleInstance(const Locale
& where
, UErrorCode
& status
);
462 #endif /* U_HIDE_DEPRECATED_API */
465 * Get the set of Locales for which TextBoundaries are installed.
466 * <p><b>Note:</b> this will not return locales added through the register
467 * call. To see the registered locales too, use the getAvailableLocales
468 * function that returns a StringEnumeration object </p>
469 * @param count the output parameter of number of elements in the locale list
470 * @return available locales
473 static const Locale
* U_EXPORT2
getAvailableLocales(int32_t& count
);
476 * Get name of the object for the desired Locale, in the desired language.
477 * @param objectLocale must be from getAvailableLocales.
478 * @param displayLocale specifies the desired locale for output.
479 * @param name the fill-in parameter of the return value
481 * @return user-displayable name
484 static UnicodeString
& U_EXPORT2
getDisplayName(const Locale
& objectLocale
,
485 const Locale
& displayLocale
,
486 UnicodeString
& name
);
489 * Get name of the object for the desired Locale, in the language of the
491 * @param objectLocale must be from getMatchingLocales
492 * @param name the fill-in parameter of the return value
493 * @return user-displayable name
496 static UnicodeString
& U_EXPORT2
getDisplayName(const Locale
& objectLocale
,
497 UnicodeString
& name
);
500 * Deprecated functionality. Use clone() instead.
502 * Thread safe client-buffer-based cloning operation
503 * Do NOT call delete on a safeclone, since 'new' is not used to create it.
504 * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
505 * If buffer is not large enough, new memory will be allocated.
506 * @param BufferSize reference to size of allocated space.
507 * If BufferSize == 0, a sufficient size for use in cloning will
508 * be returned ('pre-flighting')
509 * If BufferSize is not enough for a stack-based safe clone,
510 * new memory will be allocated.
511 * @param status to indicate whether the operation went on smoothly or there were errors
512 * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
514 * @return pointer to the new clone
516 * @deprecated ICU 52. Use clone() instead.
518 virtual BreakIterator
* createBufferClone(void *stackBuffer
,
520 UErrorCode
&status
) = 0;
522 #ifndef U_HIDE_DEPRECATED_API
525 * Determine whether the BreakIterator was created in user memory by
526 * createBufferClone(), and thus should not be deleted. Such objects
527 * must be closed by an explicit call to the destructor (not delete).
528 * @deprecated ICU 52. Always delete the BreakIterator.
530 inline UBool
isBufferClone(void);
532 #endif /* U_HIDE_DEPRECATED_API */
534 #if !UCONFIG_NO_SERVICE
536 * Register a new break iterator of the indicated kind, to use in the given locale.
537 * The break iterator will be adopted. Clones of the iterator will be returned
538 * if a request for a break iterator of the given kind matches or falls back to
540 * Because ICU may choose to cache BreakIterators internally, this must
541 * be called at application startup, prior to any calls to
542 * BreakIterator::createXXXInstance to avoid undefined behavior.
543 * @param toAdopt the BreakIterator instance to be adopted
544 * @param locale the Locale for which this instance is to be registered
545 * @param kind the type of iterator for which this instance is to be registered
546 * @param status the in/out status code, no special meanings are assigned
547 * @return a registry key that can be used to unregister this instance
550 static URegistryKey U_EXPORT2
registerInstance(BreakIterator
* toAdopt
,
551 const Locale
& locale
,
552 UBreakIteratorType kind
,
556 * Unregister a previously-registered BreakIterator using the key returned from the
557 * register call. Key becomes invalid after a successful call and should not be used again.
558 * The BreakIterator corresponding to the key will be deleted.
559 * Because ICU may choose to cache BreakIterators internally, this should
560 * be called during application shutdown, after all calls to
561 * BreakIterator::createXXXInstance to avoid undefined behavior.
562 * @param key the registry key returned by a previous call to registerInstance
563 * @param status the in/out status code, no special meanings are assigned
564 * @return TRUE if the iterator for the key was successfully unregistered
567 static UBool U_EXPORT2
unregister(URegistryKey key
, UErrorCode
& status
);
570 * Return a StringEnumeration over the locales available at the time of the call,
571 * including registered locales.
572 * @return a StringEnumeration over the locales available at the time of the call
575 static StringEnumeration
* U_EXPORT2
getAvailableLocales(void);
579 * Returns the locale for this break iterator. Two flavors are available: valid and
583 Locale
getLocale(ULocDataLocaleType type
, UErrorCode
& status
) const;
585 #ifndef U_HIDE_INTERNAL_API
586 /** Get the locale for this break iterator object. You can choose between valid and actual locale.
587 * @param type type of the locale we're looking for (valid or actual)
588 * @param status error code for the operation
592 const char *getLocaleID(ULocDataLocaleType type
, UErrorCode
& status
) const;
593 #endif /* U_HIDE_INTERNAL_API */
596 * Set the subject text string upon which the break iterator is operating
597 * without changing any other aspect of the matching state.
598 * The new and previous text strings must have the same content.
600 * This function is intended for use in environments where ICU is operating on
601 * strings that may move around in memory. It provides a mechanism for notifying
602 * ICU that the string has been relocated, and providing a new UText to access the
603 * string in its new position.
605 * Note that the break iterator implementation never copies the underlying text
606 * of a string being processed, but always operates directly on the original text
607 * provided by the user. Refreshing simply drops the references to the old text
608 * and replaces them with references to the new.
610 * Caution: this function is normally used only by very specialized,
611 * system-level code. One example use case is with garbage collection that moves
612 * the text in memory.
614 * @param input The new (moved) text string.
615 * @param status Receives errors detected by this function.
620 virtual BreakIterator
&refreshInputText(UText
*input
, UErrorCode
&status
) = 0;
622 #ifndef U_HIDE_INTERNAL_API
624 * Set the ULineWordOptions for this break iterator.
625 * @param lineWordOpts The ULineWordOptions to set.
626 * @internal Apple only
628 void setLineWordOpts(ULineWordOptions lineWordOpts
);
629 #endif /* U_HIDE_INTERNAL_API */
632 static BreakIterator
* buildInstance(const Locale
& loc
, const char *type
, UErrorCode
& status
);
633 static BreakIterator
* createInstance(const Locale
& loc
, int32_t kind
, UErrorCode
& status
);
634 static BreakIterator
* makeInstance(const Locale
& loc
, int32_t kind
, UErrorCode
& status
);
636 friend class ICUBreakIteratorFactory
;
637 friend class ICUBreakIteratorService
;
640 // Do not enclose protected default/copy constructors with #ifndef U_HIDE_INTERNAL_API
641 // or else the compiler will create a public ones.
645 BreakIterator (const BreakIterator
&other
);
646 #ifndef U_HIDE_INTERNAL_API
648 BreakIterator (const Locale
& valid
, const Locale
&actual
);
649 /** @internal. Assignment Operator, used by RuleBasedBreakIterator. */
650 BreakIterator
&operator = (const BreakIterator
&other
);
651 #endif /* U_HIDE_INTERNAL_API */
652 ULineWordOptions fLineWordOpts
;
656 /** @internal (private) */
657 char actualLocale
[ULOC_FULLNAME_CAPACITY
];
658 char validLocale
[ULOC_FULLNAME_CAPACITY
];
661 inline void BreakIterator::setLineWordOpts(ULineWordOptions lineWordOpts
)
663 fLineWordOpts
= lineWordOpts
;
666 #ifndef U_HIDE_DEPRECATED_API
668 inline UBool
BreakIterator::isBufferClone()
673 #endif /* U_HIDE_DEPRECATED_API */
676 #endif // U_SHOW_CPLUSPLUS_API
678 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */