]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unicode/brkiter.h
ICU-400.42.tar.gz
[apple/icu.git] / icuSources / common / unicode / brkiter.h
1 /*
2 ********************************************************************************
3 * Copyright (C) 1997-2007, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ********************************************************************************
6 *
7 * File brkiter.h
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 02/18/97 aliu Added typedef for TextCount. Made DONE const.
13 * 05/07/97 aliu Fixed DLL declaration.
14 * 07/09/97 jfitz Renamed BreakIterator and interface synced with JDK
15 * 08/11/98 helena Sync-up JDK1.2.
16 * 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.
17 ********************************************************************************
18 */
19
20 #ifndef BRKITER_H
21 #define BRKITER_H
22
23 #include "unicode/utypes.h"
24
25 /**
26 * \file
27 * \brief C++ API: Break Iterator.
28 */
29
30 #if UCONFIG_NO_BREAK_ITERATION
31
32 U_NAMESPACE_BEGIN
33
34 /*
35 * Allow the declaration of APIs with pointers to BreakIterator
36 * even when break iteration is removed from the build.
37 */
38 class BreakIterator;
39
40 U_NAMESPACE_END
41
42 #else
43
44 #include "unicode/uobject.h"
45 #include "unicode/unistr.h"
46 #include "unicode/chariter.h"
47 #include "unicode/locid.h"
48 #include "unicode/ubrk.h"
49 #include "unicode/strenum.h"
50 #include "unicode/utext.h"
51 #include "unicode/umisc.h"
52
53 U_NAMESPACE_BEGIN
54
55 /**
56 * The BreakIterator class implements methods for finding the location
57 * of boundaries in text. BreakIterator is an abstract base class.
58 * Instances of BreakIterator maintain a current position and scan over
59 * text returning the index of characters where boundaries occur.
60 * <p>
61 * Line boundary analysis determines where a text string can be broken
62 * when line-wrapping. The mechanism correctly handles punctuation and
63 * hyphenated words.
64 * <p>
65 * Sentence boundary analysis allows selection with correct
66 * interpretation of periods within numbers and abbreviations, and
67 * trailing punctuation marks such as quotation marks and parentheses.
68 * <p>
69 * Word boundary analysis is used by search and replace functions, as
70 * well as within text editing applications that allow the user to
71 * select words with a double click. Word selection provides correct
72 * interpretation of punctuation marks within and following
73 * words. Characters that are not part of a word, such as symbols or
74 * punctuation marks, have word-breaks on both sides.
75 * <p>
76 * Character boundary analysis allows users to interact with
77 * characters as they expect to, for example, when moving the cursor
78 * through a text string. Character boundary analysis provides correct
79 * navigation of through character strings, regardless of how the
80 * character is stored. For example, an accented character might be
81 * stored as a base character and a diacritical mark. What users
82 * consider to be a character can differ between languages.
83 * <p>
84 * The text boundary positions are found according to the rules
85 * described in Unicode Standard Annex #29, Text Boundaries, and
86 * Unicode Standard Annex #14, Line Breaking Properties. These
87 * are available at http://www.unicode.org/reports/tr14/ and
88 * http://www.unicode.org/reports/tr29/.
89 * <p>
90 * In addition to the C++ API defined in this header file, a
91 * plain C API with equivalent functionality is defined in the
92 * file ubrk.h
93 * <p>
94 * Code snippits illustrating the use of the Break Iterator APIs
95 * are available in the ICU User Guide,
96 * http://icu-project.org/userguide/boundaryAnalysis.html
97 * and in the sample program icu/source/samples/break/break.cpp"
98 *
99 */
100 class U_COMMON_API BreakIterator : public UObject {
101 public:
102 /**
103 * destructor
104 * @stable ICU 2.0
105 */
106 virtual ~BreakIterator();
107
108 /**
109 * Return true if another object is semantically equal to this
110 * one. The other object should be an instance of the same subclass of
111 * BreakIterator. Objects of different subclasses are considered
112 * unequal.
113 * <P>
114 * Return true if this BreakIterator is at the same position in the
115 * same text, and is the same class and type (word, line, etc.) of
116 * BreakIterator, as the argument. Text is considered the same if
117 * it contains the same characters, it need not be the same
118 * object, and styles are not considered.
119 * @stable ICU 2.0
120 */
121 virtual UBool operator==(const BreakIterator&) const = 0;
122
123 /**
124 * Returns the complement of the result of operator==
125 * @param rhs The BreakIterator to be compared for inequality
126 * @return the complement of the result of operator==
127 * @stable ICU 2.0
128 */
129 UBool operator!=(const BreakIterator& rhs) const { return !operator==(rhs); }
130
131 /**
132 * Return a polymorphic copy of this object. This is an abstract
133 * method which subclasses implement.
134 * @stable ICU 2.0
135 */
136 virtual BreakIterator* clone(void) const = 0;
137
138 /**
139 * Return a polymorphic class ID for this object. Different subclasses
140 * will return distinct unequal values.
141 * @stable ICU 2.0
142 */
143 virtual UClassID getDynamicClassID(void) const = 0;
144
145 /**
146 * Return a CharacterIterator over the text being analyzed.
147 * @stable ICU 2.0
148 */
149 virtual CharacterIterator& getText(void) const = 0;
150
151
152 /**
153 * Get a UText for the text being analyzed.
154 * The returned UText is a shallow clone of the UText used internally
155 * by the break iterator implementation. It can safely be used to
156 * access the text without impacting any break iterator operations,
157 * but the underlying text itself must not be altered.
158 *
159 * @param fillIn A UText to be filled in. If NULL, a new UText will be
160 * allocated to hold the result.
161 * @param status receives any error codes.
162 * @return The current UText for this break iterator. If an input
163 * UText was provided, it will always be returned.
164 * @stable ICU 3.4
165 */
166 virtual UText *getUText(UText *fillIn, UErrorCode &status) const = 0;
167
168 /**
169 * Change the text over which this operates. The text boundary is
170 * reset to the start.
171 * @param text The UnicodeString used to change the text.
172 * @stable ICU 2.0
173 */
174 virtual void setText(const UnicodeString &text) = 0;
175
176 /**
177 * Reset the break iterator to operate over the text represented by
178 * the UText. The iterator position is reset to the start.
179 *
180 * This function makes a shallow clone of the supplied UText. This means
181 * that the caller is free to immediately close or otherwise reuse the
182 * Utext that was passed as a parameter, but that the underlying text itself
183 * must not be altered while being referenced by the break iterator.
184 *
185 * @param text The UText used to change the text.
186 * @param status receives any error codes.
187 * @stable ICU 3.4
188 */
189 virtual void setText(UText *text, UErrorCode &status) = 0;
190
191 /**
192 * Change the text over which this operates. The text boundary is
193 * reset to the start.
194 * Note that setText(UText *) provides similar functionality to this function,
195 * and is more efficient.
196 * @param it The CharacterIterator used to change the text.
197 * @stable ICU 2.0
198 */
199 virtual void adoptText(CharacterIterator* it) = 0;
200
201 enum {
202 /**
203 * DONE is returned by previous() and next() after all valid
204 * boundaries have been returned.
205 * @stable ICU 2.0
206 */
207 DONE = (int32_t)-1
208 };
209
210 /**
211 * Return the index of the first character in the text being scanned.
212 * @stable ICU 2.0
213 */
214 virtual int32_t first(void) = 0;
215
216 /**
217 * Return the index immediately BEYOND the last character in the text being scanned.
218 * @stable ICU 2.0
219 */
220 virtual int32_t last(void) = 0;
221
222 /**
223 * Return the boundary preceding the current boundary.
224 * @return The character index of the previous text boundary or DONE if all
225 * boundaries have been returned.
226 * @stable ICU 2.0
227 */
228 virtual int32_t previous(void) = 0;
229
230 /**
231 * Return the boundary following the current boundary.
232 * @return The character index of the next text boundary or DONE if all
233 * boundaries have been returned.
234 * @stable ICU 2.0
235 */
236 virtual int32_t next(void) = 0;
237
238 /**
239 * Return character index of the current interator position within the text.
240 * @return The boundary most recently returned.
241 * @stable ICU 2.0
242 */
243 virtual int32_t current(void) const = 0;
244
245 /**
246 * Return the first boundary following the specified offset.
247 * The value returned is always greater than the offset or
248 * the value BreakIterator.DONE
249 * @param offset the offset to begin scanning.
250 * @return The first boundary after the specified offset.
251 * @stable ICU 2.0
252 */
253 virtual int32_t following(int32_t offset) = 0;
254
255 /**
256 * Return the first boundary preceding the specified offset.
257 * The value returned is always smaller than the offset or
258 * the value BreakIterator.DONE
259 * @param offset the offset to begin scanning.
260 * @return The first boundary before the specified offset.
261 * @stable ICU 2.0
262 */
263 virtual int32_t preceding(int32_t offset) = 0;
264
265 /**
266 * Return true if the specfied position is a boundary position.
267 * As a side effect, the current position of the iterator is set
268 * to the first boundary position at or following the specified offset.
269 * @param offset the offset to check.
270 * @return True if "offset" is a boundary position.
271 * @stable ICU 2.0
272 */
273 virtual UBool isBoundary(int32_t offset) = 0;
274
275 /**
276 * Return the nth boundary from the current boundary
277 * @param n which boundary to return. A value of 0
278 * does nothing. Negative values move to previous boundaries
279 * and positive values move to later boundaries.
280 * @return The index of the nth boundary from the current position, or
281 * DONE if there are fewer than |n| boundaries in the specfied direction.
282 * @stable ICU 2.0
283 */
284 virtual int32_t next(int32_t n) = 0;
285
286 /**
287 * Create BreakIterator for word-breaks using the given locale.
288 * Returns an instance of a BreakIterator implementing word breaks.
289 * WordBreak is useful for word selection (ex. double click)
290 * @param where the locale.
291 * @param status the error code
292 * @return A BreakIterator for word-breaks. The UErrorCode& status
293 * parameter is used to return status information to the user.
294 * To check whether the construction succeeded or not, you should check
295 * the value of U_SUCCESS(err). If you wish more detailed information, you
296 * can check for informational error results which still indicate success.
297 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
298 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
299 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
300 * used; neither the requested locale nor any of its fall back locales
301 * could be found.
302 * The caller owns the returned object and is responsible for deleting it.
303 * @stable ICU 2.0
304 */
305 static BreakIterator* U_EXPORT2
306 createWordInstance(const Locale& where, UErrorCode& status);
307
308 /**
309 * Create BreakIterator for line-breaks using specified locale.
310 * Returns an instance of a BreakIterator implementing line breaks. Line
311 * breaks are logically possible line breaks, actual line breaks are
312 * usually determined based on display width.
313 * LineBreak is useful for word wrapping text.
314 * @param where the locale.
315 * @param status The error code.
316 * @return A BreakIterator for line-breaks. The UErrorCode& status
317 * parameter is used to return status information to the user.
318 * To check whether the construction succeeded or not, you should check
319 * the value of U_SUCCESS(err). If you wish more detailed information, you
320 * can check for informational error results which still indicate success.
321 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
322 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
323 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
324 * used; neither the requested locale nor any of its fall back locales
325 * could be found.
326 * The caller owns the returned object and is responsible for deleting it.
327 * @stable ICU 2.0
328 */
329 static BreakIterator* U_EXPORT2
330 createLineInstance(const Locale& where, UErrorCode& status);
331
332 /**
333 * Create BreakIterator for character-breaks using specified locale
334 * Returns an instance of a BreakIterator implementing character breaks.
335 * Character breaks are boundaries of combining character sequences.
336 * @param where the locale.
337 * @param status The error code.
338 * @return A BreakIterator for character-breaks. The UErrorCode& status
339 * parameter is used to return status information to the user.
340 * To check whether the construction succeeded or not, you should check
341 * the value of U_SUCCESS(err). If you wish more detailed information, you
342 * can check for informational error results which still indicate success.
343 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
344 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
345 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
346 * used; neither the requested locale nor any of its fall back locales
347 * could be found.
348 * The caller owns the returned object and is responsible for deleting it.
349 * @stable ICU 2.0
350 */
351 static BreakIterator* U_EXPORT2
352 createCharacterInstance(const Locale& where, UErrorCode& status);
353
354 /**
355 * Create BreakIterator for sentence-breaks using specified locale
356 * Returns an instance of a BreakIterator implementing sentence breaks.
357 * @param where the locale.
358 * @param status The error code.
359 * @return A BreakIterator for sentence-breaks. The UErrorCode& status
360 * parameter is used to return status information to the user.
361 * To check whether the construction succeeded or not, you should check
362 * the value of U_SUCCESS(err). If you wish more detailed information, you
363 * can check for informational error results which still indicate success.
364 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
365 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
366 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
367 * used; neither the requested locale nor any of its fall back locales
368 * could be found.
369 * The caller owns the returned object and is responsible for deleting it.
370 * @stable ICU 2.0
371 */
372 static BreakIterator* U_EXPORT2
373 createSentenceInstance(const Locale& where, UErrorCode& status);
374
375 /**
376 * Create BreakIterator for title-casing breaks using the specified locale
377 * Returns an instance of a BreakIterator implementing title breaks.
378 * The iterator returned locates title boundaries as described for
379 * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
380 * please use Word Boundary iterator.{@link #createWordInstance }
381 *
382 * @param where the locale.
383 * @param status The error code.
384 * @return A BreakIterator for title-breaks. The UErrorCode& status
385 * parameter is used to return status information to the user.
386 * To check whether the construction succeeded or not, you should check
387 * the value of U_SUCCESS(err). If you wish more detailed information, you
388 * can check for informational error results which still indicate success.
389 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
390 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
391 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
392 * used; neither the requested locale nor any of its fall back locales
393 * could be found.
394 * The caller owns the returned object and is responsible for deleting it.
395 * @stable ICU 2.1
396 */
397 static BreakIterator* U_EXPORT2
398 createTitleInstance(const Locale& where, UErrorCode& status);
399
400 /**
401 * Get the set of Locales for which TextBoundaries are installed.
402 * <p><b>Note:</b> this will not return locales added through the register
403 * call. To see the registered locales too, use the getAvailableLocales
404 * function that returns a StringEnumeration object </p>
405 * @param count the output parameter of number of elements in the locale list
406 * @return available locales
407 * @stable ICU 2.0
408 */
409 static const Locale* U_EXPORT2 getAvailableLocales(int32_t& count);
410
411 /**
412 * Get name of the object for the desired Locale, in the desired langauge.
413 * @param objectLocale must be from getAvailableLocales.
414 * @param displayLocale specifies the desired locale for output.
415 * @param name the fill-in parameter of the return value
416 * Uses best match.
417 * @return user-displayable name
418 * @stable ICU 2.0
419 */
420 static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
421 const Locale& displayLocale,
422 UnicodeString& name);
423
424 /**
425 * Get name of the object for the desired Locale, in the langauge of the
426 * default locale.
427 * @param objectLocale must be from getMatchingLocales
428 * @param name the fill-in parameter of the return value
429 * @return user-displayable name
430 * @stable ICU 2.0
431 */
432 static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
433 UnicodeString& name);
434
435 /**
436 * Thread safe client-buffer-based cloning operation
437 * Do NOT call delete on a safeclone, since 'new' is not used to create it.
438 * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
439 * If buffer is not large enough, new memory will be allocated.
440 * @param BufferSize reference to size of allocated space.
441 * If BufferSize == 0, a sufficient size for use in cloning will
442 * be returned ('pre-flighting')
443 * If BufferSize is not enough for a stack-based safe clone,
444 * new memory will be allocated.
445 * @param status to indicate whether the operation went on smoothly or there were errors
446 * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
447 * necessary.
448 * @return pointer to the new clone
449 *
450 * @stable ICU 2.0
451 */
452 virtual BreakIterator * createBufferClone(void *stackBuffer,
453 int32_t &BufferSize,
454 UErrorCode &status) = 0;
455
456 /**
457 * Determine whether the BreakIterator was created in user memory by
458 * createBufferClone(), and thus should not be deleted. Such objects
459 * must be closed by an explicit call to the destructor (not delete).
460 * @stable ICU 2.0
461 */
462 inline UBool isBufferClone(void);
463
464 #if !UCONFIG_NO_SERVICE
465 /**
466 * Register a new break iterator of the indicated kind, to use in the given locale.
467 * The break iterator will be adopted. Clones of the iterator will be returned
468 * if a request for a break iterator of the given kind matches or falls back to
469 * this locale.
470 * @param toAdopt the BreakIterator instance to be adopted
471 * @param locale the Locale for which this instance is to be registered
472 * @param kind the type of iterator for which this instance is to be registered
473 * @param status the in/out status code, no special meanings are assigned
474 * @return a registry key that can be used to unregister this instance
475 * @stable ICU 2.4
476 */
477 static URegistryKey U_EXPORT2 registerInstance(BreakIterator* toAdopt,
478 const Locale& locale,
479 UBreakIteratorType kind,
480 UErrorCode& status);
481
482 /**
483 * Unregister a previously-registered BreakIterator using the key returned from the
484 * register call. Key becomes invalid after a successful call and should not be used again.
485 * The BreakIterator corresponding to the key will be deleted.
486 * @param key the registry key returned by a previous call to registerInstance
487 * @param status the in/out status code, no special meanings are assigned
488 * @return TRUE if the iterator for the key was successfully unregistered
489 * @stable ICU 2.4
490 */
491 static UBool U_EXPORT2 unregister(URegistryKey key, UErrorCode& status);
492
493 /**
494 * Return a StringEnumeration over the locales available at the time of the call,
495 * including registered locales.
496 * @return a StringEnumeration over the locales available at the time of the call
497 * @stable ICU 2.4
498 */
499 static StringEnumeration* U_EXPORT2 getAvailableLocales(void);
500 #endif
501
502 /**
503 * Returns the locale for this break iterator. Two flavors are available: valid and
504 * actual locale.
505 * @stable ICU 2.8
506 */
507 Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
508
509 /** Get the locale for this break iterator object. You can choose between valid and actual locale.
510 * @param type type of the locale we're looking for (valid or actual)
511 * @param status error code for the operation
512 * @return the locale
513 * @internal
514 */
515 const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
516
517 private:
518 static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status);
519 static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
520 static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
521
522 friend class ICUBreakIteratorFactory;
523 friend class ICUBreakIteratorService;
524
525 protected:
526 /** @internal */
527 BreakIterator();
528 /** @internal */
529 UBool fBufferClone;
530 /** @internal */
531 BreakIterator (const BreakIterator &other) : UObject(other), fBufferClone(FALSE) {}
532
533 private:
534
535 /** @internal */
536 char actualLocale[ULOC_FULLNAME_CAPACITY];
537 char validLocale[ULOC_FULLNAME_CAPACITY];
538
539 /**
540 * The assignment operator has no real implementation.
541 * It's provided to make the compiler happy. Do not call.
542 */
543 BreakIterator& operator=(const BreakIterator&);
544 };
545
546 inline UBool BreakIterator::isBufferClone()
547 {
548 return fBufferClone;
549 }
550
551 U_NAMESPACE_END
552
553 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
554
555 #endif // _BRKITER
556 //eof
557