]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/unicode/brkiter.h
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / common / unicode / brkiter.h
CommitLineData
b75a7d8f
A
1/*
2********************************************************************************
374ca955 3* Copyright (C) 1997-2004, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5********************************************************************************
6*
7* File brkiter.h
8*
9* Modification History:
10*
11* Date Name Description
12* 02/18/97 aliu Added typedef for TextCount. Made DONE const.
13* 05/07/97 aliu Fixed DLL declaration.
14* 07/09/97 jfitz Renamed BreakIterator and interface synced with JDK
15* 08/11/98 helena Sync-up JDK1.2.
16* 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.
17********************************************************************************
18*/
19
20#ifndef BRKITER_H
21#define BRKITER_H
22
23#include "unicode/utypes.h"
24
25#if UCONFIG_NO_BREAK_ITERATION
26
27U_NAMESPACE_BEGIN
28
29/*
30 * Allow the declaration of APIs with pointers to BreakIterator
31 * even when break iteration is removed from the build.
32 */
33class BreakIterator;
34
35U_NAMESPACE_END
36
37#else
38
39#include "unicode/uobject.h"
40#include "unicode/unistr.h"
41#include "unicode/chariter.h"
42#include "unicode/locid.h"
43#include "unicode/ubrk.h"
44#include "unicode/strenum.h"
45
46U_NAMESPACE_BEGIN
47
374ca955
A
48#if !UCONFIG_NO_SERVICE
49/**
50 * Opaque type returned by registerInstance.
51 * @stable
52 */
b75a7d8f 53typedef const void* URegistryKey;
374ca955 54#endif
b75a7d8f
A
55
56/**
57 * The BreakIterator class implements methods for finding the location
58 * of boundaries in text. BreakIterator is an abstract base class.
59 * Instances of BreakIterator maintain a current position and scan over
60 * text returning the index of characters where boundaries occur.
61 * <P>
62 * Line boundary analysis determines where a text string can be broken
63 * when line-wrapping. The mechanism correctly handles punctuation and
64 * hyphenated words.
65 * <P>
66 * Sentence boundary analysis allows selection with correct
67 * interpretation of periods within numbers and abbreviations, and
68 * trailing punctuation marks such as quotation marks and parentheses.
69 * <P>
70 * Word boundary analysis is used by search and replace functions, as
71 * well as within text editing applications that allow the user to
72 * select words with a double click. Word selection provides correct
73 * interpretation of punctuation marks within and following
74 * words. Characters that are not part of a word, such as symbols or
75 * punctuation marks, have word-breaks on both sides.
76 * <P>
77 * Character boundary analysis allows users to interact with
78 * characters as they expect to, for example, when moving the cursor
79 * through a text string. Character boundary analysis provides correct
80 * navigation of through character strings, regardless of how the
81 * character is stored. For example, an accented character might be
82 * stored as a base character and a diacritical mark. What users
83 * consider to be a character can differ between languages.
84 * <P>
85 * This is the interface for all text boundaries.
86 * <P>
87 * Examples:
88 * <P>
89 * Helper function to output text
90 * <pre>
91 * \code
92 * void printTextRange( BreakIterator& iterator, int32_t start, int32_t end )
93 * {
94 * UnicodeString textBuffer, temp;
95 * CharacterIterator *strIter = iterator.createText();
96 * strIter->getText(temp);
97 * cout << " " << start << " " << end << " |"
98 * << temp.extractBetween(start, end, textBuffer)
99 * << "|" << endl;
100 * delete strIter;
101 * }
102 * \endcode
103 * </pre>
104 * Print each element in order:
105 * <pre>
106 * \code
107 * void printEachForward( BreakIterator& boundary)
108 * {
109 * int32_t start = boundary.first();
110 * for (int32_t end = boundary.next();
111 * end != BreakIterator::DONE;
112 * start = end, end = boundary.next())
113 * {
114 * printTextRange( boundary, start, end );
115 * }
116 * }
117 * \code
118 * </pre>
119 * Print each element in reverse order:
120 * <pre>
121 * \code
122 * void printEachBackward( BreakIterator& boundary)
123 * {
124 * int32_t end = boundary.last();
125 * for (int32_t start = boundary.previous();
126 * start != BreakIterator::DONE;
127 * end = start, start = boundary.previous())
128 * {
129 * printTextRange( boundary, start, end );
130 * }
131 * }
132 * \endcode
133 * </pre>
134 * Print first element
135 * <pre>
136 * \code
137 * void printFirst(BreakIterator& boundary)
138 * {
139 * int32_t start = boundary.first();
140 * int32_t end = boundary.next();
141 * printTextRange( boundary, start, end );
142 * }
143 * \endcode
144 * </pre>
145 * Print last element
146 * <pre>
147 * \code
148 * void printLast(BreakIterator& boundary)
149 * {
150 * int32_t end = boundary.last();
151 * int32_t start = boundary.previous();
152 * printTextRange( boundary, start, end );
153 * }
154 * \endcode
155 * </pre>
156 * Print the element at a specified position
157 * <pre>
158 * \code
159 * void printAt(BreakIterator &boundary, int32_t pos )
160 * {
161 * int32_t end = boundary.following(pos);
162 * int32_t start = boundary.previous();
163 * printTextRange( boundary, start, end );
164 * }
165 * \endcode
166 * </pre>
167 * Creating and using text boundaries
168 * <pre>
169 * \code
170 * void BreakIterator_Example( void )
171 * {
172 * BreakIterator* boundary;
173 * UnicodeString stringToExamine("Aaa bbb ccc. Ddd eee fff.");
174 * cout << "Examining: " << stringToExamine << endl;
175 *
176 * //print each sentence in forward and reverse order
177 * boundary = BreakIterator::createSentenceInstance( Locale::US );
178 * boundary->setText(stringToExamine);
179 * cout << "----- forward: -----------" << endl;
180 * printEachForward(*boundary);
181 * cout << "----- backward: ----------" << endl;
182 * printEachBackward(*boundary);
183 * delete boundary;
184 *
185 * //print each word in order
186 * boundary = BreakIterator::createWordInstance();
187 * boundary->setText(stringToExamine);
188 * cout << "----- forward: -----------" << endl;
189 * printEachForward(*boundary);
190 * //print first element
191 * cout << "----- first: -------------" << endl;
192 * printFirst(*boundary);
193 * //print last element
194 * cout << "----- last: --------------" << endl;
195 * printLast(*boundary);
196 * //print word at charpos 10
197 * cout << "----- at pos 10: ---------" << endl;
198 * printAt(*boundary, 10 );
199 *
200 * delete boundary;
201 * }
202 * \endcode
203 * </pre>
204 */
205class U_COMMON_API BreakIterator : public UObject {
206public:
207 /**
208 * destructor
209 * @stable ICU 2.0
210 */
211 virtual ~BreakIterator();
212
213 /**
214 * Return true if another object is semantically equal to this
215 * one. The other object should be an instance of the same subclass of
216 * BreakIterator. Objects of different subclasses are considered
217 * unequal.
218 * <P>
219 * Return true if this BreakIterator is at the same position in the
220 * same text, and is the same class and type (word, line, etc.) of
221 * BreakIterator, as the argument. Text is considered the same if
222 * it contains the same characters, it need not be the same
223 * object, and styles are not considered.
224 * @stable ICU 2.0
225 */
226 virtual UBool operator==(const BreakIterator&) const = 0;
227
228 /**
229 * Returns the complement of the result of operator==
230 * @param rhs The BreakIterator to be compared for inequality
231 * @return the complement of the result of operator==
232 * @stable ICU 2.0
233 */
234 UBool operator!=(const BreakIterator& rhs) const { return !operator==(rhs); }
235
236 /**
237 * Return a polymorphic copy of this object. This is an abstract
238 * method which subclasses implement.
239 * @stable ICU 2.0
240 */
241 virtual BreakIterator* clone(void) const = 0;
242
243 /**
244 * Return a polymorphic class ID for this object. Different subclasses
245 * will return distinct unequal values.
246 * @stable ICU 2.0
247 */
248 virtual UClassID getDynamicClassID(void) const = 0;
249
250 /**
251 * Return a CharacterIterator over the text being analyzed.
252 * Changing the state of the returned iterator can have undefined consequences
253 * on the operation of the break iterator. If you need to change it, clone it first.
254 * @stable ICU 2.0
255 */
256 virtual const CharacterIterator& getText(void) const = 0;
257
258 /**
259 * Change the text over which this operates. The text boundary is
260 * reset to the start.
261 * @param text The UnicodeString used to change the text.
262 * @stable ICU 2.0
263 */
264 virtual void setText(const UnicodeString &text) = 0;
265
266 /**
267 * Change the text over which this operates. The text boundary is
268 * reset to the start.
269 * @param it The CharacterIterator used to change the text.
270 * @stable ICU 2.0
271 */
272 virtual void adoptText(CharacterIterator* it) = 0;
273
274 /**
275 * DONE is returned by previous() and next() after all valid
276 * boundaries have been returned.
277 * @stable ICU 2.0
278 */
374ca955
A
279#ifdef U_CYGWIN
280 static U_COMMON_API const int32_t DONE;
281#else
b75a7d8f 282 static const int32_t DONE;
374ca955 283#endif
b75a7d8f
A
284
285 /**
286 * Return the index of the first character in the text being scanned.
287 * @stable ICU 2.0
288 */
289 virtual int32_t first(void) = 0;
290
291 /**
292 * Return the index immediately BEYOND the last character in the text being scanned.
293 * @stable ICU 2.0
294 */
295 virtual int32_t last(void) = 0;
296
297 /**
298 * Return the boundary preceding the current boundary.
299 * @return The character index of the previous text boundary or DONE if all
300 * boundaries have been returned.
301 * @stable ICU 2.0
302 */
303 virtual int32_t previous(void) = 0;
304
305 /**
306 * Return the boundary following the current boundary.
307 * @return The character index of the next text boundary or DONE if all
308 * boundaries have been returned.
309 * @stable ICU 2.0
310 */
311 virtual int32_t next(void) = 0;
312
313 /**
314 * Return character index of the current interator position within the text.
315 * @return The boundary most recently returned.
316 * @stable ICU 2.0
317 */
318 virtual int32_t current(void) const = 0;
319
320 /**
321 * Return the first boundary following the specified offset.
322 * The value returned is always greater than the offset or
323 * the value BreakIterator.DONE
324 * @param offset the offset to begin scanning.
325 * @return The first boundary after the specified offset.
326 * @stable ICU 2.0
327 */
328 virtual int32_t following(int32_t offset) = 0;
329
330 /**
331 * Return the first boundary preceding the specified offset.
332 * The value returned is always smaller than the offset or
333 * the value BreakIterator.DONE
334 * @param offset the offset to begin scanning.
335 * @return The first boundary before the specified offset.
336 * @stable ICU 2.0
337 */
338 virtual int32_t preceding(int32_t offset) = 0;
339
340 /**
341 * Return true if the specfied position is a boundary position.
342 * As a side effect, the current position of the iterator is set
343 * to the first boundary position at or following the specified offset.
344 * @param offset the offset to check.
345 * @return True if "offset" is a boundary position.
346 * @stable ICU 2.0
347 */
348 virtual UBool isBoundary(int32_t offset) = 0;
349
350 /**
351 * Return the nth boundary from the current boundary
352 * @param n which boundary to return. A value of 0
353 * does nothing. Negative values move to previous boundaries
354 * and positive values move to later boundaries.
355 * @return The index of the nth boundary from the current position, or
356 * DONE if there are fewer than |n| boundaries in the specfied direction.
357 * @stable ICU 2.0
358 */
359 virtual int32_t next(int32_t n) = 0;
360
361 /**
362 * Create BreakIterator for word-breaks using the given locale.
363 * Returns an instance of a BreakIterator implementing word breaks.
364 * WordBreak is useful for word selection (ex. double click)
365 * @param where the locale.
366 * @param status the error code
367 * @return A BreakIterator for word-breaks. The UErrorCode& status
368 * parameter is used to return status information to the user.
369 * To check whether the construction succeeded or not, you should check
370 * the value of U_SUCCESS(err). If you wish more detailed information, you
371 * can check for informational error results which still indicate success.
372 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
373 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
374 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
375 * used; neither the requested locale nor any of its fall back locales
376 * could be found.
377 * The caller owns the returned object and is responsible for deleting it.
378 * @stable ICU 2.0
379 */
374ca955
A
380 static BreakIterator* U_EXPORT2
381 createWordInstance(const Locale& where, UErrorCode& status);
b75a7d8f
A
382
383 /**
384 * Create BreakIterator for line-breaks using specified locale.
385 * Returns an instance of a BreakIterator implementing line breaks. Line
386 * breaks are logically possible line breaks, actual line breaks are
387 * usually determined based on display width.
388 * LineBreak is useful for word wrapping text.
389 * @param where the locale.
390 * @param status The error code.
391 * @return A BreakIterator for line-breaks. The UErrorCode& status
392 * parameter is used to return status information to the user.
393 * To check whether the construction succeeded or not, you should check
394 * the value of U_SUCCESS(err). If you wish more detailed information, you
395 * can check for informational error results which still indicate success.
396 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
397 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
398 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
399 * used; neither the requested locale nor any of its fall back locales
400 * could be found.
401 * The caller owns the returned object and is responsible for deleting it.
402 * @stable ICU 2.0
403 */
374ca955
A
404 static BreakIterator* U_EXPORT2
405 createLineInstance(const Locale& where, UErrorCode& status);
b75a7d8f
A
406
407 /**
408 * Create BreakIterator for character-breaks using specified locale
409 * Returns an instance of a BreakIterator implementing character breaks.
410 * Character breaks are boundaries of combining character sequences.
411 * @param where the locale.
412 * @param status The error code.
413 * @return A BreakIterator for character-breaks. The UErrorCode& status
414 * parameter is used to return status information to the user.
415 * To check whether the construction succeeded or not, you should check
416 * the value of U_SUCCESS(err). If you wish more detailed information, you
417 * can check for informational error results which still indicate success.
418 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
419 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
420 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
421 * used; neither the requested locale nor any of its fall back locales
422 * could be found.
423 * The caller owns the returned object and is responsible for deleting it.
424 * @stable ICU 2.0
425 */
374ca955
A
426 static BreakIterator* U_EXPORT2
427 createCharacterInstance(const Locale& where, UErrorCode& status);
b75a7d8f
A
428
429 /**
430 * Create BreakIterator for sentence-breaks using specified locale
431 * Returns an instance of a BreakIterator implementing sentence breaks.
432 * @param where the locale.
433 * @param status The error code.
434 * @return A BreakIterator for sentence-breaks. The UErrorCode& status
435 * parameter is used to return status information to the user.
436 * To check whether the construction succeeded or not, you should check
437 * the value of U_SUCCESS(err). If you wish more detailed information, you
438 * can check for informational error results which still indicate success.
439 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
440 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
441 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
442 * used; neither the requested locale nor any of its fall back locales
443 * could be found.
444 * The caller owns the returned object and is responsible for deleting it.
445 * @stable ICU 2.0
446 */
374ca955
A
447 static BreakIterator* U_EXPORT2
448 createSentenceInstance(const Locale& where, UErrorCode& status);
b75a7d8f
A
449
450 /**
451 * Create BreakIterator for title-casing breaks using the specified locale
452 * Returns an instance of a BreakIterator implementing title breaks.
374ca955 453 * The iterator returned locates title boundaries as described for
b75a7d8f 454 * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
374ca955 455 * please use Word Boundary iterator.{@link #createWordInstance }
b75a7d8f
A
456 *
457 * @param where the locale.
458 * @param status The error code.
459 * @return A BreakIterator for title-breaks. The UErrorCode& status
460 * parameter is used to return status information to the user.
461 * To check whether the construction succeeded or not, you should check
462 * the value of U_SUCCESS(err). If you wish more detailed information, you
463 * can check for informational error results which still indicate success.
464 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
465 * example, 'de_CH' was requested, but nothing was found there, so 'de' was
466 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
467 * used; neither the requested locale nor any of its fall back locales
468 * could be found.
469 * The caller owns the returned object and is responsible for deleting it.
470 * @stable ICU 2.1
471 */
374ca955
A
472 static BreakIterator* U_EXPORT2
473 createTitleInstance(const Locale& where, UErrorCode& status);
b75a7d8f
A
474
475 /**
476 * Get the set of Locales for which TextBoundaries are installed.
477 * <p><b>Note:</b> this will not return locales added through the register
374ca955
A
478 * call. To see the registered locales too, use the getAvailableLocales
479 * function that returns a StringEnumeration object </p>
b75a7d8f
A
480 * @param count the output parameter of number of elements in the locale list
481 * @return available locales
482 * @stable ICU 2.0
483 */
374ca955 484 static const Locale* U_EXPORT2 getAvailableLocales(int32_t& count);
b75a7d8f
A
485
486 /**
487 * Get name of the object for the desired Locale, in the desired langauge.
488 * @param objectLocale must be from getAvailableLocales.
489 * @param displayLocale specifies the desired locale for output.
490 * @param name the fill-in parameter of the return value
491 * Uses best match.
492 * @return user-displayable name
493 * @stable ICU 2.0
494 */
374ca955 495 static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
b75a7d8f
A
496 const Locale& displayLocale,
497 UnicodeString& name);
498
499 /**
500 * Get name of the object for the desired Locale, in the langauge of the
501 * default locale.
502 * @param objectLocale must be from getMatchingLocales
503 * @param name the fill-in parameter of the return value
504 * @return user-displayable name
505 * @stable ICU 2.0
506 */
374ca955 507 static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
b75a7d8f
A
508 UnicodeString& name);
509
510 /**
511 * Thread safe client-buffer-based cloning operation
512 * Do NOT call delete on a safeclone, since 'new' is not used to create it.
513 * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
514 * If buffer is not large enough, new memory will be allocated.
515 * @param BufferSize reference to size of allocated space.
516 * If BufferSize == 0, a sufficient size for use in cloning will
517 * be returned ('pre-flighting')
518 * If BufferSize is not enough for a stack-based safe clone,
519 * new memory will be allocated.
520 * @param status to indicate whether the operation went on smoothly or there were errors
521 * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
522 * necessary.
523 * @return pointer to the new clone
524 *
525 * @stable ICU 2.0
526 */
527 virtual BreakIterator * createBufferClone(void *stackBuffer,
528 int32_t &BufferSize,
529 UErrorCode &status) = 0;
530
531 /**
532 * Determine whether the BreakIterator was created in user memory by
533 * createBufferClone(), and thus should not be deleted. Such objects
534 * must be closed by an explicit call to the destructor (not delete).
535 * @stable ICU 2.0
536 */
537 inline UBool isBufferClone(void);
538
374ca955 539#if !UCONFIG_NO_SERVICE
b75a7d8f
A
540 /**
541 * Register a new break iterator of the indicated kind, to use in the given locale.
374ca955 542 * The break iterator will be adopted. Clones of the iterator will be returned
b75a7d8f
A
543 * if a request for a break iterator of the given kind matches or falls back to
544 * this locale.
545 * @param toAdopt the BreakIterator instance to be adopted
546 * @param locale the Locale for which this instance is to be registered
547 * @param kind the type of iterator for which this instance is to be registered
548 * @param status the in/out status code, no special meanings are assigned
549 * @return a registry key that can be used to unregister this instance
374ca955 550 * @stable ICU 2.4
b75a7d8f 551 */
374ca955
A
552 static URegistryKey U_EXPORT2 registerInstance(BreakIterator* toAdopt,
553 const Locale& locale,
554 UBreakIteratorType kind,
555 UErrorCode& status);
b75a7d8f
A
556
557 /**
558 * Unregister a previously-registered BreakIterator using the key returned from the
559 * register call. Key becomes invalid after a successful call and should not be used again.
560 * The BreakIterator corresponding to the key will be deleted.
561 * @param key the registry key returned by a previous call to registerInstance
562 * @param status the in/out status code, no special meanings are assigned
563 * @return TRUE if the iterator for the key was successfully unregistered
374ca955 564 * @stable ICU 2.4
b75a7d8f 565 */
374ca955 566 static UBool U_EXPORT2 unregister(URegistryKey key, UErrorCode& status);
b75a7d8f
A
567
568 /**
374ca955 569 * Return a StringEnumeration over the locales available at the time of the call,
b75a7d8f
A
570 * including registered locales.
571 * @return a StringEnumeration over the locales available at the time of the call
374ca955 572 * @stable ICU 2.4
b75a7d8f 573 */
374ca955
A
574 static StringEnumeration* U_EXPORT2 getAvailableLocales(void);
575#endif
b75a7d8f 576
374ca955
A
577 /**
578 * Returns the locale for this break iterator. Two flavors are available: valid and
579 * actual locale.
580 * @draft ICU 2.8 likely to change in ICU 3.0, based on feedback
581 */
582 Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
b75a7d8f 583
374ca955
A
584 /** Get the locale for this break iterator object. You can choose between valid and actual locale.
585 * @param type type of the locale we're looking for (valid or actual)
586 * @param status error code for the operation
587 * @return the locale
588 * @internal
589 */
590 const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
591
592 private:
593 static BreakIterator* buildInstance(const Locale& loc, const char *type, UBool dict, UErrorCode& status);
b75a7d8f
A
594 static BreakIterator* createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status);
595 static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
596
597 friend class ICUBreakIteratorFactory;
598 friend class ICUBreakIteratorService;
599
600protected:
601 /** @internal */
602 BreakIterator();
603 /** @internal */
604 UBool fBufferClone;
605 /** @internal */
606 BreakIterator (const BreakIterator &other) : UObject(other), fBufferClone(FALSE) {}
374ca955 607
b75a7d8f 608private:
374ca955
A
609
610 /** @internal */
611 char actualLocale[ULOC_FULLNAME_CAPACITY];
612 char validLocale[ULOC_FULLNAME_CAPACITY];
613
b75a7d8f
A
614 /**
615 * The assignment operator has no real implementation.
616 * It's provided to make the compiler happy. Do not call.
617 */
618 BreakIterator& operator=(const BreakIterator&) { return *this; }
619};
620
621inline UBool BreakIterator::isBufferClone()
622{
623 return fBufferClone;
624}
625
626U_NAMESPACE_END
627
628#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
629
630#endif // _BRKITER
631//eof
632