]>
Commit | Line | Data |
---|---|---|
b75a7d8f | 1 | /* |
73c04bcf | 2 | ****************************************************************************** |
46f4442e | 3 | * Copyright (C) 1996-2007, International Business Machines Corporation and others. |
73c04bcf A |
4 | * All Rights Reserved. |
5 | ****************************************************************************** | |
b75a7d8f A |
6 | */ |
7 | ||
8 | #ifndef UBRK_H | |
9 | #define UBRK_H | |
10 | ||
11 | #include "unicode/utypes.h" | |
374ca955 | 12 | #include "unicode/uloc.h" |
73c04bcf | 13 | #include "unicode/utext.h" |
b75a7d8f A |
14 | |
15 | /** | |
16 | * A text-break iterator. | |
17 | * For usage in C programs. | |
18 | */ | |
19 | #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR | |
20 | # define UBRK_TYPEDEF_UBREAK_ITERATOR | |
21 | /** | |
22 | * Opaque type representing an ICU Break iterator object. | |
23 | * @stable ICU 2.0 | |
24 | */ | |
25 | typedef void UBreakIterator; | |
26 | #endif | |
27 | ||
28 | #if !UCONFIG_NO_BREAK_ITERATION | |
29 | ||
30 | #include "unicode/parseerr.h" | |
31 | ||
32 | /** | |
33 | * \file | |
34 | * \brief C API: BreakIterator | |
35 | * | |
36 | * <h2> BreakIterator C API </h2> | |
37 | * | |
38 | * The BreakIterator C API defines methods for finding the location | |
39 | * of boundaries in text. Pointer to a UBreakIterator maintain a | |
40 | * current position and scan over text returning the index of characters | |
41 | * where boundaries occur. | |
73c04bcf | 42 | * <p> |
b75a7d8f A |
43 | * Line boundary analysis determines where a text string can be broken |
44 | * when line-wrapping. The mechanism correctly handles punctuation and | |
45 | * hyphenated words. | |
73c04bcf | 46 | * <p> |
b75a7d8f A |
47 | * Sentence boundary analysis allows selection with correct |
48 | * interpretation of periods within numbers and abbreviations, and | |
49 | * trailing punctuation marks such as quotation marks and parentheses. | |
73c04bcf | 50 | * <p> |
b75a7d8f A |
51 | * Word boundary analysis is used by search and replace functions, as |
52 | * well as within text editing applications that allow the user to | |
53 | * select words with a double click. Word selection provides correct | |
54 | * interpretation of punctuation marks within and following | |
55 | * words. Characters that are not part of a word, such as symbols or | |
56 | * punctuation marks, have word-breaks on both sides. | |
73c04bcf | 57 | * <p> |
b75a7d8f A |
58 | * Character boundary analysis allows users to interact with |
59 | * characters as they expect to, for example, when moving the cursor | |
60 | * through a text string. Character boundary analysis provides correct | |
61 | * navigation of through character strings, regardless of how the | |
62 | * character is stored. For example, an accented character might be | |
63 | * stored as a base character and a diacritical mark. What users | |
64 | * consider to be a character can differ between languages. | |
73c04bcf | 65 | * <p> |
b75a7d8f A |
66 | * Title boundary analysis locates all positions, |
67 | * typically starts of words, that should be set to Title Case | |
68 | * when title casing the text. | |
73c04bcf A |
69 | * <p> |
70 | * The text boundary positions are found according to the rules | |
71 | * described in Unicode Standard Annex #29, Text Boundaries, and | |
72 | * Unicode Standard Annex #14, Line Breaking Properties. These | |
73 | * are available at http://www.unicode.org/reports/tr14/ and | |
74 | * http://www.unicode.org/reports/tr29/. | |
75 | * <p> | |
76 | * In addition to the plain C API defined in this header file, an | |
77 | * object oriented C++ API with equivalent functionality is defined in the | |
78 | * file brkiter.h. | |
79 | * <p> | |
80 | * Code snippits illustrating the use of the Break Iterator APIs | |
46f4442e A |
81 | * are available in the ICU User Guide, |
82 | * http://icu-project.org/userguide/boundaryAnalysis.html | |
73c04bcf | 83 | * and in the sample program icu/source/samples/break/break.cpp" |
b75a7d8f A |
84 | */ |
85 | ||
86 | /** The possible types of text boundaries. @stable ICU 2.0 */ | |
87 | typedef enum UBreakIteratorType { | |
88 | /** Character breaks @stable ICU 2.0 */ | |
73c04bcf | 89 | UBRK_CHARACTER = 0, |
b75a7d8f | 90 | /** Word breaks @stable ICU 2.0 */ |
73c04bcf | 91 | UBRK_WORD = 1, |
b75a7d8f | 92 | /** Line breaks @stable ICU 2.0 */ |
73c04bcf | 93 | UBRK_LINE = 2, |
b75a7d8f | 94 | /** Sentence breaks @stable ICU 2.0 */ |
73c04bcf | 95 | UBRK_SENTENCE = 3, |
374ca955 A |
96 | |
97 | #ifndef U_HIDE_DEPRECATED_API | |
46f4442e A |
98 | /** |
99 | * Title Case breaks | |
100 | * The iterator created using this type locates title boundaries as described for | |
b75a7d8f | 101 | * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration, |
374ca955 | 102 | * please use Word Boundary iterator. |
b75a7d8f | 103 | * |
374ca955 | 104 | * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later. |
b75a7d8f | 105 | */ |
73c04bcf | 106 | UBRK_TITLE = 4, |
374ca955 | 107 | #endif /* U_HIDE_DEPRECATED_API */ |
73c04bcf | 108 | UBRK_COUNT = 5 |
b75a7d8f A |
109 | } UBreakIteratorType; |
110 | ||
111 | /** Value indicating all text boundaries have been returned. | |
46f4442e | 112 | * @stable ICU 2.0 |
b75a7d8f A |
113 | */ |
114 | #define UBRK_DONE ((int32_t) -1) | |
115 | ||
116 | ||
117 | /** | |
118 | * Enum constants for the word break tags returned by | |
119 | * getRuleStatus(). A range of values is defined for each category of | |
120 | * word, to allow for further subdivisions of a category in future releases. | |
121 | * Applications should check for tag values falling within the range, rather | |
122 | * than for single individual values. | |
374ca955 | 123 | * @stable ICU 2.2 |
b75a7d8f A |
124 | */ |
125 | typedef enum UWordBreak { | |
46f4442e | 126 | /** Tag value for "words" that do not fit into any of other categories. |
b75a7d8f A |
127 | * Includes spaces and most punctuation. */ |
128 | UBRK_WORD_NONE = 0, | |
129 | /** Upper bound for tags for uncategorized words. */ | |
130 | UBRK_WORD_NONE_LIMIT = 100, | |
131 | /** Tag value for words that appear to be numbers, lower limit. */ | |
132 | UBRK_WORD_NUMBER = 100, | |
133 | /** Tag value for words that appear to be numbers, upper limit. */ | |
134 | UBRK_WORD_NUMBER_LIMIT = 200, | |
135 | /** Tag value for words that contain letters, excluding | |
136 | * hiragana, katakana or ideographic characters, lower limit. */ | |
137 | UBRK_WORD_LETTER = 200, | |
138 | /** Tag value for words containing letters, upper limit */ | |
139 | UBRK_WORD_LETTER_LIMIT = 300, | |
140 | /** Tag value for words containing kana characters, lower limit */ | |
141 | UBRK_WORD_KANA = 300, | |
142 | /** Tag value for words containing kana characters, upper limit */ | |
143 | UBRK_WORD_KANA_LIMIT = 400, | |
144 | /** Tag value for words containing ideographic characters, lower limit */ | |
145 | UBRK_WORD_IDEO = 400, | |
146 | /** Tag value for words containing ideographic characters, upper limit */ | |
147 | UBRK_WORD_IDEO_LIMIT = 500 | |
148 | } UWordBreak; | |
149 | ||
374ca955 A |
150 | /** |
151 | * Enum constants for the line break tags returned by getRuleStatus(). | |
152 | * A range of values is defined for each category of | |
153 | * word, to allow for further subdivisions of a category in future releases. | |
154 | * Applications should check for tag values falling within the range, rather | |
155 | * than for single individual values. | |
73c04bcf | 156 | * @stable ICU 2.8 |
374ca955 A |
157 | */ |
158 | typedef enum ULineBreakTag { | |
159 | /** Tag value for soft line breaks, positions at which a line break | |
160 | * is acceptable but not required */ | |
161 | UBRK_LINE_SOFT = 0, | |
162 | /** Upper bound for soft line breaks. */ | |
163 | UBRK_LINE_SOFT_LIMIT = 100, | |
164 | /** Tag value for a hard, or mandatory line break */ | |
165 | UBRK_LINE_HARD = 100, | |
166 | /** Upper bound for hard line breaks. */ | |
167 | UBRK_LINE_HARD_LIMIT = 200 | |
168 | } ULineBreakTag; | |
169 | ||
170 | ||
171 | ||
172 | /** | |
173 | * Enum constants for the sentence break tags returned by getRuleStatus(). | |
174 | * A range of values is defined for each category of | |
175 | * sentence, to allow for further subdivisions of a category in future releases. | |
176 | * Applications should check for tag values falling within the range, rather | |
177 | * than for single individual values. | |
73c04bcf | 178 | * @stable ICU 2.8 |
374ca955 A |
179 | */ |
180 | typedef enum USentenceBreakTag { | |
181 | /** Tag value for for sentences ending with a sentence terminator | |
182 | * ('.', '?', '!', etc.) character, possibly followed by a | |
183 | * hard separator (CR, LF, PS, etc.) | |
184 | */ | |
185 | UBRK_SENTENCE_TERM = 0, | |
186 | /** Upper bound for tags for sentences ended by sentence terminators. */ | |
187 | UBRK_SENTENCE_TERM_LIMIT = 100, | |
188 | /** Tag value for for sentences that do not contain an ending | |
46f4442e | 189 | * sentence terminator ('.', '?', '!', etc.) character, but |
374ca955 A |
190 | * are ended only by a hard separator (CR, LF, PS, etc.) or end of input. |
191 | */ | |
192 | UBRK_SENTENCE_SEP = 100, | |
193 | /** Upper bound for tags for sentences ended by a separator. */ | |
194 | UBRK_SENTENCE_SEP_LIMIT = 200 | |
195 | /** Tag value for a hard, or mandatory line break */ | |
196 | } USentenceBreakTag; | |
197 | ||
b75a7d8f A |
198 | |
199 | /** | |
200 | * Open a new UBreakIterator for locating text boundaries for a specified locale. | |
201 | * A UBreakIterator may be used for detecting character, line, word, | |
202 | * and sentence breaks in text. | |
203 | * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD, | |
204 | * UBRK_LINE, UBRK_SENTENCE | |
205 | * @param locale The locale specifying the text-breaking conventions. | |
206 | * @param text The text to be iterated over. | |
207 | * @param textLength The number of characters in text, or -1 if null-terminated. | |
208 | * @param status A UErrorCode to receive any errors. | |
209 | * @return A UBreakIterator for the specified locale. | |
210 | * @see ubrk_openRules | |
211 | * @stable ICU 2.0 | |
212 | */ | |
374ca955 | 213 | U_STABLE UBreakIterator* U_EXPORT2 |
b75a7d8f A |
214 | ubrk_open(UBreakIteratorType type, |
215 | const char *locale, | |
216 | const UChar *text, | |
217 | int32_t textLength, | |
218 | UErrorCode *status); | |
219 | ||
220 | /** | |
221 | * Open a new UBreakIterator for locating text boundaries using specified breaking rules. | |
222 | * The rule syntax is ... (TBD) | |
223 | * @param rules A set of rules specifying the text breaking conventions. | |
224 | * @param rulesLength The number of characters in rules, or -1 if null-terminated. | |
225 | * @param text The text to be iterated over. May be null, in which case ubrk_setText() is | |
226 | * used to specify the text to be iterated. | |
227 | * @param textLength The number of characters in text, or -1 if null-terminated. | |
228 | * @param parseErr Receives position and context information for any syntax errors | |
229 | * detected while parsing the rules. | |
230 | * @param status A UErrorCode to receive any errors. | |
231 | * @return A UBreakIterator for the specified rules. | |
232 | * @see ubrk_open | |
374ca955 | 233 | * @stable ICU 2.2 |
b75a7d8f | 234 | */ |
374ca955 | 235 | U_STABLE UBreakIterator* U_EXPORT2 |
b75a7d8f A |
236 | ubrk_openRules(const UChar *rules, |
237 | int32_t rulesLength, | |
238 | const UChar *text, | |
239 | int32_t textLength, | |
240 | UParseError *parseErr, | |
241 | UErrorCode *status); | |
242 | ||
243 | /** | |
244 | * Thread safe cloning operation | |
245 | * @param bi iterator to be cloned | |
246 | * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated. | |
247 | * If buffer is not large enough, new memory will be allocated. | |
248 | * Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations. | |
249 | * @param pBufferSize pointer to size of allocated space. | |
250 | * If *pBufferSize == 0, a sufficient size for use in cloning will | |
251 | * be returned ('pre-flighting') | |
252 | * If *pBufferSize is not enough for a stack-based safe clone, | |
253 | * new memory will be allocated. | |
254 | * @param status to indicate whether the operation went on smoothly or there were errors | |
255 | * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary. | |
256 | * @return pointer to the new clone | |
257 | * @stable ICU 2.0 | |
258 | */ | |
374ca955 | 259 | U_STABLE UBreakIterator * U_EXPORT2 |
b75a7d8f A |
260 | ubrk_safeClone( |
261 | const UBreakIterator *bi, | |
262 | void *stackBuffer, | |
263 | int32_t *pBufferSize, | |
264 | UErrorCode *status); | |
265 | ||
266 | /** | |
267 | * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone(). | |
268 | * @stable ICU 2.0 | |
269 | */ | |
270 | #define U_BRK_SAFECLONE_BUFFERSIZE 512 | |
271 | ||
272 | /** | |
273 | * Close a UBreakIterator. | |
274 | * Once closed, a UBreakIterator may no longer be used. | |
275 | * @param bi The break iterator to close. | |
276 | * @stable ICU 2.0 | |
277 | */ | |
374ca955 | 278 | U_STABLE void U_EXPORT2 |
b75a7d8f A |
279 | ubrk_close(UBreakIterator *bi); |
280 | ||
281 | /** | |
282 | * Sets an existing iterator to point to a new piece of text | |
283 | * @param bi The iterator to use | |
284 | * @param text The text to be set | |
285 | * @param textLength The length of the text | |
286 | * @param status The error code | |
287 | * @stable ICU 2.0 | |
288 | */ | |
374ca955 | 289 | U_STABLE void U_EXPORT2 |
b75a7d8f A |
290 | ubrk_setText(UBreakIterator* bi, |
291 | const UChar* text, | |
292 | int32_t textLength, | |
293 | UErrorCode* status); | |
294 | ||
73c04bcf A |
295 | |
296 | /** | |
297 | * Sets an existing iterator to point to a new piece of text | |
298 | * @param bi The iterator to use | |
46f4442e A |
299 | * @param text The text to be set. |
300 | * This function makes a shallow clone of the supplied UText. This means | |
301 | * that the caller is free to immediately close or otherwise reuse the | |
302 | * UText that was passed as a parameter, but that the underlying text itself | |
303 | * must not be altered while being referenced by the break iterator. | |
73c04bcf | 304 | * @param status The error code |
46f4442e | 305 | * @stable ICU 3.4 |
73c04bcf | 306 | */ |
46f4442e | 307 | U_STABLE void U_EXPORT2 |
73c04bcf A |
308 | ubrk_setUText(UBreakIterator* bi, |
309 | UText* text, | |
310 | UErrorCode* status); | |
311 | ||
312 | ||
313 | ||
b75a7d8f A |
314 | /** |
315 | * Determine the most recently-returned text boundary. | |
316 | * | |
317 | * @param bi The break iterator to use. | |
374ca955 A |
318 | * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous, |
319 | * \ref ubrk_first, or \ref ubrk_last. | |
b75a7d8f A |
320 | * @stable ICU 2.0 |
321 | */ | |
374ca955 | 322 | U_STABLE int32_t U_EXPORT2 |
b75a7d8f A |
323 | ubrk_current(const UBreakIterator *bi); |
324 | ||
325 | /** | |
326 | * Determine the text boundary following the current text boundary. | |
327 | * | |
328 | * @param bi The break iterator to use. | |
329 | * @return The character index of the next text boundary, or UBRK_DONE | |
330 | * if all text boundaries have been returned. | |
331 | * @see ubrk_previous | |
332 | * @stable ICU 2.0 | |
333 | */ | |
374ca955 | 334 | U_STABLE int32_t U_EXPORT2 |
b75a7d8f A |
335 | ubrk_next(UBreakIterator *bi); |
336 | ||
337 | /** | |
338 | * Determine the text boundary preceding the current text boundary. | |
339 | * | |
340 | * @param bi The break iterator to use. | |
341 | * @return The character index of the preceding text boundary, or UBRK_DONE | |
342 | * if all text boundaries have been returned. | |
343 | * @see ubrk_next | |
344 | * @stable ICU 2.0 | |
345 | */ | |
374ca955 | 346 | U_STABLE int32_t U_EXPORT2 |
b75a7d8f A |
347 | ubrk_previous(UBreakIterator *bi); |
348 | ||
349 | /** | |
350 | * Determine the index of the first character in the text being scanned. | |
351 | * This is not always the same as index 0 of the text. | |
352 | * @param bi The break iterator to use. | |
353 | * @return The character index of the first character in the text being scanned. | |
354 | * @see ubrk_last | |
355 | * @stable ICU 2.0 | |
356 | */ | |
374ca955 | 357 | U_STABLE int32_t U_EXPORT2 |
b75a7d8f A |
358 | ubrk_first(UBreakIterator *bi); |
359 | ||
360 | /** | |
361 | * Determine the index immediately <EM>beyond</EM> the last character in the text being | |
362 | * scanned. | |
363 | * This is not the same as the last character. | |
364 | * @param bi The break iterator to use. | |
365 | * @return The character offset immediately <EM>beyond</EM> the last character in the | |
366 | * text being scanned. | |
367 | * @see ubrk_first | |
368 | * @stable ICU 2.0 | |
369 | */ | |
374ca955 | 370 | U_STABLE int32_t U_EXPORT2 |
b75a7d8f A |
371 | ubrk_last(UBreakIterator *bi); |
372 | ||
373 | /** | |
374 | * Determine the text boundary preceding the specified offset. | |
375 | * The value returned is always smaller than offset, or UBRK_DONE. | |
376 | * @param bi The break iterator to use. | |
377 | * @param offset The offset to begin scanning. | |
378 | * @return The text boundary preceding offset, or UBRK_DONE. | |
379 | * @see ubrk_following | |
380 | * @stable ICU 2.0 | |
381 | */ | |
374ca955 | 382 | U_STABLE int32_t U_EXPORT2 |
b75a7d8f A |
383 | ubrk_preceding(UBreakIterator *bi, |
384 | int32_t offset); | |
385 | ||
386 | /** | |
387 | * Determine the text boundary following the specified offset. | |
388 | * The value returned is always greater than offset, or UBRK_DONE. | |
389 | * @param bi The break iterator to use. | |
390 | * @param offset The offset to begin scanning. | |
391 | * @return The text boundary following offset, or UBRK_DONE. | |
392 | * @see ubrk_preceding | |
393 | * @stable ICU 2.0 | |
394 | */ | |
374ca955 | 395 | U_STABLE int32_t U_EXPORT2 |
b75a7d8f A |
396 | ubrk_following(UBreakIterator *bi, |
397 | int32_t offset); | |
398 | ||
399 | /** | |
400 | * Get a locale for which text breaking information is available. | |
401 | * A UBreakIterator in a locale returned by this function will perform the correct | |
402 | * text breaking for the locale. | |
403 | * @param index The index of the desired locale. | |
404 | * @return A locale for which number text breaking information is available, or 0 if none. | |
405 | * @see ubrk_countAvailable | |
406 | * @stable ICU 2.0 | |
407 | */ | |
374ca955 | 408 | U_STABLE const char* U_EXPORT2 |
b75a7d8f A |
409 | ubrk_getAvailable(int32_t index); |
410 | ||
411 | /** | |
412 | * Determine how many locales have text breaking information available. | |
413 | * This function is most useful as determining the loop ending condition for | |
374ca955 | 414 | * calls to \ref ubrk_getAvailable. |
b75a7d8f A |
415 | * @return The number of locales for which text breaking information is available. |
416 | * @see ubrk_getAvailable | |
417 | * @stable ICU 2.0 | |
418 | */ | |
374ca955 | 419 | U_STABLE int32_t U_EXPORT2 |
b75a7d8f A |
420 | ubrk_countAvailable(void); |
421 | ||
422 | ||
423 | /** | |
424 | * Returns true if the specfied position is a boundary position. As a side | |
425 | * effect, leaves the iterator pointing to the first boundary position at | |
426 | * or after "offset". | |
427 | * @param bi The break iterator to use. | |
428 | * @param offset the offset to check. | |
429 | * @return True if "offset" is a boundary position. | |
430 | * @stable ICU 2.0 | |
431 | */ | |
374ca955 | 432 | U_STABLE UBool U_EXPORT2 |
b75a7d8f A |
433 | ubrk_isBoundary(UBreakIterator *bi, int32_t offset); |
434 | ||
435 | /** | |
436 | * Return the status from the break rule that determined the most recently | |
437 | * returned break position. The values appear in the rule source | |
438 | * within brackets, {123}, for example. For rules that do not specify a | |
439 | * status, a default value of 0 is returned. | |
440 | * <p> | |
441 | * For word break iterators, the possible values are defined in enum UWordBreak. | |
374ca955 | 442 | * @stable ICU 2.2 |
b75a7d8f | 443 | */ |
374ca955 | 444 | U_STABLE int32_t U_EXPORT2 |
b75a7d8f A |
445 | ubrk_getRuleStatus(UBreakIterator *bi); |
446 | ||
374ca955 A |
447 | /** |
448 | * Get the statuses from the break rules that determined the most recently | |
449 | * returned break position. The values appear in the rule source | |
450 | * within brackets, {123}, for example. The default status value for rules | |
451 | * that do not explicitly provide one is zero. | |
452 | * <p> | |
453 | * For word break iterators, the possible values are defined in enum UWordBreak. | |
454 | * @param bi The break iterator to use | |
46f4442e | 455 | * @param fillInVec an array to be filled in with the status values. |
374ca955 A |
456 | * @param capacity the length of the supplied vector. A length of zero causes |
457 | * the function to return the number of status values, in the | |
458 | * normal way, without attemtping to store any values. | |
46f4442e A |
459 | * @param status receives error codes. |
460 | * @return The number of rule status values from rules that determined | |
374ca955 | 461 | * the most recent boundary returned by the break iterator. |
73c04bcf | 462 | * @stable ICU 3.0 |
374ca955 | 463 | */ |
73c04bcf | 464 | U_STABLE int32_t U_EXPORT2 |
374ca955 A |
465 | ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status); |
466 | ||
467 | /** | |
468 | * Return the locale of the break iterator. You can choose between the valid and | |
469 | * the actual locale. | |
470 | * @param bi break iterator | |
471 | * @param type locale type (valid or actual) | |
472 | * @param status error code | |
473 | * @return locale string | |
73c04bcf | 474 | * @stable ICU 2.8 |
374ca955 | 475 | */ |
73c04bcf | 476 | U_STABLE const char* U_EXPORT2 |
374ca955 A |
477 | ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status); |
478 | ||
479 | ||
b75a7d8f A |
480 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
481 | ||
482 | #endif |