]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | * Copyright (C) 1996-2003, International Business Machines Corporation and others. All Rights Reserved. | |
3 | ***************************************************************************************** | |
4 | */ | |
5 | ||
6 | #ifndef UBRK_H | |
7 | #define UBRK_H | |
8 | ||
9 | #include "unicode/utypes.h" | |
10 | ||
11 | /** | |
12 | * A text-break iterator. | |
13 | * For usage in C programs. | |
14 | */ | |
15 | #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR | |
16 | # define UBRK_TYPEDEF_UBREAK_ITERATOR | |
17 | /** | |
18 | * Opaque type representing an ICU Break iterator object. | |
19 | * @stable ICU 2.0 | |
20 | */ | |
21 | typedef void UBreakIterator; | |
22 | #endif | |
23 | ||
24 | #if !UCONFIG_NO_BREAK_ITERATION | |
25 | ||
26 | #include "unicode/parseerr.h" | |
27 | ||
28 | /** | |
29 | * \file | |
30 | * \brief C API: BreakIterator | |
31 | * | |
32 | * <h2> BreakIterator C API </h2> | |
33 | * | |
34 | * The BreakIterator C API defines methods for finding the location | |
35 | * of boundaries in text. Pointer to a UBreakIterator maintain a | |
36 | * current position and scan over text returning the index of characters | |
37 | * where boundaries occur. | |
38 | * <P> | |
39 | * Line boundary analysis determines where a text string can be broken | |
40 | * when line-wrapping. The mechanism correctly handles punctuation and | |
41 | * hyphenated words. | |
42 | * <P> | |
43 | * Sentence boundary analysis allows selection with correct | |
44 | * interpretation of periods within numbers and abbreviations, and | |
45 | * trailing punctuation marks such as quotation marks and parentheses. | |
46 | * <P> | |
47 | * Word boundary analysis is used by search and replace functions, as | |
48 | * well as within text editing applications that allow the user to | |
49 | * select words with a double click. Word selection provides correct | |
50 | * interpretation of punctuation marks within and following | |
51 | * words. Characters that are not part of a word, such as symbols or | |
52 | * punctuation marks, have word-breaks on both sides. | |
53 | * <P> | |
54 | * Character boundary analysis allows users to interact with | |
55 | * characters as they expect to, for example, when moving the cursor | |
56 | * through a text string. Character boundary analysis provides correct | |
57 | * navigation of through character strings, regardless of how the | |
58 | * character is stored. For example, an accented character might be | |
59 | * stored as a base character and a diacritical mark. What users | |
60 | * consider to be a character can differ between languages. | |
61 | * <P> | |
62 | * Title boundary analysis locates all positions, | |
63 | * typically starts of words, that should be set to Title Case | |
64 | * when title casing the text. | |
65 | * <P> | |
66 | * | |
67 | * This is the interface for all text boundaries. | |
68 | * <P> | |
69 | * Examples: | |
70 | * <P> | |
71 | * Helper function to output text | |
72 | * <pre> | |
73 | * \code | |
74 | * void printTextRange(UChar* str, int32_t start, int32_t end ) { | |
75 | * UChar* result; | |
76 | * UChar* temp; | |
77 | * const char* res; | |
78 | * temp=(UChar*)malloc(sizeof(UChar) * ((u_strlen(str)-start)+1)); | |
79 | * result=(UChar*)malloc(sizeof(UChar) * ((end-start)+1)); | |
80 | * u_strcpy(temp, &str[start]); | |
81 | * u_strncpy(result, temp, end-start); | |
82 | * res=(char*)malloc(sizeof(char) * (u_strlen(result)+1)); | |
83 | * u_austrcpy(res, result); | |
84 | * printf("%s\n", res); | |
85 | * } | |
86 | * \endcode | |
87 | * </pre> | |
88 | * Print each element in order: | |
89 | * <pre> | |
90 | * \code | |
91 | * void printEachForward( UBreakIterator* boundary, UChar* str) { | |
92 | * int32_t end; | |
93 | * int32_t start = ubrk_first(boundary); | |
94 | * for (end = ubrk_next(boundary)); end != UBRK_DONE; start = end, end = ubrk_next(boundary)) { | |
95 | * printTextRange(str, start, end ); | |
96 | * } | |
97 | * } | |
98 | * \endcode | |
99 | * </pre> | |
100 | * Print each element in reverse order: | |
101 | * <pre> | |
102 | * \code | |
103 | * void printEachBackward( UBreakIterator* boundary, UChar* str) { | |
104 | * int32_t start; | |
105 | * int32_t end = ubrk_last(boundary); | |
106 | * for (start = ubrk_previous(boundary); start != UBRK_DONE; end = start, start =ubrk_previous(boundary)) { | |
107 | * printTextRange( str, start, end ); | |
108 | * } | |
109 | * } | |
110 | * \endcode | |
111 | * </pre> | |
112 | * Print first element | |
113 | * <pre> | |
114 | * \code | |
115 | * void printFirst(UBreakIterator* boundary, UChar* str) { | |
116 | * int32_t end; | |
117 | * int32_t start = ubrk_first(boundary); | |
118 | * end = ubrk_next(boundary); | |
119 | * printTextRange( str, start, end ); | |
120 | * } | |
121 | * \endcode | |
122 | * </pre> | |
123 | * Print last element | |
124 | * <pre> | |
125 | * \code | |
126 | * void printLast(UBreakIterator* boundary, UChar* str) { | |
127 | * int32_t start; | |
128 | * int32_t end = ubrk_last(boundary); | |
129 | * start = ubrk_previous(boundary); | |
130 | * printTextRange(str, start, end ); | |
131 | * } | |
132 | * \endcode | |
133 | * </pre> | |
134 | * Print the element at a specified position | |
135 | * <pre> | |
136 | * \code | |
137 | * void printAt(UBreakIterator* boundary, int32_t pos , UChar* str) { | |
138 | * int32_t start; | |
139 | * int32_t end = ubrk_following(boundary, pos); | |
140 | * start = ubrk_previous(boundary); | |
141 | * printTextRange(str, start, end ); | |
142 | * } | |
143 | * \endcode | |
144 | * </pre> | |
145 | * Creating and using text boundaries | |
146 | * <pre> | |
147 | * \code | |
148 | * void BreakIterator_Example( void ) { | |
149 | * UBreakIterator* boundary; | |
150 | * UChar *stringToExamine; | |
151 | * stringToExamine=(UChar*)malloc(sizeof(UChar) * (strlen("Aaa bbb ccc. Ddd eee fff.")+1) ); | |
152 | * u_uastrcpy(stringToExamine, "Aaa bbb ccc. Ddd eee fff."); | |
153 | * printf("Examining: "Aaa bbb ccc. Ddd eee fff."); | |
154 | * | |
155 | * //print each sentence in forward and reverse order | |
156 | * boundary = ubrk_open(UBRK_SENTENCE, "en_us", stringToExamine, u_strlen(stringToExamine), &status); | |
157 | * printf("----- forward: -----------\n"); | |
158 | * printEachForward(boundary, stringToExamine); | |
159 | * printf("----- backward: ----------\n"); | |
160 | * printEachBackward(boundary, stringToExamine); | |
161 | * ubrk_close(boundary); | |
162 | * | |
163 | * //print each word in order | |
164 | * boundary = ubrk_open(UBRK_WORD, "en_us", stringToExamine, u_strlen(stringToExamine), &status); | |
165 | * printf("----- forward: -----------\n"); | |
166 | * printEachForward(boundary, stringToExamine); | |
167 | * printf("----- backward: ----------\n"); | |
168 | * printEachBackward(boundary, stringToExamine); | |
169 | * //print first element | |
170 | * printf("----- first: -------------\n"); | |
171 | * printFirst(boundary, stringToExamine); | |
172 | * //print last element | |
173 | * printf("----- last: --------------\n"); | |
174 | * printLast(boundary, stringToExamine); | |
175 | * //print word at charpos 10 | |
176 | * printf("----- at pos 10: ---------\n"); | |
177 | * printAt(boundary, 10 , stringToExamine); | |
178 | * | |
179 | * ubrk_close(boundary); | |
180 | * } | |
181 | * \endcode | |
182 | * </pre> | |
183 | */ | |
184 | ||
185 | /** The possible types of text boundaries. @stable ICU 2.0 */ | |
186 | typedef enum UBreakIteratorType { | |
187 | /** Character breaks @stable ICU 2.0 */ | |
188 | UBRK_CHARACTER, | |
189 | /** Word breaks @stable ICU 2.0 */ | |
190 | UBRK_WORD, | |
191 | /** Line breaks @stable ICU 2.0 */ | |
192 | UBRK_LINE, | |
193 | /** Sentence breaks @stable ICU 2.0 */ | |
194 | UBRK_SENTENCE, | |
195 | /** | |
196 | * Title Case breaks | |
197 | * The iterator created using this type locates title boundaries as described for | |
198 | * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration, | |
199 | * please use Word Boundary iterator. @draft ICU 2.2 | |
200 | * | |
201 | */ | |
202 | UBRK_TITLE | |
203 | } UBreakIteratorType; | |
204 | ||
205 | /** Value indicating all text boundaries have been returned. | |
206 | * @stable ICU 2.0 | |
207 | */ | |
208 | #define UBRK_DONE ((int32_t) -1) | |
209 | ||
210 | ||
211 | /** | |
212 | * Enum constants for the word break tags returned by | |
213 | * getRuleStatus(). A range of values is defined for each category of | |
214 | * word, to allow for further subdivisions of a category in future releases. | |
215 | * Applications should check for tag values falling within the range, rather | |
216 | * than for single individual values. | |
217 | * @draft ICU 2.2 | |
218 | */ | |
219 | typedef enum UWordBreak { | |
220 | /** Tag value for "words" that do not fit into any of other categories. | |
221 | * Includes spaces and most punctuation. */ | |
222 | UBRK_WORD_NONE = 0, | |
223 | /** Upper bound for tags for uncategorized words. */ | |
224 | UBRK_WORD_NONE_LIMIT = 100, | |
225 | /** Tag value for words that appear to be numbers, lower limit. */ | |
226 | UBRK_WORD_NUMBER = 100, | |
227 | /** Tag value for words that appear to be numbers, upper limit. */ | |
228 | UBRK_WORD_NUMBER_LIMIT = 200, | |
229 | /** Tag value for words that contain letters, excluding | |
230 | * hiragana, katakana or ideographic characters, lower limit. */ | |
231 | UBRK_WORD_LETTER = 200, | |
232 | /** Tag value for words containing letters, upper limit */ | |
233 | UBRK_WORD_LETTER_LIMIT = 300, | |
234 | /** Tag value for words containing kana characters, lower limit */ | |
235 | UBRK_WORD_KANA = 300, | |
236 | /** Tag value for words containing kana characters, upper limit */ | |
237 | UBRK_WORD_KANA_LIMIT = 400, | |
238 | /** Tag value for words containing ideographic characters, lower limit */ | |
239 | UBRK_WORD_IDEO = 400, | |
240 | /** Tag value for words containing ideographic characters, upper limit */ | |
241 | UBRK_WORD_IDEO_LIMIT = 500 | |
242 | } UWordBreak; | |
243 | ||
244 | ||
245 | /** | |
246 | * Open a new UBreakIterator for locating text boundaries for a specified locale. | |
247 | * A UBreakIterator may be used for detecting character, line, word, | |
248 | * and sentence breaks in text. | |
249 | * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD, | |
250 | * UBRK_LINE, UBRK_SENTENCE | |
251 | * @param locale The locale specifying the text-breaking conventions. | |
252 | * @param text The text to be iterated over. | |
253 | * @param textLength The number of characters in text, or -1 if null-terminated. | |
254 | * @param status A UErrorCode to receive any errors. | |
255 | * @return A UBreakIterator for the specified locale. | |
256 | * @see ubrk_openRules | |
257 | * @stable ICU 2.0 | |
258 | */ | |
259 | U_CAPI UBreakIterator* U_EXPORT2 | |
260 | ubrk_open(UBreakIteratorType type, | |
261 | const char *locale, | |
262 | const UChar *text, | |
263 | int32_t textLength, | |
264 | UErrorCode *status); | |
265 | ||
266 | /** | |
267 | * Open a new UBreakIterator for locating text boundaries using specified breaking rules. | |
268 | * The rule syntax is ... (TBD) | |
269 | * @param rules A set of rules specifying the text breaking conventions. | |
270 | * @param rulesLength The number of characters in rules, or -1 if null-terminated. | |
271 | * @param text The text to be iterated over. May be null, in which case ubrk_setText() is | |
272 | * used to specify the text to be iterated. | |
273 | * @param textLength The number of characters in text, or -1 if null-terminated. | |
274 | * @param parseErr Receives position and context information for any syntax errors | |
275 | * detected while parsing the rules. | |
276 | * @param status A UErrorCode to receive any errors. | |
277 | * @return A UBreakIterator for the specified rules. | |
278 | * @see ubrk_open | |
279 | * @draft ICU 2.2 | |
280 | */ | |
281 | U_CAPI UBreakIterator* U_EXPORT2 | |
282 | ubrk_openRules(const UChar *rules, | |
283 | int32_t rulesLength, | |
284 | const UChar *text, | |
285 | int32_t textLength, | |
286 | UParseError *parseErr, | |
287 | UErrorCode *status); | |
288 | ||
289 | /** | |
290 | * Thread safe cloning operation | |
291 | * @param bi iterator to be cloned | |
292 | * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated. | |
293 | * If buffer is not large enough, new memory will be allocated. | |
294 | * Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations. | |
295 | * @param pBufferSize pointer to size of allocated space. | |
296 | * If *pBufferSize == 0, a sufficient size for use in cloning will | |
297 | * be returned ('pre-flighting') | |
298 | * If *pBufferSize is not enough for a stack-based safe clone, | |
299 | * new memory will be allocated. | |
300 | * @param status to indicate whether the operation went on smoothly or there were errors | |
301 | * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary. | |
302 | * @return pointer to the new clone | |
303 | * @stable ICU 2.0 | |
304 | */ | |
305 | U_CAPI UBreakIterator * U_EXPORT2 | |
306 | ubrk_safeClone( | |
307 | const UBreakIterator *bi, | |
308 | void *stackBuffer, | |
309 | int32_t *pBufferSize, | |
310 | UErrorCode *status); | |
311 | ||
312 | /** | |
313 | * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone(). | |
314 | * @stable ICU 2.0 | |
315 | */ | |
316 | #define U_BRK_SAFECLONE_BUFFERSIZE 512 | |
317 | ||
318 | /** | |
319 | * Close a UBreakIterator. | |
320 | * Once closed, a UBreakIterator may no longer be used. | |
321 | * @param bi The break iterator to close. | |
322 | * @stable ICU 2.0 | |
323 | */ | |
324 | U_CAPI void U_EXPORT2 | |
325 | ubrk_close(UBreakIterator *bi); | |
326 | ||
327 | /** | |
328 | * Sets an existing iterator to point to a new piece of text | |
329 | * @param bi The iterator to use | |
330 | * @param text The text to be set | |
331 | * @param textLength The length of the text | |
332 | * @param status The error code | |
333 | * @stable ICU 2.0 | |
334 | */ | |
335 | U_CAPI void U_EXPORT2 | |
336 | ubrk_setText(UBreakIterator* bi, | |
337 | const UChar* text, | |
338 | int32_t textLength, | |
339 | UErrorCode* status); | |
340 | ||
341 | /** | |
342 | * Determine the most recently-returned text boundary. | |
343 | * | |
344 | * @param bi The break iterator to use. | |
345 | * @return The character index most recently returned by \Ref{ubrk_next}, \Ref{ubrk_previous}, | |
346 | * \Ref{ubrk_first}, or \Ref{ubrk_last}. | |
347 | * @stable ICU 2.0 | |
348 | */ | |
349 | U_CAPI int32_t U_EXPORT2 | |
350 | ubrk_current(const UBreakIterator *bi); | |
351 | ||
352 | /** | |
353 | * Determine the text boundary following the current text boundary. | |
354 | * | |
355 | * @param bi The break iterator to use. | |
356 | * @return The character index of the next text boundary, or UBRK_DONE | |
357 | * if all text boundaries have been returned. | |
358 | * @see ubrk_previous | |
359 | * @stable ICU 2.0 | |
360 | */ | |
361 | U_CAPI int32_t U_EXPORT2 | |
362 | ubrk_next(UBreakIterator *bi); | |
363 | ||
364 | /** | |
365 | * Determine the text boundary preceding the current text boundary. | |
366 | * | |
367 | * @param bi The break iterator to use. | |
368 | * @return The character index of the preceding text boundary, or UBRK_DONE | |
369 | * if all text boundaries have been returned. | |
370 | * @see ubrk_next | |
371 | * @stable ICU 2.0 | |
372 | */ | |
373 | U_CAPI int32_t U_EXPORT2 | |
374 | ubrk_previous(UBreakIterator *bi); | |
375 | ||
376 | /** | |
377 | * Determine the index of the first character in the text being scanned. | |
378 | * This is not always the same as index 0 of the text. | |
379 | * @param bi The break iterator to use. | |
380 | * @return The character index of the first character in the text being scanned. | |
381 | * @see ubrk_last | |
382 | * @stable ICU 2.0 | |
383 | */ | |
384 | U_CAPI int32_t U_EXPORT2 | |
385 | ubrk_first(UBreakIterator *bi); | |
386 | ||
387 | /** | |
388 | * Determine the index immediately <EM>beyond</EM> the last character in the text being | |
389 | * scanned. | |
390 | * This is not the same as the last character. | |
391 | * @param bi The break iterator to use. | |
392 | * @return The character offset immediately <EM>beyond</EM> the last character in the | |
393 | * text being scanned. | |
394 | * @see ubrk_first | |
395 | * @stable ICU 2.0 | |
396 | */ | |
397 | U_CAPI int32_t U_EXPORT2 | |
398 | ubrk_last(UBreakIterator *bi); | |
399 | ||
400 | /** | |
401 | * Determine the text boundary preceding the specified offset. | |
402 | * The value returned is always smaller than offset, or UBRK_DONE. | |
403 | * @param bi The break iterator to use. | |
404 | * @param offset The offset to begin scanning. | |
405 | * @return The text boundary preceding offset, or UBRK_DONE. | |
406 | * @see ubrk_following | |
407 | * @stable ICU 2.0 | |
408 | */ | |
409 | U_CAPI int32_t U_EXPORT2 | |
410 | ubrk_preceding(UBreakIterator *bi, | |
411 | int32_t offset); | |
412 | ||
413 | /** | |
414 | * Determine the text boundary following the specified offset. | |
415 | * The value returned is always greater than offset, or UBRK_DONE. | |
416 | * @param bi The break iterator to use. | |
417 | * @param offset The offset to begin scanning. | |
418 | * @return The text boundary following offset, or UBRK_DONE. | |
419 | * @see ubrk_preceding | |
420 | * @stable ICU 2.0 | |
421 | */ | |
422 | U_CAPI int32_t U_EXPORT2 | |
423 | ubrk_following(UBreakIterator *bi, | |
424 | int32_t offset); | |
425 | ||
426 | /** | |
427 | * Get a locale for which text breaking information is available. | |
428 | * A UBreakIterator in a locale returned by this function will perform the correct | |
429 | * text breaking for the locale. | |
430 | * @param index The index of the desired locale. | |
431 | * @return A locale for which number text breaking information is available, or 0 if none. | |
432 | * @see ubrk_countAvailable | |
433 | * @stable ICU 2.0 | |
434 | */ | |
435 | U_CAPI const char* U_EXPORT2 | |
436 | ubrk_getAvailable(int32_t index); | |
437 | ||
438 | /** | |
439 | * Determine how many locales have text breaking information available. | |
440 | * This function is most useful as determining the loop ending condition for | |
441 | * calls to \Ref{ubrk_getAvailable}. | |
442 | * @return The number of locales for which text breaking information is available. | |
443 | * @see ubrk_getAvailable | |
444 | * @stable ICU 2.0 | |
445 | */ | |
446 | U_CAPI int32_t U_EXPORT2 | |
447 | ubrk_countAvailable(void); | |
448 | ||
449 | ||
450 | /** | |
451 | * Returns true if the specfied position is a boundary position. As a side | |
452 | * effect, leaves the iterator pointing to the first boundary position at | |
453 | * or after "offset". | |
454 | * @param bi The break iterator to use. | |
455 | * @param offset the offset to check. | |
456 | * @return True if "offset" is a boundary position. | |
457 | * @stable ICU 2.0 | |
458 | */ | |
459 | U_CAPI UBool U_EXPORT2 | |
460 | ubrk_isBoundary(UBreakIterator *bi, int32_t offset); | |
461 | ||
462 | /** | |
463 | * Return the status from the break rule that determined the most recently | |
464 | * returned break position. The values appear in the rule source | |
465 | * within brackets, {123}, for example. For rules that do not specify a | |
466 | * status, a default value of 0 is returned. | |
467 | * <p> | |
468 | * For word break iterators, the possible values are defined in enum UWordBreak. | |
469 | * @draft ICU 2.2 | |
470 | */ | |
471 | U_CAPI int32_t U_EXPORT2 | |
472 | ubrk_getRuleStatus(UBreakIterator *bi); | |
473 | ||
474 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | |
475 | ||
476 | #endif |