]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unicode/ucasemap.h
ICU-400.42.tar.gz
[apple/icu.git] / icuSources / common / unicode / ucasemap.h
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2005-2008, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucasemap.h
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2005may06
14 * created by: Markus W. Scherer
15 *
16 * Case mapping service object and functions using it.
17 */
18
19 #ifndef __UCASEMAP_H__
20 #define __UCASEMAP_H__
21
22 #include "unicode/utypes.h"
23 #include "unicode/ustring.h"
24
25 /**
26 * \file
27 * \brief C API: Unicode case mapping functions using a UCaseMap service object.
28 *
29 * The service object takes care of memory allocations, data loading, and setup
30 * for the attributes, as usual.
31 *
32 * Currently, the functionality provided here does not overlap with uchar.h
33 * and ustring.h, except for ucasemap_toTitle().
34 *
35 * ucasemap_utf8XYZ() functions operate directly on UTF-8 strings.
36 */
37
38 /**
39 * UCaseMap is an opaque service object for newer ICU case mapping functions.
40 * Older functions did not use a service object.
41 * @stable ICU 3.4
42 */
43 struct UCaseMap;
44 typedef struct UCaseMap UCaseMap; /**< C typedef for struct UCaseMap. @stable ICU 3.4 */
45
46 /**
47 * Open a UCaseMap service object for a locale and a set of options.
48 * The locale ID and options are preprocessed so that functions using the
49 * service object need not process them in each call.
50 *
51 * @param locale ICU locale ID, used for language-dependent
52 * upper-/lower-/title-casing according to the Unicode standard.
53 * Usual semantics: ""=root, NULL=default locale, etc.
54 * @param options Options bit set, used for case folding and string comparisons.
55 * Same flags as for u_foldCase(), u_strFoldCase(),
56 * u_strCaseCompare(), etc.
57 * Use 0 or U_FOLD_CASE_DEFAULT for default behavior.
58 * @param pErrorCode Must be a valid pointer to an error code value,
59 * which must not indicate a failure before the function call.
60 * @return Pointer to a UCaseMap service object, if successful.
61 *
62 * @see U_FOLD_CASE_DEFAULT
63 * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
64 * @see U_TITLECASE_NO_LOWERCASE
65 * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
66 * @stable ICU 3.4
67 */
68 U_STABLE UCaseMap * U_EXPORT2
69 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode);
70
71 /**
72 * Close a UCaseMap service object.
73 * @param csm Object to be closed.
74 * @stable ICU 3.4
75 */
76 U_STABLE void U_EXPORT2
77 ucasemap_close(UCaseMap *csm);
78
79 /**
80 * Get the locale ID that is used for language-dependent case mappings.
81 * @param csm UCaseMap service object.
82 * @return locale ID
83 * @stable ICU 3.4
84 */
85 U_STABLE const char * U_EXPORT2
86 ucasemap_getLocale(const UCaseMap *csm);
87
88 /**
89 * Get the options bit set that is used for case folding and string comparisons.
90 * @param csm UCaseMap service object.
91 * @return options bit set
92 * @stable ICU 3.4
93 */
94 U_STABLE uint32_t U_EXPORT2
95 ucasemap_getOptions(const UCaseMap *csm);
96
97 /**
98 * Set the locale ID that is used for language-dependent case mappings.
99 *
100 * @param csm UCaseMap service object.
101 * @param locale Locale ID, see ucasemap_open().
102 * @param pErrorCode Must be a valid pointer to an error code value,
103 * which must not indicate a failure before the function call.
104 *
105 * @see ucasemap_open
106 * @stable ICU 3.4
107 */
108 U_STABLE void U_EXPORT2
109 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode);
110
111 /**
112 * Set the options bit set that is used for case folding and string comparisons.
113 *
114 * @param csm UCaseMap service object.
115 * @param options Options bit set, see ucasemap_open().
116 * @param pErrorCode Must be a valid pointer to an error code value,
117 * which must not indicate a failure before the function call.
118 *
119 * @see ucasemap_open
120 * @stable ICU 3.4
121 */
122 U_STABLE void U_EXPORT2
123 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode);
124
125 /**
126 * Do not lowercase non-initial parts of words when titlecasing.
127 * Option bit for titlecasing APIs that take an options bit set.
128 *
129 * By default, titlecasing will titlecase the first cased character
130 * of a word and lowercase all other characters.
131 * With this option, the other characters will not be modified.
132 *
133 * @see ucasemap_setOptions
134 * @see ucasemap_toTitle
135 * @see ucasemap_utf8ToTitle
136 * @see UnicodeString::toTitle
137 * @stable ICU 4.0
138 */
139 #define U_TITLECASE_NO_LOWERCASE 0x100
140
141 /**
142 * Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
143 * titlecase exactly the characters at breaks from the iterator.
144 * Option bit for titlecasing APIs that take an options bit set.
145 *
146 * By default, titlecasing will take each break iterator index,
147 * adjust it by looking for the next cased character, and titlecase that one.
148 * Other characters are lowercased.
149 *
150 * This follows Unicode 4 & 5 section 3.13 Default Case Operations:
151 *
152 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
153 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
154 * cased character F. If F exists, map F to default_title(F); then map each
155 * subsequent character C to default_lower(C).
156 *
157 * @see ucasemap_setOptions
158 * @see ucasemap_toTitle
159 * @see ucasemap_utf8ToTitle
160 * @see UnicodeString::toTitle
161 * @see U_TITLECASE_NO_LOWERCASE
162 * @stable ICU 4.0
163 */
164 #define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200
165
166 #if !UCONFIG_NO_BREAK_ITERATION
167
168 /**
169 * Get the break iterator that is used for titlecasing.
170 * Do not modify the returned break iterator.
171 * @param csm UCaseMap service object.
172 * @return titlecasing break iterator
173 * @stable ICU 4.0
174 */
175 U_DRAFT const UBreakIterator * U_EXPORT2
176 ucasemap_getBreakIterator(const UCaseMap *csm);
177
178 /**
179 * Set the break iterator that is used for titlecasing.
180 * The UCaseMap service object releases a previously set break iterator
181 * and "adopts" this new one, taking ownership of it.
182 * It will be released in a subsequent call to ucasemap_setBreakIterator()
183 * or ucasemap_close().
184 *
185 * Break iterator operations are not thread-safe. Therefore, titlecasing
186 * functions use non-const UCaseMap objects. It is not possible to titlecase
187 * strings concurrently using the same UCaseMap.
188 *
189 * @param csm UCaseMap service object.
190 * @param iterToAdopt Break iterator to be adopted for titlecasing.
191 * @param pErrorCode Must be a valid pointer to an error code value,
192 * which must not indicate a failure before the function call.
193 *
194 * @see ucasemap_toTitle
195 * @see ucasemap_utf8ToTitle
196 * @stable ICU 4.0
197 */
198 U_DRAFT void U_EXPORT2
199 ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode);
200
201 /**
202 * Titlecase a UTF-16 string. This function is almost a duplicate of u_strToTitle(),
203 * except that it takes ucasemap_setOptions() into account and has performance
204 * advantages from being able to use a UCaseMap object for multiple case mapping
205 * operations, saving setup time.
206 *
207 * Casing is locale-dependent and context-sensitive.
208 * Titlecasing uses a break iterator to find the first characters of words
209 * that are to be titlecased. It titlecases those characters and lowercases
210 * all others. (This can be modified with ucasemap_setOptions().)
211 *
212 * The titlecase break iterator can be provided to customize for arbitrary
213 * styles, using rules and dictionaries beyond the standard iterators.
214 * It may be more efficient to always provide an iterator to avoid
215 * opening and closing one for each string.
216 * The standard titlecase iterator for the root locale implements the
217 * algorithm of Unicode TR 21.
218 *
219 * This function uses only the setText(), first() and next() methods of the
220 * provided break iterator.
221 *
222 * The result may be longer or shorter than the original.
223 * The source string and the destination buffer must not overlap.
224 *
225 * @param csm UCaseMap service object.
226 * @param dest A buffer for the result string. The result will be NUL-terminated if
227 * the buffer is large enough.
228 * The contents is undefined in case of failure.
229 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
230 * dest may be NULL and the function will only return the length of the result
231 * without writing any of the result string.
232 * @param src The original string.
233 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
234 * @param pErrorCode Must be a valid pointer to an error code value,
235 * which must not indicate a failure before the function call.
236 * @return The length of the result string, if successful - or in case of a buffer overflow,
237 * in which case it will be greater than destCapacity.
238 *
239 * @see u_strToTitle
240 * @stable ICU 4.0
241 */
242 U_DRAFT int32_t U_EXPORT2
243 ucasemap_toTitle(UCaseMap *csm,
244 UChar *dest, int32_t destCapacity,
245 const UChar *src, int32_t srcLength,
246 UErrorCode *pErrorCode);
247
248 #endif
249
250 /**
251 * Lowercase the characters in a UTF-8 string.
252 * Casing is locale-dependent and context-sensitive.
253 * The result may be longer or shorter than the original.
254 * The source string and the destination buffer must not overlap.
255 *
256 * @param csm UCaseMap service object.
257 * @param dest A buffer for the result string. The result will be NUL-terminated if
258 * the buffer is large enough.
259 * The contents is undefined in case of failure.
260 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
261 * dest may be NULL and the function will only return the length of the result
262 * without writing any of the result string.
263 * @param src The original string.
264 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
265 * @param pErrorCode Must be a valid pointer to an error code value,
266 * which must not indicate a failure before the function call.
267 * @return The length of the result string, if successful - or in case of a buffer overflow,
268 * in which case it will be greater than destCapacity.
269 *
270 * @see u_strToLower
271 * @stable ICU 3.4
272 */
273 U_STABLE int32_t U_EXPORT2
274 ucasemap_utf8ToLower(const UCaseMap *csm,
275 char *dest, int32_t destCapacity,
276 const char *src, int32_t srcLength,
277 UErrorCode *pErrorCode);
278
279 /**
280 * Uppercase the characters in a UTF-8 string.
281 * Casing is locale-dependent and context-sensitive.
282 * The result may be longer or shorter than the original.
283 * The source string and the destination buffer must not overlap.
284 *
285 * @param csm UCaseMap service object.
286 * @param dest A buffer for the result string. The result will be NUL-terminated if
287 * the buffer is large enough.
288 * The contents is undefined in case of failure.
289 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
290 * dest may be NULL and the function will only return the length of the result
291 * without writing any of the result string.
292 * @param src The original string.
293 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
294 * @param pErrorCode Must be a valid pointer to an error code value,
295 * which must not indicate a failure before the function call.
296 * @return The length of the result string, if successful - or in case of a buffer overflow,
297 * in which case it will be greater than destCapacity.
298 *
299 * @see u_strToUpper
300 * @stable ICU 3.4
301 */
302 U_STABLE int32_t U_EXPORT2
303 ucasemap_utf8ToUpper(const UCaseMap *csm,
304 char *dest, int32_t destCapacity,
305 const char *src, int32_t srcLength,
306 UErrorCode *pErrorCode);
307
308 #if !UCONFIG_NO_BREAK_ITERATION
309
310 /**
311 * Titlecase a UTF-8 string.
312 * Casing is locale-dependent and context-sensitive.
313 * Titlecasing uses a break iterator to find the first characters of words
314 * that are to be titlecased. It titlecases those characters and lowercases
315 * all others. (This can be modified with ucasemap_setOptions().)
316 *
317 * The titlecase break iterator can be provided to customize for arbitrary
318 * styles, using rules and dictionaries beyond the standard iterators.
319 * It may be more efficient to always provide an iterator to avoid
320 * opening and closing one for each string.
321 * The standard titlecase iterator for the root locale implements the
322 * algorithm of Unicode TR 21.
323 *
324 * This function uses only the setText(), first() and next() methods of the
325 * provided break iterator.
326 *
327 * The result may be longer or shorter than the original.
328 * The source string and the destination buffer must not overlap.
329 *
330 * @param csm UCaseMap service object.
331 * @param dest A buffer for the result string. The result will be NUL-terminated if
332 * the buffer is large enough.
333 * The contents is undefined in case of failure.
334 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
335 * dest may be NULL and the function will only return the length of the result
336 * without writing any of the result string.
337 * @param src The original string.
338 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
339 * @param pErrorCode Must be a valid pointer to an error code value,
340 * which must not indicate a failure before the function call.
341 * @return The length of the result string, if successful - or in case of a buffer overflow,
342 * in which case it will be greater than destCapacity.
343 *
344 * @see u_strToTitle
345 * @see U_TITLECASE_NO_LOWERCASE
346 * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
347 * @stable ICU 4.0
348 */
349 U_DRAFT int32_t U_EXPORT2
350 ucasemap_utf8ToTitle(UCaseMap *csm,
351 char *dest, int32_t destCapacity,
352 const char *src, int32_t srcLength,
353 UErrorCode *pErrorCode);
354
355 #endif
356
357 /**
358 * Case-fold the characters in a UTF-8 string.
359 * Case-folding is locale-independent and not context-sensitive,
360 * but there is an option for whether to include or exclude mappings for dotted I
361 * and dotless i that are marked with 'I' in CaseFolding.txt.
362 * The result may be longer or shorter than the original.
363 * The source string and the destination buffer must not overlap.
364 *
365 * @param csm UCaseMap service object.
366 * @param dest A buffer for the result string. The result will be NUL-terminated if
367 * the buffer is large enough.
368 * The contents is undefined in case of failure.
369 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
370 * dest may be NULL and the function will only return the length of the result
371 * without writing any of the result string.
372 * @param src The original string.
373 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
374 * @param pErrorCode Must be a valid pointer to an error code value,
375 * which must not indicate a failure before the function call.
376 * @return The length of the result string, if successful - or in case of a buffer overflow,
377 * in which case it will be greater than destCapacity.
378 *
379 * @see u_strFoldCase
380 * @see ucasemap_setOptions
381 * @see U_FOLD_CASE_DEFAULT
382 * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
383 * @stable ICU 4.0
384 */
385 U_DRAFT int32_t U_EXPORT2
386 ucasemap_utf8FoldCase(const UCaseMap *csm,
387 char *dest, int32_t destCapacity,
388 const char *src, int32_t srcLength,
389 UErrorCode *pErrorCode);
390
391 #endif