]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
374ca955 A |
3 | /* |
4 | ******************************************************************************* | |
5 | * | |
4388f060 | 6 | * Copyright (C) 2004-2012, International Business Machines |
374ca955 A |
7 | * Corporation and others. All Rights Reserved. |
8 | * | |
9 | ******************************************************************************* | |
10 | * file name: ucase.h | |
f3c0d7a5 | 11 | * encoding: UTF-8 |
374ca955 A |
12 | * tab size: 8 (not used) |
13 | * indentation:4 | |
14 | * | |
15 | * created on: 2004aug30 | |
16 | * created by: Markus W. Scherer | |
17 | * | |
18 | * Low-level Unicode character/string case mapping code. | |
19 | */ | |
20 | ||
21 | #ifndef __UCASE_H__ | |
22 | #define __UCASE_H__ | |
23 | ||
24 | #include "unicode/utypes.h" | |
25 | #include "unicode/uset.h" | |
4388f060 | 26 | #include "putilimp.h" |
374ca955 A |
27 | #include "uset_imp.h" |
28 | #include "udataswp.h" | |
0f5d89e8 | 29 | #include "utrie2.h" |
374ca955 | 30 | |
4388f060 A |
31 | #ifdef __cplusplus |
32 | U_NAMESPACE_BEGIN | |
33 | ||
34 | class UnicodeString; | |
35 | ||
36 | U_NAMESPACE_END | |
37 | #endif | |
374ca955 A |
38 | |
39 | /* library API -------------------------------------------------------------- */ | |
40 | ||
46f4442e | 41 | U_CFUNC void U_EXPORT2 |
f3c0d7a5 | 42 | ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); |
73c04bcf A |
43 | |
44 | /** | |
45 | * Requires non-NULL locale ID but otherwise does the equivalent of | |
46 | * checking for language codes as if uloc_getLanguage() were called: | |
47 | * Accepts both 2- and 3-letter codes and accepts case variants. | |
48 | */ | |
49 | U_CFUNC int32_t | |
f3c0d7a5 | 50 | ucase_getCaseLocale(const char *locale); |
374ca955 | 51 | |
46f4442e A |
52 | /* Casing locale types for ucase_getCaseLocale */ |
53 | enum { | |
54 | UCASE_LOC_UNKNOWN, | |
55 | UCASE_LOC_ROOT, | |
56 | UCASE_LOC_TURKISH, | |
57 | UCASE_LOC_LITHUANIAN, | |
f3c0d7a5 | 58 | UCASE_LOC_GREEK, |
46f4442e A |
59 | UCASE_LOC_DUTCH |
60 | }; | |
61 | ||
374ca955 A |
62 | /** |
63 | * Bit mask for getting just the options from a string compare options word | |
64 | * that are relevant for case-insensitive string comparison. | |
0f5d89e8 | 65 | * See stringoptions.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER. |
374ca955 A |
66 | * @internal |
67 | */ | |
68 | #define _STRCASECMP_OPTIONS_MASK 0xffff | |
69 | ||
70 | /** | |
71 | * Bit mask for getting just the options from a string compare options word | |
72 | * that are relevant for case folding (of a single string or code point). | |
0f5d89e8 A |
73 | * |
74 | * Currently only bit 0 for U_FOLD_CASE_EXCLUDE_SPECIAL_I. | |
75 | * It is conceivable that at some point we might use one more bit for using uppercase sharp s. | |
76 | * It is conceivable that at some point we might want the option to use only simple case foldings | |
77 | * when operating on strings. | |
78 | * | |
79 | * See stringoptions.h. | |
374ca955 A |
80 | * @internal |
81 | */ | |
0f5d89e8 | 82 | #define _FOLD_CASE_OPTIONS_MASK 7 |
374ca955 A |
83 | |
84 | /* single-code point functions */ | |
85 | ||
86 | U_CAPI UChar32 U_EXPORT2 | |
f3c0d7a5 | 87 | ucase_tolower(UChar32 c); |
374ca955 A |
88 | |
89 | U_CAPI UChar32 U_EXPORT2 | |
f3c0d7a5 | 90 | ucase_toupper(UChar32 c); |
374ca955 A |
91 | |
92 | U_CAPI UChar32 U_EXPORT2 | |
f3c0d7a5 | 93 | ucase_totitle(UChar32 c); |
374ca955 A |
94 | |
95 | U_CAPI UChar32 U_EXPORT2 | |
f3c0d7a5 | 96 | ucase_fold(UChar32 c, uint32_t options); |
73c04bcf A |
97 | |
98 | /** | |
99 | * Adds all simple case mappings and the full case folding for c to sa, | |
100 | * and also adds special case closure mappings. | |
101 | * c itself is not added. | |
102 | * For example, the mappings | |
103 | * - for s include long s | |
104 | * - for sharp s include ss | |
105 | * - for k include the Kelvin sign | |
106 | */ | |
46f4442e | 107 | U_CFUNC void U_EXPORT2 |
f3c0d7a5 | 108 | ucase_addCaseClosure(UChar32 c, const USetAdder *sa); |
73c04bcf A |
109 | |
110 | /** | |
111 | * Maps the string to single code points and adds the associated case closure | |
112 | * mappings. | |
113 | * The string is mapped to code points if it is their full case folding string. | |
114 | * In other words, this performs a reverse full case folding and then | |
115 | * adds the case closure items of the resulting code points. | |
116 | * If the string is found and its closure applied, then | |
117 | * the string itself is added as well as part of its code points' closure. | |
118 | * It must be length>=0. | |
119 | * | |
120 | * @return TRUE if the string was found | |
121 | */ | |
46f4442e | 122 | U_CFUNC UBool U_EXPORT2 |
f3c0d7a5 | 123 | ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa); |
374ca955 | 124 | |
4388f060 A |
125 | #ifdef __cplusplus |
126 | U_NAMESPACE_BEGIN | |
127 | ||
128 | /** | |
129 | * Iterator over characters with more than one code point in the full default Case_Folding. | |
130 | */ | |
131 | class U_COMMON_API FullCaseFoldingIterator { | |
132 | public: | |
133 | /** Constructor. */ | |
134 | FullCaseFoldingIterator(); | |
135 | /** | |
136 | * Returns the next (cp, full) pair where "full" is cp's full default Case_Folding. | |
137 | * Returns a negative cp value at the end of the iteration. | |
138 | */ | |
139 | UChar32 next(UnicodeString &full); | |
140 | private: | |
141 | FullCaseFoldingIterator(const FullCaseFoldingIterator &); // no copy | |
142 | FullCaseFoldingIterator &operator=(const FullCaseFoldingIterator &); // no assignment | |
143 | ||
144 | const UChar *unfold; | |
145 | int32_t unfoldRows; | |
146 | int32_t unfoldRowWidth; | |
147 | int32_t unfoldStringWidth; | |
148 | int32_t currentRow; | |
149 | int32_t rowCpIndex; | |
150 | }; | |
151 | ||
0f5d89e8 A |
152 | /** |
153 | * Fast case mapping data for ASCII/Latin. | |
154 | * Linear arrays of delta bytes: 0=no mapping; EXC=exception. | |
155 | * Deltas must not cross the ASCII boundary, or else they cannot be easily used | |
156 | * in simple UTF-8 code. | |
157 | */ | |
158 | namespace LatinCase { | |
159 | ||
160 | /** Case mapping/folding data for code points up to U+017F. */ | |
161 | constexpr UChar LIMIT = 0x180; | |
162 | /** U+017F case-folds and uppercases crossing the ASCII boundary. */ | |
163 | constexpr UChar LONG_S = 0x17f; | |
164 | /** Exception: Complex mapping, or too-large delta. */ | |
165 | constexpr int8_t EXC = -0x80; | |
166 | ||
167 | /** Deltas for lowercasing for most locales, and default case folding. */ | |
168 | extern const int8_t TO_LOWER_NORMAL[LIMIT]; | |
169 | /** Deltas for lowercasing for tr/az/lt, and Turkic case folding. */ | |
170 | extern const int8_t TO_LOWER_TR_LT[LIMIT]; | |
171 | ||
172 | /** Deltas for uppercasing for most locales. */ | |
173 | extern const int8_t TO_UPPER_NORMAL[LIMIT]; | |
174 | /** Deltas for uppercasing for tr/az. */ | |
175 | extern const int8_t TO_UPPER_TR[LIMIT]; | |
176 | ||
177 | } // namespace LatinCase | |
178 | ||
4388f060 A |
179 | U_NAMESPACE_END |
180 | #endif | |
181 | ||
374ca955 A |
182 | /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ |
183 | U_CAPI int32_t U_EXPORT2 | |
f3c0d7a5 | 184 | ucase_getType(UChar32 c); |
374ca955 | 185 | |
f3c0d7a5 | 186 | /** @return like ucase_getType() but also sets UCASE_IGNORABLE if c is case-ignorable */ |
374ca955 | 187 | U_CAPI int32_t U_EXPORT2 |
f3c0d7a5 | 188 | ucase_getTypeOrIgnorable(UChar32 c); |
374ca955 A |
189 | |
190 | U_CAPI UBool U_EXPORT2 | |
f3c0d7a5 | 191 | ucase_isSoftDotted(UChar32 c); |
374ca955 A |
192 | |
193 | U_CAPI UBool U_EXPORT2 | |
f3c0d7a5 | 194 | ucase_isCaseSensitive(UChar32 c); |
374ca955 A |
195 | |
196 | /* string case mapping functions */ | |
197 | ||
4388f060 A |
198 | U_CDECL_BEGIN |
199 | ||
374ca955 A |
200 | /** |
201 | * Iterator function for string case mappings, which need to look at the | |
202 | * context (surrounding text) of a given character for conditional mappings. | |
203 | * | |
204 | * The iterator only needs to go backward or forward away from the | |
205 | * character in question. It does not use any indexes on this interface. | |
206 | * It does not support random access or an arbitrary change of | |
207 | * iteration direction. | |
208 | * | |
73c04bcf A |
209 | * The code point being case-mapped itself is never returned by |
210 | * this iterator. | |
374ca955 A |
211 | * |
212 | * @param context A pointer to the iterator's working data. | |
213 | * @param dir If <0 then start iterating backward from the character; | |
214 | * if >0 then start iterating forward from the character; | |
215 | * if 0 then continue iterating in the current direction. | |
216 | * @return Next code point, or <0 when the iteration is done. | |
217 | */ | |
218 | typedef UChar32 U_CALLCONV | |
219 | UCaseContextIterator(void *context, int8_t dir); | |
220 | ||
221 | /** | |
222 | * Sample struct which may be used by some implementations of | |
223 | * UCaseContextIterator. | |
224 | */ | |
225 | struct UCaseContext { | |
226 | void *p; | |
227 | int32_t start, index, limit; | |
228 | int32_t cpStart, cpLimit; | |
229 | int8_t dir; | |
230 | int8_t b1, b2, b3; | |
231 | }; | |
232 | typedef struct UCaseContext UCaseContext; | |
233 | ||
4388f060 A |
234 | U_CDECL_END |
235 | ||
236 | #define UCASECONTEXT_INITIALIZER { NULL, 0, 0, 0, 0, 0, 0, 0, 0, 0 } | |
237 | ||
374ca955 A |
238 | enum { |
239 | /** | |
240 | * For string case mappings, a single character (a code point) is mapped | |
241 | * either to itself (in which case in-place mapping functions do nothing), | |
242 | * or to another single code point, or to a string. | |
243 | * Aside from the string contents, these are indicated with a single int32_t | |
244 | * value as follows: | |
245 | * | |
246 | * Mapping to self: Negative values (~self instead of -self to support U+0000) | |
247 | * | |
248 | * Mapping to another code point: Positive values >UCASE_MAX_STRING_LENGTH | |
249 | * | |
250 | * Mapping to a string: The string length (0..UCASE_MAX_STRING_LENGTH) is | |
251 | * returned. Note that the string result may indeed have zero length. | |
252 | */ | |
253 | UCASE_MAX_STRING_LENGTH=0x1f | |
254 | }; | |
255 | ||
256 | /** | |
257 | * Get the full lowercase mapping for c. | |
258 | * | |
259 | * @param csp Case mapping properties. | |
260 | * @param c Character to be mapped. | |
261 | * @param iter Character iterator, used for context-sensitive mappings. | |
262 | * See UCaseContextIterator for details. | |
263 | * If iter==NULL then a context-independent result is returned. | |
264 | * @param context Pointer to be passed into iter. | |
265 | * @param pString If the mapping result is a string, then the pointer is | |
266 | * written to *pString. | |
f3c0d7a5 | 267 | * @param caseLocale Case locale value from ucase_getCaseLocale(). |
374ca955 A |
268 | * @return Output code point or string length, see UCASE_MAX_STRING_LENGTH. |
269 | * | |
270 | * @see UCaseContextIterator | |
271 | * @see UCASE_MAX_STRING_LENGTH | |
272 | * @internal | |
273 | */ | |
274 | U_CAPI int32_t U_EXPORT2 | |
f3c0d7a5 | 275 | ucase_toFullLower(UChar32 c, |
374ca955 A |
276 | UCaseContextIterator *iter, void *context, |
277 | const UChar **pString, | |
f3c0d7a5 | 278 | int32_t caseLocale); |
374ca955 A |
279 | |
280 | U_CAPI int32_t U_EXPORT2 | |
f3c0d7a5 | 281 | ucase_toFullUpper(UChar32 c, |
374ca955 A |
282 | UCaseContextIterator *iter, void *context, |
283 | const UChar **pString, | |
f3c0d7a5 | 284 | int32_t caseLocale); |
374ca955 A |
285 | |
286 | U_CAPI int32_t U_EXPORT2 | |
f3c0d7a5 | 287 | ucase_toFullTitle(UChar32 c, |
374ca955 A |
288 | UCaseContextIterator *iter, void *context, |
289 | const UChar **pString, | |
f3c0d7a5 | 290 | int32_t caseLocale); |
374ca955 A |
291 | |
292 | U_CAPI int32_t U_EXPORT2 | |
f3c0d7a5 | 293 | ucase_toFullFolding(UChar32 c, |
374ca955 A |
294 | const UChar **pString, |
295 | uint32_t options); | |
296 | ||
73c04bcf A |
297 | U_CFUNC int32_t U_EXPORT2 |
298 | ucase_hasBinaryProperty(UChar32 c, UProperty which); | |
299 | ||
46f4442e A |
300 | |
301 | U_CDECL_BEGIN | |
302 | ||
303 | /** | |
304 | * @internal | |
305 | */ | |
306 | typedef int32_t U_CALLCONV | |
f3c0d7a5 | 307 | UCaseMapFull(UChar32 c, |
46f4442e A |
308 | UCaseContextIterator *iter, void *context, |
309 | const UChar **pString, | |
f3c0d7a5 | 310 | int32_t caseLocale); |
46f4442e A |
311 | |
312 | U_CDECL_END | |
313 | ||
374ca955 A |
314 | /* file definitions --------------------------------------------------------- */ |
315 | ||
316 | #define UCASE_DATA_NAME "ucase" | |
317 | #define UCASE_DATA_TYPE "icu" | |
318 | ||
319 | /* format "cAsE" */ | |
320 | #define UCASE_FMT_0 0x63 | |
321 | #define UCASE_FMT_1 0x41 | |
322 | #define UCASE_FMT_2 0x53 | |
323 | #define UCASE_FMT_3 0x45 | |
324 | ||
325 | /* indexes into indexes[] */ | |
326 | enum { | |
327 | UCASE_IX_INDEX_TOP, | |
328 | UCASE_IX_LENGTH, | |
329 | UCASE_IX_TRIE_SIZE, | |
330 | UCASE_IX_EXC_LENGTH, | |
73c04bcf | 331 | UCASE_IX_UNFOLD_LENGTH, |
374ca955 A |
332 | |
333 | UCASE_IX_MAX_FULL_LENGTH=15, | |
334 | UCASE_IX_TOP=16 | |
335 | }; | |
336 | ||
337 | /* definitions for 16-bit case properties word ------------------------------ */ | |
338 | ||
0f5d89e8 A |
339 | U_CFUNC const UTrie2 * U_EXPORT2 |
340 | ucase_getTrie(); | |
341 | ||
374ca955 A |
342 | /* 2-bit constants for types of cased characters */ |
343 | #define UCASE_TYPE_MASK 3 | |
344 | enum { | |
345 | UCASE_NONE, | |
346 | UCASE_LOWER, | |
347 | UCASE_UPPER, | |
348 | UCASE_TITLE | |
349 | }; | |
350 | ||
73c04bcf | 351 | #define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK) |
4388f060 | 352 | #define UCASE_GET_TYPE_AND_IGNORABLE(props) ((props)&7) |
73c04bcf | 353 | |
0f5d89e8 A |
354 | #define UCASE_IS_UPPER_OR_TITLE(props) ((props)&2) |
355 | ||
4388f060 | 356 | #define UCASE_IGNORABLE 4 |
0f5d89e8 A |
357 | #define UCASE_EXCEPTION 8 |
358 | #define UCASE_SENSITIVE 0x10 | |
359 | ||
360 | #define UCASE_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION) | |
374ca955 | 361 | |
4388f060 | 362 | #define UCASE_DOT_MASK 0x60 |
374ca955 A |
363 | enum { |
364 | UCASE_NO_DOT=0, /* normal characters with cc=0 */ | |
4388f060 A |
365 | UCASE_SOFT_DOTTED=0x20, /* soft-dotted characters with cc=0 */ |
366 | UCASE_ABOVE=0x40, /* "above" accents with cc=230 */ | |
367 | UCASE_OTHER_ACCENT=0x60 /* other accent character (0<cc!=230) */ | |
374ca955 A |
368 | }; |
369 | ||
4388f060 A |
370 | /* no exception: bits 15..7 are a 9-bit signed case mapping delta */ |
371 | #define UCASE_DELTA_SHIFT 7 | |
372 | #define UCASE_DELTA_MASK 0xff80 | |
373 | #define UCASE_MAX_DELTA 0xff | |
374ca955 A |
374 | #define UCASE_MIN_DELTA (-UCASE_MAX_DELTA-1) |
375 | ||
4388f060 A |
376 | #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC |
377 | # define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT) | |
378 | #else | |
379 | # define UCASE_GET_DELTA(props) (int16_t)(((props)&0x8000) ? (((props)>>UCASE_DELTA_SHIFT)|0xfe00) : ((uint16_t)(props)>>UCASE_DELTA_SHIFT)) | |
380 | #endif | |
374ca955 | 381 | |
0f5d89e8 A |
382 | /* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */ |
383 | #define UCASE_EXC_SHIFT 4 | |
384 | #define UCASE_EXC_MASK 0xfff0 | |
4388f060 | 385 | #define UCASE_MAX_EXCEPTIONS ((UCASE_EXC_MASK>>UCASE_EXC_SHIFT)+1) |
374ca955 A |
386 | |
387 | /* definitions for 16-bit main exceptions word ------------------------------ */ | |
388 | ||
389 | /* first 8 bits indicate values in optional slots */ | |
390 | enum { | |
391 | UCASE_EXC_LOWER, | |
392 | UCASE_EXC_FOLD, | |
393 | UCASE_EXC_UPPER, | |
394 | UCASE_EXC_TITLE, | |
0f5d89e8 | 395 | UCASE_EXC_DELTA, |
374ca955 | 396 | UCASE_EXC_5, /* reserved */ |
73c04bcf | 397 | UCASE_EXC_CLOSURE, |
374ca955 A |
398 | UCASE_EXC_FULL_MAPPINGS, |
399 | UCASE_EXC_ALL_SLOTS /* one past the last slot */ | |
400 | }; | |
401 | ||
402 | /* each slot is 2 uint16_t instead of 1 */ | |
403 | #define UCASE_EXC_DOUBLE_SLOTS 0x100 | |
404 | ||
0f5d89e8 A |
405 | enum { |
406 | UCASE_EXC_NO_SIMPLE_CASE_FOLDING=0x200, | |
407 | UCASE_EXC_DELTA_IS_NEGATIVE=0x400, | |
408 | UCASE_EXC_SENSITIVE=0x800 | |
409 | }; | |
374ca955 A |
410 | |
411 | /* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */ | |
4388f060 | 412 | #define UCASE_EXC_DOT_SHIFT 7 |
374ca955 A |
413 | |
414 | /* normally stored in the main word, but pushed out for larger exception indexes */ | |
415 | #define UCASE_EXC_DOT_MASK 0x3000 | |
416 | enum { | |
417 | UCASE_EXC_NO_DOT=0, | |
418 | UCASE_EXC_SOFT_DOTTED=0x1000, | |
419 | UCASE_EXC_ABOVE=0x2000, /* "above" accents with cc=230 */ | |
420 | UCASE_EXC_OTHER_ACCENT=0x3000 /* other character (0<cc!=230) */ | |
421 | }; | |
422 | ||
423 | /* complex/conditional mappings */ | |
424 | #define UCASE_EXC_CONDITIONAL_SPECIAL 0x4000 | |
425 | #define UCASE_EXC_CONDITIONAL_FOLD 0x8000 | |
426 | ||
427 | /* definitions for lengths word for full case mappings */ | |
428 | #define UCASE_FULL_LOWER 0xf | |
429 | #define UCASE_FULL_FOLDING 0xf0 | |
430 | #define UCASE_FULL_UPPER 0xf00 | |
431 | #define UCASE_FULL_TITLE 0xf000 | |
432 | ||
73c04bcf A |
433 | /* maximum lengths */ |
434 | #define UCASE_FULL_MAPPINGS_MAX_LENGTH (4*0xf) | |
435 | #define UCASE_CLOSURE_MAX_LENGTH 0xf | |
436 | ||
437 | /* constants for reverse case folding ("unfold") data */ | |
438 | enum { | |
439 | UCASE_UNFOLD_ROWS, | |
440 | UCASE_UNFOLD_ROW_WIDTH, | |
441 | UCASE_UNFOLD_STRING_WIDTH | |
442 | }; | |
443 | ||
374ca955 | 444 | #endif |