]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
4 | * Copyright (C) 2004, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: ucase.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2004aug30 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * Low-level Unicode character/string case mapping code. | |
17 | * Much code moved here (and modified) from uchar.c. | |
18 | */ | |
19 | ||
20 | #include "unicode/utypes.h" | |
21 | #include "unicode/uset.h" | |
22 | #include "unicode/udata.h" /* UDataInfo */ | |
23 | #include "ucmndata.h" /* DataHeader */ | |
24 | #include "udatamem.h" | |
25 | #include "umutex.h" | |
26 | #include "uassert.h" | |
27 | #include "cmemory.h" | |
28 | #include "utrie.h" | |
29 | #include "ucase.h" | |
30 | #include "ucln_cmn.h" | |
31 | ||
32 | struct UCaseProps { | |
33 | UDataMemory *mem; | |
34 | const int32_t *indexes; | |
35 | const uint16_t *exceptions; | |
36 | ||
37 | UTrie trie; | |
38 | uint8_t formatVersion[4]; | |
39 | }; | |
40 | ||
41 | /* data loading etc. -------------------------------------------------------- */ | |
42 | ||
43 | static UBool U_CALLCONV | |
44 | isAcceptable(void *context, | |
45 | const char *type, const char *name, | |
46 | const UDataInfo *pInfo) { | |
47 | if( | |
48 | pInfo->size>=20 && | |
49 | pInfo->isBigEndian==U_IS_BIG_ENDIAN && | |
50 | pInfo->charsetFamily==U_CHARSET_FAMILY && | |
51 | pInfo->dataFormat[0]==UCASE_FMT_0 && /* dataFormat="cAsE" */ | |
52 | pInfo->dataFormat[1]==UCASE_FMT_1 && | |
53 | pInfo->dataFormat[2]==UCASE_FMT_2 && | |
54 | pInfo->dataFormat[3]==UCASE_FMT_3 && | |
55 | pInfo->formatVersion[0]==1 && | |
56 | pInfo->formatVersion[2]==UTRIE_SHIFT && | |
57 | pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT | |
58 | ) { | |
59 | UCaseProps *csp=(UCaseProps *)context; | |
60 | uprv_memcpy(csp->formatVersion, pInfo->formatVersion, 4); | |
61 | return TRUE; | |
62 | } else { | |
63 | return FALSE; | |
64 | } | |
65 | } | |
66 | ||
67 | static UCaseProps * | |
68 | ucase_openData(UCaseProps *cspProto, | |
69 | const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) { | |
70 | UCaseProps *csp; | |
71 | int32_t size, trieSize; | |
72 | ||
73 | cspProto->indexes=(const int32_t *)bin; | |
74 | if( cspProto->indexes[UCASE_IX_INDEX_TOP]<16 || | |
75 | (length>=0 && length<cspProto->indexes[UCASE_IX_LENGTH]) | |
76 | ) { | |
77 | *pErrorCode=U_INVALID_FORMAT_ERROR; | |
78 | return NULL; | |
79 | } | |
80 | ||
81 | /* get the trie address, after indexes[] */ | |
82 | size=cspProto->indexes[UCASE_IX_INDEX_TOP]*4; | |
83 | bin+=size; | |
84 | if(length>=0 && (length-=size)<16) { | |
85 | *pErrorCode=U_INVALID_FORMAT_ERROR; | |
86 | return NULL; | |
87 | } | |
88 | ||
89 | /* unserialize the trie */ | |
90 | trieSize=cspProto->indexes[UCASE_IX_TRIE_SIZE]; | |
91 | trieSize=utrie_unserialize(&cspProto->trie, bin, length>=0 ? length : trieSize, pErrorCode); | |
92 | if(U_FAILURE(*pErrorCode)) { | |
93 | return NULL; | |
94 | } | |
95 | ||
96 | /* get exceptions[] */ | |
97 | bin+=trieSize; | |
98 | if(length>=0 && (length-=trieSize)<2*cspProto->indexes[UCASE_IX_EXC_LENGTH]) { | |
99 | *pErrorCode=U_INVALID_FORMAT_ERROR; | |
100 | return NULL; | |
101 | } | |
102 | cspProto->exceptions=(const uint16_t *)bin; | |
103 | ||
104 | /* allocate, copy, and return the new UCaseProps */ | |
105 | csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps)); | |
106 | if(csp==NULL) { | |
107 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
108 | return NULL; | |
109 | } else { | |
110 | uprv_memcpy(csp, cspProto, sizeof(UCaseProps)); | |
111 | return csp; | |
112 | } | |
113 | } | |
114 | ||
115 | U_CAPI UCaseProps * U_EXPORT2 | |
116 | ucase_open(UErrorCode *pErrorCode) { | |
117 | UCaseProps cspProto={ NULL }, *csp; | |
118 | ||
119 | cspProto.mem=udata_openChoice(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, isAcceptable, &cspProto, pErrorCode); | |
120 | if(U_FAILURE(*pErrorCode)) { | |
121 | return NULL; | |
122 | } | |
123 | ||
124 | csp=ucase_openData( | |
125 | &cspProto, | |
126 | udata_getMemory(cspProto.mem), | |
127 | udata_getLength(cspProto.mem), | |
128 | pErrorCode); | |
129 | if(U_FAILURE(*pErrorCode)) { | |
130 | udata_close(cspProto.mem); | |
131 | return NULL; | |
132 | } else { | |
133 | return csp; | |
134 | } | |
135 | } | |
136 | ||
137 | U_CAPI UCaseProps * U_EXPORT2 | |
138 | ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) { | |
139 | UCaseProps cspProto={ NULL }; | |
140 | const DataHeader *hdr; | |
141 | ||
142 | if(U_FAILURE(*pErrorCode)) { | |
143 | return NULL; | |
144 | } | |
145 | if(bin==NULL) { | |
146 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
147 | return NULL; | |
148 | } | |
149 | ||
150 | /* check the header */ | |
151 | if(length>=0 && length<20) { | |
152 | *pErrorCode=U_INVALID_FORMAT_ERROR; | |
153 | return NULL; | |
154 | } | |
155 | hdr=(const DataHeader *)bin; | |
156 | if( | |
157 | !(hdr->dataHeader.magic1==0xda && hdr->dataHeader.magic2==0x27 && | |
158 | hdr->info.isBigEndian==U_IS_BIG_ENDIAN && | |
159 | isAcceptable(&cspProto, UCASE_DATA_TYPE, UCASE_DATA_NAME, &hdr->info)) | |
160 | ) { | |
161 | *pErrorCode=U_INVALID_FORMAT_ERROR; | |
162 | return NULL; | |
163 | } | |
164 | ||
165 | bin+=hdr->dataHeader.headerSize; | |
166 | if(length>=0) { | |
167 | length-=hdr->dataHeader.headerSize; | |
168 | } | |
169 | return ucase_openData(&cspProto, bin, length, pErrorCode); | |
170 | } | |
171 | ||
172 | U_CAPI void U_EXPORT2 | |
173 | ucase_close(UCaseProps *csp) { | |
174 | if(csp!=NULL) { | |
175 | udata_close(csp->mem); | |
176 | uprv_free(csp); | |
177 | } | |
178 | } | |
179 | ||
180 | /* UCaseProps singleton ----------------------------------------------------- */ | |
181 | ||
182 | static UCaseProps *gCsp=NULL; | |
183 | static UErrorCode gErrorCode=U_ZERO_ERROR; | |
184 | static int8_t gHaveData=0; | |
185 | ||
186 | static UBool U_CALLCONV ucase_cleanup(void) { | |
187 | ucase_close(gCsp); | |
188 | gCsp=NULL; | |
189 | gErrorCode=U_ZERO_ERROR; | |
190 | gHaveData=0; | |
191 | return TRUE; | |
192 | } | |
193 | ||
194 | U_CAPI UCaseProps * U_EXPORT2 | |
195 | ucase_getSingleton(UErrorCode *pErrorCode) { | |
196 | int8_t haveData; | |
197 | ||
198 | if(U_FAILURE(*pErrorCode)) { | |
199 | return NULL; | |
200 | } | |
201 | ||
202 | UMTX_CHECK(NULL, gHaveData, haveData); | |
203 | ||
204 | if(haveData>0) { | |
205 | /* data was loaded */ | |
206 | return gCsp; | |
207 | } else if(haveData<0) { | |
208 | /* data loading failed */ | |
209 | *pErrorCode=gErrorCode; | |
210 | return NULL; | |
211 | } else /* haveData==0 */ { | |
212 | /* load the data */ | |
213 | UCaseProps *csp=ucase_open(pErrorCode); | |
214 | if(U_FAILURE(*pErrorCode)) { | |
215 | gHaveData=-1; | |
216 | gErrorCode=*pErrorCode; | |
217 | return NULL; | |
218 | } | |
219 | ||
220 | /* set the static variables */ | |
221 | umtx_lock(NULL); | |
222 | if(gCsp==NULL) { | |
223 | gCsp=csp; | |
224 | csp=NULL; | |
225 | gHaveData=1; | |
226 | ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup); | |
227 | } | |
228 | umtx_unlock(NULL); | |
229 | ||
230 | ucase_close(csp); | |
231 | return gCsp; | |
232 | } | |
233 | } | |
234 | ||
235 | /* Unicode case mapping data swapping --------------------------------------- */ | |
236 | ||
237 | U_CAPI int32_t U_EXPORT2 | |
238 | ucase_swap(const UDataSwapper *ds, | |
239 | const void *inData, int32_t length, void *outData, | |
240 | UErrorCode *pErrorCode) { | |
241 | const UDataInfo *pInfo; | |
242 | int32_t headerSize; | |
243 | ||
244 | const uint8_t *inBytes; | |
245 | uint8_t *outBytes; | |
246 | ||
247 | const int32_t *inIndexes; | |
248 | int32_t indexes[16]; | |
249 | ||
250 | int32_t i, offset, count, size; | |
251 | ||
252 | /* udata_swapDataHeader checks the arguments */ | |
253 | headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); | |
254 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | |
255 | return 0; | |
256 | } | |
257 | ||
258 | /* check data format and format version */ | |
259 | pInfo=(const UDataInfo *)((const char *)inData+4); | |
260 | if(!( | |
261 | pInfo->dataFormat[0]==UCASE_FMT_0 && /* dataFormat="cAsE" */ | |
262 | pInfo->dataFormat[1]==UCASE_FMT_1 && | |
263 | pInfo->dataFormat[2]==UCASE_FMT_2 && | |
264 | pInfo->dataFormat[3]==UCASE_FMT_3 && | |
265 | pInfo->formatVersion[0]==1 && | |
266 | pInfo->formatVersion[2]==UTRIE_SHIFT && | |
267 | pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT | |
268 | )) { | |
269 | udata_printError(ds, "ucase_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as case mapping data\n", | |
270 | pInfo->dataFormat[0], pInfo->dataFormat[1], | |
271 | pInfo->dataFormat[2], pInfo->dataFormat[3], | |
272 | pInfo->formatVersion[0]); | |
273 | *pErrorCode=U_UNSUPPORTED_ERROR; | |
274 | return 0; | |
275 | } | |
276 | ||
277 | inBytes=(const uint8_t *)inData+headerSize; | |
278 | outBytes=(uint8_t *)outData+headerSize; | |
279 | ||
280 | inIndexes=(const int32_t *)inBytes; | |
281 | ||
282 | if(length>=0) { | |
283 | length-=headerSize; | |
284 | if(length<16*4) { | |
285 | udata_printError(ds, "ucase_swap(): too few bytes (%d after header) for case mapping data\n", | |
286 | length); | |
287 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
288 | return 0; | |
289 | } | |
290 | } | |
291 | ||
292 | /* read the first 16 indexes (ICU 3.2/format version 1: UCASE_IX_TOP==16, might grow) */ | |
293 | for(i=0; i<16; ++i) { | |
294 | indexes[i]=udata_readInt32(ds, inIndexes[i]); | |
295 | } | |
296 | ||
297 | /* get the total length of the data */ | |
298 | size=indexes[UCASE_IX_LENGTH]; | |
299 | ||
300 | if(length>=0) { | |
301 | if(length<size) { | |
302 | udata_printError(ds, "ucase_swap(): too few bytes (%d after header) for all of case mapping data\n", | |
303 | length); | |
304 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
305 | return 0; | |
306 | } | |
307 | ||
308 | /* copy the data for inaccessible bytes */ | |
309 | if(inBytes!=outBytes) { | |
310 | uprv_memcpy(outBytes, inBytes, size); | |
311 | } | |
312 | ||
313 | offset=0; | |
314 | ||
315 | /* swap the int32_t indexes[] */ | |
316 | count=indexes[UCASE_IX_INDEX_TOP]*4; | |
317 | ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode); | |
318 | offset+=count; | |
319 | ||
320 | /* swap the UTrie */ | |
321 | count=indexes[UCASE_IX_TRIE_SIZE]; | |
322 | utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); | |
323 | offset+=count; | |
324 | ||
325 | /* swap the uint16_t exceptions[] */ | |
326 | count=indexes[UCASE_IX_EXC_LENGTH]*2; | |
327 | ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode); | |
328 | offset+=count; | |
329 | ||
330 | U_ASSERT(offset==size); | |
331 | } | |
332 | ||
333 | return headerSize+size; | |
334 | } | |
335 | ||
336 | /* set of property starts for UnicodeSet ------------------------------------ */ | |
337 | ||
338 | static UBool U_CALLCONV | |
339 | _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) { | |
340 | /* add the start code point to the USet */ | |
341 | USetAdder *sa=(USetAdder *)context; | |
342 | sa->add(sa->set, start); | |
343 | return TRUE; | |
344 | } | |
345 | ||
346 | U_CAPI void U_EXPORT2 | |
347 | ucase_addPropertyStarts(const UCaseProps *csp, USetAdder *sa, UErrorCode *pErrorCode) { | |
348 | if(U_FAILURE(*pErrorCode)) { | |
349 | return; | |
350 | } | |
351 | ||
352 | /* add the start code point of each same-value range of the trie */ | |
353 | utrie_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa); | |
354 | ||
355 | /* add code points with hardcoded properties, plus the ones following them */ | |
356 | ||
357 | /* (none right now, see comment below) */ | |
358 | ||
359 | /* | |
360 | * Omit code points with hardcoded specialcasing properties | |
361 | * because we do not build property UnicodeSets for them right now. | |
362 | */ | |
363 | } | |
364 | ||
365 | /* data access primitives --------------------------------------------------- */ | |
366 | ||
367 | /* UTRIE_GET16() itself validates c */ | |
368 | #define GET_PROPS(csp, c, result) \ | |
369 | UTRIE_GET16(&(csp)->trie, c, result); | |
370 | ||
371 | #define GET_CASE_TYPE(props) ((props)&UCASE_TYPE_MASK) | |
372 | #define GET_SIGNED_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT) | |
373 | #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT)) | |
374 | ||
375 | #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION) | |
376 | ||
377 | /* number of bits in an 8-bit integer value */ | |
378 | static const uint8_t flagsOffset[256]={ | |
379 | 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, | |
380 | 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, | |
381 | 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, | |
382 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
383 | 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, | |
384 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
385 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
386 | 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, | |
387 | 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, | |
388 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
389 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
390 | 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, | |
391 | 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
392 | 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, | |
393 | 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, | |
394 | 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 | |
395 | }; | |
396 | ||
397 | #define HAS_SLOT(flags, index) ((flags)&(1<<(index))) | |
398 | #define SLOT_OFFSET(flags, index) flagsOffset[(flags)&((1<<(index))-1)] | |
399 | ||
400 | /* | |
401 | * Get the value of an optional-value slot where HAS_SLOT(excWord, index). | |
402 | * | |
403 | * @param excWord (in) initial exceptions word | |
404 | * @param index (in) desired slot index | |
405 | * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++; | |
406 | * moved to the last uint16_t of the value, use +1 for beginning of next slot | |
407 | * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified | |
408 | */ | |
409 | #define GET_SLOT_VALUE(excWord, index, pExc16, value) \ | |
410 | if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \ | |
411 | (pExc16)+=SLOT_OFFSET(excWord, index); \ | |
412 | (value)=*pExc16; \ | |
413 | } else { \ | |
414 | (pExc16)+=2*SLOT_OFFSET(excWord, index); \ | |
415 | (value)=*pExc16++; \ | |
416 | (value)=((value)<<16)|*pExc16; \ | |
417 | } | |
418 | ||
419 | /* simple case mappings ----------------------------------------------------- */ | |
420 | ||
421 | U_CAPI UChar32 U_EXPORT2 | |
422 | ucase_tolower(const UCaseProps *csp, UChar32 c) { | |
423 | uint16_t props; | |
424 | GET_PROPS(csp, c, props); | |
425 | if(!PROPS_HAS_EXCEPTION(props)) { | |
426 | if(GET_CASE_TYPE(props)>=UCASE_UPPER) { | |
427 | c+=GET_SIGNED_DELTA(props); | |
428 | } | |
429 | } else { | |
430 | const uint16_t *pe=GET_EXCEPTIONS(csp, props); | |
431 | uint16_t excWord=*pe++; | |
432 | if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { | |
433 | GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c); | |
434 | } | |
435 | } | |
436 | return c; | |
437 | } | |
438 | ||
439 | U_CAPI UChar32 U_EXPORT2 | |
440 | ucase_toupper(const UCaseProps *csp, UChar32 c) { | |
441 | uint16_t props; | |
442 | GET_PROPS(csp, c, props); | |
443 | if(!PROPS_HAS_EXCEPTION(props)) { | |
444 | if(GET_CASE_TYPE(props)==UCASE_LOWER) { | |
445 | c+=GET_SIGNED_DELTA(props); | |
446 | } | |
447 | } else { | |
448 | const uint16_t *pe=GET_EXCEPTIONS(csp, props); | |
449 | uint16_t excWord=*pe++; | |
450 | if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { | |
451 | GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c); | |
452 | } | |
453 | } | |
454 | return c; | |
455 | } | |
456 | ||
457 | U_CAPI UChar32 U_EXPORT2 | |
458 | ucase_totitle(const UCaseProps *csp, UChar32 c) { | |
459 | uint16_t props; | |
460 | GET_PROPS(csp, c, props); | |
461 | if(!PROPS_HAS_EXCEPTION(props)) { | |
462 | if(GET_CASE_TYPE(props)==UCASE_LOWER) { | |
463 | c+=GET_SIGNED_DELTA(props); | |
464 | } | |
465 | } else { | |
466 | const uint16_t *pe=GET_EXCEPTIONS(csp, props); | |
467 | uint16_t excWord=*pe++; | |
468 | int32_t index; | |
469 | if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) { | |
470 | index=UCASE_EXC_TITLE; | |
471 | } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { | |
472 | index=UCASE_EXC_UPPER; | |
473 | } else { | |
474 | return c; | |
475 | } | |
476 | GET_SLOT_VALUE(excWord, index, pe, c); | |
477 | } | |
478 | return c; | |
479 | } | |
480 | ||
481 | /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ | |
482 | U_CAPI int32_t U_EXPORT2 | |
483 | ucase_getType(const UCaseProps *csp, UChar32 c) { | |
484 | uint16_t props; | |
485 | GET_PROPS(csp, c, props); | |
486 | return GET_CASE_TYPE(props); | |
487 | } | |
488 | ||
489 | /** @return same as ucase_getType(), or <0 if c is case-ignorable */ | |
490 | U_CAPI int32_t U_EXPORT2 | |
491 | ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) { | |
492 | int32_t type; | |
493 | uint16_t props; | |
494 | GET_PROPS(csp, c, props); | |
495 | type=GET_CASE_TYPE(props); | |
496 | if(type!=UCASE_NONE) { | |
497 | return type; | |
498 | } else if( | |
499 | c==0x307 || | |
500 | (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE | |
501 | ) { | |
502 | return -1; /* case-ignorable */ | |
503 | } else { | |
504 | return 0; /* c is neither cased nor case-ignorable */ | |
505 | } | |
506 | } | |
507 | ||
508 | /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */ | |
509 | static U_INLINE int32_t | |
510 | getDotType(const UCaseProps *csp, UChar32 c) { | |
511 | uint16_t props; | |
512 | GET_PROPS(csp, c, props); | |
513 | if(!PROPS_HAS_EXCEPTION(props)) { | |
514 | return props&UCASE_DOT_MASK; | |
515 | } else { | |
516 | const uint16_t *pe=GET_EXCEPTIONS(csp, props); | |
517 | return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK; | |
518 | } | |
519 | } | |
520 | ||
521 | U_CAPI UBool U_EXPORT2 | |
522 | ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) { | |
523 | return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED); | |
524 | } | |
525 | ||
526 | U_CAPI UBool U_EXPORT2 | |
527 | ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) { | |
528 | uint16_t props; | |
529 | GET_PROPS(csp, c, props); | |
530 | return (UBool)((props&UCASE_SENSITIVE)!=0); | |
531 | } | |
532 | ||
533 | /* public API (see uchar.h) ------------------------------------------------- */ | |
534 | ||
535 | U_CAPI UBool U_EXPORT2 | |
536 | u_isULowercase(UChar32 c) { | |
537 | UErrorCode errorCode=U_ZERO_ERROR; | |
538 | UCaseProps *csp=ucase_getSingleton(&errorCode); | |
539 | return (UBool)(csp!=NULL && UCASE_LOWER==ucase_getType(csp, c)); | |
540 | } | |
541 | ||
542 | U_CAPI UBool U_EXPORT2 | |
543 | u_isUUppercase(UChar32 c) { | |
544 | UErrorCode errorCode=U_ZERO_ERROR; | |
545 | UCaseProps *csp=ucase_getSingleton(&errorCode); | |
546 | return (UBool)(csp!=NULL && UCASE_UPPER==ucase_getType(csp, c)); | |
547 | } | |
548 | ||
549 | /* Transforms the Unicode character to its lower case equivalent.*/ | |
550 | U_CAPI UChar32 U_EXPORT2 | |
551 | u_tolower(UChar32 c) { | |
552 | UErrorCode errorCode=U_ZERO_ERROR; | |
553 | UCaseProps *csp=ucase_getSingleton(&errorCode); | |
554 | if(csp!=NULL) { | |
555 | return ucase_tolower(csp, c); | |
556 | } else { | |
557 | return c; | |
558 | } | |
559 | } | |
560 | ||
561 | /* Transforms the Unicode character to its upper case equivalent.*/ | |
562 | U_CAPI UChar32 U_EXPORT2 | |
563 | u_toupper(UChar32 c) { | |
564 | UErrorCode errorCode=U_ZERO_ERROR; | |
565 | UCaseProps *csp=ucase_getSingleton(&errorCode); | |
566 | if(csp!=NULL) { | |
567 | return ucase_toupper(csp, c); | |
568 | } else { | |
569 | return c; | |
570 | } | |
571 | } | |
572 | ||
573 | /* Transforms the Unicode character to its title case equivalent.*/ | |
574 | U_CAPI UChar32 U_EXPORT2 | |
575 | u_totitle(UChar32 c) { | |
576 | UErrorCode errorCode=U_ZERO_ERROR; | |
577 | UCaseProps *csp=ucase_getSingleton(&errorCode); | |
578 | if(csp!=NULL) { | |
579 | return ucase_totitle(csp, c); | |
580 | } else { | |
581 | return c; | |
582 | } | |
583 | } | |
584 | ||
585 | /* return the simple case folding mapping for c */ | |
586 | U_CAPI UChar32 U_EXPORT2 | |
587 | u_foldCase(UChar32 c, uint32_t options) { | |
588 | UErrorCode errorCode=U_ZERO_ERROR; | |
589 | UCaseProps *csp=ucase_getSingleton(&errorCode); | |
590 | if(csp!=NULL) { | |
591 | return ucase_fold(csp, c, options); | |
592 | } else { | |
593 | return c; | |
594 | } | |
595 | } | |
596 | ||
597 | /* string casing ------------------------------------------------------------ */ | |
598 | ||
599 | /* | |
600 | * These internal functions form the core of string case mappings. | |
601 | * They map single code points to result code points or strings and take | |
602 | * all necessary conditions (context, locale ID, options) into account. | |
603 | * | |
604 | * They do not iterate over the source or write to the destination | |
605 | * so that the same functions are useful for non-standard string storage, | |
606 | * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc. | |
607 | * For the same reason, the "surrounding text" context is passed in as a | |
608 | * UCaseContextIterator which does not make any assumptions about | |
609 | * the underlying storage. | |
610 | * | |
611 | * This section contains helper functions that check for conditions | |
612 | * in the input text surrounding the current code point | |
613 | * according to SpecialCasing.txt. | |
614 | * | |
615 | * Each helper function gets the index | |
616 | * - after the current code point if it looks at following text | |
617 | * - before the current code point if it looks at preceding text | |
618 | * | |
619 | * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows: | |
620 | * | |
621 | * Final_Sigma | |
622 | * C is preceded by a sequence consisting of | |
623 | * a cased letter and a case-ignorable sequence, | |
624 | * and C is not followed by a sequence consisting of | |
625 | * an ignorable sequence and then a cased letter. | |
626 | * | |
627 | * More_Above | |
628 | * C is followed by one or more characters of combining class 230 (ABOVE) | |
629 | * in the combining character sequence. | |
630 | * | |
631 | * After_Soft_Dotted | |
632 | * The last preceding character with combining class of zero before C | |
633 | * was Soft_Dotted, | |
634 | * and there is no intervening combining character class 230 (ABOVE). | |
635 | * | |
636 | * Before_Dot | |
637 | * C is followed by combining dot above (U+0307). | |
638 | * Any sequence of characters with a combining class that is neither 0 nor 230 | |
639 | * may intervene between the current character and the combining dot above. | |
640 | * | |
641 | * The erratum from 2002-10-31 adds the condition | |
642 | * | |
643 | * After_I | |
644 | * The last preceding base character was an uppercase I, and there is no | |
645 | * intervening combining character class 230 (ABOVE). | |
646 | * | |
647 | * (See Jitterbug 2344 and the comments on After_I below.) | |
648 | * | |
649 | * Helper definitions in Unicode 3.2 UAX 21: | |
650 | * | |
651 | * D1. A character C is defined to be cased | |
652 | * if it meets any of the following criteria: | |
653 | * | |
654 | * - The general category of C is Titlecase Letter (Lt) | |
655 | * - In [CoreProps], C has one of the properties Uppercase, or Lowercase | |
656 | * - Given D = NFD(C), then it is not the case that: | |
657 | * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D) | |
658 | * (This third criterium does not add any characters to the list | |
659 | * for Unicode 3.2. Ignored.) | |
660 | * | |
661 | * D2. A character C is defined to be case-ignorable | |
662 | * if it meets either of the following criteria: | |
663 | * | |
664 | * - The general category of C is | |
665 | * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or | |
666 | * Letter Modifier (Lm), or Symbol Modifier (Sk) | |
667 | * - C is one of the following characters | |
668 | * U+0027 APOSTROPHE | |
669 | * U+00AD SOFT HYPHEN (SHY) | |
670 | * U+2019 RIGHT SINGLE QUOTATION MARK | |
671 | * (the preferred character for apostrophe) | |
672 | * | |
673 | * D3. A case-ignorable sequence is a sequence of | |
674 | * zero or more case-ignorable characters. | |
675 | */ | |
676 | ||
677 | enum { | |
678 | LOC_UNKNOWN, | |
679 | LOC_ROOT, | |
680 | LOC_TURKISH, | |
681 | LOC_LITHUANIAN | |
682 | }; | |
683 | ||
684 | #define is_a(c) ((c)=='a' || (c)=='A') | |
685 | #define is_e(c) ((c)=='e' || (c)=='E') | |
686 | #define is_i(c) ((c)=='i' || (c)=='I') | |
687 | #define is_l(c) ((c)=='l' || (c)=='L') | |
688 | #define is_r(c) ((c)=='r' || (c)=='R') | |
689 | #define is_t(c) ((c)=='t' || (c)=='T') | |
690 | #define is_u(c) ((c)=='u' || (c)=='U') | |
691 | #define is_z(c) ((c)=='z' || (c)=='Z') | |
692 | ||
693 | /* separator? */ | |
694 | #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0) | |
695 | ||
696 | /* | |
697 | * Requires non-NULL locale ID but otherwise does the equivalent of | |
698 | * checking for language codes as if uloc_getLanguage() were called: | |
699 | * Accepts both 2- and 3-letter codes and accepts case variants. | |
700 | */ | |
701 | static int32_t | |
702 | getCaseLocale(const char *locale, int32_t *locCache) { | |
703 | int32_t result; | |
704 | char c; | |
705 | ||
706 | if(locCache!=NULL && (result=*locCache)!=LOC_UNKNOWN) { | |
707 | return result; | |
708 | } | |
709 | ||
710 | result=LOC_ROOT; | |
711 | ||
712 | /* | |
713 | * This function used to use uloc_getLanguage(), but the current code | |
714 | * removes the dependency of this low-level code on uloc implementation code | |
715 | * and is faster because not the whole locale ID has to be | |
716 | * examined and copied/transformed. | |
717 | * | |
718 | * Because this code does not want to depend on uloc, the caller must | |
719 | * pass in a non-NULL locale, i.e., may need to call uloc_getDefault(). | |
720 | */ | |
721 | c=*locale++; | |
722 | if(is_t(c)) { | |
723 | /* tr or tur? */ | |
724 | c=*locale++; | |
725 | if(is_u(c)) { | |
726 | c=*locale++; | |
727 | } | |
728 | if(is_r(c)) { | |
729 | c=*locale; | |
730 | if(is_sep(c)) { | |
731 | result=LOC_TURKISH; | |
732 | } | |
733 | } | |
734 | } else if(is_a(c)) { | |
735 | /* az or aze? */ | |
736 | c=*locale++; | |
737 | if(is_z(c)) { | |
738 | c=*locale++; | |
739 | if(is_e(c)) { | |
740 | c=*locale; | |
741 | } | |
742 | if(is_sep(c)) { | |
743 | result=LOC_TURKISH; | |
744 | } | |
745 | } | |
746 | } else if(is_l(c)) { | |
747 | /* lt or lit? */ | |
748 | c=*locale++; | |
749 | if(is_i(c)) { | |
750 | c=*locale++; | |
751 | } | |
752 | if(is_t(c)) { | |
753 | c=*locale; | |
754 | if(is_sep(c)) { | |
755 | result=LOC_LITHUANIAN; | |
756 | } | |
757 | } | |
758 | } | |
759 | ||
760 | if(locCache!=NULL) { | |
761 | *locCache=result; | |
762 | } | |
763 | return result; | |
764 | } | |
765 | ||
766 | /* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */ | |
767 | static UBool | |
768 | isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) { | |
769 | UChar32 c; | |
770 | uint16_t props; | |
771 | ||
772 | if(iter==NULL) { | |
773 | return FALSE; | |
774 | } | |
775 | ||
776 | for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) { | |
777 | GET_PROPS(csp, c, props); | |
778 | if(GET_CASE_TYPE(props)!=UCASE_NONE) { | |
779 | return TRUE; /* followed by cased letter */ | |
780 | } else if(c==0x307 || (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE) { | |
781 | /* case-ignorable, continue with the loop */ | |
782 | } else { | |
783 | return FALSE; /* not ignorable */ | |
784 | } | |
785 | } | |
786 | ||
787 | return FALSE; /* not followed by cased letter */ | |
788 | } | |
789 | ||
790 | /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */ | |
791 | static UBool | |
792 | isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { | |
793 | UChar32 c; | |
794 | int32_t dotType; | |
795 | int8_t dir; | |
796 | ||
797 | if(iter==NULL) { | |
798 | return FALSE; | |
799 | } | |
800 | ||
801 | for(dir=-1; (c=iter(context, dir))>=0; dir=0) { | |
802 | dotType=getDotType(csp, c); | |
803 | if(dotType==UCASE_SOFT_DOTTED) { | |
804 | return TRUE; /* preceded by TYPE_i */ | |
805 | } else if(dotType!=UCASE_OTHER_ACCENT) { | |
806 | return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */ | |
807 | } | |
808 | } | |
809 | ||
810 | return FALSE; /* not preceded by TYPE_i */ | |
811 | } | |
812 | ||
813 | /* | |
814 | * See Jitterbug 2344: | |
815 | * The condition After_I for Turkic-lowercasing of U+0307 combining dot above | |
816 | * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because | |
817 | * we made those releases compatible with Unicode 3.2 which had not fixed | |
818 | * a related bug in SpecialCasing.txt. | |
819 | * | |
820 | * From the Jitterbug 2344 text: | |
821 | * ... this bug is listed as a Unicode erratum | |
822 | * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html | |
823 | * <quote> | |
824 | * There are two errors in SpecialCasing.txt. | |
825 | * 1. Missing semicolons on two lines. ... [irrelevant for ICU] | |
826 | * 2. An incorrect context definition. Correct as follows: | |
827 | * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE | |
828 | * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE | |
829 | * --- | |
830 | * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE | |
831 | * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE | |
832 | * where the context After_I is defined as: | |
833 | * The last preceding base character was an uppercase I, and there is no | |
834 | * intervening combining character class 230 (ABOVE). | |
835 | * </quote> | |
836 | * | |
837 | * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as: | |
838 | * | |
839 | * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. | |
840 | * # This matches the behavior of the canonically equivalent I-dot_above | |
841 | * | |
842 | * See also the description in this place in older versions of uchar.c (revision 1.100). | |
843 | * | |
844 | * Markus W. Scherer 2003-feb-15 | |
845 | */ | |
846 | ||
847 | /* Is preceded by base character 'I' with no intervening cc=230 ? */ | |
848 | static UBool | |
849 | isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { | |
850 | UChar32 c; | |
851 | int32_t dotType; | |
852 | int8_t dir; | |
853 | ||
854 | if(iter==NULL) { | |
855 | return FALSE; | |
856 | } | |
857 | ||
858 | for(dir=-1; (c=iter(context, dir))>=0; dir=0) { | |
859 | if(c==0x49) { | |
860 | return TRUE; /* preceded by I */ | |
861 | } | |
862 | dotType=getDotType(csp, c); | |
863 | if(dotType!=UCASE_OTHER_ACCENT) { | |
864 | return FALSE; /* preceded by different base character (not I), or intervening cc==230 */ | |
865 | } | |
866 | } | |
867 | ||
868 | return FALSE; /* not preceded by I */ | |
869 | } | |
870 | ||
871 | /* Is followed by one or more cc==230 ? */ | |
872 | static UBool | |
873 | isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { | |
874 | UChar32 c; | |
875 | int32_t dotType; | |
876 | int8_t dir; | |
877 | ||
878 | if(iter==NULL) { | |
879 | return FALSE; | |
880 | } | |
881 | ||
882 | for(dir=1; (c=iter(context, dir))>=0; dir=0) { | |
883 | dotType=getDotType(csp, c); | |
884 | if(dotType==UCASE_ABOVE) { | |
885 | return TRUE; /* at least one cc==230 following */ | |
886 | } else if(dotType!=UCASE_OTHER_ACCENT) { | |
887 | return FALSE; /* next base character, no more cc==230 following */ | |
888 | } | |
889 | } | |
890 | ||
891 | return FALSE; /* no more cc==230 following */ | |
892 | } | |
893 | ||
894 | /* Is followed by a dot above (without cc==230 in between) ? */ | |
895 | static UBool | |
896 | isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { | |
897 | UChar32 c; | |
898 | int32_t dotType; | |
899 | int8_t dir; | |
900 | ||
901 | if(iter==NULL) { | |
902 | return FALSE; | |
903 | } | |
904 | ||
905 | for(dir=1; (c=iter(context, dir))>=0; dir=0) { | |
906 | if(c==0x307) { | |
907 | return TRUE; | |
908 | } | |
909 | dotType=getDotType(csp, c); | |
910 | if(dotType!=UCASE_OTHER_ACCENT) { | |
911 | return FALSE; /* next base character or cc==230 in between */ | |
912 | } | |
913 | } | |
914 | ||
915 | return FALSE; /* no dot above following */ | |
916 | } | |
917 | ||
918 | U_CAPI int32_t U_EXPORT2 | |
919 | ucase_toFullLower(const UCaseProps *csp, UChar32 c, | |
920 | UCaseContextIterator *iter, void *context, | |
921 | const UChar **pString, | |
922 | const char *locale, int32_t *locCache) { | |
923 | static const UChar | |
924 | iDot[2]= { 0x69, 0x307 }, | |
925 | jDot[2]= { 0x6a, 0x307 }, | |
926 | iOgonekDot[3]= { 0x12f, 0x307 }, | |
927 | iDotGrave[3]= { 0x69, 0x307, 0x300 }, | |
928 | iDotAcute[3]= { 0x69, 0x307, 0x301 }, | |
929 | iDotTilde[3]= { 0x69, 0x307, 0x303 }; | |
930 | ||
931 | UChar32 result; | |
932 | uint16_t props; | |
933 | ||
934 | result=c; | |
935 | GET_PROPS(csp, c, props); | |
936 | if(!PROPS_HAS_EXCEPTION(props)) { | |
937 | if(GET_CASE_TYPE(props)>=UCASE_UPPER) { | |
938 | result=c+GET_SIGNED_DELTA(props); | |
939 | } | |
940 | } else { | |
941 | const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; | |
942 | uint16_t excWord=*pe++; | |
943 | int32_t full; | |
944 | ||
945 | pe2=pe; | |
946 | ||
947 | if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { | |
948 | /* use hardcoded conditions and mappings */ | |
949 | int32_t loc=getCaseLocale(locale, locCache); | |
950 | ||
951 | /* | |
952 | * Test for conditional mappings first | |
953 | * (otherwise the unconditional default mappings are always taken), | |
954 | * then test for characters that have unconditional mappings in SpecialCasing.txt, | |
955 | * then get the UnicodeData.txt mappings. | |
956 | */ | |
957 | if( loc==LOC_LITHUANIAN && | |
958 | /* base characters, find accents above */ | |
959 | (((c==0x49 || c==0x4a || c==0x12e) && | |
960 | isFollowedByMoreAbove(csp, iter, context)) || | |
961 | /* precomposed with accent above, no need to find one */ | |
962 | (c==0xcc || c==0xcd || c==0x128)) | |
963 | ) { | |
964 | /* | |
965 | # Lithuanian | |
966 | ||
967 | # Lithuanian retains the dot in a lowercase i when followed by accents. | |
968 | ||
969 | # Introduce an explicit dot above when lowercasing capital I's and J's | |
970 | # whenever there are more accents above. | |
971 | # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) | |
972 | ||
973 | 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I | |
974 | 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J | |
975 | 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK | |
976 | 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE | |
977 | 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE | |
978 | 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE | |
979 | */ | |
980 | switch(c) { | |
981 | case 0x49: /* LATIN CAPITAL LETTER I */ | |
982 | *pString=iDot; | |
983 | return 2; | |
984 | case 0x4a: /* LATIN CAPITAL LETTER J */ | |
985 | *pString=jDot; | |
986 | return 2; | |
987 | case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ | |
988 | *pString=iOgonekDot; | |
989 | return 2; | |
990 | case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ | |
991 | *pString=iDotGrave; | |
992 | return 3; | |
993 | case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ | |
994 | *pString=iDotAcute; | |
995 | return 3; | |
996 | case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ | |
997 | *pString=iDotTilde; | |
998 | return 3; | |
999 | default: | |
1000 | return 0; /* will not occur */ | |
1001 | } | |
1002 | /* # Turkish and Azeri */ | |
1003 | } else if(loc==LOC_TURKISH && c==0x130) { | |
1004 | /* | |
1005 | # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri | |
1006 | # The following rules handle those cases. | |
1007 | ||
1008 | 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE | |
1009 | 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE | |
1010 | */ | |
1011 | return 0x69; | |
1012 | } else if(loc==LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) { | |
1013 | /* | |
1014 | # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. | |
1015 | # This matches the behavior of the canonically equivalent I-dot_above | |
1016 | ||
1017 | 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE | |
1018 | 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE | |
1019 | */ | |
1020 | return 0; /* remove the dot (continue without output) */ | |
1021 | } else if(loc==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) { | |
1022 | /* | |
1023 | # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. | |
1024 | ||
1025 | 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I | |
1026 | 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I | |
1027 | */ | |
1028 | return 0x131; | |
1029 | } else if(c==0x130) { | |
1030 | /* | |
1031 | # Preserve canonical equivalence for I with dot. Turkic is handled below. | |
1032 | ||
1033 | 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE | |
1034 | */ | |
1035 | *pString=iDot; | |
1036 | return 2; | |
1037 | } else if( c==0x3a3 && | |
1038 | !isFollowedByCasedLetter(csp, iter, context, 1) && | |
1039 | isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */ | |
1040 | ) { | |
1041 | /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */ | |
1042 | /* | |
1043 | # Special case for final form of sigma | |
1044 | ||
1045 | 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA | |
1046 | */ | |
1047 | return 0x3c2; /* greek small final sigma */ | |
1048 | } else { | |
1049 | /* no known conditional special case mapping, use a normal mapping */ | |
1050 | } | |
1051 | } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { | |
1052 | GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); | |
1053 | full&=UCASE_FULL_LOWER; | |
1054 | if(full!=0) { | |
1055 | /* set the output pointer to the lowercase mapping */ | |
1056 | *pString=pe+1; | |
1057 | ||
1058 | /* return the string length */ | |
1059 | return full; | |
1060 | } | |
1061 | } | |
1062 | ||
1063 | if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { | |
1064 | GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result); | |
1065 | } | |
1066 | } | |
1067 | ||
1068 | return (result==c) ? ~result : result; | |
1069 | } | |
1070 | ||
1071 | /* internal */ | |
1072 | static int32_t | |
1073 | toUpperOrTitle(const UCaseProps *csp, UChar32 c, | |
1074 | UCaseContextIterator *iter, void *context, | |
1075 | const UChar **pString, | |
1076 | const char *locale, int32_t *locCache, | |
1077 | UBool upperNotTitle) { | |
1078 | UChar32 result; | |
1079 | uint16_t props; | |
1080 | ||
1081 | result=c; | |
1082 | GET_PROPS(csp, c, props); | |
1083 | if(!PROPS_HAS_EXCEPTION(props)) { | |
1084 | if(GET_CASE_TYPE(props)==UCASE_LOWER) { | |
1085 | result=c+GET_SIGNED_DELTA(props); | |
1086 | } | |
1087 | } else { | |
1088 | const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; | |
1089 | uint16_t excWord=*pe++; | |
1090 | int32_t full, index; | |
1091 | ||
1092 | pe2=pe; | |
1093 | ||
1094 | if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { | |
1095 | /* use hardcoded conditions and mappings */ | |
1096 | int32_t loc=getCaseLocale(locale, locCache); | |
1097 | ||
1098 | if(loc==LOC_TURKISH && c==0x69) { | |
1099 | /* | |
1100 | # Turkish and Azeri | |
1101 | ||
1102 | # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri | |
1103 | # The following rules handle those cases. | |
1104 | ||
1105 | # When uppercasing, i turns into a dotted capital I | |
1106 | ||
1107 | 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I | |
1108 | 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I | |
1109 | */ | |
1110 | return 0x130; | |
1111 | } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) { | |
1112 | /* | |
1113 | # Lithuanian | |
1114 | ||
1115 | # Lithuanian retains the dot in a lowercase i when followed by accents. | |
1116 | ||
1117 | # Remove DOT ABOVE after "i" with upper or titlecase | |
1118 | ||
1119 | 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE | |
1120 | */ | |
1121 | return 0; /* remove the dot (continue without output) */ | |
1122 | } else { | |
1123 | /* no known conditional special case mapping, use a normal mapping */ | |
1124 | } | |
1125 | } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { | |
1126 | GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); | |
1127 | ||
1128 | /* start of full case mapping strings */ | |
1129 | ++pe; | |
1130 | ||
1131 | /* skip the lowercase and case-folding result strings */ | |
1132 | pe+=full&UCASE_FULL_LOWER; | |
1133 | full>>=4; | |
1134 | pe+=full&0xf; | |
1135 | full>>=4; | |
1136 | ||
1137 | if(upperNotTitle) { | |
1138 | full&=0xf; | |
1139 | } else { | |
1140 | /* skip the uppercase result string */ | |
1141 | pe+=full&0xf; | |
1142 | full=(full>>4)&0xf; | |
1143 | } | |
1144 | ||
1145 | if(full!=0) { | |
1146 | /* set the output pointer to the result string */ | |
1147 | *pString=pe; | |
1148 | ||
1149 | /* return the string length */ | |
1150 | return full; | |
1151 | } | |
1152 | } | |
1153 | ||
1154 | if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) { | |
1155 | index=UCASE_EXC_TITLE; | |
1156 | } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { | |
1157 | /* here, titlecase is same as uppercase */ | |
1158 | index=UCASE_EXC_UPPER; | |
1159 | } else { | |
1160 | return ~c; | |
1161 | } | |
1162 | GET_SLOT_VALUE(excWord, index, pe2, result); | |
1163 | } | |
1164 | ||
1165 | return (result==c) ? ~result : result; | |
1166 | } | |
1167 | ||
1168 | U_CAPI int32_t U_EXPORT2 | |
1169 | ucase_toFullUpper(const UCaseProps *csp, UChar32 c, | |
1170 | UCaseContextIterator *iter, void *context, | |
1171 | const UChar **pString, | |
1172 | const char *locale, int32_t *locCache) { | |
1173 | return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE); | |
1174 | } | |
1175 | ||
1176 | U_CAPI int32_t U_EXPORT2 | |
1177 | ucase_toFullTitle(const UCaseProps *csp, UChar32 c, | |
1178 | UCaseContextIterator *iter, void *context, | |
1179 | const UChar **pString, | |
1180 | const char *locale, int32_t *locCache) { | |
1181 | return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE); | |
1182 | } | |
1183 | ||
1184 | /* case folding ------------------------------------------------------------- */ | |
1185 | ||
1186 | /* | |
1187 | * Case folding is similar to lowercasing. | |
1188 | * The result may be a simple mapping, i.e., a single code point, or | |
1189 | * a full mapping, i.e., a string. | |
1190 | * If the case folding for a code point is the same as its simple (1:1) lowercase mapping, | |
1191 | * then only the lowercase mapping is stored. | |
1192 | * | |
1193 | * Some special cases are hardcoded because their conditions cannot be | |
1194 | * parsed and processed from CaseFolding.txt. | |
1195 | * | |
1196 | * Unicode 3.2 CaseFolding.txt specifies for its status field: | |
1197 | ||
1198 | # C: common case folding, common mappings shared by both simple and full mappings. | |
1199 | # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. | |
1200 | # S: simple case folding, mappings to single characters where different from F. | |
1201 | # T: special case for uppercase I and dotted uppercase I | |
1202 | # - For non-Turkic languages, this mapping is normally not used. | |
1203 | # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. | |
1204 | # | |
1205 | # Usage: | |
1206 | # A. To do a simple case folding, use the mappings with status C + S. | |
1207 | # B. To do a full case folding, use the mappings with status C + F. | |
1208 | # | |
1209 | # The mappings with status T can be used or omitted depending on the desired case-folding | |
1210 | # behavior. (The default option is to exclude them.) | |
1211 | ||
1212 | * Unicode 3.2 has 'T' mappings as follows: | |
1213 | ||
1214 | 0049; T; 0131; # LATIN CAPITAL LETTER I | |
1215 | 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE | |
1216 | ||
1217 | * while the default mappings for these code points are: | |
1218 | ||
1219 | 0049; C; 0069; # LATIN CAPITAL LETTER I | |
1220 | 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE | |
1221 | ||
1222 | * U+0130 is otherwise lowercased to U+0069 (UnicodeData.txt). | |
1223 | * | |
1224 | * In case this code is used with CaseFolding.txt from an older version of Unicode | |
1225 | * where CaseFolding.txt contains mappings with a status of 'I' that | |
1226 | * have the opposite polarity ('I' mappings are included by default but excluded for Turkic), | |
1227 | * we must also hardcode the Unicode 3.2 mappings for the code points | |
1228 | * with 'I' mappings. | |
1229 | * Unicode 3.1.1 has 'I' mappings for U+0130 and U+0131. | |
1230 | * Unicode 3.2 has a 'T' mapping for U+0130, and lowercases U+0131 to itself (see UnicodeData.txt). | |
1231 | */ | |
1232 | ||
1233 | /* return the simple case folding mapping for c */ | |
1234 | U_CAPI UChar32 U_EXPORT2 | |
1235 | ucase_fold(UCaseProps *csp, UChar32 c, uint32_t options) { | |
1236 | uint16_t props; | |
1237 | GET_PROPS(csp, c, props); | |
1238 | if(!PROPS_HAS_EXCEPTION(props)) { | |
1239 | if(GET_CASE_TYPE(props)>=UCASE_UPPER) { | |
1240 | c+=GET_SIGNED_DELTA(props); | |
1241 | } | |
1242 | } else { | |
1243 | const uint16_t *pe=GET_EXCEPTIONS(csp, props); | |
1244 | uint16_t excWord=*pe++; | |
1245 | int32_t index; | |
1246 | if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { | |
1247 | /* special case folding mappings, hardcoded */ | |
1248 | if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { | |
1249 | /* default mappings */ | |
1250 | if(c==0x49) { | |
1251 | /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ | |
1252 | return 0x69; | |
1253 | } else if(c==0x130) { | |
1254 | /* no simple default mapping for U+0130, use UnicodeData.txt */ | |
1255 | return 0x69; | |
1256 | } | |
1257 | } else { | |
1258 | /* Turkic mappings */ | |
1259 | if(c==0x49) { | |
1260 | /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ | |
1261 | return 0x131; | |
1262 | } else if(c==0x130) { | |
1263 | /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ | |
1264 | return 0x69; | |
1265 | } | |
1266 | } | |
1267 | } | |
1268 | if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { | |
1269 | index=UCASE_EXC_FOLD; | |
1270 | } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { | |
1271 | index=UCASE_EXC_LOWER; | |
1272 | } else { | |
1273 | return c; | |
1274 | } | |
1275 | GET_SLOT_VALUE(excWord, index, pe, c); | |
1276 | } | |
1277 | return c; | |
1278 | } | |
1279 | ||
1280 | /* | |
1281 | * Issue for canonical caseless match (UAX #21): | |
1282 | * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve | |
1283 | * canonical equivalence, unlike default-option casefolding. | |
1284 | * For example, I-grave and I + grave fold to strings that are not canonically | |
1285 | * equivalent. | |
1286 | * For more details, see the comment in unorm_compare() in unorm.cpp | |
1287 | * and the intermediate prototype changes for Jitterbug 2021. | |
1288 | * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.) | |
1289 | * | |
1290 | * This did not get fixed because it appears that it is not possible to fix | |
1291 | * it for uppercase and lowercase characters (I-grave vs. i-grave) | |
1292 | * together in a way that they still fold to common result strings. | |
1293 | */ | |
1294 | ||
1295 | U_CAPI int32_t U_EXPORT2 | |
1296 | ucase_toFullFolding(const UCaseProps *csp, UChar32 c, | |
1297 | const UChar **pString, | |
1298 | uint32_t options) { | |
1299 | static const UChar | |
1300 | iDot[2]= { 0x69, 0x307 }; | |
1301 | ||
1302 | UChar32 result; | |
1303 | uint16_t props; | |
1304 | ||
1305 | result=c; | |
1306 | GET_PROPS(csp, c, props); | |
1307 | if(!PROPS_HAS_EXCEPTION(props)) { | |
1308 | if(GET_CASE_TYPE(props)>=UCASE_UPPER) { | |
1309 | result=c+GET_SIGNED_DELTA(props); | |
1310 | } | |
1311 | } else { | |
1312 | const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; | |
1313 | uint16_t excWord=*pe++; | |
1314 | int32_t full, index; | |
1315 | ||
1316 | pe2=pe; | |
1317 | ||
1318 | if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { | |
1319 | /* use hardcoded conditions and mappings */ | |
1320 | if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { | |
1321 | /* default mappings */ | |
1322 | if(c==0x49) { | |
1323 | /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ | |
1324 | return 0x69; | |
1325 | } else if(c==0x130) { | |
1326 | /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ | |
1327 | *pString=iDot; | |
1328 | return 2; | |
1329 | } | |
1330 | } else { | |
1331 | /* Turkic mappings */ | |
1332 | if(c==0x49) { | |
1333 | /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ | |
1334 | return 0x131; | |
1335 | } else if(c==0x130) { | |
1336 | /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ | |
1337 | return 0x69; | |
1338 | } | |
1339 | } | |
1340 | } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { | |
1341 | GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); | |
1342 | ||
1343 | /* start of full case mapping strings */ | |
1344 | ++pe; | |
1345 | ||
1346 | /* skip the lowercase result string */ | |
1347 | pe+=full&UCASE_FULL_LOWER; | |
1348 | full=(full>>4)&0xf; | |
1349 | ||
1350 | if(full!=0) { | |
1351 | /* set the output pointer to the result string */ | |
1352 | *pString=pe; | |
1353 | ||
1354 | /* return the string length */ | |
1355 | return full; | |
1356 | } | |
1357 | } | |
1358 | ||
1359 | if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { | |
1360 | index=UCASE_EXC_FOLD; | |
1361 | } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { | |
1362 | index=UCASE_EXC_LOWER; | |
1363 | } else { | |
1364 | return ~c; | |
1365 | } | |
1366 | GET_SLOT_VALUE(excWord, index, pe2, result); | |
1367 | } | |
1368 | ||
1369 | return (result==c) ? ~result : result; | |
1370 | } |