]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucase.c
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / common / ucase.c
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucase.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2004aug30
14 * created by: Markus W. Scherer
15 *
16 * Low-level Unicode character/string case mapping code.
17 * Much code moved here (and modified) from uchar.c.
18 */
19
20 #include "unicode/utypes.h"
21 #include "unicode/uset.h"
22 #include "unicode/udata.h" /* UDataInfo */
23 #include "ucmndata.h" /* DataHeader */
24 #include "udatamem.h"
25 #include "umutex.h"
26 #include "uassert.h"
27 #include "cmemory.h"
28 #include "utrie.h"
29 #include "ucase.h"
30 #include "ucln_cmn.h"
31
32 struct UCaseProps {
33 UDataMemory *mem;
34 const int32_t *indexes;
35 const uint16_t *exceptions;
36
37 UTrie trie;
38 uint8_t formatVersion[4];
39 };
40
41 /* data loading etc. -------------------------------------------------------- */
42
43 static UBool U_CALLCONV
44 isAcceptable(void *context,
45 const char *type, const char *name,
46 const UDataInfo *pInfo) {
47 if(
48 pInfo->size>=20 &&
49 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
50 pInfo->charsetFamily==U_CHARSET_FAMILY &&
51 pInfo->dataFormat[0]==UCASE_FMT_0 && /* dataFormat="cAsE" */
52 pInfo->dataFormat[1]==UCASE_FMT_1 &&
53 pInfo->dataFormat[2]==UCASE_FMT_2 &&
54 pInfo->dataFormat[3]==UCASE_FMT_3 &&
55 pInfo->formatVersion[0]==1 &&
56 pInfo->formatVersion[2]==UTRIE_SHIFT &&
57 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
58 ) {
59 UCaseProps *csp=(UCaseProps *)context;
60 uprv_memcpy(csp->formatVersion, pInfo->formatVersion, 4);
61 return TRUE;
62 } else {
63 return FALSE;
64 }
65 }
66
67 static UCaseProps *
68 ucase_openData(UCaseProps *cspProto,
69 const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
70 UCaseProps *csp;
71 int32_t size, trieSize;
72
73 cspProto->indexes=(const int32_t *)bin;
74 if( cspProto->indexes[UCASE_IX_INDEX_TOP]<16 ||
75 (length>=0 && length<cspProto->indexes[UCASE_IX_LENGTH])
76 ) {
77 *pErrorCode=U_INVALID_FORMAT_ERROR;
78 return NULL;
79 }
80
81 /* get the trie address, after indexes[] */
82 size=cspProto->indexes[UCASE_IX_INDEX_TOP]*4;
83 bin+=size;
84 if(length>=0 && (length-=size)<16) {
85 *pErrorCode=U_INVALID_FORMAT_ERROR;
86 return NULL;
87 }
88
89 /* unserialize the trie */
90 trieSize=cspProto->indexes[UCASE_IX_TRIE_SIZE];
91 trieSize=utrie_unserialize(&cspProto->trie, bin, length>=0 ? length : trieSize, pErrorCode);
92 if(U_FAILURE(*pErrorCode)) {
93 return NULL;
94 }
95
96 /* get exceptions[] */
97 bin+=trieSize;
98 if(length>=0 && (length-=trieSize)<2*cspProto->indexes[UCASE_IX_EXC_LENGTH]) {
99 *pErrorCode=U_INVALID_FORMAT_ERROR;
100 return NULL;
101 }
102 cspProto->exceptions=(const uint16_t *)bin;
103
104 /* allocate, copy, and return the new UCaseProps */
105 csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps));
106 if(csp==NULL) {
107 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
108 return NULL;
109 } else {
110 uprv_memcpy(csp, cspProto, sizeof(UCaseProps));
111 return csp;
112 }
113 }
114
115 U_CAPI UCaseProps * U_EXPORT2
116 ucase_open(UErrorCode *pErrorCode) {
117 UCaseProps cspProto={ NULL }, *csp;
118
119 cspProto.mem=udata_openChoice(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, isAcceptable, &cspProto, pErrorCode);
120 if(U_FAILURE(*pErrorCode)) {
121 return NULL;
122 }
123
124 csp=ucase_openData(
125 &cspProto,
126 udata_getMemory(cspProto.mem),
127 udata_getLength(cspProto.mem),
128 pErrorCode);
129 if(U_FAILURE(*pErrorCode)) {
130 udata_close(cspProto.mem);
131 return NULL;
132 } else {
133 return csp;
134 }
135 }
136
137 U_CAPI UCaseProps * U_EXPORT2
138 ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
139 UCaseProps cspProto={ NULL };
140 const DataHeader *hdr;
141
142 if(U_FAILURE(*pErrorCode)) {
143 return NULL;
144 }
145 if(bin==NULL) {
146 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
147 return NULL;
148 }
149
150 /* check the header */
151 if(length>=0 && length<20) {
152 *pErrorCode=U_INVALID_FORMAT_ERROR;
153 return NULL;
154 }
155 hdr=(const DataHeader *)bin;
156 if(
157 !(hdr->dataHeader.magic1==0xda && hdr->dataHeader.magic2==0x27 &&
158 hdr->info.isBigEndian==U_IS_BIG_ENDIAN &&
159 isAcceptable(&cspProto, UCASE_DATA_TYPE, UCASE_DATA_NAME, &hdr->info))
160 ) {
161 *pErrorCode=U_INVALID_FORMAT_ERROR;
162 return NULL;
163 }
164
165 bin+=hdr->dataHeader.headerSize;
166 if(length>=0) {
167 length-=hdr->dataHeader.headerSize;
168 }
169 return ucase_openData(&cspProto, bin, length, pErrorCode);
170 }
171
172 U_CAPI void U_EXPORT2
173 ucase_close(UCaseProps *csp) {
174 if(csp!=NULL) {
175 udata_close(csp->mem);
176 uprv_free(csp);
177 }
178 }
179
180 /* UCaseProps singleton ----------------------------------------------------- */
181
182 static UCaseProps *gCsp=NULL;
183 static UErrorCode gErrorCode=U_ZERO_ERROR;
184 static int8_t gHaveData=0;
185
186 static UBool U_CALLCONV ucase_cleanup(void) {
187 ucase_close(gCsp);
188 gCsp=NULL;
189 gErrorCode=U_ZERO_ERROR;
190 gHaveData=0;
191 return TRUE;
192 }
193
194 U_CAPI UCaseProps * U_EXPORT2
195 ucase_getSingleton(UErrorCode *pErrorCode) {
196 int8_t haveData;
197
198 if(U_FAILURE(*pErrorCode)) {
199 return NULL;
200 }
201
202 UMTX_CHECK(NULL, gHaveData, haveData);
203
204 if(haveData>0) {
205 /* data was loaded */
206 return gCsp;
207 } else if(haveData<0) {
208 /* data loading failed */
209 *pErrorCode=gErrorCode;
210 return NULL;
211 } else /* haveData==0 */ {
212 /* load the data */
213 UCaseProps *csp=ucase_open(pErrorCode);
214 if(U_FAILURE(*pErrorCode)) {
215 gHaveData=-1;
216 gErrorCode=*pErrorCode;
217 return NULL;
218 }
219
220 /* set the static variables */
221 umtx_lock(NULL);
222 if(gCsp==NULL) {
223 gCsp=csp;
224 csp=NULL;
225 gHaveData=1;
226 ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup);
227 }
228 umtx_unlock(NULL);
229
230 ucase_close(csp);
231 return gCsp;
232 }
233 }
234
235 /* Unicode case mapping data swapping --------------------------------------- */
236
237 U_CAPI int32_t U_EXPORT2
238 ucase_swap(const UDataSwapper *ds,
239 const void *inData, int32_t length, void *outData,
240 UErrorCode *pErrorCode) {
241 const UDataInfo *pInfo;
242 int32_t headerSize;
243
244 const uint8_t *inBytes;
245 uint8_t *outBytes;
246
247 const int32_t *inIndexes;
248 int32_t indexes[16];
249
250 int32_t i, offset, count, size;
251
252 /* udata_swapDataHeader checks the arguments */
253 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
254 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
255 return 0;
256 }
257
258 /* check data format and format version */
259 pInfo=(const UDataInfo *)((const char *)inData+4);
260 if(!(
261 pInfo->dataFormat[0]==UCASE_FMT_0 && /* dataFormat="cAsE" */
262 pInfo->dataFormat[1]==UCASE_FMT_1 &&
263 pInfo->dataFormat[2]==UCASE_FMT_2 &&
264 pInfo->dataFormat[3]==UCASE_FMT_3 &&
265 pInfo->formatVersion[0]==1 &&
266 pInfo->formatVersion[2]==UTRIE_SHIFT &&
267 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
268 )) {
269 udata_printError(ds, "ucase_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as case mapping data\n",
270 pInfo->dataFormat[0], pInfo->dataFormat[1],
271 pInfo->dataFormat[2], pInfo->dataFormat[3],
272 pInfo->formatVersion[0]);
273 *pErrorCode=U_UNSUPPORTED_ERROR;
274 return 0;
275 }
276
277 inBytes=(const uint8_t *)inData+headerSize;
278 outBytes=(uint8_t *)outData+headerSize;
279
280 inIndexes=(const int32_t *)inBytes;
281
282 if(length>=0) {
283 length-=headerSize;
284 if(length<16*4) {
285 udata_printError(ds, "ucase_swap(): too few bytes (%d after header) for case mapping data\n",
286 length);
287 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
288 return 0;
289 }
290 }
291
292 /* read the first 16 indexes (ICU 3.2/format version 1: UCASE_IX_TOP==16, might grow) */
293 for(i=0; i<16; ++i) {
294 indexes[i]=udata_readInt32(ds, inIndexes[i]);
295 }
296
297 /* get the total length of the data */
298 size=indexes[UCASE_IX_LENGTH];
299
300 if(length>=0) {
301 if(length<size) {
302 udata_printError(ds, "ucase_swap(): too few bytes (%d after header) for all of case mapping data\n",
303 length);
304 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
305 return 0;
306 }
307
308 /* copy the data for inaccessible bytes */
309 if(inBytes!=outBytes) {
310 uprv_memcpy(outBytes, inBytes, size);
311 }
312
313 offset=0;
314
315 /* swap the int32_t indexes[] */
316 count=indexes[UCASE_IX_INDEX_TOP]*4;
317 ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode);
318 offset+=count;
319
320 /* swap the UTrie */
321 count=indexes[UCASE_IX_TRIE_SIZE];
322 utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
323 offset+=count;
324
325 /* swap the uint16_t exceptions[] */
326 count=indexes[UCASE_IX_EXC_LENGTH]*2;
327 ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
328 offset+=count;
329
330 U_ASSERT(offset==size);
331 }
332
333 return headerSize+size;
334 }
335
336 /* set of property starts for UnicodeSet ------------------------------------ */
337
338 static UBool U_CALLCONV
339 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
340 /* add the start code point to the USet */
341 USetAdder *sa=(USetAdder *)context;
342 sa->add(sa->set, start);
343 return TRUE;
344 }
345
346 U_CAPI void U_EXPORT2
347 ucase_addPropertyStarts(const UCaseProps *csp, USetAdder *sa, UErrorCode *pErrorCode) {
348 if(U_FAILURE(*pErrorCode)) {
349 return;
350 }
351
352 /* add the start code point of each same-value range of the trie */
353 utrie_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
354
355 /* add code points with hardcoded properties, plus the ones following them */
356
357 /* (none right now, see comment below) */
358
359 /*
360 * Omit code points with hardcoded specialcasing properties
361 * because we do not build property UnicodeSets for them right now.
362 */
363 }
364
365 /* data access primitives --------------------------------------------------- */
366
367 /* UTRIE_GET16() itself validates c */
368 #define GET_PROPS(csp, c, result) \
369 UTRIE_GET16(&(csp)->trie, c, result);
370
371 #define GET_CASE_TYPE(props) ((props)&UCASE_TYPE_MASK)
372 #define GET_SIGNED_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT)
373 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
374
375 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
376
377 /* number of bits in an 8-bit integer value */
378 static const uint8_t flagsOffset[256]={
379 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
380 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
381 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
382 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
383 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
384 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
385 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
386 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
387 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
388 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
389 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
390 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
391 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
392 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
393 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
394 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
395 };
396
397 #define HAS_SLOT(flags, index) ((flags)&(1<<(index)))
398 #define SLOT_OFFSET(flags, index) flagsOffset[(flags)&((1<<(index))-1)]
399
400 /*
401 * Get the value of an optional-value slot where HAS_SLOT(excWord, index).
402 *
403 * @param excWord (in) initial exceptions word
404 * @param index (in) desired slot index
405 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
406 * moved to the last uint16_t of the value, use +1 for beginning of next slot
407 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
408 */
409 #define GET_SLOT_VALUE(excWord, index, pExc16, value) \
410 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
411 (pExc16)+=SLOT_OFFSET(excWord, index); \
412 (value)=*pExc16; \
413 } else { \
414 (pExc16)+=2*SLOT_OFFSET(excWord, index); \
415 (value)=*pExc16++; \
416 (value)=((value)<<16)|*pExc16; \
417 }
418
419 /* simple case mappings ----------------------------------------------------- */
420
421 U_CAPI UChar32 U_EXPORT2
422 ucase_tolower(const UCaseProps *csp, UChar32 c) {
423 uint16_t props;
424 GET_PROPS(csp, c, props);
425 if(!PROPS_HAS_EXCEPTION(props)) {
426 if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
427 c+=GET_SIGNED_DELTA(props);
428 }
429 } else {
430 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
431 uint16_t excWord=*pe++;
432 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
433 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
434 }
435 }
436 return c;
437 }
438
439 U_CAPI UChar32 U_EXPORT2
440 ucase_toupper(const UCaseProps *csp, UChar32 c) {
441 uint16_t props;
442 GET_PROPS(csp, c, props);
443 if(!PROPS_HAS_EXCEPTION(props)) {
444 if(GET_CASE_TYPE(props)==UCASE_LOWER) {
445 c+=GET_SIGNED_DELTA(props);
446 }
447 } else {
448 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
449 uint16_t excWord=*pe++;
450 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
451 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
452 }
453 }
454 return c;
455 }
456
457 U_CAPI UChar32 U_EXPORT2
458 ucase_totitle(const UCaseProps *csp, UChar32 c) {
459 uint16_t props;
460 GET_PROPS(csp, c, props);
461 if(!PROPS_HAS_EXCEPTION(props)) {
462 if(GET_CASE_TYPE(props)==UCASE_LOWER) {
463 c+=GET_SIGNED_DELTA(props);
464 }
465 } else {
466 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
467 uint16_t excWord=*pe++;
468 int32_t index;
469 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
470 index=UCASE_EXC_TITLE;
471 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
472 index=UCASE_EXC_UPPER;
473 } else {
474 return c;
475 }
476 GET_SLOT_VALUE(excWord, index, pe, c);
477 }
478 return c;
479 }
480
481 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
482 U_CAPI int32_t U_EXPORT2
483 ucase_getType(const UCaseProps *csp, UChar32 c) {
484 uint16_t props;
485 GET_PROPS(csp, c, props);
486 return GET_CASE_TYPE(props);
487 }
488
489 /** @return same as ucase_getType(), or <0 if c is case-ignorable */
490 U_CAPI int32_t U_EXPORT2
491 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
492 int32_t type;
493 uint16_t props;
494 GET_PROPS(csp, c, props);
495 type=GET_CASE_TYPE(props);
496 if(type!=UCASE_NONE) {
497 return type;
498 } else if(
499 c==0x307 ||
500 (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE
501 ) {
502 return -1; /* case-ignorable */
503 } else {
504 return 0; /* c is neither cased nor case-ignorable */
505 }
506 }
507
508 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
509 static U_INLINE int32_t
510 getDotType(const UCaseProps *csp, UChar32 c) {
511 uint16_t props;
512 GET_PROPS(csp, c, props);
513 if(!PROPS_HAS_EXCEPTION(props)) {
514 return props&UCASE_DOT_MASK;
515 } else {
516 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
517 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
518 }
519 }
520
521 U_CAPI UBool U_EXPORT2
522 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
523 return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
524 }
525
526 U_CAPI UBool U_EXPORT2
527 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
528 uint16_t props;
529 GET_PROPS(csp, c, props);
530 return (UBool)((props&UCASE_SENSITIVE)!=0);
531 }
532
533 /* public API (see uchar.h) ------------------------------------------------- */
534
535 U_CAPI UBool U_EXPORT2
536 u_isULowercase(UChar32 c) {
537 UErrorCode errorCode=U_ZERO_ERROR;
538 UCaseProps *csp=ucase_getSingleton(&errorCode);
539 return (UBool)(csp!=NULL && UCASE_LOWER==ucase_getType(csp, c));
540 }
541
542 U_CAPI UBool U_EXPORT2
543 u_isUUppercase(UChar32 c) {
544 UErrorCode errorCode=U_ZERO_ERROR;
545 UCaseProps *csp=ucase_getSingleton(&errorCode);
546 return (UBool)(csp!=NULL && UCASE_UPPER==ucase_getType(csp, c));
547 }
548
549 /* Transforms the Unicode character to its lower case equivalent.*/
550 U_CAPI UChar32 U_EXPORT2
551 u_tolower(UChar32 c) {
552 UErrorCode errorCode=U_ZERO_ERROR;
553 UCaseProps *csp=ucase_getSingleton(&errorCode);
554 if(csp!=NULL) {
555 return ucase_tolower(csp, c);
556 } else {
557 return c;
558 }
559 }
560
561 /* Transforms the Unicode character to its upper case equivalent.*/
562 U_CAPI UChar32 U_EXPORT2
563 u_toupper(UChar32 c) {
564 UErrorCode errorCode=U_ZERO_ERROR;
565 UCaseProps *csp=ucase_getSingleton(&errorCode);
566 if(csp!=NULL) {
567 return ucase_toupper(csp, c);
568 } else {
569 return c;
570 }
571 }
572
573 /* Transforms the Unicode character to its title case equivalent.*/
574 U_CAPI UChar32 U_EXPORT2
575 u_totitle(UChar32 c) {
576 UErrorCode errorCode=U_ZERO_ERROR;
577 UCaseProps *csp=ucase_getSingleton(&errorCode);
578 if(csp!=NULL) {
579 return ucase_totitle(csp, c);
580 } else {
581 return c;
582 }
583 }
584
585 /* return the simple case folding mapping for c */
586 U_CAPI UChar32 U_EXPORT2
587 u_foldCase(UChar32 c, uint32_t options) {
588 UErrorCode errorCode=U_ZERO_ERROR;
589 UCaseProps *csp=ucase_getSingleton(&errorCode);
590 if(csp!=NULL) {
591 return ucase_fold(csp, c, options);
592 } else {
593 return c;
594 }
595 }
596
597 /* string casing ------------------------------------------------------------ */
598
599 /*
600 * These internal functions form the core of string case mappings.
601 * They map single code points to result code points or strings and take
602 * all necessary conditions (context, locale ID, options) into account.
603 *
604 * They do not iterate over the source or write to the destination
605 * so that the same functions are useful for non-standard string storage,
606 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
607 * For the same reason, the "surrounding text" context is passed in as a
608 * UCaseContextIterator which does not make any assumptions about
609 * the underlying storage.
610 *
611 * This section contains helper functions that check for conditions
612 * in the input text surrounding the current code point
613 * according to SpecialCasing.txt.
614 *
615 * Each helper function gets the index
616 * - after the current code point if it looks at following text
617 * - before the current code point if it looks at preceding text
618 *
619 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
620 *
621 * Final_Sigma
622 * C is preceded by a sequence consisting of
623 * a cased letter and a case-ignorable sequence,
624 * and C is not followed by a sequence consisting of
625 * an ignorable sequence and then a cased letter.
626 *
627 * More_Above
628 * C is followed by one or more characters of combining class 230 (ABOVE)
629 * in the combining character sequence.
630 *
631 * After_Soft_Dotted
632 * The last preceding character with combining class of zero before C
633 * was Soft_Dotted,
634 * and there is no intervening combining character class 230 (ABOVE).
635 *
636 * Before_Dot
637 * C is followed by combining dot above (U+0307).
638 * Any sequence of characters with a combining class that is neither 0 nor 230
639 * may intervene between the current character and the combining dot above.
640 *
641 * The erratum from 2002-10-31 adds the condition
642 *
643 * After_I
644 * The last preceding base character was an uppercase I, and there is no
645 * intervening combining character class 230 (ABOVE).
646 *
647 * (See Jitterbug 2344 and the comments on After_I below.)
648 *
649 * Helper definitions in Unicode 3.2 UAX 21:
650 *
651 * D1. A character C is defined to be cased
652 * if it meets any of the following criteria:
653 *
654 * - The general category of C is Titlecase Letter (Lt)
655 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
656 * - Given D = NFD(C), then it is not the case that:
657 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
658 * (This third criterium does not add any characters to the list
659 * for Unicode 3.2. Ignored.)
660 *
661 * D2. A character C is defined to be case-ignorable
662 * if it meets either of the following criteria:
663 *
664 * - The general category of C is
665 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
666 * Letter Modifier (Lm), or Symbol Modifier (Sk)
667 * - C is one of the following characters
668 * U+0027 APOSTROPHE
669 * U+00AD SOFT HYPHEN (SHY)
670 * U+2019 RIGHT SINGLE QUOTATION MARK
671 * (the preferred character for apostrophe)
672 *
673 * D3. A case-ignorable sequence is a sequence of
674 * zero or more case-ignorable characters.
675 */
676
677 enum {
678 LOC_UNKNOWN,
679 LOC_ROOT,
680 LOC_TURKISH,
681 LOC_LITHUANIAN
682 };
683
684 #define is_a(c) ((c)=='a' || (c)=='A')
685 #define is_e(c) ((c)=='e' || (c)=='E')
686 #define is_i(c) ((c)=='i' || (c)=='I')
687 #define is_l(c) ((c)=='l' || (c)=='L')
688 #define is_r(c) ((c)=='r' || (c)=='R')
689 #define is_t(c) ((c)=='t' || (c)=='T')
690 #define is_u(c) ((c)=='u' || (c)=='U')
691 #define is_z(c) ((c)=='z' || (c)=='Z')
692
693 /* separator? */
694 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
695
696 /*
697 * Requires non-NULL locale ID but otherwise does the equivalent of
698 * checking for language codes as if uloc_getLanguage() were called:
699 * Accepts both 2- and 3-letter codes and accepts case variants.
700 */
701 static int32_t
702 getCaseLocale(const char *locale, int32_t *locCache) {
703 int32_t result;
704 char c;
705
706 if(locCache!=NULL && (result=*locCache)!=LOC_UNKNOWN) {
707 return result;
708 }
709
710 result=LOC_ROOT;
711
712 /*
713 * This function used to use uloc_getLanguage(), but the current code
714 * removes the dependency of this low-level code on uloc implementation code
715 * and is faster because not the whole locale ID has to be
716 * examined and copied/transformed.
717 *
718 * Because this code does not want to depend on uloc, the caller must
719 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
720 */
721 c=*locale++;
722 if(is_t(c)) {
723 /* tr or tur? */
724 c=*locale++;
725 if(is_u(c)) {
726 c=*locale++;
727 }
728 if(is_r(c)) {
729 c=*locale;
730 if(is_sep(c)) {
731 result=LOC_TURKISH;
732 }
733 }
734 } else if(is_a(c)) {
735 /* az or aze? */
736 c=*locale++;
737 if(is_z(c)) {
738 c=*locale++;
739 if(is_e(c)) {
740 c=*locale;
741 }
742 if(is_sep(c)) {
743 result=LOC_TURKISH;
744 }
745 }
746 } else if(is_l(c)) {
747 /* lt or lit? */
748 c=*locale++;
749 if(is_i(c)) {
750 c=*locale++;
751 }
752 if(is_t(c)) {
753 c=*locale;
754 if(is_sep(c)) {
755 result=LOC_LITHUANIAN;
756 }
757 }
758 }
759
760 if(locCache!=NULL) {
761 *locCache=result;
762 }
763 return result;
764 }
765
766 /* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */
767 static UBool
768 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
769 UChar32 c;
770 uint16_t props;
771
772 if(iter==NULL) {
773 return FALSE;
774 }
775
776 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
777 GET_PROPS(csp, c, props);
778 if(GET_CASE_TYPE(props)!=UCASE_NONE) {
779 return TRUE; /* followed by cased letter */
780 } else if(c==0x307 || (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE) {
781 /* case-ignorable, continue with the loop */
782 } else {
783 return FALSE; /* not ignorable */
784 }
785 }
786
787 return FALSE; /* not followed by cased letter */
788 }
789
790 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
791 static UBool
792 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
793 UChar32 c;
794 int32_t dotType;
795 int8_t dir;
796
797 if(iter==NULL) {
798 return FALSE;
799 }
800
801 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
802 dotType=getDotType(csp, c);
803 if(dotType==UCASE_SOFT_DOTTED) {
804 return TRUE; /* preceded by TYPE_i */
805 } else if(dotType!=UCASE_OTHER_ACCENT) {
806 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
807 }
808 }
809
810 return FALSE; /* not preceded by TYPE_i */
811 }
812
813 /*
814 * See Jitterbug 2344:
815 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
816 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
817 * we made those releases compatible with Unicode 3.2 which had not fixed
818 * a related bug in SpecialCasing.txt.
819 *
820 * From the Jitterbug 2344 text:
821 * ... this bug is listed as a Unicode erratum
822 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
823 * <quote>
824 * There are two errors in SpecialCasing.txt.
825 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
826 * 2. An incorrect context definition. Correct as follows:
827 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
828 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
829 * ---
830 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
831 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
832 * where the context After_I is defined as:
833 * The last preceding base character was an uppercase I, and there is no
834 * intervening combining character class 230 (ABOVE).
835 * </quote>
836 *
837 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
838 *
839 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
840 * # This matches the behavior of the canonically equivalent I-dot_above
841 *
842 * See also the description in this place in older versions of uchar.c (revision 1.100).
843 *
844 * Markus W. Scherer 2003-feb-15
845 */
846
847 /* Is preceded by base character 'I' with no intervening cc=230 ? */
848 static UBool
849 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
850 UChar32 c;
851 int32_t dotType;
852 int8_t dir;
853
854 if(iter==NULL) {
855 return FALSE;
856 }
857
858 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
859 if(c==0x49) {
860 return TRUE; /* preceded by I */
861 }
862 dotType=getDotType(csp, c);
863 if(dotType!=UCASE_OTHER_ACCENT) {
864 return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
865 }
866 }
867
868 return FALSE; /* not preceded by I */
869 }
870
871 /* Is followed by one or more cc==230 ? */
872 static UBool
873 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
874 UChar32 c;
875 int32_t dotType;
876 int8_t dir;
877
878 if(iter==NULL) {
879 return FALSE;
880 }
881
882 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
883 dotType=getDotType(csp, c);
884 if(dotType==UCASE_ABOVE) {
885 return TRUE; /* at least one cc==230 following */
886 } else if(dotType!=UCASE_OTHER_ACCENT) {
887 return FALSE; /* next base character, no more cc==230 following */
888 }
889 }
890
891 return FALSE; /* no more cc==230 following */
892 }
893
894 /* Is followed by a dot above (without cc==230 in between) ? */
895 static UBool
896 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
897 UChar32 c;
898 int32_t dotType;
899 int8_t dir;
900
901 if(iter==NULL) {
902 return FALSE;
903 }
904
905 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
906 if(c==0x307) {
907 return TRUE;
908 }
909 dotType=getDotType(csp, c);
910 if(dotType!=UCASE_OTHER_ACCENT) {
911 return FALSE; /* next base character or cc==230 in between */
912 }
913 }
914
915 return FALSE; /* no dot above following */
916 }
917
918 U_CAPI int32_t U_EXPORT2
919 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
920 UCaseContextIterator *iter, void *context,
921 const UChar **pString,
922 const char *locale, int32_t *locCache) {
923 static const UChar
924 iDot[2]= { 0x69, 0x307 },
925 jDot[2]= { 0x6a, 0x307 },
926 iOgonekDot[3]= { 0x12f, 0x307 },
927 iDotGrave[3]= { 0x69, 0x307, 0x300 },
928 iDotAcute[3]= { 0x69, 0x307, 0x301 },
929 iDotTilde[3]= { 0x69, 0x307, 0x303 };
930
931 UChar32 result;
932 uint16_t props;
933
934 result=c;
935 GET_PROPS(csp, c, props);
936 if(!PROPS_HAS_EXCEPTION(props)) {
937 if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
938 result=c+GET_SIGNED_DELTA(props);
939 }
940 } else {
941 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
942 uint16_t excWord=*pe++;
943 int32_t full;
944
945 pe2=pe;
946
947 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
948 /* use hardcoded conditions and mappings */
949 int32_t loc=getCaseLocale(locale, locCache);
950
951 /*
952 * Test for conditional mappings first
953 * (otherwise the unconditional default mappings are always taken),
954 * then test for characters that have unconditional mappings in SpecialCasing.txt,
955 * then get the UnicodeData.txt mappings.
956 */
957 if( loc==LOC_LITHUANIAN &&
958 /* base characters, find accents above */
959 (((c==0x49 || c==0x4a || c==0x12e) &&
960 isFollowedByMoreAbove(csp, iter, context)) ||
961 /* precomposed with accent above, no need to find one */
962 (c==0xcc || c==0xcd || c==0x128))
963 ) {
964 /*
965 # Lithuanian
966
967 # Lithuanian retains the dot in a lowercase i when followed by accents.
968
969 # Introduce an explicit dot above when lowercasing capital I's and J's
970 # whenever there are more accents above.
971 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
972
973 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
974 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
975 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
976 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
977 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
978 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
979 */
980 switch(c) {
981 case 0x49: /* LATIN CAPITAL LETTER I */
982 *pString=iDot;
983 return 2;
984 case 0x4a: /* LATIN CAPITAL LETTER J */
985 *pString=jDot;
986 return 2;
987 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
988 *pString=iOgonekDot;
989 return 2;
990 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
991 *pString=iDotGrave;
992 return 3;
993 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
994 *pString=iDotAcute;
995 return 3;
996 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
997 *pString=iDotTilde;
998 return 3;
999 default:
1000 return 0; /* will not occur */
1001 }
1002 /* # Turkish and Azeri */
1003 } else if(loc==LOC_TURKISH && c==0x130) {
1004 /*
1005 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1006 # The following rules handle those cases.
1007
1008 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1009 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1010 */
1011 return 0x69;
1012 } else if(loc==LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
1013 /*
1014 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1015 # This matches the behavior of the canonically equivalent I-dot_above
1016
1017 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1018 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1019 */
1020 return 0; /* remove the dot (continue without output) */
1021 } else if(loc==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
1022 /*
1023 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1024
1025 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1026 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1027 */
1028 return 0x131;
1029 } else if(c==0x130) {
1030 /*
1031 # Preserve canonical equivalence for I with dot. Turkic is handled below.
1032
1033 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1034 */
1035 *pString=iDot;
1036 return 2;
1037 } else if( c==0x3a3 &&
1038 !isFollowedByCasedLetter(csp, iter, context, 1) &&
1039 isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
1040 ) {
1041 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1042 /*
1043 # Special case for final form of sigma
1044
1045 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1046 */
1047 return 0x3c2; /* greek small final sigma */
1048 } else {
1049 /* no known conditional special case mapping, use a normal mapping */
1050 }
1051 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1052 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1053 full&=UCASE_FULL_LOWER;
1054 if(full!=0) {
1055 /* set the output pointer to the lowercase mapping */
1056 *pString=pe+1;
1057
1058 /* return the string length */
1059 return full;
1060 }
1061 }
1062
1063 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1064 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1065 }
1066 }
1067
1068 return (result==c) ? ~result : result;
1069 }
1070
1071 /* internal */
1072 static int32_t
1073 toUpperOrTitle(const UCaseProps *csp, UChar32 c,
1074 UCaseContextIterator *iter, void *context,
1075 const UChar **pString,
1076 const char *locale, int32_t *locCache,
1077 UBool upperNotTitle) {
1078 UChar32 result;
1079 uint16_t props;
1080
1081 result=c;
1082 GET_PROPS(csp, c, props);
1083 if(!PROPS_HAS_EXCEPTION(props)) {
1084 if(GET_CASE_TYPE(props)==UCASE_LOWER) {
1085 result=c+GET_SIGNED_DELTA(props);
1086 }
1087 } else {
1088 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1089 uint16_t excWord=*pe++;
1090 int32_t full, index;
1091
1092 pe2=pe;
1093
1094 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1095 /* use hardcoded conditions and mappings */
1096 int32_t loc=getCaseLocale(locale, locCache);
1097
1098 if(loc==LOC_TURKISH && c==0x69) {
1099 /*
1100 # Turkish and Azeri
1101
1102 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1103 # The following rules handle those cases.
1104
1105 # When uppercasing, i turns into a dotted capital I
1106
1107 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1108 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1109 */
1110 return 0x130;
1111 } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
1112 /*
1113 # Lithuanian
1114
1115 # Lithuanian retains the dot in a lowercase i when followed by accents.
1116
1117 # Remove DOT ABOVE after "i" with upper or titlecase
1118
1119 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1120 */
1121 return 0; /* remove the dot (continue without output) */
1122 } else {
1123 /* no known conditional special case mapping, use a normal mapping */
1124 }
1125 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1126 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1127
1128 /* start of full case mapping strings */
1129 ++pe;
1130
1131 /* skip the lowercase and case-folding result strings */
1132 pe+=full&UCASE_FULL_LOWER;
1133 full>>=4;
1134 pe+=full&0xf;
1135 full>>=4;
1136
1137 if(upperNotTitle) {
1138 full&=0xf;
1139 } else {
1140 /* skip the uppercase result string */
1141 pe+=full&0xf;
1142 full=(full>>4)&0xf;
1143 }
1144
1145 if(full!=0) {
1146 /* set the output pointer to the result string */
1147 *pString=pe;
1148
1149 /* return the string length */
1150 return full;
1151 }
1152 }
1153
1154 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1155 index=UCASE_EXC_TITLE;
1156 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1157 /* here, titlecase is same as uppercase */
1158 index=UCASE_EXC_UPPER;
1159 } else {
1160 return ~c;
1161 }
1162 GET_SLOT_VALUE(excWord, index, pe2, result);
1163 }
1164
1165 return (result==c) ? ~result : result;
1166 }
1167
1168 U_CAPI int32_t U_EXPORT2
1169 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
1170 UCaseContextIterator *iter, void *context,
1171 const UChar **pString,
1172 const char *locale, int32_t *locCache) {
1173 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
1174 }
1175
1176 U_CAPI int32_t U_EXPORT2
1177 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
1178 UCaseContextIterator *iter, void *context,
1179 const UChar **pString,
1180 const char *locale, int32_t *locCache) {
1181 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
1182 }
1183
1184 /* case folding ------------------------------------------------------------- */
1185
1186 /*
1187 * Case folding is similar to lowercasing.
1188 * The result may be a simple mapping, i.e., a single code point, or
1189 * a full mapping, i.e., a string.
1190 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1191 * then only the lowercase mapping is stored.
1192 *
1193 * Some special cases are hardcoded because their conditions cannot be
1194 * parsed and processed from CaseFolding.txt.
1195 *
1196 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1197
1198 # C: common case folding, common mappings shared by both simple and full mappings.
1199 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1200 # S: simple case folding, mappings to single characters where different from F.
1201 # T: special case for uppercase I and dotted uppercase I
1202 # - For non-Turkic languages, this mapping is normally not used.
1203 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1204 #
1205 # Usage:
1206 # A. To do a simple case folding, use the mappings with status C + S.
1207 # B. To do a full case folding, use the mappings with status C + F.
1208 #
1209 # The mappings with status T can be used or omitted depending on the desired case-folding
1210 # behavior. (The default option is to exclude them.)
1211
1212 * Unicode 3.2 has 'T' mappings as follows:
1213
1214 0049; T; 0131; # LATIN CAPITAL LETTER I
1215 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1216
1217 * while the default mappings for these code points are:
1218
1219 0049; C; 0069; # LATIN CAPITAL LETTER I
1220 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1221
1222 * U+0130 is otherwise lowercased to U+0069 (UnicodeData.txt).
1223 *
1224 * In case this code is used with CaseFolding.txt from an older version of Unicode
1225 * where CaseFolding.txt contains mappings with a status of 'I' that
1226 * have the opposite polarity ('I' mappings are included by default but excluded for Turkic),
1227 * we must also hardcode the Unicode 3.2 mappings for the code points
1228 * with 'I' mappings.
1229 * Unicode 3.1.1 has 'I' mappings for U+0130 and U+0131.
1230 * Unicode 3.2 has a 'T' mapping for U+0130, and lowercases U+0131 to itself (see UnicodeData.txt).
1231 */
1232
1233 /* return the simple case folding mapping for c */
1234 U_CAPI UChar32 U_EXPORT2
1235 ucase_fold(UCaseProps *csp, UChar32 c, uint32_t options) {
1236 uint16_t props;
1237 GET_PROPS(csp, c, props);
1238 if(!PROPS_HAS_EXCEPTION(props)) {
1239 if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
1240 c+=GET_SIGNED_DELTA(props);
1241 }
1242 } else {
1243 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
1244 uint16_t excWord=*pe++;
1245 int32_t index;
1246 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1247 /* special case folding mappings, hardcoded */
1248 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1249 /* default mappings */
1250 if(c==0x49) {
1251 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1252 return 0x69;
1253 } else if(c==0x130) {
1254 /* no simple default mapping for U+0130, use UnicodeData.txt */
1255 return 0x69;
1256 }
1257 } else {
1258 /* Turkic mappings */
1259 if(c==0x49) {
1260 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1261 return 0x131;
1262 } else if(c==0x130) {
1263 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1264 return 0x69;
1265 }
1266 }
1267 }
1268 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1269 index=UCASE_EXC_FOLD;
1270 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1271 index=UCASE_EXC_LOWER;
1272 } else {
1273 return c;
1274 }
1275 GET_SLOT_VALUE(excWord, index, pe, c);
1276 }
1277 return c;
1278 }
1279
1280 /*
1281 * Issue for canonical caseless match (UAX #21):
1282 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1283 * canonical equivalence, unlike default-option casefolding.
1284 * For example, I-grave and I + grave fold to strings that are not canonically
1285 * equivalent.
1286 * For more details, see the comment in unorm_compare() in unorm.cpp
1287 * and the intermediate prototype changes for Jitterbug 2021.
1288 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1289 *
1290 * This did not get fixed because it appears that it is not possible to fix
1291 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1292 * together in a way that they still fold to common result strings.
1293 */
1294
1295 U_CAPI int32_t U_EXPORT2
1296 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
1297 const UChar **pString,
1298 uint32_t options) {
1299 static const UChar
1300 iDot[2]= { 0x69, 0x307 };
1301
1302 UChar32 result;
1303 uint16_t props;
1304
1305 result=c;
1306 GET_PROPS(csp, c, props);
1307 if(!PROPS_HAS_EXCEPTION(props)) {
1308 if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
1309 result=c+GET_SIGNED_DELTA(props);
1310 }
1311 } else {
1312 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1313 uint16_t excWord=*pe++;
1314 int32_t full, index;
1315
1316 pe2=pe;
1317
1318 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1319 /* use hardcoded conditions and mappings */
1320 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1321 /* default mappings */
1322 if(c==0x49) {
1323 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1324 return 0x69;
1325 } else if(c==0x130) {
1326 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1327 *pString=iDot;
1328 return 2;
1329 }
1330 } else {
1331 /* Turkic mappings */
1332 if(c==0x49) {
1333 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1334 return 0x131;
1335 } else if(c==0x130) {
1336 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1337 return 0x69;
1338 }
1339 }
1340 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1341 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1342
1343 /* start of full case mapping strings */
1344 ++pe;
1345
1346 /* skip the lowercase result string */
1347 pe+=full&UCASE_FULL_LOWER;
1348 full=(full>>4)&0xf;
1349
1350 if(full!=0) {
1351 /* set the output pointer to the result string */
1352 *pString=pe;
1353
1354 /* return the string length */
1355 return full;
1356 }
1357 }
1358
1359 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1360 index=UCASE_EXC_FOLD;
1361 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1362 index=UCASE_EXC_LOWER;
1363 } else {
1364 return ~c;
1365 }
1366 GET_SLOT_VALUE(excWord, index, pe2, result);
1367 }
1368
1369 return (result==c) ? ~result : result;
1370 }