]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucase.c
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / common / ucase.c
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2004-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucase.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2004aug30
14 * created by: Markus W. Scherer
15 *
16 * Low-level Unicode character/string case mapping code.
17 * Much code moved here (and modified) from uchar.c.
18 */
19
20 #include "unicode/utypes.h"
21 #include "unicode/uset.h"
22 #include "unicode/udata.h" /* UDataInfo */
23 #include "ucmndata.h" /* DataHeader */
24 #include "udatamem.h"
25 #include "umutex.h"
26 #include "uassert.h"
27 #include "cmemory.h"
28 #include "utrie.h"
29 #include "ucase.h"
30 #include "ucln_cmn.h"
31
32 struct UCaseProps {
33 UDataMemory *mem;
34 const int32_t *indexes;
35 const uint16_t *exceptions;
36 const UChar *unfold;
37
38 UTrie trie;
39 uint8_t formatVersion[4];
40 };
41
42 /* data loading etc. -------------------------------------------------------- */
43
44 #define UCASE_HARDCODE_DATA 1
45
46 #if UCASE_HARDCODE_DATA
47
48 /* ucase_props_data.c is machine-generated by gencase --csource */
49 #include "ucase_props_data.c"
50
51 #else
52
53 static UBool U_CALLCONV
54 isAcceptable(void *context,
55 const char *type, const char *name,
56 const UDataInfo *pInfo) {
57 if(
58 pInfo->size>=20 &&
59 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
60 pInfo->charsetFamily==U_CHARSET_FAMILY &&
61 pInfo->dataFormat[0]==UCASE_FMT_0 && /* dataFormat="cAsE" */
62 pInfo->dataFormat[1]==UCASE_FMT_1 &&
63 pInfo->dataFormat[2]==UCASE_FMT_2 &&
64 pInfo->dataFormat[3]==UCASE_FMT_3 &&
65 pInfo->formatVersion[0]==1 &&
66 pInfo->formatVersion[2]==UTRIE_SHIFT &&
67 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
68 ) {
69 UCaseProps *csp=(UCaseProps *)context;
70 uprv_memcpy(csp->formatVersion, pInfo->formatVersion, 4);
71 return TRUE;
72 } else {
73 return FALSE;
74 }
75 }
76
77 static UCaseProps *
78 ucase_openData(UCaseProps *cspProto,
79 const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
80 UCaseProps *csp;
81 int32_t size;
82
83 cspProto->indexes=(const int32_t *)bin;
84 if( (length>=0 && length<16*4) ||
85 cspProto->indexes[UCASE_IX_INDEX_TOP]<16
86 ) {
87 /* length or indexes[] too short for minimum indexes[] length of 16 */
88 *pErrorCode=U_INVALID_FORMAT_ERROR;
89 return NULL;
90 }
91 size=cspProto->indexes[UCASE_IX_INDEX_TOP]*4;
92 if(length>=0) {
93 if(length>=size && length>=cspProto->indexes[UCASE_IX_LENGTH]) {
94 length-=size;
95 } else {
96 /* length too short for indexes[] or for the whole data length */
97 *pErrorCode=U_INVALID_FORMAT_ERROR;
98 return NULL;
99 }
100 }
101 bin+=size;
102 /* from here on, assume that the sizes of the items fit into the total length */
103
104 /* unserialize the trie, after indexes[] */
105 size=cspProto->indexes[UCASE_IX_TRIE_SIZE];
106 utrie_unserialize(&cspProto->trie, bin, size, pErrorCode);
107 if(U_FAILURE(*pErrorCode)) {
108 return NULL;
109 }
110 bin+=size;
111
112 /* get exceptions[] */
113 size=2*cspProto->indexes[UCASE_IX_EXC_LENGTH];
114 cspProto->exceptions=(const uint16_t *)bin;
115 bin+=size;
116
117 /* get unfold[] */
118 size=2*cspProto->indexes[UCASE_IX_UNFOLD_LENGTH];
119 if(size!=0) {
120 cspProto->unfold=(const UChar *)bin;
121 bin+=size;
122 } else {
123 cspProto->unfold=NULL;
124 }
125
126 /* allocate, copy, and return the new UCaseProps */
127 csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps));
128 if(csp==NULL) {
129 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
130 return NULL;
131 } else {
132 uprv_memcpy(csp, cspProto, sizeof(UCaseProps));
133 return csp;
134 }
135 }
136
137 U_CAPI UCaseProps * U_EXPORT2
138 ucase_open(UErrorCode *pErrorCode) {
139 UCaseProps cspProto={ NULL }, *csp;
140
141 cspProto.mem=udata_openChoice(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, isAcceptable, &cspProto, pErrorCode);
142 if(U_FAILURE(*pErrorCode)) {
143 return NULL;
144 }
145
146 csp=ucase_openData(
147 &cspProto,
148 udata_getMemory(cspProto.mem),
149 udata_getLength(cspProto.mem),
150 pErrorCode);
151 if(U_FAILURE(*pErrorCode)) {
152 udata_close(cspProto.mem);
153 return NULL;
154 } else {
155 return csp;
156 }
157 }
158
159 U_CAPI UCaseProps * U_EXPORT2
160 ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
161 UCaseProps cspProto={ NULL };
162 const DataHeader *hdr;
163
164 if(U_FAILURE(*pErrorCode)) {
165 return NULL;
166 }
167 if(bin==NULL) {
168 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
169 return NULL;
170 }
171
172 /* check the header */
173 if(length>=0 && length<20) {
174 *pErrorCode=U_INVALID_FORMAT_ERROR;
175 return NULL;
176 }
177 hdr=(const DataHeader *)bin;
178 if(
179 !(hdr->dataHeader.magic1==0xda && hdr->dataHeader.magic2==0x27 &&
180 hdr->info.isBigEndian==U_IS_BIG_ENDIAN &&
181 isAcceptable(&cspProto, UCASE_DATA_TYPE, UCASE_DATA_NAME, &hdr->info))
182 ) {
183 *pErrorCode=U_INVALID_FORMAT_ERROR;
184 return NULL;
185 }
186
187 bin+=hdr->dataHeader.headerSize;
188 if(length>=0) {
189 length-=hdr->dataHeader.headerSize;
190 }
191 return ucase_openData(&cspProto, bin, length, pErrorCode);
192 }
193
194 #endif
195
196 U_CAPI void U_EXPORT2
197 ucase_close(UCaseProps *csp) {
198 if(csp!=NULL) {
199 #if !UCASE_HARDCODE_DATA
200 udata_close(csp->mem);
201 #endif
202 uprv_free(csp);
203 }
204 }
205
206 /* UCaseProps singleton ----------------------------------------------------- */
207
208 static UCaseProps *gCsp=NULL, *gCspDummy=NULL;
209 #if !UCASE_HARDCODE_DATA
210 static UErrorCode gErrorCode=U_ZERO_ERROR;
211 static int8_t gHaveData=0;
212 #endif
213
214 static UBool U_CALLCONV ucase_cleanup(void) {
215 ucase_close(gCsp);
216 gCsp=NULL;
217 ucase_close(gCspDummy);
218 gCspDummy=NULL;
219 #if !UCASE_HARDCODE_DATA
220 gErrorCode=U_ZERO_ERROR;
221 gHaveData=0;
222 #endif
223 return TRUE;
224 }
225
226 U_CAPI const UCaseProps * U_EXPORT2
227 ucase_getSingleton(UErrorCode *pErrorCode) {
228 #if UCASE_HARDCODE_DATA
229 if(U_FAILURE(*pErrorCode)) {
230 return NULL;
231 }
232 return &ucase_props_singleton;
233 #else
234 int8_t haveData;
235
236 if(U_FAILURE(*pErrorCode)) {
237 return NULL;
238 }
239
240 UMTX_CHECK(NULL, gHaveData, haveData);
241
242 if(haveData>0) {
243 /* data was loaded */
244 return gCsp;
245 } else if(haveData<0) {
246 /* data loading failed */
247 *pErrorCode=gErrorCode;
248 return NULL;
249 } else /* haveData==0 */ {
250 /* load the data */
251 UCaseProps *csp=ucase_open(pErrorCode);
252 if(U_FAILURE(*pErrorCode)) {
253 gHaveData=-1;
254 gErrorCode=*pErrorCode;
255 return NULL;
256 }
257
258 /* set the static variables */
259 umtx_lock(NULL);
260 if(gCsp==NULL) {
261 gCsp=csp;
262 csp=NULL;
263 gHaveData=1;
264 ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup);
265 }
266 umtx_unlock(NULL);
267
268 ucase_close(csp);
269 return gCsp;
270 }
271 #endif
272 }
273
274 U_CAPI const UCaseProps * U_EXPORT2
275 ucase_getDummy(UErrorCode *pErrorCode) {
276 UCaseProps *csp;
277
278 if(U_FAILURE(*pErrorCode)) {
279 return NULL;
280 }
281
282 UMTX_CHECK(NULL, gCspDummy, csp);
283
284 if(csp!=NULL) {
285 /* the dummy object was already created */
286 return csp;
287 } else /* csp==NULL */ {
288 /* create the dummy object */
289 int32_t *indexes;
290
291 csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps)+UCASE_IX_TOP*4+UTRIE_DUMMY_SIZE);
292 if(csp==NULL) {
293 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
294 return NULL;
295 }
296 uprv_memset(csp, 0, sizeof(UCaseProps)+UCASE_IX_TOP*4);
297
298 csp->indexes=indexes=(int32_t *)(csp+1);
299 indexes[UCASE_IX_INDEX_TOP]=UCASE_IX_TOP;
300
301 indexes[UCASE_IX_TRIE_SIZE]=
302 utrie_unserializeDummy(&csp->trie, indexes+UCASE_IX_TOP, UTRIE_DUMMY_SIZE, 0, 0, TRUE, pErrorCode);
303 if(U_FAILURE(*pErrorCode)) {
304 uprv_free(csp);
305 return NULL;
306 }
307
308 csp->formatVersion[0]=1;
309 csp->formatVersion[2]=UTRIE_SHIFT;
310 csp->formatVersion[3]=UTRIE_INDEX_SHIFT;
311
312 /* set the static variables */
313 umtx_lock(NULL);
314 if(gCspDummy==NULL) {
315 gCspDummy=csp;
316 csp=NULL;
317 ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup);
318 }
319 umtx_unlock(NULL);
320
321 uprv_free(csp);
322 return gCspDummy;
323 }
324 }
325
326 /* set of property starts for UnicodeSet ------------------------------------ */
327
328 static UBool U_CALLCONV
329 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
330 /* add the start code point to the USet */
331 const USetAdder *sa=(const USetAdder *)context;
332 sa->add(sa->set, start);
333 return TRUE;
334 }
335
336 U_CAPI void U_EXPORT2
337 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
338 if(U_FAILURE(*pErrorCode)) {
339 return;
340 }
341
342 /* add the start code point of each same-value range of the trie */
343 utrie_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
344
345 /* add code points with hardcoded properties, plus the ones following them */
346
347 /* (none right now, see comment below) */
348
349 /*
350 * Omit code points with hardcoded specialcasing properties
351 * because we do not build property UnicodeSets for them right now.
352 */
353 }
354
355 /* data access primitives --------------------------------------------------- */
356
357 /* UTRIE_GET16() itself validates c */
358 #define GET_PROPS(csp, c, result) \
359 UTRIE_GET16(&(csp)->trie, c, result);
360
361 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
362
363 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
364
365 /* number of bits in an 8-bit integer value */
366 static const uint8_t flagsOffset[256]={
367 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
368 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
369 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
370 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
371 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
372 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
373 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
374 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
375 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
376 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
377 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
378 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
379 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
380 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
381 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
382 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
383 };
384
385 #define HAS_SLOT(flags, index) ((flags)&(1<<(index)))
386 #define SLOT_OFFSET(flags, index) flagsOffset[(flags)&((1<<(index))-1)]
387
388 /*
389 * Get the value of an optional-value slot where HAS_SLOT(excWord, index).
390 *
391 * @param excWord (in) initial exceptions word
392 * @param index (in) desired slot index
393 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
394 * moved to the last uint16_t of the value, use +1 for beginning of next slot
395 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
396 */
397 #define GET_SLOT_VALUE(excWord, index, pExc16, value) \
398 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
399 (pExc16)+=SLOT_OFFSET(excWord, index); \
400 (value)=*pExc16; \
401 } else { \
402 (pExc16)+=2*SLOT_OFFSET(excWord, index); \
403 (value)=*pExc16++; \
404 (value)=((value)<<16)|*pExc16; \
405 }
406
407 /* simple case mappings ----------------------------------------------------- */
408
409 U_CAPI UChar32 U_EXPORT2
410 ucase_tolower(const UCaseProps *csp, UChar32 c) {
411 uint16_t props;
412 GET_PROPS(csp, c, props);
413 if(!PROPS_HAS_EXCEPTION(props)) {
414 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
415 c+=UCASE_GET_DELTA(props);
416 }
417 } else {
418 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
419 uint16_t excWord=*pe++;
420 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
421 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
422 }
423 }
424 return c;
425 }
426
427 U_CAPI UChar32 U_EXPORT2
428 ucase_toupper(const UCaseProps *csp, UChar32 c) {
429 uint16_t props;
430 GET_PROPS(csp, c, props);
431 if(!PROPS_HAS_EXCEPTION(props)) {
432 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
433 c+=UCASE_GET_DELTA(props);
434 }
435 } else {
436 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
437 uint16_t excWord=*pe++;
438 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
439 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
440 }
441 }
442 return c;
443 }
444
445 U_CAPI UChar32 U_EXPORT2
446 ucase_totitle(const UCaseProps *csp, UChar32 c) {
447 uint16_t props;
448 GET_PROPS(csp, c, props);
449 if(!PROPS_HAS_EXCEPTION(props)) {
450 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
451 c+=UCASE_GET_DELTA(props);
452 }
453 } else {
454 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
455 uint16_t excWord=*pe++;
456 int32_t index;
457 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
458 index=UCASE_EXC_TITLE;
459 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
460 index=UCASE_EXC_UPPER;
461 } else {
462 return c;
463 }
464 GET_SLOT_VALUE(excWord, index, pe, c);
465 }
466 return c;
467 }
468
469 U_CAPI void U_EXPORT2
470 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
471 uint16_t props;
472
473 /*
474 * Hardcode the case closure of i and its relatives and ignore the
475 * data file data for these characters.
476 * The Turkic dotless i and dotted I with their case mapping conditions
477 * and case folding option make the related characters behave specially.
478 * This code matches their closure behavior to their case folding behavior.
479 */
480 static const UChar
481 iDot[2]= { 0x69, 0x307 };
482
483 switch(c) {
484 case 0x49:
485 /* regular i and I are in one equivalence class */
486 sa->add(sa->set, 0x69);
487 return;
488 case 0x69:
489 sa->add(sa->set, 0x49);
490 return;
491 case 0x130:
492 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
493 sa->addString(sa->set, iDot, 2);
494 return;
495 case 0x131:
496 /* dotless i is in a class by itself */
497 return;
498 default:
499 /* otherwise use the data file data */
500 break;
501 }
502
503 GET_PROPS(csp, c, props);
504 if(!PROPS_HAS_EXCEPTION(props)) {
505 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
506 /* add the one simple case mapping, no matter what type it is */
507 int32_t delta=UCASE_GET_DELTA(props);
508 if(delta!=0) {
509 sa->add(sa->set, c+delta);
510 }
511 }
512 } else {
513 /*
514 * c has exceptions, so there may be multiple simple and/or
515 * full case mappings. Add them all.
516 */
517 const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
518 const UChar *closure;
519 uint16_t excWord=*pe++;
520 int32_t index, closureLength, fullLength, length;
521
522 pe0=pe;
523
524 /* add all simple case mappings */
525 for(index=UCASE_EXC_LOWER; index<=UCASE_EXC_TITLE; ++index) {
526 if(HAS_SLOT(excWord, index)) {
527 pe=pe0;
528 GET_SLOT_VALUE(excWord, index, pe, c);
529 sa->add(sa->set, c);
530 }
531 }
532
533 /* get the closure string pointer & length */
534 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
535 pe=pe0;
536 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
537 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
538 closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
539 } else {
540 closureLength=0;
541 closure=NULL;
542 }
543
544 /* add the full case folding */
545 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
546 pe=pe0;
547 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
548
549 /* start of full case mapping strings */
550 ++pe;
551
552 fullLength&=0xffff; /* bits 16 and higher are reserved */
553
554 /* skip the lowercase result string */
555 pe+=fullLength&UCASE_FULL_LOWER;
556 fullLength>>=4;
557
558 /* add the full case folding string */
559 length=fullLength&0xf;
560 if(length!=0) {
561 sa->addString(sa->set, (const UChar *)pe, length);
562 pe+=length;
563 }
564
565 /* skip the uppercase and titlecase strings */
566 fullLength>>=4;
567 pe+=fullLength&0xf;
568 fullLength>>=4;
569 pe+=fullLength;
570
571 closure=(const UChar *)pe; /* behind full case mappings */
572 }
573
574 /* add each code point in the closure string */
575 for(index=0; index<closureLength;) {
576 U16_NEXT_UNSAFE(closure, index, c);
577 sa->add(sa->set, c);
578 }
579 }
580 }
581
582 /*
583 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
584 * must be length>0 and max>0 and length<=max
585 */
586 static U_INLINE int32_t
587 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
588 int32_t c1, c2;
589
590 max-=length; /* we require length<=max, so no need to decrement max in the loop */
591 do {
592 c1=*s++;
593 c2=*t++;
594 if(c2==0) {
595 return 1; /* reached the end of t but not of s */
596 }
597 c1-=c2;
598 if(c1!=0) {
599 return c1; /* return difference result */
600 }
601 } while(--length>0);
602 /* ends with length==0 */
603
604 if(max==0 || *t==0) {
605 return 0; /* equal to length of both strings */
606 } else {
607 return -max; /* return lengh difference */
608 }
609 }
610
611 U_CAPI UBool U_EXPORT2
612 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
613 const UChar *unfold, *p;
614 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
615
616 if(csp->unfold==NULL || s==NULL) {
617 return FALSE; /* no reverse case folding data, or no string */
618 }
619 if(length<=1) {
620 /* the string is too short to find any match */
621 /*
622 * more precise would be:
623 * if(!u_strHasMoreChar32Than(s, length, 1))
624 * but this does not make much practical difference because
625 * a single supplementary code point would just not be found
626 */
627 return FALSE;
628 }
629
630 unfold=csp->unfold;
631 unfoldRows=unfold[UCASE_UNFOLD_ROWS];
632 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
633 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
634 unfold+=unfoldRowWidth;
635
636 if(length>unfoldStringWidth) {
637 /* the string is too long to find any match */
638 return FALSE;
639 }
640
641 /* do a binary search for the string */
642 start=0;
643 limit=unfoldRows;
644 while(start<limit) {
645 i=(start+limit)/2;
646 p=unfold+(i*unfoldRowWidth);
647 result=strcmpMax(s, length, p, unfoldStringWidth);
648
649 if(result==0) {
650 /* found the string: add each code point, and its case closure */
651 UChar32 c;
652
653 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
654 U16_NEXT_UNSAFE(p, i, c);
655 sa->add(sa->set, c);
656 ucase_addCaseClosure(csp, c, sa);
657 }
658 return TRUE;
659 } else if(result<0) {
660 limit=i;
661 } else /* result>0 */ {
662 start=i+1;
663 }
664 }
665
666 return FALSE; /* string not found */
667 }
668
669 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
670 U_CAPI int32_t U_EXPORT2
671 ucase_getType(const UCaseProps *csp, UChar32 c) {
672 uint16_t props;
673 GET_PROPS(csp, c, props);
674 return UCASE_GET_TYPE(props);
675 }
676
677 /** @return same as ucase_getType(), or <0 if c is case-ignorable */
678 U_CAPI int32_t U_EXPORT2
679 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
680 int32_t type;
681 uint16_t props;
682 GET_PROPS(csp, c, props);
683 type=UCASE_GET_TYPE(props);
684 if(type!=UCASE_NONE) {
685 return type;
686 } else if(
687 c==0x307 ||
688 (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE
689 ) {
690 return -1; /* case-ignorable */
691 } else {
692 return 0; /* c is neither cased nor case-ignorable */
693 }
694 }
695
696 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
697 static U_INLINE int32_t
698 getDotType(const UCaseProps *csp, UChar32 c) {
699 uint16_t props;
700 GET_PROPS(csp, c, props);
701 if(!PROPS_HAS_EXCEPTION(props)) {
702 return props&UCASE_DOT_MASK;
703 } else {
704 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
705 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
706 }
707 }
708
709 U_CAPI UBool U_EXPORT2
710 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
711 return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
712 }
713
714 U_CAPI UBool U_EXPORT2
715 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
716 uint16_t props;
717 GET_PROPS(csp, c, props);
718 return (UBool)((props&UCASE_SENSITIVE)!=0);
719 }
720
721 /* string casing ------------------------------------------------------------ */
722
723 /*
724 * These internal functions form the core of string case mappings.
725 * They map single code points to result code points or strings and take
726 * all necessary conditions (context, locale ID, options) into account.
727 *
728 * They do not iterate over the source or write to the destination
729 * so that the same functions are useful for non-standard string storage,
730 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
731 * For the same reason, the "surrounding text" context is passed in as a
732 * UCaseContextIterator which does not make any assumptions about
733 * the underlying storage.
734 *
735 * This section contains helper functions that check for conditions
736 * in the input text surrounding the current code point
737 * according to SpecialCasing.txt.
738 *
739 * Each helper function gets the index
740 * - after the current code point if it looks at following text
741 * - before the current code point if it looks at preceding text
742 *
743 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
744 *
745 * Final_Sigma
746 * C is preceded by a sequence consisting of
747 * a cased letter and a case-ignorable sequence,
748 * and C is not followed by a sequence consisting of
749 * an ignorable sequence and then a cased letter.
750 *
751 * More_Above
752 * C is followed by one or more characters of combining class 230 (ABOVE)
753 * in the combining character sequence.
754 *
755 * After_Soft_Dotted
756 * The last preceding character with combining class of zero before C
757 * was Soft_Dotted,
758 * and there is no intervening combining character class 230 (ABOVE).
759 *
760 * Before_Dot
761 * C is followed by combining dot above (U+0307).
762 * Any sequence of characters with a combining class that is neither 0 nor 230
763 * may intervene between the current character and the combining dot above.
764 *
765 * The erratum from 2002-10-31 adds the condition
766 *
767 * After_I
768 * The last preceding base character was an uppercase I, and there is no
769 * intervening combining character class 230 (ABOVE).
770 *
771 * (See Jitterbug 2344 and the comments on After_I below.)
772 *
773 * Helper definitions in Unicode 3.2 UAX 21:
774 *
775 * D1. A character C is defined to be cased
776 * if it meets any of the following criteria:
777 *
778 * - The general category of C is Titlecase Letter (Lt)
779 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
780 * - Given D = NFD(C), then it is not the case that:
781 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
782 * (This third criterium does not add any characters to the list
783 * for Unicode 3.2. Ignored.)
784 *
785 * D2. A character C is defined to be case-ignorable
786 * if it meets either of the following criteria:
787 *
788 * - The general category of C is
789 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
790 * Letter Modifier (Lm), or Symbol Modifier (Sk)
791 * - C is one of the following characters
792 * U+0027 APOSTROPHE
793 * U+00AD SOFT HYPHEN (SHY)
794 * U+2019 RIGHT SINGLE QUOTATION MARK
795 * (the preferred character for apostrophe)
796 *
797 * D3. A case-ignorable sequence is a sequence of
798 * zero or more case-ignorable characters.
799 */
800
801 enum {
802 LOC_UNKNOWN,
803 LOC_ROOT,
804 LOC_TURKISH,
805 LOC_LITHUANIAN
806 };
807
808 #define is_a(c) ((c)=='a' || (c)=='A')
809 #define is_e(c) ((c)=='e' || (c)=='E')
810 #define is_i(c) ((c)=='i' || (c)=='I')
811 #define is_l(c) ((c)=='l' || (c)=='L')
812 #define is_r(c) ((c)=='r' || (c)=='R')
813 #define is_t(c) ((c)=='t' || (c)=='T')
814 #define is_u(c) ((c)=='u' || (c)=='U')
815 #define is_z(c) ((c)=='z' || (c)=='Z')
816
817 /* separator? */
818 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
819
820 /**
821 * Requires non-NULL locale ID but otherwise does the equivalent of
822 * checking for language codes as if uloc_getLanguage() were called:
823 * Accepts both 2- and 3-letter codes and accepts case variants.
824 */
825 U_CFUNC int32_t
826 ucase_getCaseLocale(const char *locale, int32_t *locCache) {
827 int32_t result;
828 char c;
829
830 if(locCache!=NULL && (result=*locCache)!=LOC_UNKNOWN) {
831 return result;
832 }
833
834 result=LOC_ROOT;
835
836 /*
837 * This function used to use uloc_getLanguage(), but the current code
838 * removes the dependency of this low-level code on uloc implementation code
839 * and is faster because not the whole locale ID has to be
840 * examined and copied/transformed.
841 *
842 * Because this code does not want to depend on uloc, the caller must
843 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
844 */
845 c=*locale++;
846 if(is_t(c)) {
847 /* tr or tur? */
848 c=*locale++;
849 if(is_u(c)) {
850 c=*locale++;
851 }
852 if(is_r(c)) {
853 c=*locale;
854 if(is_sep(c)) {
855 result=LOC_TURKISH;
856 }
857 }
858 } else if(is_a(c)) {
859 /* az or aze? */
860 c=*locale++;
861 if(is_z(c)) {
862 c=*locale++;
863 if(is_e(c)) {
864 c=*locale;
865 }
866 if(is_sep(c)) {
867 result=LOC_TURKISH;
868 }
869 }
870 } else if(is_l(c)) {
871 /* lt or lit? */
872 c=*locale++;
873 if(is_i(c)) {
874 c=*locale++;
875 }
876 if(is_t(c)) {
877 c=*locale;
878 if(is_sep(c)) {
879 result=LOC_LITHUANIAN;
880 }
881 }
882 }
883
884 if(locCache!=NULL) {
885 *locCache=result;
886 }
887 return result;
888 }
889
890 /* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */
891 static UBool
892 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
893 UChar32 c;
894 uint16_t props;
895
896 if(iter==NULL) {
897 return FALSE;
898 }
899
900 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
901 GET_PROPS(csp, c, props);
902 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
903 return TRUE; /* followed by cased letter */
904 } else if(c==0x307 || (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE) {
905 /* case-ignorable, continue with the loop */
906 } else {
907 return FALSE; /* not ignorable */
908 }
909 }
910
911 return FALSE; /* not followed by cased letter */
912 }
913
914 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
915 static UBool
916 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
917 UChar32 c;
918 int32_t dotType;
919 int8_t dir;
920
921 if(iter==NULL) {
922 return FALSE;
923 }
924
925 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
926 dotType=getDotType(csp, c);
927 if(dotType==UCASE_SOFT_DOTTED) {
928 return TRUE; /* preceded by TYPE_i */
929 } else if(dotType!=UCASE_OTHER_ACCENT) {
930 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
931 }
932 }
933
934 return FALSE; /* not preceded by TYPE_i */
935 }
936
937 /*
938 * See Jitterbug 2344:
939 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
940 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
941 * we made those releases compatible with Unicode 3.2 which had not fixed
942 * a related bug in SpecialCasing.txt.
943 *
944 * From the Jitterbug 2344 text:
945 * ... this bug is listed as a Unicode erratum
946 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
947 * <quote>
948 * There are two errors in SpecialCasing.txt.
949 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
950 * 2. An incorrect context definition. Correct as follows:
951 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
952 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
953 * ---
954 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
955 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
956 * where the context After_I is defined as:
957 * The last preceding base character was an uppercase I, and there is no
958 * intervening combining character class 230 (ABOVE).
959 * </quote>
960 *
961 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
962 *
963 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
964 * # This matches the behavior of the canonically equivalent I-dot_above
965 *
966 * See also the description in this place in older versions of uchar.c (revision 1.100).
967 *
968 * Markus W. Scherer 2003-feb-15
969 */
970
971 /* Is preceded by base character 'I' with no intervening cc=230 ? */
972 static UBool
973 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
974 UChar32 c;
975 int32_t dotType;
976 int8_t dir;
977
978 if(iter==NULL) {
979 return FALSE;
980 }
981
982 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
983 if(c==0x49) {
984 return TRUE; /* preceded by I */
985 }
986 dotType=getDotType(csp, c);
987 if(dotType!=UCASE_OTHER_ACCENT) {
988 return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
989 }
990 }
991
992 return FALSE; /* not preceded by I */
993 }
994
995 /* Is followed by one or more cc==230 ? */
996 static UBool
997 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
998 UChar32 c;
999 int32_t dotType;
1000 int8_t dir;
1001
1002 if(iter==NULL) {
1003 return FALSE;
1004 }
1005
1006 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1007 dotType=getDotType(csp, c);
1008 if(dotType==UCASE_ABOVE) {
1009 return TRUE; /* at least one cc==230 following */
1010 } else if(dotType!=UCASE_OTHER_ACCENT) {
1011 return FALSE; /* next base character, no more cc==230 following */
1012 }
1013 }
1014
1015 return FALSE; /* no more cc==230 following */
1016 }
1017
1018 /* Is followed by a dot above (without cc==230 in between) ? */
1019 static UBool
1020 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
1021 UChar32 c;
1022 int32_t dotType;
1023 int8_t dir;
1024
1025 if(iter==NULL) {
1026 return FALSE;
1027 }
1028
1029 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1030 if(c==0x307) {
1031 return TRUE;
1032 }
1033 dotType=getDotType(csp, c);
1034 if(dotType!=UCASE_OTHER_ACCENT) {
1035 return FALSE; /* next base character or cc==230 in between */
1036 }
1037 }
1038
1039 return FALSE; /* no dot above following */
1040 }
1041
1042 U_CAPI int32_t U_EXPORT2
1043 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
1044 UCaseContextIterator *iter, void *context,
1045 const UChar **pString,
1046 const char *locale, int32_t *locCache) {
1047 static const UChar
1048 iDot[2]= { 0x69, 0x307 },
1049 jDot[2]= { 0x6a, 0x307 },
1050 iOgonekDot[3]= { 0x12f, 0x307 },
1051 iDotGrave[3]= { 0x69, 0x307, 0x300 },
1052 iDotAcute[3]= { 0x69, 0x307, 0x301 },
1053 iDotTilde[3]= { 0x69, 0x307, 0x303 };
1054
1055 UChar32 result;
1056 uint16_t props;
1057
1058 result=c;
1059 GET_PROPS(csp, c, props);
1060 if(!PROPS_HAS_EXCEPTION(props)) {
1061 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1062 result=c+UCASE_GET_DELTA(props);
1063 }
1064 } else {
1065 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1066 uint16_t excWord=*pe++;
1067 int32_t full;
1068
1069 pe2=pe;
1070
1071 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1072 /* use hardcoded conditions and mappings */
1073 int32_t loc=ucase_getCaseLocale(locale, locCache);
1074
1075 /*
1076 * Test for conditional mappings first
1077 * (otherwise the unconditional default mappings are always taken),
1078 * then test for characters that have unconditional mappings in SpecialCasing.txt,
1079 * then get the UnicodeData.txt mappings.
1080 */
1081 if( loc==LOC_LITHUANIAN &&
1082 /* base characters, find accents above */
1083 (((c==0x49 || c==0x4a || c==0x12e) &&
1084 isFollowedByMoreAbove(csp, iter, context)) ||
1085 /* precomposed with accent above, no need to find one */
1086 (c==0xcc || c==0xcd || c==0x128))
1087 ) {
1088 /*
1089 # Lithuanian
1090
1091 # Lithuanian retains the dot in a lowercase i when followed by accents.
1092
1093 # Introduce an explicit dot above when lowercasing capital I's and J's
1094 # whenever there are more accents above.
1095 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1096
1097 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1098 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1099 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1100 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1101 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1102 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1103 */
1104 switch(c) {
1105 case 0x49: /* LATIN CAPITAL LETTER I */
1106 *pString=iDot;
1107 return 2;
1108 case 0x4a: /* LATIN CAPITAL LETTER J */
1109 *pString=jDot;
1110 return 2;
1111 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1112 *pString=iOgonekDot;
1113 return 2;
1114 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
1115 *pString=iDotGrave;
1116 return 3;
1117 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
1118 *pString=iDotAcute;
1119 return 3;
1120 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1121 *pString=iDotTilde;
1122 return 3;
1123 default:
1124 return 0; /* will not occur */
1125 }
1126 /* # Turkish and Azeri */
1127 } else if(loc==LOC_TURKISH && c==0x130) {
1128 /*
1129 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1130 # The following rules handle those cases.
1131
1132 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1133 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1134 */
1135 return 0x69;
1136 } else if(loc==LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
1137 /*
1138 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1139 # This matches the behavior of the canonically equivalent I-dot_above
1140
1141 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1142 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1143 */
1144 return 0; /* remove the dot (continue without output) */
1145 } else if(loc==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
1146 /*
1147 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1148
1149 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1150 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1151 */
1152 return 0x131;
1153 } else if(c==0x130) {
1154 /*
1155 # Preserve canonical equivalence for I with dot. Turkic is handled below.
1156
1157 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1158 */
1159 *pString=iDot;
1160 return 2;
1161 } else if( c==0x3a3 &&
1162 !isFollowedByCasedLetter(csp, iter, context, 1) &&
1163 isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
1164 ) {
1165 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1166 /*
1167 # Special case for final form of sigma
1168
1169 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1170 */
1171 return 0x3c2; /* greek small final sigma */
1172 } else {
1173 /* no known conditional special case mapping, use a normal mapping */
1174 }
1175 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1176 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1177 full&=UCASE_FULL_LOWER;
1178 if(full!=0) {
1179 /* set the output pointer to the lowercase mapping */
1180 *pString=pe+1;
1181
1182 /* return the string length */
1183 return full;
1184 }
1185 }
1186
1187 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1188 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1189 }
1190 }
1191
1192 return (result==c) ? ~result : result;
1193 }
1194
1195 /* internal */
1196 static int32_t
1197 toUpperOrTitle(const UCaseProps *csp, UChar32 c,
1198 UCaseContextIterator *iter, void *context,
1199 const UChar **pString,
1200 const char *locale, int32_t *locCache,
1201 UBool upperNotTitle) {
1202 UChar32 result;
1203 uint16_t props;
1204
1205 result=c;
1206 GET_PROPS(csp, c, props);
1207 if(!PROPS_HAS_EXCEPTION(props)) {
1208 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1209 result=c+UCASE_GET_DELTA(props);
1210 }
1211 } else {
1212 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1213 uint16_t excWord=*pe++;
1214 int32_t full, index;
1215
1216 pe2=pe;
1217
1218 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1219 /* use hardcoded conditions and mappings */
1220 int32_t loc=ucase_getCaseLocale(locale, locCache);
1221
1222 if(loc==LOC_TURKISH && c==0x69) {
1223 /*
1224 # Turkish and Azeri
1225
1226 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1227 # The following rules handle those cases.
1228
1229 # When uppercasing, i turns into a dotted capital I
1230
1231 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1232 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1233 */
1234 return 0x130;
1235 } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
1236 /*
1237 # Lithuanian
1238
1239 # Lithuanian retains the dot in a lowercase i when followed by accents.
1240
1241 # Remove DOT ABOVE after "i" with upper or titlecase
1242
1243 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1244 */
1245 return 0; /* remove the dot (continue without output) */
1246 } else {
1247 /* no known conditional special case mapping, use a normal mapping */
1248 }
1249 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1250 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1251
1252 /* start of full case mapping strings */
1253 ++pe;
1254
1255 /* skip the lowercase and case-folding result strings */
1256 pe+=full&UCASE_FULL_LOWER;
1257 full>>=4;
1258 pe+=full&0xf;
1259 full>>=4;
1260
1261 if(upperNotTitle) {
1262 full&=0xf;
1263 } else {
1264 /* skip the uppercase result string */
1265 pe+=full&0xf;
1266 full=(full>>4)&0xf;
1267 }
1268
1269 if(full!=0) {
1270 /* set the output pointer to the result string */
1271 *pString=pe;
1272
1273 /* return the string length */
1274 return full;
1275 }
1276 }
1277
1278 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1279 index=UCASE_EXC_TITLE;
1280 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1281 /* here, titlecase is same as uppercase */
1282 index=UCASE_EXC_UPPER;
1283 } else {
1284 return ~c;
1285 }
1286 GET_SLOT_VALUE(excWord, index, pe2, result);
1287 }
1288
1289 return (result==c) ? ~result : result;
1290 }
1291
1292 U_CAPI int32_t U_EXPORT2
1293 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
1294 UCaseContextIterator *iter, void *context,
1295 const UChar **pString,
1296 const char *locale, int32_t *locCache) {
1297 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
1298 }
1299
1300 U_CAPI int32_t U_EXPORT2
1301 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
1302 UCaseContextIterator *iter, void *context,
1303 const UChar **pString,
1304 const char *locale, int32_t *locCache) {
1305 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
1306 }
1307
1308 /* case folding ------------------------------------------------------------- */
1309
1310 /*
1311 * Case folding is similar to lowercasing.
1312 * The result may be a simple mapping, i.e., a single code point, or
1313 * a full mapping, i.e., a string.
1314 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1315 * then only the lowercase mapping is stored.
1316 *
1317 * Some special cases are hardcoded because their conditions cannot be
1318 * parsed and processed from CaseFolding.txt.
1319 *
1320 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1321
1322 # C: common case folding, common mappings shared by both simple and full mappings.
1323 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1324 # S: simple case folding, mappings to single characters where different from F.
1325 # T: special case for uppercase I and dotted uppercase I
1326 # - For non-Turkic languages, this mapping is normally not used.
1327 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1328 #
1329 # Usage:
1330 # A. To do a simple case folding, use the mappings with status C + S.
1331 # B. To do a full case folding, use the mappings with status C + F.
1332 #
1333 # The mappings with status T can be used or omitted depending on the desired case-folding
1334 # behavior. (The default option is to exclude them.)
1335
1336 * Unicode 3.2 has 'T' mappings as follows:
1337
1338 0049; T; 0131; # LATIN CAPITAL LETTER I
1339 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1340
1341 * while the default mappings for these code points are:
1342
1343 0049; C; 0069; # LATIN CAPITAL LETTER I
1344 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1345
1346 * U+0130 has no simple case folding (simple-case-folds to itself).
1347 */
1348
1349 /* return the simple case folding mapping for c */
1350 U_CAPI UChar32 U_EXPORT2
1351 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
1352 uint16_t props;
1353 GET_PROPS(csp, c, props);
1354 if(!PROPS_HAS_EXCEPTION(props)) {
1355 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1356 c+=UCASE_GET_DELTA(props);
1357 }
1358 } else {
1359 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
1360 uint16_t excWord=*pe++;
1361 int32_t index;
1362 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1363 /* special case folding mappings, hardcoded */
1364 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1365 /* default mappings */
1366 if(c==0x49) {
1367 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1368 return 0x69;
1369 } else if(c==0x130) {
1370 /* no simple case folding for U+0130 */
1371 return c;
1372 }
1373 } else {
1374 /* Turkic mappings */
1375 if(c==0x49) {
1376 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1377 return 0x131;
1378 } else if(c==0x130) {
1379 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1380 return 0x69;
1381 }
1382 }
1383 }
1384 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1385 index=UCASE_EXC_FOLD;
1386 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1387 index=UCASE_EXC_LOWER;
1388 } else {
1389 return c;
1390 }
1391 GET_SLOT_VALUE(excWord, index, pe, c);
1392 }
1393 return c;
1394 }
1395
1396 /*
1397 * Issue for canonical caseless match (UAX #21):
1398 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1399 * canonical equivalence, unlike default-option casefolding.
1400 * For example, I-grave and I + grave fold to strings that are not canonically
1401 * equivalent.
1402 * For more details, see the comment in unorm_compare() in unorm.cpp
1403 * and the intermediate prototype changes for Jitterbug 2021.
1404 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1405 *
1406 * This did not get fixed because it appears that it is not possible to fix
1407 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1408 * together in a way that they still fold to common result strings.
1409 */
1410
1411 U_CAPI int32_t U_EXPORT2
1412 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
1413 const UChar **pString,
1414 uint32_t options) {
1415 static const UChar
1416 iDot[2]= { 0x69, 0x307 };
1417
1418 UChar32 result;
1419 uint16_t props;
1420
1421 result=c;
1422 GET_PROPS(csp, c, props);
1423 if(!PROPS_HAS_EXCEPTION(props)) {
1424 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1425 result=c+UCASE_GET_DELTA(props);
1426 }
1427 } else {
1428 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1429 uint16_t excWord=*pe++;
1430 int32_t full, index;
1431
1432 pe2=pe;
1433
1434 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1435 /* use hardcoded conditions and mappings */
1436 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1437 /* default mappings */
1438 if(c==0x49) {
1439 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1440 return 0x69;
1441 } else if(c==0x130) {
1442 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1443 *pString=iDot;
1444 return 2;
1445 }
1446 } else {
1447 /* Turkic mappings */
1448 if(c==0x49) {
1449 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1450 return 0x131;
1451 } else if(c==0x130) {
1452 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1453 return 0x69;
1454 }
1455 }
1456 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1457 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1458
1459 /* start of full case mapping strings */
1460 ++pe;
1461
1462 /* skip the lowercase result string */
1463 pe+=full&UCASE_FULL_LOWER;
1464 full=(full>>4)&0xf;
1465
1466 if(full!=0) {
1467 /* set the output pointer to the result string */
1468 *pString=pe;
1469
1470 /* return the string length */
1471 return full;
1472 }
1473 }
1474
1475 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1476 index=UCASE_EXC_FOLD;
1477 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1478 index=UCASE_EXC_LOWER;
1479 } else {
1480 return ~c;
1481 }
1482 GET_SLOT_VALUE(excWord, index, pe2, result);
1483 }
1484
1485 return (result==c) ? ~result : result;
1486 }
1487
1488 /* case mapping properties API ---------------------------------------------- */
1489
1490 /* get the UCaseProps singleton, or else its dummy, once and for all */
1491 static const UCaseProps *
1492 getCaseProps() {
1493 /*
1494 * This lazy intialization with double-checked locking (without mutex protection for
1495 * the initial check) is transiently unsafe under certain circumstances.
1496 * Check the readme and use u_init() if necessary.
1497 */
1498
1499 /* the initial check is performed by the GET_CASE_PROPS() macro */
1500 const UCaseProps *csp;
1501 UErrorCode errorCode=U_ZERO_ERROR;
1502
1503 csp=ucase_getSingleton(&errorCode);
1504 if(U_FAILURE(errorCode)) {
1505 errorCode=U_ZERO_ERROR;
1506 csp=ucase_getDummy(&errorCode);
1507 if(U_FAILURE(errorCode)) {
1508 return NULL;
1509 }
1510 }
1511
1512 return csp;
1513 }
1514
1515 /*
1516 * In ICU 3.0, most Unicode properties were loaded from uprops.icu.
1517 * ICU 3.2 adds ucase.icu for case mapping properties.
1518 * ICU 3.4 adds ubidi.icu for bidi/shaping properties and
1519 * removes case/bidi/shaping properties from uprops.icu.
1520 *
1521 * Loading of uprops.icu was never mutex-protected and required u_init()
1522 * for thread safety.
1523 * In order to maintain performance for all such properties,
1524 * ucase.icu and ubidi.icu are loaded lazily, without mutexing.
1525 * u_init() will try to load them for thread safety,
1526 * but u_init() will not fail if they are missing.
1527 *
1528 * uchar.c maintains a tri-state flag for (not loaded/loaded/failed to load)
1529 * and an error code for load failure.
1530 * Instead, here we try to load at most once.
1531 * If it works, we use the resulting singleton object.
1532 * If it fails, then we get a dummy object, which always works unless
1533 * we are seriously out of memory.
1534 * After the first try, we have a never-changing pointer to either the
1535 * real singleton or the dummy.
1536 *
1537 * This method is used in Unicode properties APIs (uchar.h) that
1538 * do not have a service object and also do not have an error code parameter.
1539 * Other API implementations get the singleton themselves
1540 * (with mutexing), store it in the service object, and report errors.
1541 */
1542 #define GET_CASE_PROPS() (gCsp!=NULL ? gCsp : getCaseProps())
1543
1544 /* public API (see uchar.h) */
1545
1546 U_CAPI UBool U_EXPORT2
1547 u_isULowercase(UChar32 c) {
1548 return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
1549 }
1550
1551 U_CAPI UBool U_EXPORT2
1552 u_isUUppercase(UChar32 c) {
1553 return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
1554 }
1555
1556 /* Transforms the Unicode character to its lower case equivalent.*/
1557 U_CAPI UChar32 U_EXPORT2
1558 u_tolower(UChar32 c) {
1559 return ucase_tolower(GET_CASE_PROPS(), c);
1560 }
1561
1562 /* Transforms the Unicode character to its upper case equivalent.*/
1563 U_CAPI UChar32 U_EXPORT2
1564 u_toupper(UChar32 c) {
1565 return ucase_toupper(GET_CASE_PROPS(), c);
1566 }
1567
1568 /* Transforms the Unicode character to its title case equivalent.*/
1569 U_CAPI UChar32 U_EXPORT2
1570 u_totitle(UChar32 c) {
1571 return ucase_totitle(GET_CASE_PROPS(), c);
1572 }
1573
1574 /* return the simple case folding mapping for c */
1575 U_CAPI UChar32 U_EXPORT2
1576 u_foldCase(UChar32 c, uint32_t options) {
1577 return ucase_fold(GET_CASE_PROPS(), c, options);
1578 }
1579
1580 U_CFUNC int32_t U_EXPORT2
1581 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1582 /* case mapping properties */
1583 const UCaseProps *csp=GET_CASE_PROPS();
1584 if(csp==NULL) {
1585 return FALSE;
1586 }
1587 switch(which) {
1588 case UCHAR_LOWERCASE:
1589 return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
1590 case UCHAR_UPPERCASE:
1591 return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
1592 case UCHAR_SOFT_DOTTED:
1593 return ucase_isSoftDotted(csp, c);
1594 case UCHAR_CASE_SENSITIVE:
1595 return ucase_isCaseSensitive(csp, c);
1596 default:
1597 return FALSE;
1598 }
1599 }