]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucase.c
ICU-400.38.tar.gz
[apple/icu.git] / icuSources / common / ucase.c
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2004-2008, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucase.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2004aug30
14 * created by: Markus W. Scherer
15 *
16 * Low-level Unicode character/string case mapping code.
17 * Much code moved here (and modified) from uchar.c.
18 */
19
20 #include "unicode/utypes.h"
21 #include "unicode/uset.h"
22 #include "unicode/udata.h" /* UDataInfo */
23 #include "ucmndata.h" /* DataHeader */
24 #include "udatamem.h"
25 #include "umutex.h"
26 #include "uassert.h"
27 #include "cmemory.h"
28 #include "utrie.h"
29 #include "ucase.h"
30 #include "ucln_cmn.h"
31
32 struct UCaseProps {
33 UDataMemory *mem;
34 const int32_t *indexes;
35 const uint16_t *exceptions;
36 const UChar *unfold;
37
38 UTrie trie;
39 uint8_t formatVersion[4];
40 };
41
42 /* data loading etc. -------------------------------------------------------- */
43
44 #if UCASE_HARDCODE_DATA
45
46 /* ucase_props_data.c is machine-generated by gencase --csource */
47 #include "ucase_props_data.c"
48
49 #else
50
51 static UBool U_CALLCONV
52 isAcceptable(void *context,
53 const char *type, const char *name,
54 const UDataInfo *pInfo) {
55 if(
56 pInfo->size>=20 &&
57 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
58 pInfo->charsetFamily==U_CHARSET_FAMILY &&
59 pInfo->dataFormat[0]==UCASE_FMT_0 && /* dataFormat="cAsE" */
60 pInfo->dataFormat[1]==UCASE_FMT_1 &&
61 pInfo->dataFormat[2]==UCASE_FMT_2 &&
62 pInfo->dataFormat[3]==UCASE_FMT_3 &&
63 pInfo->formatVersion[0]==1 &&
64 pInfo->formatVersion[2]==UTRIE_SHIFT &&
65 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
66 ) {
67 UCaseProps *csp=(UCaseProps *)context;
68 uprv_memcpy(csp->formatVersion, pInfo->formatVersion, 4);
69 return TRUE;
70 } else {
71 return FALSE;
72 }
73 }
74
75 static UCaseProps *
76 ucase_openData(UCaseProps *cspProto,
77 const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
78 UCaseProps *csp;
79 int32_t size;
80
81 cspProto->indexes=(const int32_t *)bin;
82 if( (length>=0 && length<16*4) ||
83 cspProto->indexes[UCASE_IX_INDEX_TOP]<16
84 ) {
85 /* length or indexes[] too short for minimum indexes[] length of 16 */
86 *pErrorCode=U_INVALID_FORMAT_ERROR;
87 return NULL;
88 }
89 size=cspProto->indexes[UCASE_IX_INDEX_TOP]*4;
90 if(length>=0) {
91 if(length>=size && length>=cspProto->indexes[UCASE_IX_LENGTH]) {
92 length-=size;
93 } else {
94 /* length too short for indexes[] or for the whole data length */
95 *pErrorCode=U_INVALID_FORMAT_ERROR;
96 return NULL;
97 }
98 }
99 bin+=size;
100 /* from here on, assume that the sizes of the items fit into the total length */
101
102 /* unserialize the trie, after indexes[] */
103 size=cspProto->indexes[UCASE_IX_TRIE_SIZE];
104 utrie_unserialize(&cspProto->trie, bin, size, pErrorCode);
105 if(U_FAILURE(*pErrorCode)) {
106 return NULL;
107 }
108 bin+=size;
109
110 /* get exceptions[] */
111 size=2*cspProto->indexes[UCASE_IX_EXC_LENGTH];
112 cspProto->exceptions=(const uint16_t *)bin;
113 bin+=size;
114
115 /* get unfold[] */
116 size=2*cspProto->indexes[UCASE_IX_UNFOLD_LENGTH];
117 if(size!=0) {
118 cspProto->unfold=(const UChar *)bin;
119 bin+=size;
120 } else {
121 cspProto->unfold=NULL;
122 }
123
124 /* allocate, copy, and return the new UCaseProps */
125 csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps));
126 if(csp==NULL) {
127 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
128 return NULL;
129 } else {
130 uprv_memcpy(csp, cspProto, sizeof(UCaseProps));
131 return csp;
132 }
133 }
134
135 U_CAPI UCaseProps * U_EXPORT2
136 ucase_open(UErrorCode *pErrorCode) {
137 UCaseProps cspProto={ NULL }, *csp;
138
139 cspProto.mem=udata_openChoice(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, isAcceptable, &cspProto, pErrorCode);
140 if(U_FAILURE(*pErrorCode)) {
141 return NULL;
142 }
143
144 csp=ucase_openData(
145 &cspProto,
146 udata_getMemory(cspProto.mem),
147 udata_getLength(cspProto.mem),
148 pErrorCode);
149 if(U_FAILURE(*pErrorCode)) {
150 udata_close(cspProto.mem);
151 return NULL;
152 } else {
153 return csp;
154 }
155 }
156
157 U_CAPI UCaseProps * U_EXPORT2
158 ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
159 UCaseProps cspProto={ NULL };
160 const DataHeader *hdr;
161
162 if(U_FAILURE(*pErrorCode)) {
163 return NULL;
164 }
165 if(bin==NULL) {
166 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
167 return NULL;
168 }
169
170 /* check the header */
171 if(length>=0 && length<20) {
172 *pErrorCode=U_INVALID_FORMAT_ERROR;
173 return NULL;
174 }
175 hdr=(const DataHeader *)bin;
176 if(
177 !(hdr->dataHeader.magic1==0xda && hdr->dataHeader.magic2==0x27 &&
178 hdr->info.isBigEndian==U_IS_BIG_ENDIAN &&
179 isAcceptable(&cspProto, UCASE_DATA_TYPE, UCASE_DATA_NAME, &hdr->info))
180 ) {
181 *pErrorCode=U_INVALID_FORMAT_ERROR;
182 return NULL;
183 }
184
185 bin+=hdr->dataHeader.headerSize;
186 if(length>=0) {
187 length-=hdr->dataHeader.headerSize;
188 }
189 return ucase_openData(&cspProto, bin, length, pErrorCode);
190 }
191
192 #endif
193
194 U_CAPI void U_EXPORT2
195 ucase_close(UCaseProps *csp) {
196 if(csp!=NULL) {
197 #if !UCASE_HARDCODE_DATA
198 udata_close(csp->mem);
199 #endif
200 uprv_free(csp);
201 }
202 }
203
204 /* UCaseProps singleton ----------------------------------------------------- */
205
206 #if !UCASE_HARDCODE_DATA
207 static UCaseProps *gCsp=NULL;
208 static UCaseProps *gCspDummy=NULL;
209 static UErrorCode gErrorCode=U_ZERO_ERROR;
210 static int8_t gHaveData=0;
211 #endif
212
213 #if !UCASE_HARDCODE_DATA
214 static UBool U_CALLCONV ucase_cleanup(void) {
215 ucase_close(gCsp);
216 gCsp=NULL;
217 ucase_close(gCspDummy);
218 gCspDummy=NULL;
219 gErrorCode=U_ZERO_ERROR;
220 gHaveData=0;
221 return TRUE;
222 }
223 #endif
224
225 U_CAPI const UCaseProps * U_EXPORT2
226 ucase_getSingleton(UErrorCode *pErrorCode) {
227 #if UCASE_HARDCODE_DATA
228 if(U_FAILURE(*pErrorCode)) {
229 return NULL;
230 }
231 return &ucase_props_singleton;
232 #else
233 int8_t haveData;
234
235 if(U_FAILURE(*pErrorCode)) {
236 return NULL;
237 }
238
239 UMTX_CHECK(NULL, gHaveData, haveData);
240
241 if(haveData>0) {
242 /* data was loaded */
243 return gCsp;
244 } else if(haveData<0) {
245 /* data loading failed */
246 *pErrorCode=gErrorCode;
247 return NULL;
248 } else /* haveData==0 */ {
249 /* load the data */
250 UCaseProps *csp=ucase_open(pErrorCode);
251 if(U_FAILURE(*pErrorCode)) {
252 gHaveData=-1;
253 gErrorCode=*pErrorCode;
254 return NULL;
255 }
256
257 /* set the static variables */
258 umtx_lock(NULL);
259 if(gCsp==NULL) {
260 gCsp=csp;
261 csp=NULL;
262 gHaveData=1;
263 ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup);
264 }
265 umtx_unlock(NULL);
266
267 ucase_close(csp);
268 return gCsp;
269 }
270 #endif
271 }
272
273 #if !UCASE_HARDCODE_DATA
274 U_CAPI const UCaseProps * U_EXPORT2
275 ucase_getDummy(UErrorCode *pErrorCode) {
276 UCaseProps *csp;
277
278 if(U_FAILURE(*pErrorCode)) {
279 return NULL;
280 }
281
282 UMTX_CHECK(NULL, gCspDummy, csp);
283
284 if(csp!=NULL) {
285 /* the dummy object was already created */
286 return csp;
287 } else /* csp==NULL */ {
288 /* create the dummy object */
289 int32_t *indexes;
290
291 csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps)+UCASE_IX_TOP*4+UTRIE_DUMMY_SIZE);
292 if(csp==NULL) {
293 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
294 return NULL;
295 }
296 uprv_memset(csp, 0, sizeof(UCaseProps)+UCASE_IX_TOP*4);
297
298 csp->indexes=indexes=(int32_t *)(csp+1);
299 indexes[UCASE_IX_INDEX_TOP]=UCASE_IX_TOP;
300
301 indexes[UCASE_IX_TRIE_SIZE]=
302 utrie_unserializeDummy(&csp->trie, indexes+UCASE_IX_TOP, UTRIE_DUMMY_SIZE, 0, 0, TRUE, pErrorCode);
303 if(U_FAILURE(*pErrorCode)) {
304 uprv_free(csp);
305 return NULL;
306 }
307
308 csp->formatVersion[0]=1;
309 csp->formatVersion[2]=UTRIE_SHIFT;
310 csp->formatVersion[3]=UTRIE_INDEX_SHIFT;
311
312 /* set the static variables */
313 umtx_lock(NULL);
314 if(gCspDummy==NULL) {
315 gCspDummy=csp;
316 csp=NULL;
317 ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup);
318 }
319 umtx_unlock(NULL);
320
321 uprv_free(csp);
322 return gCspDummy;
323 }
324 }
325 #endif
326
327 /* set of property starts for UnicodeSet ------------------------------------ */
328
329 static UBool U_CALLCONV
330 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
331 /* add the start code point to the USet */
332 const USetAdder *sa=(const USetAdder *)context;
333 sa->add(sa->set, start);
334 return TRUE;
335 }
336
337 U_CFUNC void U_EXPORT2
338 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
339 if(U_FAILURE(*pErrorCode)) {
340 return;
341 }
342
343 /* add the start code point of each same-value range of the trie */
344 utrie_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
345
346 /* add code points with hardcoded properties, plus the ones following them */
347
348 /* (none right now, see comment below) */
349
350 /*
351 * Omit code points with hardcoded specialcasing properties
352 * because we do not build property UnicodeSets for them right now.
353 */
354 }
355
356 /* data access primitives --------------------------------------------------- */
357
358 /* UTRIE_GET16() itself validates c */
359 #define GET_PROPS(csp, c, result) \
360 UTRIE_GET16(&(csp)->trie, c, result);
361
362 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
363
364 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
365
366 /* number of bits in an 8-bit integer value */
367 static const uint8_t flagsOffset[256]={
368 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
369 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
370 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
371 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
372 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
373 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
374 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
375 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
376 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
377 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
378 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
379 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
380 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
381 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
382 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
383 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
384 };
385
386 #define HAS_SLOT(flags, index) ((flags)&(1<<(index)))
387 #define SLOT_OFFSET(flags, index) flagsOffset[(flags)&((1<<(index))-1)]
388
389 /*
390 * Get the value of an optional-value slot where HAS_SLOT(excWord, index).
391 *
392 * @param excWord (in) initial exceptions word
393 * @param index (in) desired slot index
394 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
395 * moved to the last uint16_t of the value, use +1 for beginning of next slot
396 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
397 */
398 #define GET_SLOT_VALUE(excWord, index, pExc16, value) \
399 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
400 (pExc16)+=SLOT_OFFSET(excWord, index); \
401 (value)=*pExc16; \
402 } else { \
403 (pExc16)+=2*SLOT_OFFSET(excWord, index); \
404 (value)=*pExc16++; \
405 (value)=((value)<<16)|*pExc16; \
406 }
407
408 /* simple case mappings ----------------------------------------------------- */
409
410 U_CAPI UChar32 U_EXPORT2
411 ucase_tolower(const UCaseProps *csp, UChar32 c) {
412 uint16_t props;
413 GET_PROPS(csp, c, props);
414 if(!PROPS_HAS_EXCEPTION(props)) {
415 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
416 c+=UCASE_GET_DELTA(props);
417 }
418 } else {
419 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
420 uint16_t excWord=*pe++;
421 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
422 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
423 }
424 }
425 return c;
426 }
427
428 U_CAPI UChar32 U_EXPORT2
429 ucase_toupper(const UCaseProps *csp, UChar32 c) {
430 uint16_t props;
431 GET_PROPS(csp, c, props);
432 if(!PROPS_HAS_EXCEPTION(props)) {
433 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
434 c+=UCASE_GET_DELTA(props);
435 }
436 } else {
437 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
438 uint16_t excWord=*pe++;
439 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
440 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
441 }
442 }
443 return c;
444 }
445
446 U_CAPI UChar32 U_EXPORT2
447 ucase_totitle(const UCaseProps *csp, UChar32 c) {
448 uint16_t props;
449 GET_PROPS(csp, c, props);
450 if(!PROPS_HAS_EXCEPTION(props)) {
451 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
452 c+=UCASE_GET_DELTA(props);
453 }
454 } else {
455 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
456 uint16_t excWord=*pe++;
457 int32_t index;
458 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
459 index=UCASE_EXC_TITLE;
460 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
461 index=UCASE_EXC_UPPER;
462 } else {
463 return c;
464 }
465 GET_SLOT_VALUE(excWord, index, pe, c);
466 }
467 return c;
468 }
469
470 static const UChar iDot[2] = { 0x69, 0x307 };
471 static const UChar jDot[2] = { 0x6a, 0x307 };
472 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
473 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
474 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
475 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
476
477
478 U_CFUNC void U_EXPORT2
479 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
480 uint16_t props;
481
482 /*
483 * Hardcode the case closure of i and its relatives and ignore the
484 * data file data for these characters.
485 * The Turkic dotless i and dotted I with their case mapping conditions
486 * and case folding option make the related characters behave specially.
487 * This code matches their closure behavior to their case folding behavior.
488 */
489
490 switch(c) {
491 case 0x49:
492 /* regular i and I are in one equivalence class */
493 sa->add(sa->set, 0x69);
494 return;
495 case 0x69:
496 sa->add(sa->set, 0x49);
497 return;
498 case 0x130:
499 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
500 sa->addString(sa->set, iDot, 2);
501 return;
502 case 0x131:
503 /* dotless i is in a class by itself */
504 return;
505 default:
506 /* otherwise use the data file data */
507 break;
508 }
509
510 GET_PROPS(csp, c, props);
511 if(!PROPS_HAS_EXCEPTION(props)) {
512 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
513 /* add the one simple case mapping, no matter what type it is */
514 int32_t delta=UCASE_GET_DELTA(props);
515 if(delta!=0) {
516 sa->add(sa->set, c+delta);
517 }
518 }
519 } else {
520 /*
521 * c has exceptions, so there may be multiple simple and/or
522 * full case mappings. Add them all.
523 */
524 const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
525 const UChar *closure;
526 uint16_t excWord=*pe++;
527 int32_t index, closureLength, fullLength, length;
528
529 pe0=pe;
530
531 /* add all simple case mappings */
532 for(index=UCASE_EXC_LOWER; index<=UCASE_EXC_TITLE; ++index) {
533 if(HAS_SLOT(excWord, index)) {
534 pe=pe0;
535 GET_SLOT_VALUE(excWord, index, pe, c);
536 sa->add(sa->set, c);
537 }
538 }
539
540 /* get the closure string pointer & length */
541 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
542 pe=pe0;
543 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
544 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
545 closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
546 } else {
547 closureLength=0;
548 closure=NULL;
549 }
550
551 /* add the full case folding */
552 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
553 pe=pe0;
554 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
555
556 /* start of full case mapping strings */
557 ++pe;
558
559 fullLength&=0xffff; /* bits 16 and higher are reserved */
560
561 /* skip the lowercase result string */
562 pe+=fullLength&UCASE_FULL_LOWER;
563 fullLength>>=4;
564
565 /* add the full case folding string */
566 length=fullLength&0xf;
567 if(length!=0) {
568 sa->addString(sa->set, (const UChar *)pe, length);
569 pe+=length;
570 }
571
572 /* skip the uppercase and titlecase strings */
573 fullLength>>=4;
574 pe+=fullLength&0xf;
575 fullLength>>=4;
576 pe+=fullLength;
577
578 closure=(const UChar *)pe; /* behind full case mappings */
579 }
580
581 /* add each code point in the closure string */
582 for(index=0; index<closureLength;) {
583 U16_NEXT_UNSAFE(closure, index, c);
584 sa->add(sa->set, c);
585 }
586 }
587 }
588
589 /*
590 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
591 * must be length>0 and max>0 and length<=max
592 */
593 static U_INLINE int32_t
594 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
595 int32_t c1, c2;
596
597 max-=length; /* we require length<=max, so no need to decrement max in the loop */
598 do {
599 c1=*s++;
600 c2=*t++;
601 if(c2==0) {
602 return 1; /* reached the end of t but not of s */
603 }
604 c1-=c2;
605 if(c1!=0) {
606 return c1; /* return difference result */
607 }
608 } while(--length>0);
609 /* ends with length==0 */
610
611 if(max==0 || *t==0) {
612 return 0; /* equal to length of both strings */
613 } else {
614 return -max; /* return lengh difference */
615 }
616 }
617
618 U_CFUNC UBool U_EXPORT2
619 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
620 const UChar *unfold, *p;
621 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
622
623 if(csp->unfold==NULL || s==NULL) {
624 return FALSE; /* no reverse case folding data, or no string */
625 }
626 if(length<=1) {
627 /* the string is too short to find any match */
628 /*
629 * more precise would be:
630 * if(!u_strHasMoreChar32Than(s, length, 1))
631 * but this does not make much practical difference because
632 * a single supplementary code point would just not be found
633 */
634 return FALSE;
635 }
636
637 unfold=csp->unfold;
638 unfoldRows=unfold[UCASE_UNFOLD_ROWS];
639 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
640 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
641 unfold+=unfoldRowWidth;
642
643 if(length>unfoldStringWidth) {
644 /* the string is too long to find any match */
645 return FALSE;
646 }
647
648 /* do a binary search for the string */
649 start=0;
650 limit=unfoldRows;
651 while(start<limit) {
652 i=(start+limit)/2;
653 p=unfold+(i*unfoldRowWidth);
654 result=strcmpMax(s, length, p, unfoldStringWidth);
655
656 if(result==0) {
657 /* found the string: add each code point, and its case closure */
658 UChar32 c;
659
660 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
661 U16_NEXT_UNSAFE(p, i, c);
662 sa->add(sa->set, c);
663 ucase_addCaseClosure(csp, c, sa);
664 }
665 return TRUE;
666 } else if(result<0) {
667 limit=i;
668 } else /* result>0 */ {
669 start=i+1;
670 }
671 }
672
673 return FALSE; /* string not found */
674 }
675
676 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
677 U_CAPI int32_t U_EXPORT2
678 ucase_getType(const UCaseProps *csp, UChar32 c) {
679 uint16_t props;
680 GET_PROPS(csp, c, props);
681 return UCASE_GET_TYPE(props);
682 }
683
684 /** @return same as ucase_getType(), or <0 if c is case-ignorable */
685 U_CAPI int32_t U_EXPORT2
686 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
687 int32_t type;
688 uint16_t props;
689 GET_PROPS(csp, c, props);
690 type=UCASE_GET_TYPE(props);
691 if(type!=UCASE_NONE) {
692 return type;
693 } else if(
694 c==0x307 ||
695 (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE
696 ) {
697 return -1; /* case-ignorable */
698 } else {
699 return 0; /* c is neither cased nor case-ignorable */
700 }
701 }
702
703 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
704 static U_INLINE int32_t
705 getDotType(const UCaseProps *csp, UChar32 c) {
706 uint16_t props;
707 GET_PROPS(csp, c, props);
708 if(!PROPS_HAS_EXCEPTION(props)) {
709 return props&UCASE_DOT_MASK;
710 } else {
711 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
712 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
713 }
714 }
715
716 U_CAPI UBool U_EXPORT2
717 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
718 return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
719 }
720
721 U_CAPI UBool U_EXPORT2
722 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
723 uint16_t props;
724 GET_PROPS(csp, c, props);
725 return (UBool)((props&UCASE_SENSITIVE)!=0);
726 }
727
728 /* string casing ------------------------------------------------------------ */
729
730 /*
731 * These internal functions form the core of string case mappings.
732 * They map single code points to result code points or strings and take
733 * all necessary conditions (context, locale ID, options) into account.
734 *
735 * They do not iterate over the source or write to the destination
736 * so that the same functions are useful for non-standard string storage,
737 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
738 * For the same reason, the "surrounding text" context is passed in as a
739 * UCaseContextIterator which does not make any assumptions about
740 * the underlying storage.
741 *
742 * This section contains helper functions that check for conditions
743 * in the input text surrounding the current code point
744 * according to SpecialCasing.txt.
745 *
746 * Each helper function gets the index
747 * - after the current code point if it looks at following text
748 * - before the current code point if it looks at preceding text
749 *
750 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
751 *
752 * Final_Sigma
753 * C is preceded by a sequence consisting of
754 * a cased letter and a case-ignorable sequence,
755 * and C is not followed by a sequence consisting of
756 * an ignorable sequence and then a cased letter.
757 *
758 * More_Above
759 * C is followed by one or more characters of combining class 230 (ABOVE)
760 * in the combining character sequence.
761 *
762 * After_Soft_Dotted
763 * The last preceding character with combining class of zero before C
764 * was Soft_Dotted,
765 * and there is no intervening combining character class 230 (ABOVE).
766 *
767 * Before_Dot
768 * C is followed by combining dot above (U+0307).
769 * Any sequence of characters with a combining class that is neither 0 nor 230
770 * may intervene between the current character and the combining dot above.
771 *
772 * The erratum from 2002-10-31 adds the condition
773 *
774 * After_I
775 * The last preceding base character was an uppercase I, and there is no
776 * intervening combining character class 230 (ABOVE).
777 *
778 * (See Jitterbug 2344 and the comments on After_I below.)
779 *
780 * Helper definitions in Unicode 3.2 UAX 21:
781 *
782 * D1. A character C is defined to be cased
783 * if it meets any of the following criteria:
784 *
785 * - The general category of C is Titlecase Letter (Lt)
786 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
787 * - Given D = NFD(C), then it is not the case that:
788 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
789 * (This third criterium does not add any characters to the list
790 * for Unicode 3.2. Ignored.)
791 *
792 * D2. A character C is defined to be case-ignorable
793 * if it meets either of the following criteria:
794 *
795 * - The general category of C is
796 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
797 * Letter Modifier (Lm), or Symbol Modifier (Sk)
798 * - C is one of the following characters
799 * U+0027 APOSTROPHE
800 * U+00AD SOFT HYPHEN (SHY)
801 * U+2019 RIGHT SINGLE QUOTATION MARK
802 * (the preferred character for apostrophe)
803 *
804 * D3. A case-ignorable sequence is a sequence of
805 * zero or more case-ignorable characters.
806 */
807
808 #define is_a(c) ((c)=='a' || (c)=='A')
809 #define is_d(c) ((c)=='d' || (c)=='D')
810 #define is_e(c) ((c)=='e' || (c)=='E')
811 #define is_i(c) ((c)=='i' || (c)=='I')
812 #define is_l(c) ((c)=='l' || (c)=='L')
813 #define is_n(c) ((c)=='n' || (c)=='N')
814 #define is_r(c) ((c)=='r' || (c)=='R')
815 #define is_t(c) ((c)=='t' || (c)=='T')
816 #define is_u(c) ((c)=='u' || (c)=='U')
817 #define is_z(c) ((c)=='z' || (c)=='Z')
818
819 /* separator? */
820 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
821
822 /**
823 * Requires non-NULL locale ID but otherwise does the equivalent of
824 * checking for language codes as if uloc_getLanguage() were called:
825 * Accepts both 2- and 3-letter codes and accepts case variants.
826 */
827 U_CFUNC int32_t
828 ucase_getCaseLocale(const char *locale, int32_t *locCache) {
829 int32_t result;
830 char c;
831
832 if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
833 return result;
834 }
835
836 result=UCASE_LOC_ROOT;
837
838 /*
839 * This function used to use uloc_getLanguage(), but the current code
840 * removes the dependency of this low-level code on uloc implementation code
841 * and is faster because not the whole locale ID has to be
842 * examined and copied/transformed.
843 *
844 * Because this code does not want to depend on uloc, the caller must
845 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
846 */
847 c=*locale++;
848 if(is_t(c)) {
849 /* tr or tur? */
850 c=*locale++;
851 if(is_u(c)) {
852 c=*locale++;
853 }
854 if(is_r(c)) {
855 c=*locale;
856 if(is_sep(c)) {
857 result=UCASE_LOC_TURKISH;
858 }
859 }
860 } else if(is_a(c)) {
861 /* az or aze? */
862 c=*locale++;
863 if(is_z(c)) {
864 c=*locale++;
865 if(is_e(c)) {
866 c=*locale;
867 }
868 if(is_sep(c)) {
869 result=UCASE_LOC_TURKISH;
870 }
871 }
872 } else if(is_l(c)) {
873 /* lt or lit? */
874 c=*locale++;
875 if(is_i(c)) {
876 c=*locale++;
877 }
878 if(is_t(c)) {
879 c=*locale;
880 if(is_sep(c)) {
881 result=UCASE_LOC_LITHUANIAN;
882 }
883 }
884 } else if(is_n(c)) {
885 /* nl or nld? */
886 c=*locale++;
887 if(is_l(c)) {
888 c=*locale++;
889 if(is_d(c)) {
890 c=*locale;
891 }
892 if(is_sep(c)) {
893 result=UCASE_LOC_DUTCH;
894 }
895 }
896 }
897
898 if(locCache!=NULL) {
899 *locCache=result;
900 }
901 return result;
902 }
903
904 /* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */
905 static UBool
906 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
907 UChar32 c;
908 uint16_t props;
909
910 if(iter==NULL) {
911 return FALSE;
912 }
913
914 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
915 GET_PROPS(csp, c, props);
916 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
917 return TRUE; /* followed by cased letter */
918 } else if(c==0x307 || (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE) {
919 /* case-ignorable, continue with the loop */
920 } else {
921 return FALSE; /* not ignorable */
922 }
923 }
924
925 return FALSE; /* not followed by cased letter */
926 }
927
928 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
929 static UBool
930 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
931 UChar32 c;
932 int32_t dotType;
933 int8_t dir;
934
935 if(iter==NULL) {
936 return FALSE;
937 }
938
939 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
940 dotType=getDotType(csp, c);
941 if(dotType==UCASE_SOFT_DOTTED) {
942 return TRUE; /* preceded by TYPE_i */
943 } else if(dotType!=UCASE_OTHER_ACCENT) {
944 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
945 }
946 }
947
948 return FALSE; /* not preceded by TYPE_i */
949 }
950
951 /*
952 * See Jitterbug 2344:
953 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
954 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
955 * we made those releases compatible with Unicode 3.2 which had not fixed
956 * a related bug in SpecialCasing.txt.
957 *
958 * From the Jitterbug 2344 text:
959 * ... this bug is listed as a Unicode erratum
960 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
961 * <quote>
962 * There are two errors in SpecialCasing.txt.
963 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
964 * 2. An incorrect context definition. Correct as follows:
965 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
966 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
967 * ---
968 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
969 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
970 * where the context After_I is defined as:
971 * The last preceding base character was an uppercase I, and there is no
972 * intervening combining character class 230 (ABOVE).
973 * </quote>
974 *
975 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
976 *
977 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
978 * # This matches the behavior of the canonically equivalent I-dot_above
979 *
980 * See also the description in this place in older versions of uchar.c (revision 1.100).
981 *
982 * Markus W. Scherer 2003-feb-15
983 */
984
985 /* Is preceded by base character 'I' with no intervening cc=230 ? */
986 static UBool
987 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
988 UChar32 c;
989 int32_t dotType;
990 int8_t dir;
991
992 if(iter==NULL) {
993 return FALSE;
994 }
995
996 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
997 if(c==0x49) {
998 return TRUE; /* preceded by I */
999 }
1000 dotType=getDotType(csp, c);
1001 if(dotType!=UCASE_OTHER_ACCENT) {
1002 return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
1003 }
1004 }
1005
1006 return FALSE; /* not preceded by I */
1007 }
1008
1009 /* Is followed by one or more cc==230 ? */
1010 static UBool
1011 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
1012 UChar32 c;
1013 int32_t dotType;
1014 int8_t dir;
1015
1016 if(iter==NULL) {
1017 return FALSE;
1018 }
1019
1020 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1021 dotType=getDotType(csp, c);
1022 if(dotType==UCASE_ABOVE) {
1023 return TRUE; /* at least one cc==230 following */
1024 } else if(dotType!=UCASE_OTHER_ACCENT) {
1025 return FALSE; /* next base character, no more cc==230 following */
1026 }
1027 }
1028
1029 return FALSE; /* no more cc==230 following */
1030 }
1031
1032 /* Is followed by a dot above (without cc==230 in between) ? */
1033 static UBool
1034 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
1035 UChar32 c;
1036 int32_t dotType;
1037 int8_t dir;
1038
1039 if(iter==NULL) {
1040 return FALSE;
1041 }
1042
1043 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1044 if(c==0x307) {
1045 return TRUE;
1046 }
1047 dotType=getDotType(csp, c);
1048 if(dotType!=UCASE_OTHER_ACCENT) {
1049 return FALSE; /* next base character or cc==230 in between */
1050 }
1051 }
1052
1053 return FALSE; /* no dot above following */
1054 }
1055
1056 U_CAPI int32_t U_EXPORT2
1057 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
1058 UCaseContextIterator *iter, void *context,
1059 const UChar **pString,
1060 const char *locale, int32_t *locCache)
1061 {
1062 UChar32 result;
1063 uint16_t props;
1064
1065 result=c;
1066 GET_PROPS(csp, c, props);
1067 if(!PROPS_HAS_EXCEPTION(props)) {
1068 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1069 result=c+UCASE_GET_DELTA(props);
1070 }
1071 } else {
1072 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1073 uint16_t excWord=*pe++;
1074 int32_t full;
1075
1076 pe2=pe;
1077
1078 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1079 /* use hardcoded conditions and mappings */
1080 int32_t loc=ucase_getCaseLocale(locale, locCache);
1081
1082 /*
1083 * Test for conditional mappings first
1084 * (otherwise the unconditional default mappings are always taken),
1085 * then test for characters that have unconditional mappings in SpecialCasing.txt,
1086 * then get the UnicodeData.txt mappings.
1087 */
1088 if( loc==UCASE_LOC_LITHUANIAN &&
1089 /* base characters, find accents above */
1090 (((c==0x49 || c==0x4a || c==0x12e) &&
1091 isFollowedByMoreAbove(csp, iter, context)) ||
1092 /* precomposed with accent above, no need to find one */
1093 (c==0xcc || c==0xcd || c==0x128))
1094 ) {
1095 /*
1096 # Lithuanian
1097
1098 # Lithuanian retains the dot in a lowercase i when followed by accents.
1099
1100 # Introduce an explicit dot above when lowercasing capital I's and J's
1101 # whenever there are more accents above.
1102 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1103
1104 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1105 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1106 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1107 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1108 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1109 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1110 */
1111 switch(c) {
1112 case 0x49: /* LATIN CAPITAL LETTER I */
1113 *pString=iDot;
1114 return 2;
1115 case 0x4a: /* LATIN CAPITAL LETTER J */
1116 *pString=jDot;
1117 return 2;
1118 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1119 *pString=iOgonekDot;
1120 return 2;
1121 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
1122 *pString=iDotGrave;
1123 return 3;
1124 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
1125 *pString=iDotAcute;
1126 return 3;
1127 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1128 *pString=iDotTilde;
1129 return 3;
1130 default:
1131 return 0; /* will not occur */
1132 }
1133 /* # Turkish and Azeri */
1134 } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1135 /*
1136 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1137 # The following rules handle those cases.
1138
1139 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1140 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1141 */
1142 return 0x69;
1143 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
1144 /*
1145 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1146 # This matches the behavior of the canonically equivalent I-dot_above
1147
1148 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1149 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1150 */
1151 return 0; /* remove the dot (continue without output) */
1152 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
1153 /*
1154 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1155
1156 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1157 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1158 */
1159 return 0x131;
1160 } else if(c==0x130) {
1161 /*
1162 # Preserve canonical equivalence for I with dot. Turkic is handled below.
1163
1164 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1165 */
1166 *pString=iDot;
1167 return 2;
1168 } else if( c==0x3a3 &&
1169 !isFollowedByCasedLetter(csp, iter, context, 1) &&
1170 isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
1171 ) {
1172 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1173 /*
1174 # Special case for final form of sigma
1175
1176 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1177 */
1178 return 0x3c2; /* greek small final sigma */
1179 } else {
1180 /* no known conditional special case mapping, use a normal mapping */
1181 }
1182 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1183 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1184 full&=UCASE_FULL_LOWER;
1185 if(full!=0) {
1186 /* set the output pointer to the lowercase mapping */
1187 *pString=pe+1;
1188
1189 /* return the string length */
1190 return full;
1191 }
1192 }
1193
1194 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1195 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1196 }
1197 }
1198
1199 return (result==c) ? ~result : result;
1200 }
1201
1202 /* internal */
1203 static int32_t
1204 toUpperOrTitle(const UCaseProps *csp, UChar32 c,
1205 UCaseContextIterator *iter, void *context,
1206 const UChar **pString,
1207 const char *locale, int32_t *locCache,
1208 UBool upperNotTitle) {
1209 UChar32 result;
1210 uint16_t props;
1211
1212 result=c;
1213 GET_PROPS(csp, c, props);
1214 if(!PROPS_HAS_EXCEPTION(props)) {
1215 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1216 result=c+UCASE_GET_DELTA(props);
1217 }
1218 } else {
1219 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1220 uint16_t excWord=*pe++;
1221 int32_t full, index;
1222
1223 pe2=pe;
1224
1225 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1226 /* use hardcoded conditions and mappings */
1227 int32_t loc=ucase_getCaseLocale(locale, locCache);
1228
1229 if(loc==UCASE_LOC_TURKISH && c==0x69) {
1230 /*
1231 # Turkish and Azeri
1232
1233 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1234 # The following rules handle those cases.
1235
1236 # When uppercasing, i turns into a dotted capital I
1237
1238 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1239 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1240 */
1241 return 0x130;
1242 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
1243 /*
1244 # Lithuanian
1245
1246 # Lithuanian retains the dot in a lowercase i when followed by accents.
1247
1248 # Remove DOT ABOVE after "i" with upper or titlecase
1249
1250 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1251 */
1252 return 0; /* remove the dot (continue without output) */
1253 } else {
1254 /* no known conditional special case mapping, use a normal mapping */
1255 }
1256 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1257 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1258
1259 /* start of full case mapping strings */
1260 ++pe;
1261
1262 /* skip the lowercase and case-folding result strings */
1263 pe+=full&UCASE_FULL_LOWER;
1264 full>>=4;
1265 pe+=full&0xf;
1266 full>>=4;
1267
1268 if(upperNotTitle) {
1269 full&=0xf;
1270 } else {
1271 /* skip the uppercase result string */
1272 pe+=full&0xf;
1273 full=(full>>4)&0xf;
1274 }
1275
1276 if(full!=0) {
1277 /* set the output pointer to the result string */
1278 *pString=pe;
1279
1280 /* return the string length */
1281 return full;
1282 }
1283 }
1284
1285 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1286 index=UCASE_EXC_TITLE;
1287 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1288 /* here, titlecase is same as uppercase */
1289 index=UCASE_EXC_UPPER;
1290 } else {
1291 return ~c;
1292 }
1293 GET_SLOT_VALUE(excWord, index, pe2, result);
1294 }
1295
1296 return (result==c) ? ~result : result;
1297 }
1298
1299 U_CAPI int32_t U_EXPORT2
1300 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
1301 UCaseContextIterator *iter, void *context,
1302 const UChar **pString,
1303 const char *locale, int32_t *locCache) {
1304 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
1305 }
1306
1307 U_CAPI int32_t U_EXPORT2
1308 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
1309 UCaseContextIterator *iter, void *context,
1310 const UChar **pString,
1311 const char *locale, int32_t *locCache) {
1312 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
1313 }
1314
1315 /* case folding ------------------------------------------------------------- */
1316
1317 /*
1318 * Case folding is similar to lowercasing.
1319 * The result may be a simple mapping, i.e., a single code point, or
1320 * a full mapping, i.e., a string.
1321 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1322 * then only the lowercase mapping is stored.
1323 *
1324 * Some special cases are hardcoded because their conditions cannot be
1325 * parsed and processed from CaseFolding.txt.
1326 *
1327 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1328
1329 # C: common case folding, common mappings shared by both simple and full mappings.
1330 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1331 # S: simple case folding, mappings to single characters where different from F.
1332 # T: special case for uppercase I and dotted uppercase I
1333 # - For non-Turkic languages, this mapping is normally not used.
1334 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1335 #
1336 # Usage:
1337 # A. To do a simple case folding, use the mappings with status C + S.
1338 # B. To do a full case folding, use the mappings with status C + F.
1339 #
1340 # The mappings with status T can be used or omitted depending on the desired case-folding
1341 # behavior. (The default option is to exclude them.)
1342
1343 * Unicode 3.2 has 'T' mappings as follows:
1344
1345 0049; T; 0131; # LATIN CAPITAL LETTER I
1346 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1347
1348 * while the default mappings for these code points are:
1349
1350 0049; C; 0069; # LATIN CAPITAL LETTER I
1351 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1352
1353 * U+0130 has no simple case folding (simple-case-folds to itself).
1354 */
1355
1356 /* return the simple case folding mapping for c */
1357 U_CAPI UChar32 U_EXPORT2
1358 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
1359 uint16_t props;
1360 GET_PROPS(csp, c, props);
1361 if(!PROPS_HAS_EXCEPTION(props)) {
1362 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1363 c+=UCASE_GET_DELTA(props);
1364 }
1365 } else {
1366 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
1367 uint16_t excWord=*pe++;
1368 int32_t index;
1369 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1370 /* special case folding mappings, hardcoded */
1371 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1372 /* default mappings */
1373 if(c==0x49) {
1374 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1375 return 0x69;
1376 } else if(c==0x130) {
1377 /* no simple case folding for U+0130 */
1378 return c;
1379 }
1380 } else {
1381 /* Turkic mappings */
1382 if(c==0x49) {
1383 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1384 return 0x131;
1385 } else if(c==0x130) {
1386 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1387 return 0x69;
1388 }
1389 }
1390 }
1391 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1392 index=UCASE_EXC_FOLD;
1393 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1394 index=UCASE_EXC_LOWER;
1395 } else {
1396 return c;
1397 }
1398 GET_SLOT_VALUE(excWord, index, pe, c);
1399 }
1400 return c;
1401 }
1402
1403 /*
1404 * Issue for canonical caseless match (UAX #21):
1405 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1406 * canonical equivalence, unlike default-option casefolding.
1407 * For example, I-grave and I + grave fold to strings that are not canonically
1408 * equivalent.
1409 * For more details, see the comment in unorm_compare() in unorm.cpp
1410 * and the intermediate prototype changes for Jitterbug 2021.
1411 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1412 *
1413 * This did not get fixed because it appears that it is not possible to fix
1414 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1415 * together in a way that they still fold to common result strings.
1416 */
1417
1418 U_CAPI int32_t U_EXPORT2
1419 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
1420 const UChar **pString,
1421 uint32_t options)
1422 {
1423 UChar32 result;
1424 uint16_t props;
1425
1426 result=c;
1427 GET_PROPS(csp, c, props);
1428 if(!PROPS_HAS_EXCEPTION(props)) {
1429 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1430 result=c+UCASE_GET_DELTA(props);
1431 }
1432 } else {
1433 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1434 uint16_t excWord=*pe++;
1435 int32_t full, index;
1436
1437 pe2=pe;
1438
1439 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1440 /* use hardcoded conditions and mappings */
1441 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1442 /* default mappings */
1443 if(c==0x49) {
1444 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1445 return 0x69;
1446 } else if(c==0x130) {
1447 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1448 *pString=iDot;
1449 return 2;
1450 }
1451 } else {
1452 /* Turkic mappings */
1453 if(c==0x49) {
1454 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1455 return 0x131;
1456 } else if(c==0x130) {
1457 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1458 return 0x69;
1459 }
1460 }
1461 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1462 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1463
1464 /* start of full case mapping strings */
1465 ++pe;
1466
1467 /* skip the lowercase result string */
1468 pe+=full&UCASE_FULL_LOWER;
1469 full=(full>>4)&0xf;
1470
1471 if(full!=0) {
1472 /* set the output pointer to the result string */
1473 *pString=pe;
1474
1475 /* return the string length */
1476 return full;
1477 }
1478 }
1479
1480 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1481 index=UCASE_EXC_FOLD;
1482 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1483 index=UCASE_EXC_LOWER;
1484 } else {
1485 return ~c;
1486 }
1487 GET_SLOT_VALUE(excWord, index, pe2, result);
1488 }
1489
1490 return (result==c) ? ~result : result;
1491 }
1492
1493 /* case mapping properties API ---------------------------------------------- */
1494
1495 /* get the UCaseProps singleton, or else its dummy, once and for all */
1496 #if !UCASE_HARDCODE_DATA
1497 static const UCaseProps *
1498 getCaseProps() {
1499 /*
1500 * This lazy intialization with double-checked locking (without mutex protection for
1501 * the initial check) is transiently unsafe under certain circumstances.
1502 * Check the readme and use u_init() if necessary.
1503 */
1504
1505 /* the initial check is performed by the GET_CASE_PROPS() macro */
1506 const UCaseProps *csp;
1507 UErrorCode errorCode=U_ZERO_ERROR;
1508
1509 csp=ucase_getSingleton(&errorCode);
1510 if(U_FAILURE(errorCode)) {
1511 errorCode=U_ZERO_ERROR;
1512 csp=ucase_getDummy(&errorCode);
1513 if(U_FAILURE(errorCode)) {
1514 return NULL;
1515 }
1516 }
1517
1518 return csp;
1519 }
1520 #endif
1521
1522 /*
1523 * In ICU 3.0, most Unicode properties were loaded from uprops.icu.
1524 * ICU 3.2 adds ucase.icu for case mapping properties.
1525 * ICU 3.4 adds ubidi.icu for bidi/shaping properties and
1526 * removes case/bidi/shaping properties from uprops.icu.
1527 *
1528 * Loading of uprops.icu was never mutex-protected and required u_init()
1529 * for thread safety.
1530 * In order to maintain performance for all such properties,
1531 * ucase.icu and ubidi.icu are loaded lazily, without mutexing.
1532 * u_init() will try to load them for thread safety,
1533 * but u_init() will not fail if they are missing.
1534 *
1535 * uchar.c maintains a tri-state flag for (not loaded/loaded/failed to load)
1536 * and an error code for load failure.
1537 * Instead, here we try to load at most once.
1538 * If it works, we use the resulting singleton object.
1539 * If it fails, then we get a dummy object, which always works unless
1540 * we are seriously out of memory.
1541 * After the first try, we have a never-changing pointer to either the
1542 * real singleton or the dummy.
1543 *
1544 * This method is used in Unicode properties APIs (uchar.h) that
1545 * do not have a service object and also do not have an error code parameter.
1546 * Other API implementations get the singleton themselves
1547 * (with mutexing), store it in the service object, and report errors.
1548 */
1549 #if !UCASE_HARDCODE_DATA
1550 #define GET_CASE_PROPS() (gCsp!=NULL ? gCsp : getCaseProps())
1551 #else
1552 #define GET_CASE_PROPS() &ucase_props_singleton
1553 #endif
1554
1555 /* public API (see uchar.h) */
1556
1557 U_CAPI UBool U_EXPORT2
1558 u_isULowercase(UChar32 c) {
1559 return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
1560 }
1561
1562 U_CAPI UBool U_EXPORT2
1563 u_isUUppercase(UChar32 c) {
1564 return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
1565 }
1566
1567 /* Transforms the Unicode character to its lower case equivalent.*/
1568 U_CAPI UChar32 U_EXPORT2
1569 u_tolower(UChar32 c) {
1570 return ucase_tolower(GET_CASE_PROPS(), c);
1571 }
1572
1573 /* Transforms the Unicode character to its upper case equivalent.*/
1574 U_CAPI UChar32 U_EXPORT2
1575 u_toupper(UChar32 c) {
1576 return ucase_toupper(GET_CASE_PROPS(), c);
1577 }
1578
1579 /* Transforms the Unicode character to its title case equivalent.*/
1580 U_CAPI UChar32 U_EXPORT2
1581 u_totitle(UChar32 c) {
1582 return ucase_totitle(GET_CASE_PROPS(), c);
1583 }
1584
1585 /* return the simple case folding mapping for c */
1586 U_CAPI UChar32 U_EXPORT2
1587 u_foldCase(UChar32 c, uint32_t options) {
1588 return ucase_fold(GET_CASE_PROPS(), c, options);
1589 }
1590
1591 U_CFUNC int32_t U_EXPORT2
1592 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1593 /* case mapping properties */
1594 const UCaseProps *csp=GET_CASE_PROPS();
1595 if(csp==NULL) {
1596 return FALSE;
1597 }
1598 switch(which) {
1599 case UCHAR_LOWERCASE:
1600 return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
1601 case UCHAR_UPPERCASE:
1602 return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
1603 case UCHAR_SOFT_DOTTED:
1604 return ucase_isSoftDotted(csp, c);
1605 case UCHAR_CASE_SENSITIVE:
1606 return ucase_isCaseSensitive(csp, c);
1607 default:
1608 return FALSE;
1609 }
1610 }