2 *******************************************************************************
4 * Copyright (C) 2004-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2004aug30
14 * created by: Markus W. Scherer
16 * Low-level Unicode character/string case mapping code.
17 * Much code moved here (and modified) from uchar.c.
20 #include "unicode/utypes.h"
21 #include "unicode/uset.h"
22 #include "unicode/udata.h" /* UDataInfo */
23 #include "ucmndata.h" /* DataHeader */
34 const int32_t *indexes
;
35 const uint16_t *exceptions
;
39 uint8_t formatVersion
[4];
42 /* data loading etc. -------------------------------------------------------- */
44 #define UCASE_HARDCODE_DATA 1
46 #if UCASE_HARDCODE_DATA
48 /* ucase_props_data.c is machine-generated by gencase --csource */
49 #include "ucase_props_data.c"
53 static UBool U_CALLCONV
54 isAcceptable(void *context
,
55 const char *type
, const char *name
,
56 const UDataInfo
*pInfo
) {
59 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
60 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
61 pInfo
->dataFormat
[0]==UCASE_FMT_0
&& /* dataFormat="cAsE" */
62 pInfo
->dataFormat
[1]==UCASE_FMT_1
&&
63 pInfo
->dataFormat
[2]==UCASE_FMT_2
&&
64 pInfo
->dataFormat
[3]==UCASE_FMT_3
&&
65 pInfo
->formatVersion
[0]==1 &&
66 pInfo
->formatVersion
[2]==UTRIE_SHIFT
&&
67 pInfo
->formatVersion
[3]==UTRIE_INDEX_SHIFT
69 UCaseProps
*csp
=(UCaseProps
*)context
;
70 uprv_memcpy(csp
->formatVersion
, pInfo
->formatVersion
, 4);
78 ucase_openData(UCaseProps
*cspProto
,
79 const uint8_t *bin
, int32_t length
, UErrorCode
*pErrorCode
) {
83 cspProto
->indexes
=(const int32_t *)bin
;
84 if( (length
>=0 && length
<16*4) ||
85 cspProto
->indexes
[UCASE_IX_INDEX_TOP
]<16
87 /* length or indexes[] too short for minimum indexes[] length of 16 */
88 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
91 size
=cspProto
->indexes
[UCASE_IX_INDEX_TOP
]*4;
93 if(length
>=size
&& length
>=cspProto
->indexes
[UCASE_IX_LENGTH
]) {
96 /* length too short for indexes[] or for the whole data length */
97 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
102 /* from here on, assume that the sizes of the items fit into the total length */
104 /* unserialize the trie, after indexes[] */
105 size
=cspProto
->indexes
[UCASE_IX_TRIE_SIZE
];
106 utrie_unserialize(&cspProto
->trie
, bin
, size
, pErrorCode
);
107 if(U_FAILURE(*pErrorCode
)) {
112 /* get exceptions[] */
113 size
=2*cspProto
->indexes
[UCASE_IX_EXC_LENGTH
];
114 cspProto
->exceptions
=(const uint16_t *)bin
;
118 size
=2*cspProto
->indexes
[UCASE_IX_UNFOLD_LENGTH
];
120 cspProto
->unfold
=(const UChar
*)bin
;
123 cspProto
->unfold
=NULL
;
126 /* allocate, copy, and return the new UCaseProps */
127 csp
=(UCaseProps
*)uprv_malloc(sizeof(UCaseProps
));
129 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
132 uprv_memcpy(csp
, cspProto
, sizeof(UCaseProps
));
137 U_CAPI UCaseProps
* U_EXPORT2
138 ucase_open(UErrorCode
*pErrorCode
) {
139 UCaseProps cspProto
={ NULL
}, *csp
;
141 cspProto
.mem
=udata_openChoice(NULL
, UCASE_DATA_TYPE
, UCASE_DATA_NAME
, isAcceptable
, &cspProto
, pErrorCode
);
142 if(U_FAILURE(*pErrorCode
)) {
148 udata_getMemory(cspProto
.mem
),
149 udata_getLength(cspProto
.mem
),
151 if(U_FAILURE(*pErrorCode
)) {
152 udata_close(cspProto
.mem
);
159 U_CAPI UCaseProps
* U_EXPORT2
160 ucase_openBinary(const uint8_t *bin
, int32_t length
, UErrorCode
*pErrorCode
) {
161 UCaseProps cspProto
={ NULL
};
162 const DataHeader
*hdr
;
164 if(U_FAILURE(*pErrorCode
)) {
168 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
172 /* check the header */
173 if(length
>=0 && length
<20) {
174 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
177 hdr
=(const DataHeader
*)bin
;
179 !(hdr
->dataHeader
.magic1
==0xda && hdr
->dataHeader
.magic2
==0x27 &&
180 hdr
->info
.isBigEndian
==U_IS_BIG_ENDIAN
&&
181 isAcceptable(&cspProto
, UCASE_DATA_TYPE
, UCASE_DATA_NAME
, &hdr
->info
))
183 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
187 bin
+=hdr
->dataHeader
.headerSize
;
189 length
-=hdr
->dataHeader
.headerSize
;
191 return ucase_openData(&cspProto
, bin
, length
, pErrorCode
);
196 U_CAPI
void U_EXPORT2
197 ucase_close(UCaseProps
*csp
) {
199 #if !UCASE_HARDCODE_DATA
200 udata_close(csp
->mem
);
206 /* UCaseProps singleton ----------------------------------------------------- */
208 static UCaseProps
*gCsp
=NULL
, *gCspDummy
=NULL
;
209 #if !UCASE_HARDCODE_DATA
210 static UErrorCode gErrorCode
=U_ZERO_ERROR
;
211 static int8_t gHaveData
=0;
214 static UBool U_CALLCONV
ucase_cleanup(void) {
217 ucase_close(gCspDummy
);
219 #if !UCASE_HARDCODE_DATA
220 gErrorCode
=U_ZERO_ERROR
;
226 U_CAPI
const UCaseProps
* U_EXPORT2
227 ucase_getSingleton(UErrorCode
*pErrorCode
) {
228 #if UCASE_HARDCODE_DATA
229 if(U_FAILURE(*pErrorCode
)) {
232 return &ucase_props_singleton
;
236 if(U_FAILURE(*pErrorCode
)) {
240 UMTX_CHECK(NULL
, gHaveData
, haveData
);
243 /* data was loaded */
245 } else if(haveData
<0) {
246 /* data loading failed */
247 *pErrorCode
=gErrorCode
;
249 } else /* haveData==0 */ {
251 UCaseProps
*csp
=ucase_open(pErrorCode
);
252 if(U_FAILURE(*pErrorCode
)) {
254 gErrorCode
=*pErrorCode
;
258 /* set the static variables */
264 ucln_common_registerCleanup(UCLN_COMMON_UCASE
, ucase_cleanup
);
274 U_CAPI
const UCaseProps
* U_EXPORT2
275 ucase_getDummy(UErrorCode
*pErrorCode
) {
278 if(U_FAILURE(*pErrorCode
)) {
282 UMTX_CHECK(NULL
, gCspDummy
, csp
);
285 /* the dummy object was already created */
287 } else /* csp==NULL */ {
288 /* create the dummy object */
291 csp
=(UCaseProps
*)uprv_malloc(sizeof(UCaseProps
)+UCASE_IX_TOP
*4+UTRIE_DUMMY_SIZE
);
293 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
296 uprv_memset(csp
, 0, sizeof(UCaseProps
)+UCASE_IX_TOP
*4);
298 csp
->indexes
=indexes
=(int32_t *)(csp
+1);
299 indexes
[UCASE_IX_INDEX_TOP
]=UCASE_IX_TOP
;
301 indexes
[UCASE_IX_TRIE_SIZE
]=
302 utrie_unserializeDummy(&csp
->trie
, indexes
+UCASE_IX_TOP
, UTRIE_DUMMY_SIZE
, 0, 0, TRUE
, pErrorCode
);
303 if(U_FAILURE(*pErrorCode
)) {
308 csp
->formatVersion
[0]=1;
309 csp
->formatVersion
[2]=UTRIE_SHIFT
;
310 csp
->formatVersion
[3]=UTRIE_INDEX_SHIFT
;
312 /* set the static variables */
314 if(gCspDummy
==NULL
) {
317 ucln_common_registerCleanup(UCLN_COMMON_UCASE
, ucase_cleanup
);
326 /* set of property starts for UnicodeSet ------------------------------------ */
328 static UBool U_CALLCONV
329 _enumPropertyStartsRange(const void *context
, UChar32 start
, UChar32 limit
, uint32_t value
) {
330 /* add the start code point to the USet */
331 const USetAdder
*sa
=(const USetAdder
*)context
;
332 sa
->add(sa
->set
, start
);
336 U_CAPI
void U_EXPORT2
337 ucase_addPropertyStarts(const UCaseProps
*csp
, const USetAdder
*sa
, UErrorCode
*pErrorCode
) {
338 if(U_FAILURE(*pErrorCode
)) {
342 /* add the start code point of each same-value range of the trie */
343 utrie_enum(&csp
->trie
, NULL
, _enumPropertyStartsRange
, sa
);
345 /* add code points with hardcoded properties, plus the ones following them */
347 /* (none right now, see comment below) */
350 * Omit code points with hardcoded specialcasing properties
351 * because we do not build property UnicodeSets for them right now.
355 /* data access primitives --------------------------------------------------- */
357 /* UTRIE_GET16() itself validates c */
358 #define GET_PROPS(csp, c, result) \
359 UTRIE_GET16(&(csp)->trie, c, result);
361 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
363 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
365 /* number of bits in an 8-bit integer value */
366 static const uint8_t flagsOffset
[256]={
367 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
368 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
369 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
370 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
371 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
372 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
373 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
374 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
375 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
376 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
377 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
378 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
379 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
380 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
381 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
382 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
385 #define HAS_SLOT(flags, index) ((flags)&(1<<(index)))
386 #define SLOT_OFFSET(flags, index) flagsOffset[(flags)&((1<<(index))-1)]
389 * Get the value of an optional-value slot where HAS_SLOT(excWord, index).
391 * @param excWord (in) initial exceptions word
392 * @param index (in) desired slot index
393 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
394 * moved to the last uint16_t of the value, use +1 for beginning of next slot
395 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
397 #define GET_SLOT_VALUE(excWord, index, pExc16, value) \
398 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
399 (pExc16)+=SLOT_OFFSET(excWord, index); \
402 (pExc16)+=2*SLOT_OFFSET(excWord, index); \
404 (value)=((value)<<16)|*pExc16; \
407 /* simple case mappings ----------------------------------------------------- */
409 U_CAPI UChar32 U_EXPORT2
410 ucase_tolower(const UCaseProps
*csp
, UChar32 c
) {
412 GET_PROPS(csp
, c
, props
);
413 if(!PROPS_HAS_EXCEPTION(props
)) {
414 if(UCASE_GET_TYPE(props
)>=UCASE_UPPER
) {
415 c
+=UCASE_GET_DELTA(props
);
418 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
419 uint16_t excWord
=*pe
++;
420 if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
421 GET_SLOT_VALUE(excWord
, UCASE_EXC_LOWER
, pe
, c
);
427 U_CAPI UChar32 U_EXPORT2
428 ucase_toupper(const UCaseProps
*csp
, UChar32 c
) {
430 GET_PROPS(csp
, c
, props
);
431 if(!PROPS_HAS_EXCEPTION(props
)) {
432 if(UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
433 c
+=UCASE_GET_DELTA(props
);
436 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
437 uint16_t excWord
=*pe
++;
438 if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
439 GET_SLOT_VALUE(excWord
, UCASE_EXC_UPPER
, pe
, c
);
445 U_CAPI UChar32 U_EXPORT2
446 ucase_totitle(const UCaseProps
*csp
, UChar32 c
) {
448 GET_PROPS(csp
, c
, props
);
449 if(!PROPS_HAS_EXCEPTION(props
)) {
450 if(UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
451 c
+=UCASE_GET_DELTA(props
);
454 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
455 uint16_t excWord
=*pe
++;
457 if(HAS_SLOT(excWord
, UCASE_EXC_TITLE
)) {
458 index
=UCASE_EXC_TITLE
;
459 } else if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
460 index
=UCASE_EXC_UPPER
;
464 GET_SLOT_VALUE(excWord
, index
, pe
, c
);
469 U_CAPI
void U_EXPORT2
470 ucase_addCaseClosure(const UCaseProps
*csp
, UChar32 c
, const USetAdder
*sa
) {
474 * Hardcode the case closure of i and its relatives and ignore the
475 * data file data for these characters.
476 * The Turkic dotless i and dotted I with their case mapping conditions
477 * and case folding option make the related characters behave specially.
478 * This code matches their closure behavior to their case folding behavior.
481 iDot
[2]= { 0x69, 0x307 };
485 /* regular i and I are in one equivalence class */
486 sa
->add(sa
->set
, 0x69);
489 sa
->add(sa
->set
, 0x49);
492 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
493 sa
->addString(sa
->set
, iDot
, 2);
496 /* dotless i is in a class by itself */
499 /* otherwise use the data file data */
503 GET_PROPS(csp
, c
, props
);
504 if(!PROPS_HAS_EXCEPTION(props
)) {
505 if(UCASE_GET_TYPE(props
)!=UCASE_NONE
) {
506 /* add the one simple case mapping, no matter what type it is */
507 int32_t delta
=UCASE_GET_DELTA(props
);
509 sa
->add(sa
->set
, c
+delta
);
514 * c has exceptions, so there may be multiple simple and/or
515 * full case mappings. Add them all.
517 const uint16_t *pe0
, *pe
=GET_EXCEPTIONS(csp
, props
);
518 const UChar
*closure
;
519 uint16_t excWord
=*pe
++;
520 int32_t index
, closureLength
, fullLength
, length
;
524 /* add all simple case mappings */
525 for(index
=UCASE_EXC_LOWER
; index
<=UCASE_EXC_TITLE
; ++index
) {
526 if(HAS_SLOT(excWord
, index
)) {
528 GET_SLOT_VALUE(excWord
, index
, pe
, c
);
533 /* get the closure string pointer & length */
534 if(HAS_SLOT(excWord
, UCASE_EXC_CLOSURE
)) {
536 GET_SLOT_VALUE(excWord
, UCASE_EXC_CLOSURE
, pe
, closureLength
);
537 closureLength
&=UCASE_CLOSURE_MAX_LENGTH
; /* higher bits are reserved */
538 closure
=(const UChar
*)pe
+1; /* behind this slot, unless there are full case mappings */
544 /* add the full case folding */
545 if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
547 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, fullLength
);
549 /* start of full case mapping strings */
552 fullLength
&=0xffff; /* bits 16 and higher are reserved */
554 /* skip the lowercase result string */
555 pe
+=fullLength
&UCASE_FULL_LOWER
;
558 /* add the full case folding string */
559 length
=fullLength
&0xf;
561 sa
->addString(sa
->set
, (const UChar
*)pe
, length
);
565 /* skip the uppercase and titlecase strings */
571 closure
=(const UChar
*)pe
; /* behind full case mappings */
574 /* add each code point in the closure string */
575 for(index
=0; index
<closureLength
;) {
576 U16_NEXT_UNSAFE(closure
, index
, c
);
583 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
584 * must be length>0 and max>0 and length<=max
586 static U_INLINE
int32_t
587 strcmpMax(const UChar
*s
, int32_t length
, const UChar
*t
, int32_t max
) {
590 max
-=length
; /* we require length<=max, so no need to decrement max in the loop */
595 return 1; /* reached the end of t but not of s */
599 return c1
; /* return difference result */
602 /* ends with length==0 */
604 if(max
==0 || *t
==0) {
605 return 0; /* equal to length of both strings */
607 return -max
; /* return lengh difference */
611 U_CAPI UBool U_EXPORT2
612 ucase_addStringCaseClosure(const UCaseProps
*csp
, const UChar
*s
, int32_t length
, const USetAdder
*sa
) {
613 const UChar
*unfold
, *p
;
614 int32_t i
, start
, limit
, result
, unfoldRows
, unfoldRowWidth
, unfoldStringWidth
;
616 if(csp
->unfold
==NULL
|| s
==NULL
) {
617 return FALSE
; /* no reverse case folding data, or no string */
620 /* the string is too short to find any match */
622 * more precise would be:
623 * if(!u_strHasMoreChar32Than(s, length, 1))
624 * but this does not make much practical difference because
625 * a single supplementary code point would just not be found
631 unfoldRows
=unfold
[UCASE_UNFOLD_ROWS
];
632 unfoldRowWidth
=unfold
[UCASE_UNFOLD_ROW_WIDTH
];
633 unfoldStringWidth
=unfold
[UCASE_UNFOLD_STRING_WIDTH
];
634 unfold
+=unfoldRowWidth
;
636 if(length
>unfoldStringWidth
) {
637 /* the string is too long to find any match */
641 /* do a binary search for the string */
646 p
=unfold
+(i
*unfoldRowWidth
);
647 result
=strcmpMax(s
, length
, p
, unfoldStringWidth
);
650 /* found the string: add each code point, and its case closure */
653 for(i
=unfoldStringWidth
; i
<unfoldRowWidth
&& p
[i
]!=0;) {
654 U16_NEXT_UNSAFE(p
, i
, c
);
656 ucase_addCaseClosure(csp
, c
, sa
);
659 } else if(result
<0) {
661 } else /* result>0 */ {
666 return FALSE
; /* string not found */
669 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
670 U_CAPI
int32_t U_EXPORT2
671 ucase_getType(const UCaseProps
*csp
, UChar32 c
) {
673 GET_PROPS(csp
, c
, props
);
674 return UCASE_GET_TYPE(props
);
677 /** @return same as ucase_getType(), or <0 if c is case-ignorable */
678 U_CAPI
int32_t U_EXPORT2
679 ucase_getTypeOrIgnorable(const UCaseProps
*csp
, UChar32 c
) {
682 GET_PROPS(csp
, c
, props
);
683 type
=UCASE_GET_TYPE(props
);
684 if(type
!=UCASE_NONE
) {
688 (props
&(UCASE_EXCEPTION
|UCASE_CASE_IGNORABLE
))==UCASE_CASE_IGNORABLE
690 return -1; /* case-ignorable */
692 return 0; /* c is neither cased nor case-ignorable */
696 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
697 static U_INLINE
int32_t
698 getDotType(const UCaseProps
*csp
, UChar32 c
) {
700 GET_PROPS(csp
, c
, props
);
701 if(!PROPS_HAS_EXCEPTION(props
)) {
702 return props
&UCASE_DOT_MASK
;
704 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
705 return (*pe
>>UCASE_EXC_DOT_SHIFT
)&UCASE_DOT_MASK
;
709 U_CAPI UBool U_EXPORT2
710 ucase_isSoftDotted(const UCaseProps
*csp
, UChar32 c
) {
711 return (UBool
)(getDotType(csp
, c
)==UCASE_SOFT_DOTTED
);
714 U_CAPI UBool U_EXPORT2
715 ucase_isCaseSensitive(const UCaseProps
*csp
, UChar32 c
) {
717 GET_PROPS(csp
, c
, props
);
718 return (UBool
)((props
&UCASE_SENSITIVE
)!=0);
721 /* string casing ------------------------------------------------------------ */
724 * These internal functions form the core of string case mappings.
725 * They map single code points to result code points or strings and take
726 * all necessary conditions (context, locale ID, options) into account.
728 * They do not iterate over the source or write to the destination
729 * so that the same functions are useful for non-standard string storage,
730 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
731 * For the same reason, the "surrounding text" context is passed in as a
732 * UCaseContextIterator which does not make any assumptions about
733 * the underlying storage.
735 * This section contains helper functions that check for conditions
736 * in the input text surrounding the current code point
737 * according to SpecialCasing.txt.
739 * Each helper function gets the index
740 * - after the current code point if it looks at following text
741 * - before the current code point if it looks at preceding text
743 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
746 * C is preceded by a sequence consisting of
747 * a cased letter and a case-ignorable sequence,
748 * and C is not followed by a sequence consisting of
749 * an ignorable sequence and then a cased letter.
752 * C is followed by one or more characters of combining class 230 (ABOVE)
753 * in the combining character sequence.
756 * The last preceding character with combining class of zero before C
758 * and there is no intervening combining character class 230 (ABOVE).
761 * C is followed by combining dot above (U+0307).
762 * Any sequence of characters with a combining class that is neither 0 nor 230
763 * may intervene between the current character and the combining dot above.
765 * The erratum from 2002-10-31 adds the condition
768 * The last preceding base character was an uppercase I, and there is no
769 * intervening combining character class 230 (ABOVE).
771 * (See Jitterbug 2344 and the comments on After_I below.)
773 * Helper definitions in Unicode 3.2 UAX 21:
775 * D1. A character C is defined to be cased
776 * if it meets any of the following criteria:
778 * - The general category of C is Titlecase Letter (Lt)
779 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
780 * - Given D = NFD(C), then it is not the case that:
781 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
782 * (This third criterium does not add any characters to the list
783 * for Unicode 3.2. Ignored.)
785 * D2. A character C is defined to be case-ignorable
786 * if it meets either of the following criteria:
788 * - The general category of C is
789 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
790 * Letter Modifier (Lm), or Symbol Modifier (Sk)
791 * - C is one of the following characters
793 * U+00AD SOFT HYPHEN (SHY)
794 * U+2019 RIGHT SINGLE QUOTATION MARK
795 * (the preferred character for apostrophe)
797 * D3. A case-ignorable sequence is a sequence of
798 * zero or more case-ignorable characters.
808 #define is_a(c) ((c)=='a' || (c)=='A')
809 #define is_e(c) ((c)=='e' || (c)=='E')
810 #define is_i(c) ((c)=='i' || (c)=='I')
811 #define is_l(c) ((c)=='l' || (c)=='L')
812 #define is_r(c) ((c)=='r' || (c)=='R')
813 #define is_t(c) ((c)=='t' || (c)=='T')
814 #define is_u(c) ((c)=='u' || (c)=='U')
815 #define is_z(c) ((c)=='z' || (c)=='Z')
818 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
821 * Requires non-NULL locale ID but otherwise does the equivalent of
822 * checking for language codes as if uloc_getLanguage() were called:
823 * Accepts both 2- and 3-letter codes and accepts case variants.
826 ucase_getCaseLocale(const char *locale
, int32_t *locCache
) {
830 if(locCache
!=NULL
&& (result
=*locCache
)!=LOC_UNKNOWN
) {
837 * This function used to use uloc_getLanguage(), but the current code
838 * removes the dependency of this low-level code on uloc implementation code
839 * and is faster because not the whole locale ID has to be
840 * examined and copied/transformed.
842 * Because this code does not want to depend on uloc, the caller must
843 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
879 result
=LOC_LITHUANIAN
;
890 /* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */
892 isFollowedByCasedLetter(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
, int8_t dir
) {
900 for(/* dir!=0 sets direction */; (c
=iter(context
, dir
))>=0; dir
=0) {
901 GET_PROPS(csp
, c
, props
);
902 if(UCASE_GET_TYPE(props
)!=UCASE_NONE
) {
903 return TRUE
; /* followed by cased letter */
904 } else if(c
==0x307 || (props
&(UCASE_EXCEPTION
|UCASE_CASE_IGNORABLE
))==UCASE_CASE_IGNORABLE
) {
905 /* case-ignorable, continue with the loop */
907 return FALSE
; /* not ignorable */
911 return FALSE
; /* not followed by cased letter */
914 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
916 isPrecededBySoftDotted(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
925 for(dir
=-1; (c
=iter(context
, dir
))>=0; dir
=0) {
926 dotType
=getDotType(csp
, c
);
927 if(dotType
==UCASE_SOFT_DOTTED
) {
928 return TRUE
; /* preceded by TYPE_i */
929 } else if(dotType
!=UCASE_OTHER_ACCENT
) {
930 return FALSE
; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
934 return FALSE
; /* not preceded by TYPE_i */
938 * See Jitterbug 2344:
939 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
940 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
941 * we made those releases compatible with Unicode 3.2 which had not fixed
942 * a related bug in SpecialCasing.txt.
944 * From the Jitterbug 2344 text:
945 * ... this bug is listed as a Unicode erratum
946 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
948 * There are two errors in SpecialCasing.txt.
949 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
950 * 2. An incorrect context definition. Correct as follows:
951 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
952 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
954 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
955 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
956 * where the context After_I is defined as:
957 * The last preceding base character was an uppercase I, and there is no
958 * intervening combining character class 230 (ABOVE).
961 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
963 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
964 * # This matches the behavior of the canonically equivalent I-dot_above
966 * See also the description in this place in older versions of uchar.c (revision 1.100).
968 * Markus W. Scherer 2003-feb-15
971 /* Is preceded by base character 'I' with no intervening cc=230 ? */
973 isPrecededBy_I(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
982 for(dir
=-1; (c
=iter(context
, dir
))>=0; dir
=0) {
984 return TRUE
; /* preceded by I */
986 dotType
=getDotType(csp
, c
);
987 if(dotType
!=UCASE_OTHER_ACCENT
) {
988 return FALSE
; /* preceded by different base character (not I), or intervening cc==230 */
992 return FALSE
; /* not preceded by I */
995 /* Is followed by one or more cc==230 ? */
997 isFollowedByMoreAbove(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
1006 for(dir
=1; (c
=iter(context
, dir
))>=0; dir
=0) {
1007 dotType
=getDotType(csp
, c
);
1008 if(dotType
==UCASE_ABOVE
) {
1009 return TRUE
; /* at least one cc==230 following */
1010 } else if(dotType
!=UCASE_OTHER_ACCENT
) {
1011 return FALSE
; /* next base character, no more cc==230 following */
1015 return FALSE
; /* no more cc==230 following */
1018 /* Is followed by a dot above (without cc==230 in between) ? */
1020 isFollowedByDotAbove(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
1029 for(dir
=1; (c
=iter(context
, dir
))>=0; dir
=0) {
1033 dotType
=getDotType(csp
, c
);
1034 if(dotType
!=UCASE_OTHER_ACCENT
) {
1035 return FALSE
; /* next base character or cc==230 in between */
1039 return FALSE
; /* no dot above following */
1042 U_CAPI
int32_t U_EXPORT2
1043 ucase_toFullLower(const UCaseProps
*csp
, UChar32 c
,
1044 UCaseContextIterator
*iter
, void *context
,
1045 const UChar
**pString
,
1046 const char *locale
, int32_t *locCache
) {
1048 iDot
[2]= { 0x69, 0x307 },
1049 jDot
[2]= { 0x6a, 0x307 },
1050 iOgonekDot
[3]= { 0x12f, 0x307 },
1051 iDotGrave
[3]= { 0x69, 0x307, 0x300 },
1052 iDotAcute
[3]= { 0x69, 0x307, 0x301 },
1053 iDotTilde
[3]= { 0x69, 0x307, 0x303 };
1059 GET_PROPS(csp
, c
, props
);
1060 if(!PROPS_HAS_EXCEPTION(props
)) {
1061 if(UCASE_GET_TYPE(props
)>=UCASE_UPPER
) {
1062 result
=c
+UCASE_GET_DELTA(props
);
1065 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
), *pe2
;
1066 uint16_t excWord
=*pe
++;
1071 if(excWord
&UCASE_EXC_CONDITIONAL_SPECIAL
) {
1072 /* use hardcoded conditions and mappings */
1073 int32_t loc
=ucase_getCaseLocale(locale
, locCache
);
1076 * Test for conditional mappings first
1077 * (otherwise the unconditional default mappings are always taken),
1078 * then test for characters that have unconditional mappings in SpecialCasing.txt,
1079 * then get the UnicodeData.txt mappings.
1081 if( loc
==LOC_LITHUANIAN
&&
1082 /* base characters, find accents above */
1083 (((c
==0x49 || c
==0x4a || c
==0x12e) &&
1084 isFollowedByMoreAbove(csp
, iter
, context
)) ||
1085 /* precomposed with accent above, no need to find one */
1086 (c
==0xcc || c
==0xcd || c
==0x128))
1091 # Lithuanian retains the dot in a lowercase i when followed by accents.
1093 # Introduce an explicit dot above when lowercasing capital I's and J's
1094 # whenever there are more accents above.
1095 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1097 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1098 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1099 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1100 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1101 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1102 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1105 case 0x49: /* LATIN CAPITAL LETTER I */
1108 case 0x4a: /* LATIN CAPITAL LETTER J */
1111 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1112 *pString
=iOgonekDot
;
1114 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
1117 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
1120 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1124 return 0; /* will not occur */
1126 /* # Turkish and Azeri */
1127 } else if(loc
==LOC_TURKISH
&& c
==0x130) {
1129 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1130 # The following rules handle those cases.
1132 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1133 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1136 } else if(loc
==LOC_TURKISH
&& c
==0x307 && isPrecededBy_I(csp
, iter
, context
)) {
1138 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1139 # This matches the behavior of the canonically equivalent I-dot_above
1141 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1142 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1144 return 0; /* remove the dot (continue without output) */
1145 } else if(loc
==LOC_TURKISH
&& c
==0x49 && !isFollowedByDotAbove(csp
, iter
, context
)) {
1147 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1149 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1150 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1153 } else if(c
==0x130) {
1155 # Preserve canonical equivalence for I with dot. Turkic is handled below.
1157 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1161 } else if( c
==0x3a3 &&
1162 !isFollowedByCasedLetter(csp
, iter
, context
, 1) &&
1163 isFollowedByCasedLetter(csp
, iter
, context
, -1) /* -1=preceded */
1165 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1167 # Special case for final form of sigma
1169 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1171 return 0x3c2; /* greek small final sigma */
1173 /* no known conditional special case mapping, use a normal mapping */
1175 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1176 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1177 full
&=UCASE_FULL_LOWER
;
1179 /* set the output pointer to the lowercase mapping */
1182 /* return the string length */
1187 if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1188 GET_SLOT_VALUE(excWord
, UCASE_EXC_LOWER
, pe2
, result
);
1192 return (result
==c
) ? ~result
: result
;
1197 toUpperOrTitle(const UCaseProps
*csp
, UChar32 c
,
1198 UCaseContextIterator
*iter
, void *context
,
1199 const UChar
**pString
,
1200 const char *locale
, int32_t *locCache
,
1201 UBool upperNotTitle
) {
1206 GET_PROPS(csp
, c
, props
);
1207 if(!PROPS_HAS_EXCEPTION(props
)) {
1208 if(UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
1209 result
=c
+UCASE_GET_DELTA(props
);
1212 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
), *pe2
;
1213 uint16_t excWord
=*pe
++;
1214 int32_t full
, index
;
1218 if(excWord
&UCASE_EXC_CONDITIONAL_SPECIAL
) {
1219 /* use hardcoded conditions and mappings */
1220 int32_t loc
=ucase_getCaseLocale(locale
, locCache
);
1222 if(loc
==LOC_TURKISH
&& c
==0x69) {
1226 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1227 # The following rules handle those cases.
1229 # When uppercasing, i turns into a dotted capital I
1231 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1232 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1235 } else if(loc
==LOC_LITHUANIAN
&& c
==0x307 && isPrecededBySoftDotted(csp
, iter
, context
)) {
1239 # Lithuanian retains the dot in a lowercase i when followed by accents.
1241 # Remove DOT ABOVE after "i" with upper or titlecase
1243 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1245 return 0; /* remove the dot (continue without output) */
1247 /* no known conditional special case mapping, use a normal mapping */
1249 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1250 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1252 /* start of full case mapping strings */
1255 /* skip the lowercase and case-folding result strings */
1256 pe
+=full
&UCASE_FULL_LOWER
;
1264 /* skip the uppercase result string */
1270 /* set the output pointer to the result string */
1273 /* return the string length */
1278 if(!upperNotTitle
&& HAS_SLOT(excWord
, UCASE_EXC_TITLE
)) {
1279 index
=UCASE_EXC_TITLE
;
1280 } else if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
1281 /* here, titlecase is same as uppercase */
1282 index
=UCASE_EXC_UPPER
;
1286 GET_SLOT_VALUE(excWord
, index
, pe2
, result
);
1289 return (result
==c
) ? ~result
: result
;
1292 U_CAPI
int32_t U_EXPORT2
1293 ucase_toFullUpper(const UCaseProps
*csp
, UChar32 c
,
1294 UCaseContextIterator
*iter
, void *context
,
1295 const UChar
**pString
,
1296 const char *locale
, int32_t *locCache
) {
1297 return toUpperOrTitle(csp
, c
, iter
, context
, pString
, locale
, locCache
, TRUE
);
1300 U_CAPI
int32_t U_EXPORT2
1301 ucase_toFullTitle(const UCaseProps
*csp
, UChar32 c
,
1302 UCaseContextIterator
*iter
, void *context
,
1303 const UChar
**pString
,
1304 const char *locale
, int32_t *locCache
) {
1305 return toUpperOrTitle(csp
, c
, iter
, context
, pString
, locale
, locCache
, FALSE
);
1308 /* case folding ------------------------------------------------------------- */
1311 * Case folding is similar to lowercasing.
1312 * The result may be a simple mapping, i.e., a single code point, or
1313 * a full mapping, i.e., a string.
1314 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1315 * then only the lowercase mapping is stored.
1317 * Some special cases are hardcoded because their conditions cannot be
1318 * parsed and processed from CaseFolding.txt.
1320 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1322 # C: common case folding, common mappings shared by both simple and full mappings.
1323 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1324 # S: simple case folding, mappings to single characters where different from F.
1325 # T: special case for uppercase I and dotted uppercase I
1326 # - For non-Turkic languages, this mapping is normally not used.
1327 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1330 # A. To do a simple case folding, use the mappings with status C + S.
1331 # B. To do a full case folding, use the mappings with status C + F.
1333 # The mappings with status T can be used or omitted depending on the desired case-folding
1334 # behavior. (The default option is to exclude them.)
1336 * Unicode 3.2 has 'T' mappings as follows:
1338 0049; T; 0131; # LATIN CAPITAL LETTER I
1339 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1341 * while the default mappings for these code points are:
1343 0049; C; 0069; # LATIN CAPITAL LETTER I
1344 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1346 * U+0130 has no simple case folding (simple-case-folds to itself).
1349 /* return the simple case folding mapping for c */
1350 U_CAPI UChar32 U_EXPORT2
1351 ucase_fold(const UCaseProps
*csp
, UChar32 c
, uint32_t options
) {
1353 GET_PROPS(csp
, c
, props
);
1354 if(!PROPS_HAS_EXCEPTION(props
)) {
1355 if(UCASE_GET_TYPE(props
)>=UCASE_UPPER
) {
1356 c
+=UCASE_GET_DELTA(props
);
1359 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
1360 uint16_t excWord
=*pe
++;
1362 if(excWord
&UCASE_EXC_CONDITIONAL_FOLD
) {
1363 /* special case folding mappings, hardcoded */
1364 if((options
&_FOLD_CASE_OPTIONS_MASK
)==U_FOLD_CASE_DEFAULT
) {
1365 /* default mappings */
1367 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1369 } else if(c
==0x130) {
1370 /* no simple case folding for U+0130 */
1374 /* Turkic mappings */
1376 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1378 } else if(c
==0x130) {
1379 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1384 if(HAS_SLOT(excWord
, UCASE_EXC_FOLD
)) {
1385 index
=UCASE_EXC_FOLD
;
1386 } else if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1387 index
=UCASE_EXC_LOWER
;
1391 GET_SLOT_VALUE(excWord
, index
, pe
, c
);
1397 * Issue for canonical caseless match (UAX #21):
1398 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1399 * canonical equivalence, unlike default-option casefolding.
1400 * For example, I-grave and I + grave fold to strings that are not canonically
1402 * For more details, see the comment in unorm_compare() in unorm.cpp
1403 * and the intermediate prototype changes for Jitterbug 2021.
1404 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1406 * This did not get fixed because it appears that it is not possible to fix
1407 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1408 * together in a way that they still fold to common result strings.
1411 U_CAPI
int32_t U_EXPORT2
1412 ucase_toFullFolding(const UCaseProps
*csp
, UChar32 c
,
1413 const UChar
**pString
,
1416 iDot
[2]= { 0x69, 0x307 };
1422 GET_PROPS(csp
, c
, props
);
1423 if(!PROPS_HAS_EXCEPTION(props
)) {
1424 if(UCASE_GET_TYPE(props
)>=UCASE_UPPER
) {
1425 result
=c
+UCASE_GET_DELTA(props
);
1428 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
), *pe2
;
1429 uint16_t excWord
=*pe
++;
1430 int32_t full
, index
;
1434 if(excWord
&UCASE_EXC_CONDITIONAL_FOLD
) {
1435 /* use hardcoded conditions and mappings */
1436 if((options
&_FOLD_CASE_OPTIONS_MASK
)==U_FOLD_CASE_DEFAULT
) {
1437 /* default mappings */
1439 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1441 } else if(c
==0x130) {
1442 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1447 /* Turkic mappings */
1449 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1451 } else if(c
==0x130) {
1452 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1456 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1457 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1459 /* start of full case mapping strings */
1462 /* skip the lowercase result string */
1463 pe
+=full
&UCASE_FULL_LOWER
;
1467 /* set the output pointer to the result string */
1470 /* return the string length */
1475 if(HAS_SLOT(excWord
, UCASE_EXC_FOLD
)) {
1476 index
=UCASE_EXC_FOLD
;
1477 } else if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1478 index
=UCASE_EXC_LOWER
;
1482 GET_SLOT_VALUE(excWord
, index
, pe2
, result
);
1485 return (result
==c
) ? ~result
: result
;
1488 /* case mapping properties API ---------------------------------------------- */
1490 /* get the UCaseProps singleton, or else its dummy, once and for all */
1491 static const UCaseProps
*
1494 * This lazy intialization with double-checked locking (without mutex protection for
1495 * the initial check) is transiently unsafe under certain circumstances.
1496 * Check the readme and use u_init() if necessary.
1499 /* the initial check is performed by the GET_CASE_PROPS() macro */
1500 const UCaseProps
*csp
;
1501 UErrorCode errorCode
=U_ZERO_ERROR
;
1503 csp
=ucase_getSingleton(&errorCode
);
1504 if(U_FAILURE(errorCode
)) {
1505 errorCode
=U_ZERO_ERROR
;
1506 csp
=ucase_getDummy(&errorCode
);
1507 if(U_FAILURE(errorCode
)) {
1516 * In ICU 3.0, most Unicode properties were loaded from uprops.icu.
1517 * ICU 3.2 adds ucase.icu for case mapping properties.
1518 * ICU 3.4 adds ubidi.icu for bidi/shaping properties and
1519 * removes case/bidi/shaping properties from uprops.icu.
1521 * Loading of uprops.icu was never mutex-protected and required u_init()
1522 * for thread safety.
1523 * In order to maintain performance for all such properties,
1524 * ucase.icu and ubidi.icu are loaded lazily, without mutexing.
1525 * u_init() will try to load them for thread safety,
1526 * but u_init() will not fail if they are missing.
1528 * uchar.c maintains a tri-state flag for (not loaded/loaded/failed to load)
1529 * and an error code for load failure.
1530 * Instead, here we try to load at most once.
1531 * If it works, we use the resulting singleton object.
1532 * If it fails, then we get a dummy object, which always works unless
1533 * we are seriously out of memory.
1534 * After the first try, we have a never-changing pointer to either the
1535 * real singleton or the dummy.
1537 * This method is used in Unicode properties APIs (uchar.h) that
1538 * do not have a service object and also do not have an error code parameter.
1539 * Other API implementations get the singleton themselves
1540 * (with mutexing), store it in the service object, and report errors.
1542 #define GET_CASE_PROPS() (gCsp!=NULL ? gCsp : getCaseProps())
1544 /* public API (see uchar.h) */
1546 U_CAPI UBool U_EXPORT2
1547 u_isULowercase(UChar32 c
) {
1548 return (UBool
)(UCASE_LOWER
==ucase_getType(GET_CASE_PROPS(), c
));
1551 U_CAPI UBool U_EXPORT2
1552 u_isUUppercase(UChar32 c
) {
1553 return (UBool
)(UCASE_UPPER
==ucase_getType(GET_CASE_PROPS(), c
));
1556 /* Transforms the Unicode character to its lower case equivalent.*/
1557 U_CAPI UChar32 U_EXPORT2
1558 u_tolower(UChar32 c
) {
1559 return ucase_tolower(GET_CASE_PROPS(), c
);
1562 /* Transforms the Unicode character to its upper case equivalent.*/
1563 U_CAPI UChar32 U_EXPORT2
1564 u_toupper(UChar32 c
) {
1565 return ucase_toupper(GET_CASE_PROPS(), c
);
1568 /* Transforms the Unicode character to its title case equivalent.*/
1569 U_CAPI UChar32 U_EXPORT2
1570 u_totitle(UChar32 c
) {
1571 return ucase_totitle(GET_CASE_PROPS(), c
);
1574 /* return the simple case folding mapping for c */
1575 U_CAPI UChar32 U_EXPORT2
1576 u_foldCase(UChar32 c
, uint32_t options
) {
1577 return ucase_fold(GET_CASE_PROPS(), c
, options
);
1580 U_CFUNC
int32_t U_EXPORT2
1581 ucase_hasBinaryProperty(UChar32 c
, UProperty which
) {
1582 /* case mapping properties */
1583 const UCaseProps
*csp
=GET_CASE_PROPS();
1588 case UCHAR_LOWERCASE
:
1589 return (UBool
)(UCASE_LOWER
==ucase_getType(csp
, c
));
1590 case UCHAR_UPPERCASE
:
1591 return (UBool
)(UCASE_UPPER
==ucase_getType(csp
, c
));
1592 case UCHAR_SOFT_DOTTED
:
1593 return ucase_isSoftDotted(csp
, c
);
1594 case UCHAR_CASE_SENSITIVE
:
1595 return ucase_isCaseSensitive(csp
, c
);