2 *******************************************************************************
4 * Copyright (C) 2004-2008, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2004aug30
14 * created by: Markus W. Scherer
16 * Low-level Unicode character/string case mapping code.
17 * Much code moved here (and modified) from uchar.c.
20 #include "unicode/utypes.h"
21 #include "unicode/uset.h"
22 #include "unicode/udata.h" /* UDataInfo */
23 #include "ucmndata.h" /* DataHeader */
34 const int32_t *indexes
;
35 const uint16_t *exceptions
;
39 uint8_t formatVersion
[4];
42 /* data loading etc. -------------------------------------------------------- */
44 #if UCASE_HARDCODE_DATA
46 /* ucase_props_data.c is machine-generated by gencase --csource */
47 #include "ucase_props_data.c"
51 static UBool U_CALLCONV
52 isAcceptable(void *context
,
53 const char *type
, const char *name
,
54 const UDataInfo
*pInfo
) {
57 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
58 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
59 pInfo
->dataFormat
[0]==UCASE_FMT_0
&& /* dataFormat="cAsE" */
60 pInfo
->dataFormat
[1]==UCASE_FMT_1
&&
61 pInfo
->dataFormat
[2]==UCASE_FMT_2
&&
62 pInfo
->dataFormat
[3]==UCASE_FMT_3
&&
63 pInfo
->formatVersion
[0]==1 &&
64 pInfo
->formatVersion
[2]==UTRIE_SHIFT
&&
65 pInfo
->formatVersion
[3]==UTRIE_INDEX_SHIFT
67 UCaseProps
*csp
=(UCaseProps
*)context
;
68 uprv_memcpy(csp
->formatVersion
, pInfo
->formatVersion
, 4);
76 ucase_openData(UCaseProps
*cspProto
,
77 const uint8_t *bin
, int32_t length
, UErrorCode
*pErrorCode
) {
81 cspProto
->indexes
=(const int32_t *)bin
;
82 if( (length
>=0 && length
<16*4) ||
83 cspProto
->indexes
[UCASE_IX_INDEX_TOP
]<16
85 /* length or indexes[] too short for minimum indexes[] length of 16 */
86 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
89 size
=cspProto
->indexes
[UCASE_IX_INDEX_TOP
]*4;
91 if(length
>=size
&& length
>=cspProto
->indexes
[UCASE_IX_LENGTH
]) {
94 /* length too short for indexes[] or for the whole data length */
95 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
100 /* from here on, assume that the sizes of the items fit into the total length */
102 /* unserialize the trie, after indexes[] */
103 size
=cspProto
->indexes
[UCASE_IX_TRIE_SIZE
];
104 utrie_unserialize(&cspProto
->trie
, bin
, size
, pErrorCode
);
105 if(U_FAILURE(*pErrorCode
)) {
110 /* get exceptions[] */
111 size
=2*cspProto
->indexes
[UCASE_IX_EXC_LENGTH
];
112 cspProto
->exceptions
=(const uint16_t *)bin
;
116 size
=2*cspProto
->indexes
[UCASE_IX_UNFOLD_LENGTH
];
118 cspProto
->unfold
=(const UChar
*)bin
;
121 cspProto
->unfold
=NULL
;
124 /* allocate, copy, and return the new UCaseProps */
125 csp
=(UCaseProps
*)uprv_malloc(sizeof(UCaseProps
));
127 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
130 uprv_memcpy(csp
, cspProto
, sizeof(UCaseProps
));
135 U_CAPI UCaseProps
* U_EXPORT2
136 ucase_open(UErrorCode
*pErrorCode
) {
137 UCaseProps cspProto
={ NULL
}, *csp
;
139 cspProto
.mem
=udata_openChoice(NULL
, UCASE_DATA_TYPE
, UCASE_DATA_NAME
, isAcceptable
, &cspProto
, pErrorCode
);
140 if(U_FAILURE(*pErrorCode
)) {
146 udata_getMemory(cspProto
.mem
),
147 udata_getLength(cspProto
.mem
),
149 if(U_FAILURE(*pErrorCode
)) {
150 udata_close(cspProto
.mem
);
157 U_CAPI UCaseProps
* U_EXPORT2
158 ucase_openBinary(const uint8_t *bin
, int32_t length
, UErrorCode
*pErrorCode
) {
159 UCaseProps cspProto
={ NULL
};
160 const DataHeader
*hdr
;
162 if(U_FAILURE(*pErrorCode
)) {
166 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
170 /* check the header */
171 if(length
>=0 && length
<20) {
172 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
175 hdr
=(const DataHeader
*)bin
;
177 !(hdr
->dataHeader
.magic1
==0xda && hdr
->dataHeader
.magic2
==0x27 &&
178 hdr
->info
.isBigEndian
==U_IS_BIG_ENDIAN
&&
179 isAcceptable(&cspProto
, UCASE_DATA_TYPE
, UCASE_DATA_NAME
, &hdr
->info
))
181 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
185 bin
+=hdr
->dataHeader
.headerSize
;
187 length
-=hdr
->dataHeader
.headerSize
;
189 return ucase_openData(&cspProto
, bin
, length
, pErrorCode
);
194 U_CAPI
void U_EXPORT2
195 ucase_close(UCaseProps
*csp
) {
197 #if !UCASE_HARDCODE_DATA
198 udata_close(csp
->mem
);
204 /* UCaseProps singleton ----------------------------------------------------- */
206 #if !UCASE_HARDCODE_DATA
207 static UCaseProps
*gCsp
=NULL
;
208 static UCaseProps
*gCspDummy
=NULL
;
209 static UErrorCode gErrorCode
=U_ZERO_ERROR
;
210 static int8_t gHaveData
=0;
213 #if !UCASE_HARDCODE_DATA
214 static UBool U_CALLCONV
ucase_cleanup(void) {
217 ucase_close(gCspDummy
);
219 gErrorCode
=U_ZERO_ERROR
;
225 U_CAPI
const UCaseProps
* U_EXPORT2
226 ucase_getSingleton(UErrorCode
*pErrorCode
) {
227 #if UCASE_HARDCODE_DATA
228 if(U_FAILURE(*pErrorCode
)) {
231 return &ucase_props_singleton
;
235 if(U_FAILURE(*pErrorCode
)) {
239 UMTX_CHECK(NULL
, gHaveData
, haveData
);
242 /* data was loaded */
244 } else if(haveData
<0) {
245 /* data loading failed */
246 *pErrorCode
=gErrorCode
;
248 } else /* haveData==0 */ {
250 UCaseProps
*csp
=ucase_open(pErrorCode
);
251 if(U_FAILURE(*pErrorCode
)) {
253 gErrorCode
=*pErrorCode
;
257 /* set the static variables */
263 ucln_common_registerCleanup(UCLN_COMMON_UCASE
, ucase_cleanup
);
273 #if !UCASE_HARDCODE_DATA
274 U_CAPI
const UCaseProps
* U_EXPORT2
275 ucase_getDummy(UErrorCode
*pErrorCode
) {
278 if(U_FAILURE(*pErrorCode
)) {
282 UMTX_CHECK(NULL
, gCspDummy
, csp
);
285 /* the dummy object was already created */
287 } else /* csp==NULL */ {
288 /* create the dummy object */
291 csp
=(UCaseProps
*)uprv_malloc(sizeof(UCaseProps
)+UCASE_IX_TOP
*4+UTRIE_DUMMY_SIZE
);
293 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
296 uprv_memset(csp
, 0, sizeof(UCaseProps
)+UCASE_IX_TOP
*4);
298 csp
->indexes
=indexes
=(int32_t *)(csp
+1);
299 indexes
[UCASE_IX_INDEX_TOP
]=UCASE_IX_TOP
;
301 indexes
[UCASE_IX_TRIE_SIZE
]=
302 utrie_unserializeDummy(&csp
->trie
, indexes
+UCASE_IX_TOP
, UTRIE_DUMMY_SIZE
, 0, 0, TRUE
, pErrorCode
);
303 if(U_FAILURE(*pErrorCode
)) {
308 csp
->formatVersion
[0]=1;
309 csp
->formatVersion
[2]=UTRIE_SHIFT
;
310 csp
->formatVersion
[3]=UTRIE_INDEX_SHIFT
;
312 /* set the static variables */
314 if(gCspDummy
==NULL
) {
317 ucln_common_registerCleanup(UCLN_COMMON_UCASE
, ucase_cleanup
);
327 /* set of property starts for UnicodeSet ------------------------------------ */
329 static UBool U_CALLCONV
330 _enumPropertyStartsRange(const void *context
, UChar32 start
, UChar32 limit
, uint32_t value
) {
331 /* add the start code point to the USet */
332 const USetAdder
*sa
=(const USetAdder
*)context
;
333 sa
->add(sa
->set
, start
);
337 U_CFUNC
void U_EXPORT2
338 ucase_addPropertyStarts(const UCaseProps
*csp
, const USetAdder
*sa
, UErrorCode
*pErrorCode
) {
339 if(U_FAILURE(*pErrorCode
)) {
343 /* add the start code point of each same-value range of the trie */
344 utrie_enum(&csp
->trie
, NULL
, _enumPropertyStartsRange
, sa
);
346 /* add code points with hardcoded properties, plus the ones following them */
348 /* (none right now, see comment below) */
351 * Omit code points with hardcoded specialcasing properties
352 * because we do not build property UnicodeSets for them right now.
356 /* data access primitives --------------------------------------------------- */
358 /* UTRIE_GET16() itself validates c */
359 #define GET_PROPS(csp, c, result) \
360 UTRIE_GET16(&(csp)->trie, c, result);
362 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
364 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
366 /* number of bits in an 8-bit integer value */
367 static const uint8_t flagsOffset
[256]={
368 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
369 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
370 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
371 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
372 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
373 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
374 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
375 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
376 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
377 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
378 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
379 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
380 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
381 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
382 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
383 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
386 #define HAS_SLOT(flags, index) ((flags)&(1<<(index)))
387 #define SLOT_OFFSET(flags, index) flagsOffset[(flags)&((1<<(index))-1)]
390 * Get the value of an optional-value slot where HAS_SLOT(excWord, index).
392 * @param excWord (in) initial exceptions word
393 * @param index (in) desired slot index
394 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
395 * moved to the last uint16_t of the value, use +1 for beginning of next slot
396 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
398 #define GET_SLOT_VALUE(excWord, index, pExc16, value) \
399 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
400 (pExc16)+=SLOT_OFFSET(excWord, index); \
403 (pExc16)+=2*SLOT_OFFSET(excWord, index); \
405 (value)=((value)<<16)|*pExc16; \
408 /* simple case mappings ----------------------------------------------------- */
410 U_CAPI UChar32 U_EXPORT2
411 ucase_tolower(const UCaseProps
*csp
, UChar32 c
) {
413 GET_PROPS(csp
, c
, props
);
414 if(!PROPS_HAS_EXCEPTION(props
)) {
415 if(UCASE_GET_TYPE(props
)>=UCASE_UPPER
) {
416 c
+=UCASE_GET_DELTA(props
);
419 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
420 uint16_t excWord
=*pe
++;
421 if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
422 GET_SLOT_VALUE(excWord
, UCASE_EXC_LOWER
, pe
, c
);
428 U_CAPI UChar32 U_EXPORT2
429 ucase_toupper(const UCaseProps
*csp
, UChar32 c
) {
431 GET_PROPS(csp
, c
, props
);
432 if(!PROPS_HAS_EXCEPTION(props
)) {
433 if(UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
434 c
+=UCASE_GET_DELTA(props
);
437 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
438 uint16_t excWord
=*pe
++;
439 if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
440 GET_SLOT_VALUE(excWord
, UCASE_EXC_UPPER
, pe
, c
);
446 U_CAPI UChar32 U_EXPORT2
447 ucase_totitle(const UCaseProps
*csp
, UChar32 c
) {
449 GET_PROPS(csp
, c
, props
);
450 if(!PROPS_HAS_EXCEPTION(props
)) {
451 if(UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
452 c
+=UCASE_GET_DELTA(props
);
455 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
456 uint16_t excWord
=*pe
++;
458 if(HAS_SLOT(excWord
, UCASE_EXC_TITLE
)) {
459 index
=UCASE_EXC_TITLE
;
460 } else if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
461 index
=UCASE_EXC_UPPER
;
465 GET_SLOT_VALUE(excWord
, index
, pe
, c
);
470 static const UChar iDot
[2] = { 0x69, 0x307 };
471 static const UChar jDot
[2] = { 0x6a, 0x307 };
472 static const UChar iOgonekDot
[3] = { 0x12f, 0x307 };
473 static const UChar iDotGrave
[3] = { 0x69, 0x307, 0x300 };
474 static const UChar iDotAcute
[3] = { 0x69, 0x307, 0x301 };
475 static const UChar iDotTilde
[3] = { 0x69, 0x307, 0x303 };
478 U_CFUNC
void U_EXPORT2
479 ucase_addCaseClosure(const UCaseProps
*csp
, UChar32 c
, const USetAdder
*sa
) {
483 * Hardcode the case closure of i and its relatives and ignore the
484 * data file data for these characters.
485 * The Turkic dotless i and dotted I with their case mapping conditions
486 * and case folding option make the related characters behave specially.
487 * This code matches their closure behavior to their case folding behavior.
492 /* regular i and I are in one equivalence class */
493 sa
->add(sa
->set
, 0x69);
496 sa
->add(sa
->set
, 0x49);
499 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
500 sa
->addString(sa
->set
, iDot
, 2);
503 /* dotless i is in a class by itself */
506 /* otherwise use the data file data */
510 GET_PROPS(csp
, c
, props
);
511 if(!PROPS_HAS_EXCEPTION(props
)) {
512 if(UCASE_GET_TYPE(props
)!=UCASE_NONE
) {
513 /* add the one simple case mapping, no matter what type it is */
514 int32_t delta
=UCASE_GET_DELTA(props
);
516 sa
->add(sa
->set
, c
+delta
);
521 * c has exceptions, so there may be multiple simple and/or
522 * full case mappings. Add them all.
524 const uint16_t *pe0
, *pe
=GET_EXCEPTIONS(csp
, props
);
525 const UChar
*closure
;
526 uint16_t excWord
=*pe
++;
527 int32_t index
, closureLength
, fullLength
, length
;
531 /* add all simple case mappings */
532 for(index
=UCASE_EXC_LOWER
; index
<=UCASE_EXC_TITLE
; ++index
) {
533 if(HAS_SLOT(excWord
, index
)) {
535 GET_SLOT_VALUE(excWord
, index
, pe
, c
);
540 /* get the closure string pointer & length */
541 if(HAS_SLOT(excWord
, UCASE_EXC_CLOSURE
)) {
543 GET_SLOT_VALUE(excWord
, UCASE_EXC_CLOSURE
, pe
, closureLength
);
544 closureLength
&=UCASE_CLOSURE_MAX_LENGTH
; /* higher bits are reserved */
545 closure
=(const UChar
*)pe
+1; /* behind this slot, unless there are full case mappings */
551 /* add the full case folding */
552 if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
554 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, fullLength
);
556 /* start of full case mapping strings */
559 fullLength
&=0xffff; /* bits 16 and higher are reserved */
561 /* skip the lowercase result string */
562 pe
+=fullLength
&UCASE_FULL_LOWER
;
565 /* add the full case folding string */
566 length
=fullLength
&0xf;
568 sa
->addString(sa
->set
, (const UChar
*)pe
, length
);
572 /* skip the uppercase and titlecase strings */
578 closure
=(const UChar
*)pe
; /* behind full case mappings */
581 /* add each code point in the closure string */
582 for(index
=0; index
<closureLength
;) {
583 U16_NEXT_UNSAFE(closure
, index
, c
);
590 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
591 * must be length>0 and max>0 and length<=max
593 static U_INLINE
int32_t
594 strcmpMax(const UChar
*s
, int32_t length
, const UChar
*t
, int32_t max
) {
597 max
-=length
; /* we require length<=max, so no need to decrement max in the loop */
602 return 1; /* reached the end of t but not of s */
606 return c1
; /* return difference result */
609 /* ends with length==0 */
611 if(max
==0 || *t
==0) {
612 return 0; /* equal to length of both strings */
614 return -max
; /* return lengh difference */
618 U_CFUNC UBool U_EXPORT2
619 ucase_addStringCaseClosure(const UCaseProps
*csp
, const UChar
*s
, int32_t length
, const USetAdder
*sa
) {
620 const UChar
*unfold
, *p
;
621 int32_t i
, start
, limit
, result
, unfoldRows
, unfoldRowWidth
, unfoldStringWidth
;
623 if(csp
->unfold
==NULL
|| s
==NULL
) {
624 return FALSE
; /* no reverse case folding data, or no string */
627 /* the string is too short to find any match */
629 * more precise would be:
630 * if(!u_strHasMoreChar32Than(s, length, 1))
631 * but this does not make much practical difference because
632 * a single supplementary code point would just not be found
638 unfoldRows
=unfold
[UCASE_UNFOLD_ROWS
];
639 unfoldRowWidth
=unfold
[UCASE_UNFOLD_ROW_WIDTH
];
640 unfoldStringWidth
=unfold
[UCASE_UNFOLD_STRING_WIDTH
];
641 unfold
+=unfoldRowWidth
;
643 if(length
>unfoldStringWidth
) {
644 /* the string is too long to find any match */
648 /* do a binary search for the string */
653 p
=unfold
+(i
*unfoldRowWidth
);
654 result
=strcmpMax(s
, length
, p
, unfoldStringWidth
);
657 /* found the string: add each code point, and its case closure */
660 for(i
=unfoldStringWidth
; i
<unfoldRowWidth
&& p
[i
]!=0;) {
661 U16_NEXT_UNSAFE(p
, i
, c
);
663 ucase_addCaseClosure(csp
, c
, sa
);
666 } else if(result
<0) {
668 } else /* result>0 */ {
673 return FALSE
; /* string not found */
676 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
677 U_CAPI
int32_t U_EXPORT2
678 ucase_getType(const UCaseProps
*csp
, UChar32 c
) {
680 GET_PROPS(csp
, c
, props
);
681 return UCASE_GET_TYPE(props
);
684 /** @return same as ucase_getType(), or <0 if c is case-ignorable */
685 U_CAPI
int32_t U_EXPORT2
686 ucase_getTypeOrIgnorable(const UCaseProps
*csp
, UChar32 c
) {
689 GET_PROPS(csp
, c
, props
);
690 type
=UCASE_GET_TYPE(props
);
691 if(type
!=UCASE_NONE
) {
695 (props
&(UCASE_EXCEPTION
|UCASE_CASE_IGNORABLE
))==UCASE_CASE_IGNORABLE
697 return -1; /* case-ignorable */
699 return 0; /* c is neither cased nor case-ignorable */
703 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
704 static U_INLINE
int32_t
705 getDotType(const UCaseProps
*csp
, UChar32 c
) {
707 GET_PROPS(csp
, c
, props
);
708 if(!PROPS_HAS_EXCEPTION(props
)) {
709 return props
&UCASE_DOT_MASK
;
711 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
712 return (*pe
>>UCASE_EXC_DOT_SHIFT
)&UCASE_DOT_MASK
;
716 U_CAPI UBool U_EXPORT2
717 ucase_isSoftDotted(const UCaseProps
*csp
, UChar32 c
) {
718 return (UBool
)(getDotType(csp
, c
)==UCASE_SOFT_DOTTED
);
721 U_CAPI UBool U_EXPORT2
722 ucase_isCaseSensitive(const UCaseProps
*csp
, UChar32 c
) {
724 GET_PROPS(csp
, c
, props
);
725 return (UBool
)((props
&UCASE_SENSITIVE
)!=0);
728 /* string casing ------------------------------------------------------------ */
731 * These internal functions form the core of string case mappings.
732 * They map single code points to result code points or strings and take
733 * all necessary conditions (context, locale ID, options) into account.
735 * They do not iterate over the source or write to the destination
736 * so that the same functions are useful for non-standard string storage,
737 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
738 * For the same reason, the "surrounding text" context is passed in as a
739 * UCaseContextIterator which does not make any assumptions about
740 * the underlying storage.
742 * This section contains helper functions that check for conditions
743 * in the input text surrounding the current code point
744 * according to SpecialCasing.txt.
746 * Each helper function gets the index
747 * - after the current code point if it looks at following text
748 * - before the current code point if it looks at preceding text
750 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
753 * C is preceded by a sequence consisting of
754 * a cased letter and a case-ignorable sequence,
755 * and C is not followed by a sequence consisting of
756 * an ignorable sequence and then a cased letter.
759 * C is followed by one or more characters of combining class 230 (ABOVE)
760 * in the combining character sequence.
763 * The last preceding character with combining class of zero before C
765 * and there is no intervening combining character class 230 (ABOVE).
768 * C is followed by combining dot above (U+0307).
769 * Any sequence of characters with a combining class that is neither 0 nor 230
770 * may intervene between the current character and the combining dot above.
772 * The erratum from 2002-10-31 adds the condition
775 * The last preceding base character was an uppercase I, and there is no
776 * intervening combining character class 230 (ABOVE).
778 * (See Jitterbug 2344 and the comments on After_I below.)
780 * Helper definitions in Unicode 3.2 UAX 21:
782 * D1. A character C is defined to be cased
783 * if it meets any of the following criteria:
785 * - The general category of C is Titlecase Letter (Lt)
786 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
787 * - Given D = NFD(C), then it is not the case that:
788 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
789 * (This third criterium does not add any characters to the list
790 * for Unicode 3.2. Ignored.)
792 * D2. A character C is defined to be case-ignorable
793 * if it meets either of the following criteria:
795 * - The general category of C is
796 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
797 * Letter Modifier (Lm), or Symbol Modifier (Sk)
798 * - C is one of the following characters
800 * U+00AD SOFT HYPHEN (SHY)
801 * U+2019 RIGHT SINGLE QUOTATION MARK
802 * (the preferred character for apostrophe)
804 * D3. A case-ignorable sequence is a sequence of
805 * zero or more case-ignorable characters.
808 #define is_a(c) ((c)=='a' || (c)=='A')
809 #define is_d(c) ((c)=='d' || (c)=='D')
810 #define is_e(c) ((c)=='e' || (c)=='E')
811 #define is_i(c) ((c)=='i' || (c)=='I')
812 #define is_l(c) ((c)=='l' || (c)=='L')
813 #define is_n(c) ((c)=='n' || (c)=='N')
814 #define is_r(c) ((c)=='r' || (c)=='R')
815 #define is_t(c) ((c)=='t' || (c)=='T')
816 #define is_u(c) ((c)=='u' || (c)=='U')
817 #define is_z(c) ((c)=='z' || (c)=='Z')
820 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
823 * Requires non-NULL locale ID but otherwise does the equivalent of
824 * checking for language codes as if uloc_getLanguage() were called:
825 * Accepts both 2- and 3-letter codes and accepts case variants.
828 ucase_getCaseLocale(const char *locale
, int32_t *locCache
) {
832 if(locCache
!=NULL
&& (result
=*locCache
)!=UCASE_LOC_UNKNOWN
) {
836 result
=UCASE_LOC_ROOT
;
839 * This function used to use uloc_getLanguage(), but the current code
840 * removes the dependency of this low-level code on uloc implementation code
841 * and is faster because not the whole locale ID has to be
842 * examined and copied/transformed.
844 * Because this code does not want to depend on uloc, the caller must
845 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
857 result
=UCASE_LOC_TURKISH
;
869 result
=UCASE_LOC_TURKISH
;
881 result
=UCASE_LOC_LITHUANIAN
;
893 result
=UCASE_LOC_DUTCH
;
904 /* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */
906 isFollowedByCasedLetter(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
, int8_t dir
) {
914 for(/* dir!=0 sets direction */; (c
=iter(context
, dir
))>=0; dir
=0) {
915 GET_PROPS(csp
, c
, props
);
916 if(UCASE_GET_TYPE(props
)!=UCASE_NONE
) {
917 return TRUE
; /* followed by cased letter */
918 } else if(c
==0x307 || (props
&(UCASE_EXCEPTION
|UCASE_CASE_IGNORABLE
))==UCASE_CASE_IGNORABLE
) {
919 /* case-ignorable, continue with the loop */
921 return FALSE
; /* not ignorable */
925 return FALSE
; /* not followed by cased letter */
928 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
930 isPrecededBySoftDotted(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
939 for(dir
=-1; (c
=iter(context
, dir
))>=0; dir
=0) {
940 dotType
=getDotType(csp
, c
);
941 if(dotType
==UCASE_SOFT_DOTTED
) {
942 return TRUE
; /* preceded by TYPE_i */
943 } else if(dotType
!=UCASE_OTHER_ACCENT
) {
944 return FALSE
; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
948 return FALSE
; /* not preceded by TYPE_i */
952 * See Jitterbug 2344:
953 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
954 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
955 * we made those releases compatible with Unicode 3.2 which had not fixed
956 * a related bug in SpecialCasing.txt.
958 * From the Jitterbug 2344 text:
959 * ... this bug is listed as a Unicode erratum
960 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
962 * There are two errors in SpecialCasing.txt.
963 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
964 * 2. An incorrect context definition. Correct as follows:
965 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
966 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
968 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
969 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
970 * where the context After_I is defined as:
971 * The last preceding base character was an uppercase I, and there is no
972 * intervening combining character class 230 (ABOVE).
975 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
977 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
978 * # This matches the behavior of the canonically equivalent I-dot_above
980 * See also the description in this place in older versions of uchar.c (revision 1.100).
982 * Markus W. Scherer 2003-feb-15
985 /* Is preceded by base character 'I' with no intervening cc=230 ? */
987 isPrecededBy_I(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
996 for(dir
=-1; (c
=iter(context
, dir
))>=0; dir
=0) {
998 return TRUE
; /* preceded by I */
1000 dotType
=getDotType(csp
, c
);
1001 if(dotType
!=UCASE_OTHER_ACCENT
) {
1002 return FALSE
; /* preceded by different base character (not I), or intervening cc==230 */
1006 return FALSE
; /* not preceded by I */
1009 /* Is followed by one or more cc==230 ? */
1011 isFollowedByMoreAbove(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
1020 for(dir
=1; (c
=iter(context
, dir
))>=0; dir
=0) {
1021 dotType
=getDotType(csp
, c
);
1022 if(dotType
==UCASE_ABOVE
) {
1023 return TRUE
; /* at least one cc==230 following */
1024 } else if(dotType
!=UCASE_OTHER_ACCENT
) {
1025 return FALSE
; /* next base character, no more cc==230 following */
1029 return FALSE
; /* no more cc==230 following */
1032 /* Is followed by a dot above (without cc==230 in between) ? */
1034 isFollowedByDotAbove(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
1043 for(dir
=1; (c
=iter(context
, dir
))>=0; dir
=0) {
1047 dotType
=getDotType(csp
, c
);
1048 if(dotType
!=UCASE_OTHER_ACCENT
) {
1049 return FALSE
; /* next base character or cc==230 in between */
1053 return FALSE
; /* no dot above following */
1056 U_CAPI
int32_t U_EXPORT2
1057 ucase_toFullLower(const UCaseProps
*csp
, UChar32 c
,
1058 UCaseContextIterator
*iter
, void *context
,
1059 const UChar
**pString
,
1060 const char *locale
, int32_t *locCache
)
1066 GET_PROPS(csp
, c
, props
);
1067 if(!PROPS_HAS_EXCEPTION(props
)) {
1068 if(UCASE_GET_TYPE(props
)>=UCASE_UPPER
) {
1069 result
=c
+UCASE_GET_DELTA(props
);
1072 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
), *pe2
;
1073 uint16_t excWord
=*pe
++;
1078 if(excWord
&UCASE_EXC_CONDITIONAL_SPECIAL
) {
1079 /* use hardcoded conditions and mappings */
1080 int32_t loc
=ucase_getCaseLocale(locale
, locCache
);
1083 * Test for conditional mappings first
1084 * (otherwise the unconditional default mappings are always taken),
1085 * then test for characters that have unconditional mappings in SpecialCasing.txt,
1086 * then get the UnicodeData.txt mappings.
1088 if( loc
==UCASE_LOC_LITHUANIAN
&&
1089 /* base characters, find accents above */
1090 (((c
==0x49 || c
==0x4a || c
==0x12e) &&
1091 isFollowedByMoreAbove(csp
, iter
, context
)) ||
1092 /* precomposed with accent above, no need to find one */
1093 (c
==0xcc || c
==0xcd || c
==0x128))
1098 # Lithuanian retains the dot in a lowercase i when followed by accents.
1100 # Introduce an explicit dot above when lowercasing capital I's and J's
1101 # whenever there are more accents above.
1102 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1104 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1105 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1106 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1107 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1108 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1109 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1112 case 0x49: /* LATIN CAPITAL LETTER I */
1115 case 0x4a: /* LATIN CAPITAL LETTER J */
1118 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1119 *pString
=iOgonekDot
;
1121 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
1124 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
1127 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1131 return 0; /* will not occur */
1133 /* # Turkish and Azeri */
1134 } else if(loc
==UCASE_LOC_TURKISH
&& c
==0x130) {
1136 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1137 # The following rules handle those cases.
1139 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1140 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1143 } else if(loc
==UCASE_LOC_TURKISH
&& c
==0x307 && isPrecededBy_I(csp
, iter
, context
)) {
1145 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1146 # This matches the behavior of the canonically equivalent I-dot_above
1148 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1149 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1151 return 0; /* remove the dot (continue without output) */
1152 } else if(loc
==UCASE_LOC_TURKISH
&& c
==0x49 && !isFollowedByDotAbove(csp
, iter
, context
)) {
1154 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1156 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1157 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1160 } else if(c
==0x130) {
1162 # Preserve canonical equivalence for I with dot. Turkic is handled below.
1164 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1168 } else if( c
==0x3a3 &&
1169 !isFollowedByCasedLetter(csp
, iter
, context
, 1) &&
1170 isFollowedByCasedLetter(csp
, iter
, context
, -1) /* -1=preceded */
1172 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1174 # Special case for final form of sigma
1176 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1178 return 0x3c2; /* greek small final sigma */
1180 /* no known conditional special case mapping, use a normal mapping */
1182 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1183 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1184 full
&=UCASE_FULL_LOWER
;
1186 /* set the output pointer to the lowercase mapping */
1189 /* return the string length */
1194 if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1195 GET_SLOT_VALUE(excWord
, UCASE_EXC_LOWER
, pe2
, result
);
1199 return (result
==c
) ? ~result
: result
;
1204 toUpperOrTitle(const UCaseProps
*csp
, UChar32 c
,
1205 UCaseContextIterator
*iter
, void *context
,
1206 const UChar
**pString
,
1207 const char *locale
, int32_t *locCache
,
1208 UBool upperNotTitle
) {
1213 GET_PROPS(csp
, c
, props
);
1214 if(!PROPS_HAS_EXCEPTION(props
)) {
1215 if(UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
1216 result
=c
+UCASE_GET_DELTA(props
);
1219 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
), *pe2
;
1220 uint16_t excWord
=*pe
++;
1221 int32_t full
, index
;
1225 if(excWord
&UCASE_EXC_CONDITIONAL_SPECIAL
) {
1226 /* use hardcoded conditions and mappings */
1227 int32_t loc
=ucase_getCaseLocale(locale
, locCache
);
1229 if(loc
==UCASE_LOC_TURKISH
&& c
==0x69) {
1233 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1234 # The following rules handle those cases.
1236 # When uppercasing, i turns into a dotted capital I
1238 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1239 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1242 } else if(loc
==UCASE_LOC_LITHUANIAN
&& c
==0x307 && isPrecededBySoftDotted(csp
, iter
, context
)) {
1246 # Lithuanian retains the dot in a lowercase i when followed by accents.
1248 # Remove DOT ABOVE after "i" with upper or titlecase
1250 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1252 return 0; /* remove the dot (continue without output) */
1254 /* no known conditional special case mapping, use a normal mapping */
1256 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1257 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1259 /* start of full case mapping strings */
1262 /* skip the lowercase and case-folding result strings */
1263 pe
+=full
&UCASE_FULL_LOWER
;
1271 /* skip the uppercase result string */
1277 /* set the output pointer to the result string */
1280 /* return the string length */
1285 if(!upperNotTitle
&& HAS_SLOT(excWord
, UCASE_EXC_TITLE
)) {
1286 index
=UCASE_EXC_TITLE
;
1287 } else if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
1288 /* here, titlecase is same as uppercase */
1289 index
=UCASE_EXC_UPPER
;
1293 GET_SLOT_VALUE(excWord
, index
, pe2
, result
);
1296 return (result
==c
) ? ~result
: result
;
1299 U_CAPI
int32_t U_EXPORT2
1300 ucase_toFullUpper(const UCaseProps
*csp
, UChar32 c
,
1301 UCaseContextIterator
*iter
, void *context
,
1302 const UChar
**pString
,
1303 const char *locale
, int32_t *locCache
) {
1304 return toUpperOrTitle(csp
, c
, iter
, context
, pString
, locale
, locCache
, TRUE
);
1307 U_CAPI
int32_t U_EXPORT2
1308 ucase_toFullTitle(const UCaseProps
*csp
, UChar32 c
,
1309 UCaseContextIterator
*iter
, void *context
,
1310 const UChar
**pString
,
1311 const char *locale
, int32_t *locCache
) {
1312 return toUpperOrTitle(csp
, c
, iter
, context
, pString
, locale
, locCache
, FALSE
);
1315 /* case folding ------------------------------------------------------------- */
1318 * Case folding is similar to lowercasing.
1319 * The result may be a simple mapping, i.e., a single code point, or
1320 * a full mapping, i.e., a string.
1321 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1322 * then only the lowercase mapping is stored.
1324 * Some special cases are hardcoded because their conditions cannot be
1325 * parsed and processed from CaseFolding.txt.
1327 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1329 # C: common case folding, common mappings shared by both simple and full mappings.
1330 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1331 # S: simple case folding, mappings to single characters where different from F.
1332 # T: special case for uppercase I and dotted uppercase I
1333 # - For non-Turkic languages, this mapping is normally not used.
1334 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1337 # A. To do a simple case folding, use the mappings with status C + S.
1338 # B. To do a full case folding, use the mappings with status C + F.
1340 # The mappings with status T can be used or omitted depending on the desired case-folding
1341 # behavior. (The default option is to exclude them.)
1343 * Unicode 3.2 has 'T' mappings as follows:
1345 0049; T; 0131; # LATIN CAPITAL LETTER I
1346 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1348 * while the default mappings for these code points are:
1350 0049; C; 0069; # LATIN CAPITAL LETTER I
1351 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1353 * U+0130 has no simple case folding (simple-case-folds to itself).
1356 /* return the simple case folding mapping for c */
1357 U_CAPI UChar32 U_EXPORT2
1358 ucase_fold(const UCaseProps
*csp
, UChar32 c
, uint32_t options
) {
1360 GET_PROPS(csp
, c
, props
);
1361 if(!PROPS_HAS_EXCEPTION(props
)) {
1362 if(UCASE_GET_TYPE(props
)>=UCASE_UPPER
) {
1363 c
+=UCASE_GET_DELTA(props
);
1366 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
1367 uint16_t excWord
=*pe
++;
1369 if(excWord
&UCASE_EXC_CONDITIONAL_FOLD
) {
1370 /* special case folding mappings, hardcoded */
1371 if((options
&_FOLD_CASE_OPTIONS_MASK
)==U_FOLD_CASE_DEFAULT
) {
1372 /* default mappings */
1374 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1376 } else if(c
==0x130) {
1377 /* no simple case folding for U+0130 */
1381 /* Turkic mappings */
1383 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1385 } else if(c
==0x130) {
1386 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1391 if(HAS_SLOT(excWord
, UCASE_EXC_FOLD
)) {
1392 index
=UCASE_EXC_FOLD
;
1393 } else if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1394 index
=UCASE_EXC_LOWER
;
1398 GET_SLOT_VALUE(excWord
, index
, pe
, c
);
1404 * Issue for canonical caseless match (UAX #21):
1405 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1406 * canonical equivalence, unlike default-option casefolding.
1407 * For example, I-grave and I + grave fold to strings that are not canonically
1409 * For more details, see the comment in unorm_compare() in unorm.cpp
1410 * and the intermediate prototype changes for Jitterbug 2021.
1411 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1413 * This did not get fixed because it appears that it is not possible to fix
1414 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1415 * together in a way that they still fold to common result strings.
1418 U_CAPI
int32_t U_EXPORT2
1419 ucase_toFullFolding(const UCaseProps
*csp
, UChar32 c
,
1420 const UChar
**pString
,
1427 GET_PROPS(csp
, c
, props
);
1428 if(!PROPS_HAS_EXCEPTION(props
)) {
1429 if(UCASE_GET_TYPE(props
)>=UCASE_UPPER
) {
1430 result
=c
+UCASE_GET_DELTA(props
);
1433 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
), *pe2
;
1434 uint16_t excWord
=*pe
++;
1435 int32_t full
, index
;
1439 if(excWord
&UCASE_EXC_CONDITIONAL_FOLD
) {
1440 /* use hardcoded conditions and mappings */
1441 if((options
&_FOLD_CASE_OPTIONS_MASK
)==U_FOLD_CASE_DEFAULT
) {
1442 /* default mappings */
1444 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1446 } else if(c
==0x130) {
1447 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1452 /* Turkic mappings */
1454 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1456 } else if(c
==0x130) {
1457 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1461 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1462 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1464 /* start of full case mapping strings */
1467 /* skip the lowercase result string */
1468 pe
+=full
&UCASE_FULL_LOWER
;
1472 /* set the output pointer to the result string */
1475 /* return the string length */
1480 if(HAS_SLOT(excWord
, UCASE_EXC_FOLD
)) {
1481 index
=UCASE_EXC_FOLD
;
1482 } else if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1483 index
=UCASE_EXC_LOWER
;
1487 GET_SLOT_VALUE(excWord
, index
, pe2
, result
);
1490 return (result
==c
) ? ~result
: result
;
1493 /* case mapping properties API ---------------------------------------------- */
1495 /* get the UCaseProps singleton, or else its dummy, once and for all */
1496 #if !UCASE_HARDCODE_DATA
1497 static const UCaseProps
*
1500 * This lazy intialization with double-checked locking (without mutex protection for
1501 * the initial check) is transiently unsafe under certain circumstances.
1502 * Check the readme and use u_init() if necessary.
1505 /* the initial check is performed by the GET_CASE_PROPS() macro */
1506 const UCaseProps
*csp
;
1507 UErrorCode errorCode
=U_ZERO_ERROR
;
1509 csp
=ucase_getSingleton(&errorCode
);
1510 if(U_FAILURE(errorCode
)) {
1511 errorCode
=U_ZERO_ERROR
;
1512 csp
=ucase_getDummy(&errorCode
);
1513 if(U_FAILURE(errorCode
)) {
1523 * In ICU 3.0, most Unicode properties were loaded from uprops.icu.
1524 * ICU 3.2 adds ucase.icu for case mapping properties.
1525 * ICU 3.4 adds ubidi.icu for bidi/shaping properties and
1526 * removes case/bidi/shaping properties from uprops.icu.
1528 * Loading of uprops.icu was never mutex-protected and required u_init()
1529 * for thread safety.
1530 * In order to maintain performance for all such properties,
1531 * ucase.icu and ubidi.icu are loaded lazily, without mutexing.
1532 * u_init() will try to load them for thread safety,
1533 * but u_init() will not fail if they are missing.
1535 * uchar.c maintains a tri-state flag for (not loaded/loaded/failed to load)
1536 * and an error code for load failure.
1537 * Instead, here we try to load at most once.
1538 * If it works, we use the resulting singleton object.
1539 * If it fails, then we get a dummy object, which always works unless
1540 * we are seriously out of memory.
1541 * After the first try, we have a never-changing pointer to either the
1542 * real singleton or the dummy.
1544 * This method is used in Unicode properties APIs (uchar.h) that
1545 * do not have a service object and also do not have an error code parameter.
1546 * Other API implementations get the singleton themselves
1547 * (with mutexing), store it in the service object, and report errors.
1549 #if !UCASE_HARDCODE_DATA
1550 #define GET_CASE_PROPS() (gCsp!=NULL ? gCsp : getCaseProps())
1552 #define GET_CASE_PROPS() &ucase_props_singleton
1555 /* public API (see uchar.h) */
1557 U_CAPI UBool U_EXPORT2
1558 u_isULowercase(UChar32 c
) {
1559 return (UBool
)(UCASE_LOWER
==ucase_getType(GET_CASE_PROPS(), c
));
1562 U_CAPI UBool U_EXPORT2
1563 u_isUUppercase(UChar32 c
) {
1564 return (UBool
)(UCASE_UPPER
==ucase_getType(GET_CASE_PROPS(), c
));
1567 /* Transforms the Unicode character to its lower case equivalent.*/
1568 U_CAPI UChar32 U_EXPORT2
1569 u_tolower(UChar32 c
) {
1570 return ucase_tolower(GET_CASE_PROPS(), c
);
1573 /* Transforms the Unicode character to its upper case equivalent.*/
1574 U_CAPI UChar32 U_EXPORT2
1575 u_toupper(UChar32 c
) {
1576 return ucase_toupper(GET_CASE_PROPS(), c
);
1579 /* Transforms the Unicode character to its title case equivalent.*/
1580 U_CAPI UChar32 U_EXPORT2
1581 u_totitle(UChar32 c
) {
1582 return ucase_totitle(GET_CASE_PROPS(), c
);
1585 /* return the simple case folding mapping for c */
1586 U_CAPI UChar32 U_EXPORT2
1587 u_foldCase(UChar32 c
, uint32_t options
) {
1588 return ucase_fold(GET_CASE_PROPS(), c
, options
);
1591 U_CFUNC
int32_t U_EXPORT2
1592 ucase_hasBinaryProperty(UChar32 c
, UProperty which
) {
1593 /* case mapping properties */
1594 const UCaseProps
*csp
=GET_CASE_PROPS();
1599 case UCHAR_LOWERCASE
:
1600 return (UBool
)(UCASE_LOWER
==ucase_getType(csp
, c
));
1601 case UCHAR_UPPERCASE
:
1602 return (UBool
)(UCASE_UPPER
==ucase_getType(csp
, c
));
1603 case UCHAR_SOFT_DOTTED
:
1604 return ucase_isSoftDotted(csp
, c
);
1605 case UCHAR_CASE_SENSITIVE
:
1606 return ucase_isCaseSensitive(csp
, c
);