2 *******************************************************************************
4 * Copyright (C) 2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2004aug30
14 * created by: Markus W. Scherer
16 * Low-level Unicode character/string case mapping code.
17 * Much code moved here (and modified) from uchar.c.
20 #include "unicode/utypes.h"
21 #include "unicode/uset.h"
22 #include "unicode/udata.h" /* UDataInfo */
23 #include "ucmndata.h" /* DataHeader */
34 const int32_t *indexes
;
35 const uint16_t *exceptions
;
38 uint8_t formatVersion
[4];
41 /* data loading etc. -------------------------------------------------------- */
43 static UBool U_CALLCONV
44 isAcceptable(void *context
,
45 const char *type
, const char *name
,
46 const UDataInfo
*pInfo
) {
49 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
50 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
51 pInfo
->dataFormat
[0]==UCASE_FMT_0
&& /* dataFormat="cAsE" */
52 pInfo
->dataFormat
[1]==UCASE_FMT_1
&&
53 pInfo
->dataFormat
[2]==UCASE_FMT_2
&&
54 pInfo
->dataFormat
[3]==UCASE_FMT_3
&&
55 pInfo
->formatVersion
[0]==1 &&
56 pInfo
->formatVersion
[2]==UTRIE_SHIFT
&&
57 pInfo
->formatVersion
[3]==UTRIE_INDEX_SHIFT
59 UCaseProps
*csp
=(UCaseProps
*)context
;
60 uprv_memcpy(csp
->formatVersion
, pInfo
->formatVersion
, 4);
68 ucase_openData(UCaseProps
*cspProto
,
69 const uint8_t *bin
, int32_t length
, UErrorCode
*pErrorCode
) {
71 int32_t size
, trieSize
;
73 cspProto
->indexes
=(const int32_t *)bin
;
74 if( cspProto
->indexes
[UCASE_IX_INDEX_TOP
]<16 ||
75 (length
>=0 && length
<cspProto
->indexes
[UCASE_IX_LENGTH
])
77 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
81 /* get the trie address, after indexes[] */
82 size
=cspProto
->indexes
[UCASE_IX_INDEX_TOP
]*4;
84 if(length
>=0 && (length
-=size
)<16) {
85 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
89 /* unserialize the trie */
90 trieSize
=cspProto
->indexes
[UCASE_IX_TRIE_SIZE
];
91 trieSize
=utrie_unserialize(&cspProto
->trie
, bin
, length
>=0 ? length
: trieSize
, pErrorCode
);
92 if(U_FAILURE(*pErrorCode
)) {
96 /* get exceptions[] */
98 if(length
>=0 && (length
-=trieSize
)<2*cspProto
->indexes
[UCASE_IX_EXC_LENGTH
]) {
99 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
102 cspProto
->exceptions
=(const uint16_t *)bin
;
104 /* allocate, copy, and return the new UCaseProps */
105 csp
=(UCaseProps
*)uprv_malloc(sizeof(UCaseProps
));
107 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
110 uprv_memcpy(csp
, cspProto
, sizeof(UCaseProps
));
115 U_CAPI UCaseProps
* U_EXPORT2
116 ucase_open(UErrorCode
*pErrorCode
) {
117 UCaseProps cspProto
={ NULL
}, *csp
;
119 cspProto
.mem
=udata_openChoice(NULL
, UCASE_DATA_TYPE
, UCASE_DATA_NAME
, isAcceptable
, &cspProto
, pErrorCode
);
120 if(U_FAILURE(*pErrorCode
)) {
126 udata_getMemory(cspProto
.mem
),
127 udata_getLength(cspProto
.mem
),
129 if(U_FAILURE(*pErrorCode
)) {
130 udata_close(cspProto
.mem
);
137 U_CAPI UCaseProps
* U_EXPORT2
138 ucase_openBinary(const uint8_t *bin
, int32_t length
, UErrorCode
*pErrorCode
) {
139 UCaseProps cspProto
={ NULL
};
140 const DataHeader
*hdr
;
142 if(U_FAILURE(*pErrorCode
)) {
146 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
150 /* check the header */
151 if(length
>=0 && length
<20) {
152 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
155 hdr
=(const DataHeader
*)bin
;
157 !(hdr
->dataHeader
.magic1
==0xda && hdr
->dataHeader
.magic2
==0x27 &&
158 hdr
->info
.isBigEndian
==U_IS_BIG_ENDIAN
&&
159 isAcceptable(&cspProto
, UCASE_DATA_TYPE
, UCASE_DATA_NAME
, &hdr
->info
))
161 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
165 bin
+=hdr
->dataHeader
.headerSize
;
167 length
-=hdr
->dataHeader
.headerSize
;
169 return ucase_openData(&cspProto
, bin
, length
, pErrorCode
);
172 U_CAPI
void U_EXPORT2
173 ucase_close(UCaseProps
*csp
) {
175 udata_close(csp
->mem
);
180 /* UCaseProps singleton ----------------------------------------------------- */
182 static UCaseProps
*gCsp
=NULL
;
183 static UErrorCode gErrorCode
=U_ZERO_ERROR
;
184 static int8_t gHaveData
=0;
186 static UBool U_CALLCONV
ucase_cleanup(void) {
189 gErrorCode
=U_ZERO_ERROR
;
194 U_CAPI UCaseProps
* U_EXPORT2
195 ucase_getSingleton(UErrorCode
*pErrorCode
) {
198 if(U_FAILURE(*pErrorCode
)) {
202 UMTX_CHECK(NULL
, gHaveData
, haveData
);
205 /* data was loaded */
207 } else if(haveData
<0) {
208 /* data loading failed */
209 *pErrorCode
=gErrorCode
;
211 } else /* haveData==0 */ {
213 UCaseProps
*csp
=ucase_open(pErrorCode
);
214 if(U_FAILURE(*pErrorCode
)) {
216 gErrorCode
=*pErrorCode
;
220 /* set the static variables */
226 ucln_common_registerCleanup(UCLN_COMMON_UCASE
, ucase_cleanup
);
235 /* Unicode case mapping data swapping --------------------------------------- */
237 U_CAPI
int32_t U_EXPORT2
238 ucase_swap(const UDataSwapper
*ds
,
239 const void *inData
, int32_t length
, void *outData
,
240 UErrorCode
*pErrorCode
) {
241 const UDataInfo
*pInfo
;
244 const uint8_t *inBytes
;
247 const int32_t *inIndexes
;
250 int32_t i
, offset
, count
, size
;
252 /* udata_swapDataHeader checks the arguments */
253 headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, pErrorCode
);
254 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
258 /* check data format and format version */
259 pInfo
=(const UDataInfo
*)((const char *)inData
+4);
261 pInfo
->dataFormat
[0]==UCASE_FMT_0
&& /* dataFormat="cAsE" */
262 pInfo
->dataFormat
[1]==UCASE_FMT_1
&&
263 pInfo
->dataFormat
[2]==UCASE_FMT_2
&&
264 pInfo
->dataFormat
[3]==UCASE_FMT_3
&&
265 pInfo
->formatVersion
[0]==1 &&
266 pInfo
->formatVersion
[2]==UTRIE_SHIFT
&&
267 pInfo
->formatVersion
[3]==UTRIE_INDEX_SHIFT
269 udata_printError(ds
, "ucase_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as case mapping data\n",
270 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
271 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
272 pInfo
->formatVersion
[0]);
273 *pErrorCode
=U_UNSUPPORTED_ERROR
;
277 inBytes
=(const uint8_t *)inData
+headerSize
;
278 outBytes
=(uint8_t *)outData
+headerSize
;
280 inIndexes
=(const int32_t *)inBytes
;
285 udata_printError(ds
, "ucase_swap(): too few bytes (%d after header) for case mapping data\n",
287 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
292 /* read the first 16 indexes (ICU 3.2/format version 1: UCASE_IX_TOP==16, might grow) */
293 for(i
=0; i
<16; ++i
) {
294 indexes
[i
]=udata_readInt32(ds
, inIndexes
[i
]);
297 /* get the total length of the data */
298 size
=indexes
[UCASE_IX_LENGTH
];
302 udata_printError(ds
, "ucase_swap(): too few bytes (%d after header) for all of case mapping data\n",
304 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
308 /* copy the data for inaccessible bytes */
309 if(inBytes
!=outBytes
) {
310 uprv_memcpy(outBytes
, inBytes
, size
);
315 /* swap the int32_t indexes[] */
316 count
=indexes
[UCASE_IX_INDEX_TOP
]*4;
317 ds
->swapArray32(ds
, inBytes
, count
, outBytes
, pErrorCode
);
321 count
=indexes
[UCASE_IX_TRIE_SIZE
];
322 utrie_swap(ds
, inBytes
+offset
, count
, outBytes
+offset
, pErrorCode
);
325 /* swap the uint16_t exceptions[] */
326 count
=indexes
[UCASE_IX_EXC_LENGTH
]*2;
327 ds
->swapArray16(ds
, inBytes
+offset
, count
, outBytes
+offset
, pErrorCode
);
330 U_ASSERT(offset
==size
);
333 return headerSize
+size
;
336 /* set of property starts for UnicodeSet ------------------------------------ */
338 static UBool U_CALLCONV
339 _enumPropertyStartsRange(const void *context
, UChar32 start
, UChar32 limit
, uint32_t value
) {
340 /* add the start code point to the USet */
341 USetAdder
*sa
=(USetAdder
*)context
;
342 sa
->add(sa
->set
, start
);
346 U_CAPI
void U_EXPORT2
347 ucase_addPropertyStarts(const UCaseProps
*csp
, USetAdder
*sa
, UErrorCode
*pErrorCode
) {
348 if(U_FAILURE(*pErrorCode
)) {
352 /* add the start code point of each same-value range of the trie */
353 utrie_enum(&csp
->trie
, NULL
, _enumPropertyStartsRange
, sa
);
355 /* add code points with hardcoded properties, plus the ones following them */
357 /* (none right now, see comment below) */
360 * Omit code points with hardcoded specialcasing properties
361 * because we do not build property UnicodeSets for them right now.
365 /* data access primitives --------------------------------------------------- */
367 /* UTRIE_GET16() itself validates c */
368 #define GET_PROPS(csp, c, result) \
369 UTRIE_GET16(&(csp)->trie, c, result);
371 #define GET_CASE_TYPE(props) ((props)&UCASE_TYPE_MASK)
372 #define GET_SIGNED_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT)
373 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
375 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
377 /* number of bits in an 8-bit integer value */
378 static const uint8_t flagsOffset
[256]={
379 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
380 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
381 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
382 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
383 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
384 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
385 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
386 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
387 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
388 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
389 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
390 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
391 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
392 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
393 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
394 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
397 #define HAS_SLOT(flags, index) ((flags)&(1<<(index)))
398 #define SLOT_OFFSET(flags, index) flagsOffset[(flags)&((1<<(index))-1)]
401 * Get the value of an optional-value slot where HAS_SLOT(excWord, index).
403 * @param excWord (in) initial exceptions word
404 * @param index (in) desired slot index
405 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
406 * moved to the last uint16_t of the value, use +1 for beginning of next slot
407 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
409 #define GET_SLOT_VALUE(excWord, index, pExc16, value) \
410 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
411 (pExc16)+=SLOT_OFFSET(excWord, index); \
414 (pExc16)+=2*SLOT_OFFSET(excWord, index); \
416 (value)=((value)<<16)|*pExc16; \
419 /* simple case mappings ----------------------------------------------------- */
421 U_CAPI UChar32 U_EXPORT2
422 ucase_tolower(const UCaseProps
*csp
, UChar32 c
) {
424 GET_PROPS(csp
, c
, props
);
425 if(!PROPS_HAS_EXCEPTION(props
)) {
426 if(GET_CASE_TYPE(props
)>=UCASE_UPPER
) {
427 c
+=GET_SIGNED_DELTA(props
);
430 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
431 uint16_t excWord
=*pe
++;
432 if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
433 GET_SLOT_VALUE(excWord
, UCASE_EXC_LOWER
, pe
, c
);
439 U_CAPI UChar32 U_EXPORT2
440 ucase_toupper(const UCaseProps
*csp
, UChar32 c
) {
442 GET_PROPS(csp
, c
, props
);
443 if(!PROPS_HAS_EXCEPTION(props
)) {
444 if(GET_CASE_TYPE(props
)==UCASE_LOWER
) {
445 c
+=GET_SIGNED_DELTA(props
);
448 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
449 uint16_t excWord
=*pe
++;
450 if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
451 GET_SLOT_VALUE(excWord
, UCASE_EXC_UPPER
, pe
, c
);
457 U_CAPI UChar32 U_EXPORT2
458 ucase_totitle(const UCaseProps
*csp
, UChar32 c
) {
460 GET_PROPS(csp
, c
, props
);
461 if(!PROPS_HAS_EXCEPTION(props
)) {
462 if(GET_CASE_TYPE(props
)==UCASE_LOWER
) {
463 c
+=GET_SIGNED_DELTA(props
);
466 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
467 uint16_t excWord
=*pe
++;
469 if(HAS_SLOT(excWord
, UCASE_EXC_TITLE
)) {
470 index
=UCASE_EXC_TITLE
;
471 } else if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
472 index
=UCASE_EXC_UPPER
;
476 GET_SLOT_VALUE(excWord
, index
, pe
, c
);
481 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
482 U_CAPI
int32_t U_EXPORT2
483 ucase_getType(const UCaseProps
*csp
, UChar32 c
) {
485 GET_PROPS(csp
, c
, props
);
486 return GET_CASE_TYPE(props
);
489 /** @return same as ucase_getType(), or <0 if c is case-ignorable */
490 U_CAPI
int32_t U_EXPORT2
491 ucase_getTypeOrIgnorable(const UCaseProps
*csp
, UChar32 c
) {
494 GET_PROPS(csp
, c
, props
);
495 type
=GET_CASE_TYPE(props
);
496 if(type
!=UCASE_NONE
) {
500 (props
&(UCASE_EXCEPTION
|UCASE_CASE_IGNORABLE
))==UCASE_CASE_IGNORABLE
502 return -1; /* case-ignorable */
504 return 0; /* c is neither cased nor case-ignorable */
508 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
509 static U_INLINE
int32_t
510 getDotType(const UCaseProps
*csp
, UChar32 c
) {
512 GET_PROPS(csp
, c
, props
);
513 if(!PROPS_HAS_EXCEPTION(props
)) {
514 return props
&UCASE_DOT_MASK
;
516 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
517 return (*pe
>>UCASE_EXC_DOT_SHIFT
)&UCASE_DOT_MASK
;
521 U_CAPI UBool U_EXPORT2
522 ucase_isSoftDotted(const UCaseProps
*csp
, UChar32 c
) {
523 return (UBool
)(getDotType(csp
, c
)==UCASE_SOFT_DOTTED
);
526 U_CAPI UBool U_EXPORT2
527 ucase_isCaseSensitive(const UCaseProps
*csp
, UChar32 c
) {
529 GET_PROPS(csp
, c
, props
);
530 return (UBool
)((props
&UCASE_SENSITIVE
)!=0);
533 /* public API (see uchar.h) ------------------------------------------------- */
535 U_CAPI UBool U_EXPORT2
536 u_isULowercase(UChar32 c
) {
537 UErrorCode errorCode
=U_ZERO_ERROR
;
538 UCaseProps
*csp
=ucase_getSingleton(&errorCode
);
539 return (UBool
)(csp
!=NULL
&& UCASE_LOWER
==ucase_getType(csp
, c
));
542 U_CAPI UBool U_EXPORT2
543 u_isUUppercase(UChar32 c
) {
544 UErrorCode errorCode
=U_ZERO_ERROR
;
545 UCaseProps
*csp
=ucase_getSingleton(&errorCode
);
546 return (UBool
)(csp
!=NULL
&& UCASE_UPPER
==ucase_getType(csp
, c
));
549 /* Transforms the Unicode character to its lower case equivalent.*/
550 U_CAPI UChar32 U_EXPORT2
551 u_tolower(UChar32 c
) {
552 UErrorCode errorCode
=U_ZERO_ERROR
;
553 UCaseProps
*csp
=ucase_getSingleton(&errorCode
);
555 return ucase_tolower(csp
, c
);
561 /* Transforms the Unicode character to its upper case equivalent.*/
562 U_CAPI UChar32 U_EXPORT2
563 u_toupper(UChar32 c
) {
564 UErrorCode errorCode
=U_ZERO_ERROR
;
565 UCaseProps
*csp
=ucase_getSingleton(&errorCode
);
567 return ucase_toupper(csp
, c
);
573 /* Transforms the Unicode character to its title case equivalent.*/
574 U_CAPI UChar32 U_EXPORT2
575 u_totitle(UChar32 c
) {
576 UErrorCode errorCode
=U_ZERO_ERROR
;
577 UCaseProps
*csp
=ucase_getSingleton(&errorCode
);
579 return ucase_totitle(csp
, c
);
585 /* return the simple case folding mapping for c */
586 U_CAPI UChar32 U_EXPORT2
587 u_foldCase(UChar32 c
, uint32_t options
) {
588 UErrorCode errorCode
=U_ZERO_ERROR
;
589 UCaseProps
*csp
=ucase_getSingleton(&errorCode
);
591 return ucase_fold(csp
, c
, options
);
597 /* string casing ------------------------------------------------------------ */
600 * These internal functions form the core of string case mappings.
601 * They map single code points to result code points or strings and take
602 * all necessary conditions (context, locale ID, options) into account.
604 * They do not iterate over the source or write to the destination
605 * so that the same functions are useful for non-standard string storage,
606 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
607 * For the same reason, the "surrounding text" context is passed in as a
608 * UCaseContextIterator which does not make any assumptions about
609 * the underlying storage.
611 * This section contains helper functions that check for conditions
612 * in the input text surrounding the current code point
613 * according to SpecialCasing.txt.
615 * Each helper function gets the index
616 * - after the current code point if it looks at following text
617 * - before the current code point if it looks at preceding text
619 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
622 * C is preceded by a sequence consisting of
623 * a cased letter and a case-ignorable sequence,
624 * and C is not followed by a sequence consisting of
625 * an ignorable sequence and then a cased letter.
628 * C is followed by one or more characters of combining class 230 (ABOVE)
629 * in the combining character sequence.
632 * The last preceding character with combining class of zero before C
634 * and there is no intervening combining character class 230 (ABOVE).
637 * C is followed by combining dot above (U+0307).
638 * Any sequence of characters with a combining class that is neither 0 nor 230
639 * may intervene between the current character and the combining dot above.
641 * The erratum from 2002-10-31 adds the condition
644 * The last preceding base character was an uppercase I, and there is no
645 * intervening combining character class 230 (ABOVE).
647 * (See Jitterbug 2344 and the comments on After_I below.)
649 * Helper definitions in Unicode 3.2 UAX 21:
651 * D1. A character C is defined to be cased
652 * if it meets any of the following criteria:
654 * - The general category of C is Titlecase Letter (Lt)
655 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
656 * - Given D = NFD(C), then it is not the case that:
657 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
658 * (This third criterium does not add any characters to the list
659 * for Unicode 3.2. Ignored.)
661 * D2. A character C is defined to be case-ignorable
662 * if it meets either of the following criteria:
664 * - The general category of C is
665 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
666 * Letter Modifier (Lm), or Symbol Modifier (Sk)
667 * - C is one of the following characters
669 * U+00AD SOFT HYPHEN (SHY)
670 * U+2019 RIGHT SINGLE QUOTATION MARK
671 * (the preferred character for apostrophe)
673 * D3. A case-ignorable sequence is a sequence of
674 * zero or more case-ignorable characters.
684 #define is_a(c) ((c)=='a' || (c)=='A')
685 #define is_e(c) ((c)=='e' || (c)=='E')
686 #define is_i(c) ((c)=='i' || (c)=='I')
687 #define is_l(c) ((c)=='l' || (c)=='L')
688 #define is_r(c) ((c)=='r' || (c)=='R')
689 #define is_t(c) ((c)=='t' || (c)=='T')
690 #define is_u(c) ((c)=='u' || (c)=='U')
691 #define is_z(c) ((c)=='z' || (c)=='Z')
694 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
697 * Requires non-NULL locale ID but otherwise does the equivalent of
698 * checking for language codes as if uloc_getLanguage() were called:
699 * Accepts both 2- and 3-letter codes and accepts case variants.
702 getCaseLocale(const char *locale
, int32_t *locCache
) {
706 if(locCache
!=NULL
&& (result
=*locCache
)!=LOC_UNKNOWN
) {
713 * This function used to use uloc_getLanguage(), but the current code
714 * removes the dependency of this low-level code on uloc implementation code
715 * and is faster because not the whole locale ID has to be
716 * examined and copied/transformed.
718 * Because this code does not want to depend on uloc, the caller must
719 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
755 result
=LOC_LITHUANIAN
;
766 /* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */
768 isFollowedByCasedLetter(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
, int8_t dir
) {
776 for(/* dir!=0 sets direction */; (c
=iter(context
, dir
))>=0; dir
=0) {
777 GET_PROPS(csp
, c
, props
);
778 if(GET_CASE_TYPE(props
)!=UCASE_NONE
) {
779 return TRUE
; /* followed by cased letter */
780 } else if(c
==0x307 || (props
&(UCASE_EXCEPTION
|UCASE_CASE_IGNORABLE
))==UCASE_CASE_IGNORABLE
) {
781 /* case-ignorable, continue with the loop */
783 return FALSE
; /* not ignorable */
787 return FALSE
; /* not followed by cased letter */
790 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
792 isPrecededBySoftDotted(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
801 for(dir
=-1; (c
=iter(context
, dir
))>=0; dir
=0) {
802 dotType
=getDotType(csp
, c
);
803 if(dotType
==UCASE_SOFT_DOTTED
) {
804 return TRUE
; /* preceded by TYPE_i */
805 } else if(dotType
!=UCASE_OTHER_ACCENT
) {
806 return FALSE
; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
810 return FALSE
; /* not preceded by TYPE_i */
814 * See Jitterbug 2344:
815 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
816 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
817 * we made those releases compatible with Unicode 3.2 which had not fixed
818 * a related bug in SpecialCasing.txt.
820 * From the Jitterbug 2344 text:
821 * ... this bug is listed as a Unicode erratum
822 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
824 * There are two errors in SpecialCasing.txt.
825 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
826 * 2. An incorrect context definition. Correct as follows:
827 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
828 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
830 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
831 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
832 * where the context After_I is defined as:
833 * The last preceding base character was an uppercase I, and there is no
834 * intervening combining character class 230 (ABOVE).
837 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
839 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
840 * # This matches the behavior of the canonically equivalent I-dot_above
842 * See also the description in this place in older versions of uchar.c (revision 1.100).
844 * Markus W. Scherer 2003-feb-15
847 /* Is preceded by base character 'I' with no intervening cc=230 ? */
849 isPrecededBy_I(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
858 for(dir
=-1; (c
=iter(context
, dir
))>=0; dir
=0) {
860 return TRUE
; /* preceded by I */
862 dotType
=getDotType(csp
, c
);
863 if(dotType
!=UCASE_OTHER_ACCENT
) {
864 return FALSE
; /* preceded by different base character (not I), or intervening cc==230 */
868 return FALSE
; /* not preceded by I */
871 /* Is followed by one or more cc==230 ? */
873 isFollowedByMoreAbove(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
882 for(dir
=1; (c
=iter(context
, dir
))>=0; dir
=0) {
883 dotType
=getDotType(csp
, c
);
884 if(dotType
==UCASE_ABOVE
) {
885 return TRUE
; /* at least one cc==230 following */
886 } else if(dotType
!=UCASE_OTHER_ACCENT
) {
887 return FALSE
; /* next base character, no more cc==230 following */
891 return FALSE
; /* no more cc==230 following */
894 /* Is followed by a dot above (without cc==230 in between) ? */
896 isFollowedByDotAbove(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
905 for(dir
=1; (c
=iter(context
, dir
))>=0; dir
=0) {
909 dotType
=getDotType(csp
, c
);
910 if(dotType
!=UCASE_OTHER_ACCENT
) {
911 return FALSE
; /* next base character or cc==230 in between */
915 return FALSE
; /* no dot above following */
918 U_CAPI
int32_t U_EXPORT2
919 ucase_toFullLower(const UCaseProps
*csp
, UChar32 c
,
920 UCaseContextIterator
*iter
, void *context
,
921 const UChar
**pString
,
922 const char *locale
, int32_t *locCache
) {
924 iDot
[2]= { 0x69, 0x307 },
925 jDot
[2]= { 0x6a, 0x307 },
926 iOgonekDot
[3]= { 0x12f, 0x307 },
927 iDotGrave
[3]= { 0x69, 0x307, 0x300 },
928 iDotAcute
[3]= { 0x69, 0x307, 0x301 },
929 iDotTilde
[3]= { 0x69, 0x307, 0x303 };
935 GET_PROPS(csp
, c
, props
);
936 if(!PROPS_HAS_EXCEPTION(props
)) {
937 if(GET_CASE_TYPE(props
)>=UCASE_UPPER
) {
938 result
=c
+GET_SIGNED_DELTA(props
);
941 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
), *pe2
;
942 uint16_t excWord
=*pe
++;
947 if(excWord
&UCASE_EXC_CONDITIONAL_SPECIAL
) {
948 /* use hardcoded conditions and mappings */
949 int32_t loc
=getCaseLocale(locale
, locCache
);
952 * Test for conditional mappings first
953 * (otherwise the unconditional default mappings are always taken),
954 * then test for characters that have unconditional mappings in SpecialCasing.txt,
955 * then get the UnicodeData.txt mappings.
957 if( loc
==LOC_LITHUANIAN
&&
958 /* base characters, find accents above */
959 (((c
==0x49 || c
==0x4a || c
==0x12e) &&
960 isFollowedByMoreAbove(csp
, iter
, context
)) ||
961 /* precomposed with accent above, no need to find one */
962 (c
==0xcc || c
==0xcd || c
==0x128))
967 # Lithuanian retains the dot in a lowercase i when followed by accents.
969 # Introduce an explicit dot above when lowercasing capital I's and J's
970 # whenever there are more accents above.
971 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
973 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
974 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
975 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
976 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
977 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
978 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
981 case 0x49: /* LATIN CAPITAL LETTER I */
984 case 0x4a: /* LATIN CAPITAL LETTER J */
987 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
990 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
993 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
996 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1000 return 0; /* will not occur */
1002 /* # Turkish and Azeri */
1003 } else if(loc
==LOC_TURKISH
&& c
==0x130) {
1005 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1006 # The following rules handle those cases.
1008 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1009 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1012 } else if(loc
==LOC_TURKISH
&& c
==0x307 && isPrecededBy_I(csp
, iter
, context
)) {
1014 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1015 # This matches the behavior of the canonically equivalent I-dot_above
1017 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1018 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1020 return 0; /* remove the dot (continue without output) */
1021 } else if(loc
==LOC_TURKISH
&& c
==0x49 && !isFollowedByDotAbove(csp
, iter
, context
)) {
1023 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1025 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1026 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1029 } else if(c
==0x130) {
1031 # Preserve canonical equivalence for I with dot. Turkic is handled below.
1033 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1037 } else if( c
==0x3a3 &&
1038 !isFollowedByCasedLetter(csp
, iter
, context
, 1) &&
1039 isFollowedByCasedLetter(csp
, iter
, context
, -1) /* -1=preceded */
1041 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1043 # Special case for final form of sigma
1045 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1047 return 0x3c2; /* greek small final sigma */
1049 /* no known conditional special case mapping, use a normal mapping */
1051 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1052 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1053 full
&=UCASE_FULL_LOWER
;
1055 /* set the output pointer to the lowercase mapping */
1058 /* return the string length */
1063 if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1064 GET_SLOT_VALUE(excWord
, UCASE_EXC_LOWER
, pe2
, result
);
1068 return (result
==c
) ? ~result
: result
;
1073 toUpperOrTitle(const UCaseProps
*csp
, UChar32 c
,
1074 UCaseContextIterator
*iter
, void *context
,
1075 const UChar
**pString
,
1076 const char *locale
, int32_t *locCache
,
1077 UBool upperNotTitle
) {
1082 GET_PROPS(csp
, c
, props
);
1083 if(!PROPS_HAS_EXCEPTION(props
)) {
1084 if(GET_CASE_TYPE(props
)==UCASE_LOWER
) {
1085 result
=c
+GET_SIGNED_DELTA(props
);
1088 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
), *pe2
;
1089 uint16_t excWord
=*pe
++;
1090 int32_t full
, index
;
1094 if(excWord
&UCASE_EXC_CONDITIONAL_SPECIAL
) {
1095 /* use hardcoded conditions and mappings */
1096 int32_t loc
=getCaseLocale(locale
, locCache
);
1098 if(loc
==LOC_TURKISH
&& c
==0x69) {
1102 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1103 # The following rules handle those cases.
1105 # When uppercasing, i turns into a dotted capital I
1107 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1108 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1111 } else if(loc
==LOC_LITHUANIAN
&& c
==0x307 && isPrecededBySoftDotted(csp
, iter
, context
)) {
1115 # Lithuanian retains the dot in a lowercase i when followed by accents.
1117 # Remove DOT ABOVE after "i" with upper or titlecase
1119 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1121 return 0; /* remove the dot (continue without output) */
1123 /* no known conditional special case mapping, use a normal mapping */
1125 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1126 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1128 /* start of full case mapping strings */
1131 /* skip the lowercase and case-folding result strings */
1132 pe
+=full
&UCASE_FULL_LOWER
;
1140 /* skip the uppercase result string */
1146 /* set the output pointer to the result string */
1149 /* return the string length */
1154 if(!upperNotTitle
&& HAS_SLOT(excWord
, UCASE_EXC_TITLE
)) {
1155 index
=UCASE_EXC_TITLE
;
1156 } else if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
1157 /* here, titlecase is same as uppercase */
1158 index
=UCASE_EXC_UPPER
;
1162 GET_SLOT_VALUE(excWord
, index
, pe2
, result
);
1165 return (result
==c
) ? ~result
: result
;
1168 U_CAPI
int32_t U_EXPORT2
1169 ucase_toFullUpper(const UCaseProps
*csp
, UChar32 c
,
1170 UCaseContextIterator
*iter
, void *context
,
1171 const UChar
**pString
,
1172 const char *locale
, int32_t *locCache
) {
1173 return toUpperOrTitle(csp
, c
, iter
, context
, pString
, locale
, locCache
, TRUE
);
1176 U_CAPI
int32_t U_EXPORT2
1177 ucase_toFullTitle(const UCaseProps
*csp
, UChar32 c
,
1178 UCaseContextIterator
*iter
, void *context
,
1179 const UChar
**pString
,
1180 const char *locale
, int32_t *locCache
) {
1181 return toUpperOrTitle(csp
, c
, iter
, context
, pString
, locale
, locCache
, FALSE
);
1184 /* case folding ------------------------------------------------------------- */
1187 * Case folding is similar to lowercasing.
1188 * The result may be a simple mapping, i.e., a single code point, or
1189 * a full mapping, i.e., a string.
1190 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1191 * then only the lowercase mapping is stored.
1193 * Some special cases are hardcoded because their conditions cannot be
1194 * parsed and processed from CaseFolding.txt.
1196 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1198 # C: common case folding, common mappings shared by both simple and full mappings.
1199 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1200 # S: simple case folding, mappings to single characters where different from F.
1201 # T: special case for uppercase I and dotted uppercase I
1202 # - For non-Turkic languages, this mapping is normally not used.
1203 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1206 # A. To do a simple case folding, use the mappings with status C + S.
1207 # B. To do a full case folding, use the mappings with status C + F.
1209 # The mappings with status T can be used or omitted depending on the desired case-folding
1210 # behavior. (The default option is to exclude them.)
1212 * Unicode 3.2 has 'T' mappings as follows:
1214 0049; T; 0131; # LATIN CAPITAL LETTER I
1215 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1217 * while the default mappings for these code points are:
1219 0049; C; 0069; # LATIN CAPITAL LETTER I
1220 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1222 * U+0130 is otherwise lowercased to U+0069 (UnicodeData.txt).
1224 * In case this code is used with CaseFolding.txt from an older version of Unicode
1225 * where CaseFolding.txt contains mappings with a status of 'I' that
1226 * have the opposite polarity ('I' mappings are included by default but excluded for Turkic),
1227 * we must also hardcode the Unicode 3.2 mappings for the code points
1228 * with 'I' mappings.
1229 * Unicode 3.1.1 has 'I' mappings for U+0130 and U+0131.
1230 * Unicode 3.2 has a 'T' mapping for U+0130, and lowercases U+0131 to itself (see UnicodeData.txt).
1233 /* return the simple case folding mapping for c */
1234 U_CAPI UChar32 U_EXPORT2
1235 ucase_fold(UCaseProps
*csp
, UChar32 c
, uint32_t options
) {
1237 GET_PROPS(csp
, c
, props
);
1238 if(!PROPS_HAS_EXCEPTION(props
)) {
1239 if(GET_CASE_TYPE(props
)>=UCASE_UPPER
) {
1240 c
+=GET_SIGNED_DELTA(props
);
1243 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
1244 uint16_t excWord
=*pe
++;
1246 if(excWord
&UCASE_EXC_CONDITIONAL_FOLD
) {
1247 /* special case folding mappings, hardcoded */
1248 if((options
&_FOLD_CASE_OPTIONS_MASK
)==U_FOLD_CASE_DEFAULT
) {
1249 /* default mappings */
1251 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1253 } else if(c
==0x130) {
1254 /* no simple default mapping for U+0130, use UnicodeData.txt */
1258 /* Turkic mappings */
1260 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1262 } else if(c
==0x130) {
1263 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1268 if(HAS_SLOT(excWord
, UCASE_EXC_FOLD
)) {
1269 index
=UCASE_EXC_FOLD
;
1270 } else if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1271 index
=UCASE_EXC_LOWER
;
1275 GET_SLOT_VALUE(excWord
, index
, pe
, c
);
1281 * Issue for canonical caseless match (UAX #21):
1282 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1283 * canonical equivalence, unlike default-option casefolding.
1284 * For example, I-grave and I + grave fold to strings that are not canonically
1286 * For more details, see the comment in unorm_compare() in unorm.cpp
1287 * and the intermediate prototype changes for Jitterbug 2021.
1288 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1290 * This did not get fixed because it appears that it is not possible to fix
1291 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1292 * together in a way that they still fold to common result strings.
1295 U_CAPI
int32_t U_EXPORT2
1296 ucase_toFullFolding(const UCaseProps
*csp
, UChar32 c
,
1297 const UChar
**pString
,
1300 iDot
[2]= { 0x69, 0x307 };
1306 GET_PROPS(csp
, c
, props
);
1307 if(!PROPS_HAS_EXCEPTION(props
)) {
1308 if(GET_CASE_TYPE(props
)>=UCASE_UPPER
) {
1309 result
=c
+GET_SIGNED_DELTA(props
);
1312 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
), *pe2
;
1313 uint16_t excWord
=*pe
++;
1314 int32_t full
, index
;
1318 if(excWord
&UCASE_EXC_CONDITIONAL_FOLD
) {
1319 /* use hardcoded conditions and mappings */
1320 if((options
&_FOLD_CASE_OPTIONS_MASK
)==U_FOLD_CASE_DEFAULT
) {
1321 /* default mappings */
1323 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1325 } else if(c
==0x130) {
1326 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1331 /* Turkic mappings */
1333 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1335 } else if(c
==0x130) {
1336 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1340 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1341 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1343 /* start of full case mapping strings */
1346 /* skip the lowercase result string */
1347 pe
+=full
&UCASE_FULL_LOWER
;
1351 /* set the output pointer to the result string */
1354 /* return the string length */
1359 if(HAS_SLOT(excWord
, UCASE_EXC_FOLD
)) {
1360 index
=UCASE_EXC_FOLD
;
1361 } else if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1362 index
=UCASE_EXC_LOWER
;
1366 GET_SLOT_VALUE(excWord
, index
, pe2
, result
);
1369 return (result
==c
) ? ~result
: result
;