2 *******************************************************************************
4 * Copyright (C) 2004-2014, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2004aug30
14 * created by: Markus W. Scherer
16 * Low-level Unicode character/string case mapping code.
17 * Much code moved here (and modified) from uchar.c.
20 #include "unicode/utypes.h"
21 #include "unicode/unistr.h"
22 #include "unicode/uset.h"
23 #include "unicode/udata.h" /* UDataInfo */
24 #include "unicode/utf16.h"
25 #include "ucmndata.h" /* DataHeader */
35 const int32_t *indexes
;
36 const uint16_t *exceptions
;
37 const uint16_t *unfold
;
40 uint8_t formatVersion
[4];
43 /* ucase_props_data.h is machine-generated by gencase --csource */
44 #define INCLUDED_FROM_UCASE_CPP
45 #include "ucase_props_data.h"
47 /* UCaseProps singleton ----------------------------------------------------- */
49 U_CAPI
const UCaseProps
* U_EXPORT2
50 ucase_getSingleton() {
51 return &ucase_props_singleton
;
54 /* set of property starts for UnicodeSet ------------------------------------ */
56 static UBool U_CALLCONV
57 _enumPropertyStartsRange(const void *context
, UChar32 start
, UChar32
/*end*/, uint32_t /*value*/) {
58 /* add the start code point to the USet */
59 const USetAdder
*sa
=(const USetAdder
*)context
;
60 sa
->add(sa
->set
, start
);
64 U_CFUNC
void U_EXPORT2
65 ucase_addPropertyStarts(const UCaseProps
*csp
, const USetAdder
*sa
, UErrorCode
*pErrorCode
) {
66 if(U_FAILURE(*pErrorCode
)) {
70 /* add the start code point of each same-value range of the trie */
71 utrie2_enum(&csp
->trie
, NULL
, _enumPropertyStartsRange
, sa
);
73 /* add code points with hardcoded properties, plus the ones following them */
75 /* (none right now, see comment below) */
78 * Omit code points with hardcoded specialcasing properties
79 * because we do not build property UnicodeSets for them right now.
83 /* data access primitives --------------------------------------------------- */
85 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
87 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
89 /* number of bits in an 8-bit integer value */
90 static const uint8_t flagsOffset
[256]={
91 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
92 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
93 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
96 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
97 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
98 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
99 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
100 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
101 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
104 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
105 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
106 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
109 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
110 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
113 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
115 * @param excWord (in) initial exceptions word
116 * @param idx (in) desired slot index
117 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
118 * moved to the last uint16_t of the value, use +1 for beginning of next slot
119 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
121 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
122 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
123 (pExc16)+=SLOT_OFFSET(excWord, idx); \
126 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
128 (value)=((value)<<16)|*pExc16; \
131 /* simple case mappings ----------------------------------------------------- */
133 U_CAPI UChar32 U_EXPORT2
134 ucase_tolower(const UCaseProps
*csp
, UChar32 c
) {
135 uint16_t props
=UTRIE2_GET16(&csp
->trie
, c
);
136 if(!PROPS_HAS_EXCEPTION(props
)) {
137 if(UCASE_GET_TYPE(props
)>=UCASE_UPPER
) {
138 c
+=UCASE_GET_DELTA(props
);
141 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
142 uint16_t excWord
=*pe
++;
143 if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
144 GET_SLOT_VALUE(excWord
, UCASE_EXC_LOWER
, pe
, c
);
150 U_CAPI UChar32 U_EXPORT2
151 ucase_toupper(const UCaseProps
*csp
, UChar32 c
) {
152 uint16_t props
=UTRIE2_GET16(&csp
->trie
, c
);
153 if(!PROPS_HAS_EXCEPTION(props
)) {
154 if(UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
155 c
+=UCASE_GET_DELTA(props
);
158 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
159 uint16_t excWord
=*pe
++;
160 if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
161 GET_SLOT_VALUE(excWord
, UCASE_EXC_UPPER
, pe
, c
);
167 U_CAPI UChar32 U_EXPORT2
168 ucase_totitle(const UCaseProps
*csp
, UChar32 c
) {
169 uint16_t props
=UTRIE2_GET16(&csp
->trie
, c
);
170 if(!PROPS_HAS_EXCEPTION(props
)) {
171 if(UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
172 c
+=UCASE_GET_DELTA(props
);
175 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
176 uint16_t excWord
=*pe
++;
178 if(HAS_SLOT(excWord
, UCASE_EXC_TITLE
)) {
180 } else if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
185 GET_SLOT_VALUE(excWord
, idx
, pe
, c
);
190 static const UChar iDot
[2] = { 0x69, 0x307 };
191 static const UChar jDot
[2] = { 0x6a, 0x307 };
192 static const UChar iOgonekDot
[3] = { 0x12f, 0x307 };
193 static const UChar iDotGrave
[3] = { 0x69, 0x307, 0x300 };
194 static const UChar iDotAcute
[3] = { 0x69, 0x307, 0x301 };
195 static const UChar iDotTilde
[3] = { 0x69, 0x307, 0x303 };
198 U_CFUNC
void U_EXPORT2
199 ucase_addCaseClosure(const UCaseProps
*csp
, UChar32 c
, const USetAdder
*sa
) {
203 * Hardcode the case closure of i and its relatives and ignore the
204 * data file data for these characters.
205 * The Turkic dotless i and dotted I with their case mapping conditions
206 * and case folding option make the related characters behave specially.
207 * This code matches their closure behavior to their case folding behavior.
212 /* regular i and I are in one equivalence class */
213 sa
->add(sa
->set
, 0x69);
216 sa
->add(sa
->set
, 0x49);
219 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
220 sa
->addString(sa
->set
, iDot
, 2);
223 /* dotless i is in a class by itself */
226 /* otherwise use the data file data */
230 props
=UTRIE2_GET16(&csp
->trie
, c
);
231 if(!PROPS_HAS_EXCEPTION(props
)) {
232 if(UCASE_GET_TYPE(props
)!=UCASE_NONE
) {
233 /* add the one simple case mapping, no matter what type it is */
234 int32_t delta
=UCASE_GET_DELTA(props
);
236 sa
->add(sa
->set
, c
+delta
);
241 * c has exceptions, so there may be multiple simple and/or
242 * full case mappings. Add them all.
244 const uint16_t *pe0
, *pe
=GET_EXCEPTIONS(csp
, props
);
245 const UChar
*closure
;
246 uint16_t excWord
=*pe
++;
247 int32_t idx
, closureLength
, fullLength
, length
;
251 /* add all simple case mappings */
252 for(idx
=UCASE_EXC_LOWER
; idx
<=UCASE_EXC_TITLE
; ++idx
) {
253 if(HAS_SLOT(excWord
, idx
)) {
255 GET_SLOT_VALUE(excWord
, idx
, pe
, c
);
260 /* get the closure string pointer & length */
261 if(HAS_SLOT(excWord
, UCASE_EXC_CLOSURE
)) {
263 GET_SLOT_VALUE(excWord
, UCASE_EXC_CLOSURE
, pe
, closureLength
);
264 closureLength
&=UCASE_CLOSURE_MAX_LENGTH
; /* higher bits are reserved */
265 closure
=(const UChar
*)pe
+1; /* behind this slot, unless there are full case mappings */
271 /* add the full case folding */
272 if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
274 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, fullLength
);
276 /* start of full case mapping strings */
279 fullLength
&=0xffff; /* bits 16 and higher are reserved */
281 /* skip the lowercase result string */
282 pe
+=fullLength
&UCASE_FULL_LOWER
;
285 /* add the full case folding string */
286 length
=fullLength
&0xf;
288 sa
->addString(sa
->set
, (const UChar
*)pe
, length
);
292 /* skip the uppercase and titlecase strings */
298 closure
=(const UChar
*)pe
; /* behind full case mappings */
301 /* add each code point in the closure string */
302 for(idx
=0; idx
<closureLength
;) {
303 U16_NEXT_UNSAFE(closure
, idx
, c
);
310 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
311 * must be length>0 and max>0 and length<=max
313 static inline int32_t
314 strcmpMax(const UChar
*s
, int32_t length
, const UChar
*t
, int32_t max
) {
317 max
-=length
; /* we require length<=max, so no need to decrement max in the loop */
322 return 1; /* reached the end of t but not of s */
326 return c1
; /* return difference result */
329 /* ends with length==0 */
331 if(max
==0 || *t
==0) {
332 return 0; /* equal to length of both strings */
334 return -max
; /* return lengh difference */
338 U_CFUNC UBool U_EXPORT2
339 ucase_addStringCaseClosure(const UCaseProps
*csp
, const UChar
*s
, int32_t length
, const USetAdder
*sa
) {
340 int32_t i
, start
, limit
, result
, unfoldRows
, unfoldRowWidth
, unfoldStringWidth
;
342 if(csp
->unfold
==NULL
|| s
==NULL
) {
343 return FALSE
; /* no reverse case folding data, or no string */
346 /* the string is too short to find any match */
348 * more precise would be:
349 * if(!u_strHasMoreChar32Than(s, length, 1))
350 * but this does not make much practical difference because
351 * a single supplementary code point would just not be found
356 const uint16_t *unfold
=csp
->unfold
;
357 unfoldRows
=unfold
[UCASE_UNFOLD_ROWS
];
358 unfoldRowWidth
=unfold
[UCASE_UNFOLD_ROW_WIDTH
];
359 unfoldStringWidth
=unfold
[UCASE_UNFOLD_STRING_WIDTH
];
360 unfold
+=unfoldRowWidth
;
362 if(length
>unfoldStringWidth
) {
363 /* the string is too long to find any match */
367 /* do a binary search for the string */
372 const UChar
*p
=reinterpret_cast<const UChar
*>(unfold
+(i
*unfoldRowWidth
));
373 result
=strcmpMax(s
, length
, p
, unfoldStringWidth
);
376 /* found the string: add each code point, and its case closure */
379 for(i
=unfoldStringWidth
; i
<unfoldRowWidth
&& p
[i
]!=0;) {
380 U16_NEXT_UNSAFE(p
, i
, c
);
382 ucase_addCaseClosure(csp
, c
, sa
);
385 } else if(result
<0) {
387 } else /* result>0 */ {
392 return FALSE
; /* string not found */
397 FullCaseFoldingIterator::FullCaseFoldingIterator()
398 : unfold(reinterpret_cast<const UChar
*>(ucase_props_singleton
.unfold
)),
399 unfoldRows(unfold
[UCASE_UNFOLD_ROWS
]),
400 unfoldRowWidth(unfold
[UCASE_UNFOLD_ROW_WIDTH
]),
401 unfoldStringWidth(unfold
[UCASE_UNFOLD_STRING_WIDTH
]),
403 rowCpIndex(unfoldStringWidth
) {
404 unfold
+=unfoldRowWidth
;
408 FullCaseFoldingIterator::next(UnicodeString
&full
) {
409 // Advance past the last-delivered code point.
410 const UChar
*p
=unfold
+(currentRow
*unfoldRowWidth
);
411 if(rowCpIndex
>=unfoldRowWidth
|| p
[rowCpIndex
]==0) {
414 rowCpIndex
=unfoldStringWidth
;
416 if(currentRow
>=unfoldRows
) { return U_SENTINEL
; }
417 // Set "full" to the NUL-terminated string in the first unfold column.
418 int32_t length
=unfoldStringWidth
;
419 while(length
>0 && p
[length
-1]==0) { --length
; }
420 full
.setTo(FALSE
, p
, length
);
421 // Return the code point.
423 U16_NEXT_UNSAFE(p
, rowCpIndex
, c
);
429 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
430 U_CAPI
int32_t U_EXPORT2
431 ucase_getType(const UCaseProps
*csp
, UChar32 c
) {
432 uint16_t props
=UTRIE2_GET16(&csp
->trie
, c
);
433 return UCASE_GET_TYPE(props
);
436 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
437 U_CAPI
int32_t U_EXPORT2
438 ucase_getTypeOrIgnorable(const UCaseProps
*csp
, UChar32 c
) {
439 uint16_t props
=UTRIE2_GET16(&csp
->trie
, c
);
440 return UCASE_GET_TYPE_AND_IGNORABLE(props
);
443 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
444 static inline int32_t
445 getDotType(const UCaseProps
*csp
, UChar32 c
) {
446 uint16_t props
=UTRIE2_GET16(&csp
->trie
, c
);
447 if(!PROPS_HAS_EXCEPTION(props
)) {
448 return props
&UCASE_DOT_MASK
;
450 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
451 return (*pe
>>UCASE_EXC_DOT_SHIFT
)&UCASE_DOT_MASK
;
455 U_CAPI UBool U_EXPORT2
456 ucase_isSoftDotted(const UCaseProps
*csp
, UChar32 c
) {
457 return (UBool
)(getDotType(csp
, c
)==UCASE_SOFT_DOTTED
);
460 U_CAPI UBool U_EXPORT2
461 ucase_isCaseSensitive(const UCaseProps
*csp
, UChar32 c
) {
462 uint16_t props
=UTRIE2_GET16(&csp
->trie
, c
);
463 return (UBool
)((props
&UCASE_SENSITIVE
)!=0);
466 /* string casing ------------------------------------------------------------ */
469 * These internal functions form the core of string case mappings.
470 * They map single code points to result code points or strings and take
471 * all necessary conditions (context, locale ID, options) into account.
473 * They do not iterate over the source or write to the destination
474 * so that the same functions are useful for non-standard string storage,
475 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
476 * For the same reason, the "surrounding text" context is passed in as a
477 * UCaseContextIterator which does not make any assumptions about
478 * the underlying storage.
480 * This section contains helper functions that check for conditions
481 * in the input text surrounding the current code point
482 * according to SpecialCasing.txt.
484 * Each helper function gets the index
485 * - after the current code point if it looks at following text
486 * - before the current code point if it looks at preceding text
488 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
491 * C is preceded by a sequence consisting of
492 * a cased letter and a case-ignorable sequence,
493 * and C is not followed by a sequence consisting of
494 * an ignorable sequence and then a cased letter.
497 * C is followed by one or more characters of combining class 230 (ABOVE)
498 * in the combining character sequence.
501 * The last preceding character with combining class of zero before C
503 * and there is no intervening combining character class 230 (ABOVE).
506 * C is followed by combining dot above (U+0307).
507 * Any sequence of characters with a combining class that is neither 0 nor 230
508 * may intervene between the current character and the combining dot above.
510 * The erratum from 2002-10-31 adds the condition
513 * The last preceding base character was an uppercase I, and there is no
514 * intervening combining character class 230 (ABOVE).
516 * (See Jitterbug 2344 and the comments on After_I below.)
518 * Helper definitions in Unicode 3.2 UAX 21:
520 * D1. A character C is defined to be cased
521 * if it meets any of the following criteria:
523 * - The general category of C is Titlecase Letter (Lt)
524 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
525 * - Given D = NFD(C), then it is not the case that:
526 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
527 * (This third criterium does not add any characters to the list
528 * for Unicode 3.2. Ignored.)
530 * D2. A character C is defined to be case-ignorable
531 * if it meets either of the following criteria:
533 * - The general category of C is
534 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
535 * Letter Modifier (Lm), or Symbol Modifier (Sk)
536 * - C is one of the following characters
538 * U+00AD SOFT HYPHEN (SHY)
539 * U+2019 RIGHT SINGLE QUOTATION MARK
540 * (the preferred character for apostrophe)
542 * D3. A case-ignorable sequence is a sequence of
543 * zero or more case-ignorable characters.
546 #define is_a(c) ((c)=='a' || (c)=='A')
547 #define is_d(c) ((c)=='d' || (c)=='D')
548 #define is_e(c) ((c)=='e' || (c)=='E')
549 #define is_i(c) ((c)=='i' || (c)=='I')
550 #define is_l(c) ((c)=='l' || (c)=='L')
551 #define is_n(c) ((c)=='n' || (c)=='N')
552 #define is_r(c) ((c)=='r' || (c)=='R')
553 #define is_t(c) ((c)=='t' || (c)=='T')
554 #define is_u(c) ((c)=='u' || (c)=='U')
555 #define is_z(c) ((c)=='z' || (c)=='Z')
558 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
561 * Requires non-NULL locale ID but otherwise does the equivalent of
562 * checking for language codes as if uloc_getLanguage() were called:
563 * Accepts both 2- and 3-letter codes and accepts case variants.
566 ucase_getCaseLocale(const char *locale
, int32_t *locCache
) {
570 if(locCache
!=NULL
&& (result
=*locCache
)!=UCASE_LOC_UNKNOWN
) {
574 result
=UCASE_LOC_ROOT
;
577 * This function used to use uloc_getLanguage(), but the current code
578 * removes the dependency of this low-level code on uloc implementation code
579 * and is faster because not the whole locale ID has to be
580 * examined and copied/transformed.
582 * Because this code does not want to depend on uloc, the caller must
583 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
595 result
=UCASE_LOC_TURKISH
;
607 result
=UCASE_LOC_TURKISH
;
619 result
=UCASE_LOC_LITHUANIAN
;
631 result
=UCASE_LOC_DUTCH
;
644 * {case-ignorable}* cased
646 * (dir determines looking forward/backward)
647 * If a character is case-ignorable, it is skipped regardless of whether
648 * it is also cased or not.
651 isFollowedByCasedLetter(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
, int8_t dir
) {
658 for(/* dir!=0 sets direction */; (c
=iter(context
, dir
))>=0; dir
=0) {
659 int32_t type
=ucase_getTypeOrIgnorable(csp
, c
);
661 /* case-ignorable, continue with the loop */
662 } else if(type
!=UCASE_NONE
) {
663 return TRUE
; /* followed by cased letter */
665 return FALSE
; /* uncased and not case-ignorable */
669 return FALSE
; /* not followed by cased letter */
672 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
674 isPrecededBySoftDotted(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
683 for(dir
=-1; (c
=iter(context
, dir
))>=0; dir
=0) {
684 dotType
=getDotType(csp
, c
);
685 if(dotType
==UCASE_SOFT_DOTTED
) {
686 return TRUE
; /* preceded by TYPE_i */
687 } else if(dotType
!=UCASE_OTHER_ACCENT
) {
688 return FALSE
; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
692 return FALSE
; /* not preceded by TYPE_i */
696 * See Jitterbug 2344:
697 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
698 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
699 * we made those releases compatible with Unicode 3.2 which had not fixed
700 * a related bug in SpecialCasing.txt.
702 * From the Jitterbug 2344 text:
703 * ... this bug is listed as a Unicode erratum
704 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
706 * There are two errors in SpecialCasing.txt.
707 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
708 * 2. An incorrect context definition. Correct as follows:
709 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
710 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
712 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
713 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
714 * where the context After_I is defined as:
715 * The last preceding base character was an uppercase I, and there is no
716 * intervening combining character class 230 (ABOVE).
719 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
721 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
722 * # This matches the behavior of the canonically equivalent I-dot_above
724 * See also the description in this place in older versions of uchar.c (revision 1.100).
726 * Markus W. Scherer 2003-feb-15
729 /* Is preceded by base character 'I' with no intervening cc=230 ? */
731 isPrecededBy_I(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
740 for(dir
=-1; (c
=iter(context
, dir
))>=0; dir
=0) {
742 return TRUE
; /* preceded by I */
744 dotType
=getDotType(csp
, c
);
745 if(dotType
!=UCASE_OTHER_ACCENT
) {
746 return FALSE
; /* preceded by different base character (not I), or intervening cc==230 */
750 return FALSE
; /* not preceded by I */
753 /* Is followed by one or more cc==230 ? */
755 isFollowedByMoreAbove(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
764 for(dir
=1; (c
=iter(context
, dir
))>=0; dir
=0) {
765 dotType
=getDotType(csp
, c
);
766 if(dotType
==UCASE_ABOVE
) {
767 return TRUE
; /* at least one cc==230 following */
768 } else if(dotType
!=UCASE_OTHER_ACCENT
) {
769 return FALSE
; /* next base character, no more cc==230 following */
773 return FALSE
; /* no more cc==230 following */
776 /* Is followed by a dot above (without cc==230 in between) ? */
778 isFollowedByDotAbove(const UCaseProps
*csp
, UCaseContextIterator
*iter
, void *context
) {
787 for(dir
=1; (c
=iter(context
, dir
))>=0; dir
=0) {
791 dotType
=getDotType(csp
, c
);
792 if(dotType
!=UCASE_OTHER_ACCENT
) {
793 return FALSE
; /* next base character or cc==230 in between */
797 return FALSE
; /* no dot above following */
800 U_CAPI
int32_t U_EXPORT2
801 ucase_toFullLower(const UCaseProps
*csp
, UChar32 c
,
802 UCaseContextIterator
*iter
, void *context
,
803 const UChar
**pString
,
804 const char *locale
, int32_t *locCache
)
807 uint16_t props
=UTRIE2_GET16(&csp
->trie
, c
);
808 if(!PROPS_HAS_EXCEPTION(props
)) {
809 if(UCASE_GET_TYPE(props
)>=UCASE_UPPER
) {
810 result
=c
+UCASE_GET_DELTA(props
);
813 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
), *pe2
;
814 uint16_t excWord
=*pe
++;
819 if(excWord
&UCASE_EXC_CONDITIONAL_SPECIAL
) {
820 /* use hardcoded conditions and mappings */
821 int32_t loc
=ucase_getCaseLocale(locale
, locCache
);
824 * Test for conditional mappings first
825 * (otherwise the unconditional default mappings are always taken),
826 * then test for characters that have unconditional mappings in SpecialCasing.txt,
827 * then get the UnicodeData.txt mappings.
829 if( loc
==UCASE_LOC_LITHUANIAN
&&
830 /* base characters, find accents above */
831 (((c
==0x49 || c
==0x4a || c
==0x12e) &&
832 isFollowedByMoreAbove(csp
, iter
, context
)) ||
833 /* precomposed with accent above, no need to find one */
834 (c
==0xcc || c
==0xcd || c
==0x128))
839 # Lithuanian retains the dot in a lowercase i when followed by accents.
841 # Introduce an explicit dot above when lowercasing capital I's and J's
842 # whenever there are more accents above.
843 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
845 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
846 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
847 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
848 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
849 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
850 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
853 case 0x49: /* LATIN CAPITAL LETTER I */
856 case 0x4a: /* LATIN CAPITAL LETTER J */
859 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
862 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
865 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
868 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
872 return 0; /* will not occur */
874 /* # Turkish and Azeri */
875 } else if(loc
==UCASE_LOC_TURKISH
&& c
==0x130) {
877 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
878 # The following rules handle those cases.
880 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
881 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
884 } else if(loc
==UCASE_LOC_TURKISH
&& c
==0x307 && isPrecededBy_I(csp
, iter
, context
)) {
886 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
887 # This matches the behavior of the canonically equivalent I-dot_above
889 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
890 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
892 return 0; /* remove the dot (continue without output) */
893 } else if(loc
==UCASE_LOC_TURKISH
&& c
==0x49 && !isFollowedByDotAbove(csp
, iter
, context
)) {
895 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
897 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
898 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
901 } else if(c
==0x130) {
903 # Preserve canonical equivalence for I with dot. Turkic is handled below.
905 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
909 } else if( c
==0x3a3 &&
910 !isFollowedByCasedLetter(csp
, iter
, context
, 1) &&
911 isFollowedByCasedLetter(csp
, iter
, context
, -1) /* -1=preceded */
913 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
915 # Special case for final form of sigma
917 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
919 return 0x3c2; /* greek small final sigma */
921 /* no known conditional special case mapping, use a normal mapping */
923 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
924 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
925 full
&=UCASE_FULL_LOWER
;
927 /* set the output pointer to the lowercase mapping */
928 *pString
=reinterpret_cast<const UChar
*>(pe
+1);
930 /* return the string length */
935 if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
936 GET_SLOT_VALUE(excWord
, UCASE_EXC_LOWER
, pe2
, result
);
940 return (result
==c
) ? ~result
: result
;
945 toUpperOrTitle(const UCaseProps
*csp
, UChar32 c
,
946 UCaseContextIterator
*iter
, void *context
,
947 const UChar
**pString
,
948 const char *locale
, int32_t *locCache
,
949 UBool upperNotTitle
) {
951 uint16_t props
=UTRIE2_GET16(&csp
->trie
, c
);
952 if(!PROPS_HAS_EXCEPTION(props
)) {
953 if(UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
954 result
=c
+UCASE_GET_DELTA(props
);
957 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
), *pe2
;
958 uint16_t excWord
=*pe
++;
963 if(excWord
&UCASE_EXC_CONDITIONAL_SPECIAL
) {
964 /* use hardcoded conditions and mappings */
965 int32_t loc
=ucase_getCaseLocale(locale
, locCache
);
967 if(loc
==UCASE_LOC_TURKISH
&& c
==0x69) {
971 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
972 # The following rules handle those cases.
974 # When uppercasing, i turns into a dotted capital I
976 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
977 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
980 } else if(loc
==UCASE_LOC_LITHUANIAN
&& c
==0x307 && isPrecededBySoftDotted(csp
, iter
, context
)) {
984 # Lithuanian retains the dot in a lowercase i when followed by accents.
986 # Remove DOT ABOVE after "i" with upper or titlecase
988 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
990 return 0; /* remove the dot (continue without output) */
992 /* no known conditional special case mapping, use a normal mapping */
994 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
995 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
997 /* start of full case mapping strings */
1000 /* skip the lowercase and case-folding result strings */
1001 pe
+=full
&UCASE_FULL_LOWER
;
1009 /* skip the uppercase result string */
1015 /* set the output pointer to the result string */
1016 *pString
=reinterpret_cast<const UChar
*>(pe
);
1018 /* return the string length */
1023 if(!upperNotTitle
&& HAS_SLOT(excWord
, UCASE_EXC_TITLE
)) {
1024 idx
=UCASE_EXC_TITLE
;
1025 } else if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
1026 /* here, titlecase is same as uppercase */
1027 idx
=UCASE_EXC_UPPER
;
1031 GET_SLOT_VALUE(excWord
, idx
, pe2
, result
);
1034 return (result
==c
) ? ~result
: result
;
1037 U_CAPI
int32_t U_EXPORT2
1038 ucase_toFullUpper(const UCaseProps
*csp
, UChar32 c
,
1039 UCaseContextIterator
*iter
, void *context
,
1040 const UChar
**pString
,
1041 const char *locale
, int32_t *locCache
) {
1042 return toUpperOrTitle(csp
, c
, iter
, context
, pString
, locale
, locCache
, TRUE
);
1045 U_CAPI
int32_t U_EXPORT2
1046 ucase_toFullTitle(const UCaseProps
*csp
, UChar32 c
,
1047 UCaseContextIterator
*iter
, void *context
,
1048 const UChar
**pString
,
1049 const char *locale
, int32_t *locCache
) {
1050 return toUpperOrTitle(csp
, c
, iter
, context
, pString
, locale
, locCache
, FALSE
);
1053 /* case folding ------------------------------------------------------------- */
1056 * Case folding is similar to lowercasing.
1057 * The result may be a simple mapping, i.e., a single code point, or
1058 * a full mapping, i.e., a string.
1059 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1060 * then only the lowercase mapping is stored.
1062 * Some special cases are hardcoded because their conditions cannot be
1063 * parsed and processed from CaseFolding.txt.
1065 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1067 # C: common case folding, common mappings shared by both simple and full mappings.
1068 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1069 # S: simple case folding, mappings to single characters where different from F.
1070 # T: special case for uppercase I and dotted uppercase I
1071 # - For non-Turkic languages, this mapping is normally not used.
1072 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1075 # A. To do a simple case folding, use the mappings with status C + S.
1076 # B. To do a full case folding, use the mappings with status C + F.
1078 # The mappings with status T can be used or omitted depending on the desired case-folding
1079 # behavior. (The default option is to exclude them.)
1081 * Unicode 3.2 has 'T' mappings as follows:
1083 0049; T; 0131; # LATIN CAPITAL LETTER I
1084 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1086 * while the default mappings for these code points are:
1088 0049; C; 0069; # LATIN CAPITAL LETTER I
1089 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1091 * U+0130 has no simple case folding (simple-case-folds to itself).
1094 /* return the simple case folding mapping for c */
1095 U_CAPI UChar32 U_EXPORT2
1096 ucase_fold(const UCaseProps
*csp
, UChar32 c
, uint32_t options
) {
1097 uint16_t props
=UTRIE2_GET16(&csp
->trie
, c
);
1098 if(!PROPS_HAS_EXCEPTION(props
)) {
1099 if(UCASE_GET_TYPE(props
)>=UCASE_UPPER
) {
1100 c
+=UCASE_GET_DELTA(props
);
1103 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
);
1104 uint16_t excWord
=*pe
++;
1106 if(excWord
&UCASE_EXC_CONDITIONAL_FOLD
) {
1107 /* special case folding mappings, hardcoded */
1108 if((options
&_FOLD_CASE_OPTIONS_MASK
)==U_FOLD_CASE_DEFAULT
) {
1109 /* default mappings */
1111 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1113 } else if(c
==0x130) {
1114 /* no simple case folding for U+0130 */
1118 /* Turkic mappings */
1120 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1122 } else if(c
==0x130) {
1123 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1128 if(HAS_SLOT(excWord
, UCASE_EXC_FOLD
)) {
1130 } else if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1131 idx
=UCASE_EXC_LOWER
;
1135 GET_SLOT_VALUE(excWord
, idx
, pe
, c
);
1141 * Issue for canonical caseless match (UAX #21):
1142 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1143 * canonical equivalence, unlike default-option casefolding.
1144 * For example, I-grave and I + grave fold to strings that are not canonically
1146 * For more details, see the comment in unorm_compare() in unorm.cpp
1147 * and the intermediate prototype changes for Jitterbug 2021.
1148 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1150 * This did not get fixed because it appears that it is not possible to fix
1151 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1152 * together in a way that they still fold to common result strings.
1155 U_CAPI
int32_t U_EXPORT2
1156 ucase_toFullFolding(const UCaseProps
*csp
, UChar32 c
,
1157 const UChar
**pString
,
1161 uint16_t props
=UTRIE2_GET16(&csp
->trie
, c
);
1162 if(!PROPS_HAS_EXCEPTION(props
)) {
1163 if(UCASE_GET_TYPE(props
)>=UCASE_UPPER
) {
1164 result
=c
+UCASE_GET_DELTA(props
);
1167 const uint16_t *pe
=GET_EXCEPTIONS(csp
, props
), *pe2
;
1168 uint16_t excWord
=*pe
++;
1173 if(excWord
&UCASE_EXC_CONDITIONAL_FOLD
) {
1174 /* use hardcoded conditions and mappings */
1175 if((options
&_FOLD_CASE_OPTIONS_MASK
)==U_FOLD_CASE_DEFAULT
) {
1176 /* default mappings */
1178 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1180 } else if(c
==0x130) {
1181 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1186 /* Turkic mappings */
1188 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1190 } else if(c
==0x130) {
1191 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1195 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1196 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1198 /* start of full case mapping strings */
1201 /* skip the lowercase result string */
1202 pe
+=full
&UCASE_FULL_LOWER
;
1206 /* set the output pointer to the result string */
1207 *pString
=reinterpret_cast<const UChar
*>(pe
);
1209 /* return the string length */
1214 if(HAS_SLOT(excWord
, UCASE_EXC_FOLD
)) {
1216 } else if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1217 idx
=UCASE_EXC_LOWER
;
1221 GET_SLOT_VALUE(excWord
, idx
, pe2
, result
);
1224 return (result
==c
) ? ~result
: result
;
1227 /* case mapping properties API ---------------------------------------------- */
1229 #define GET_CASE_PROPS() &ucase_props_singleton
1231 /* public API (see uchar.h) */
1233 U_CAPI UBool U_EXPORT2
1234 u_isULowercase(UChar32 c
) {
1235 return (UBool
)(UCASE_LOWER
==ucase_getType(GET_CASE_PROPS(), c
));
1238 U_CAPI UBool U_EXPORT2
1239 u_isUUppercase(UChar32 c
) {
1240 return (UBool
)(UCASE_UPPER
==ucase_getType(GET_CASE_PROPS(), c
));
1243 /* Transforms the Unicode character to its lower case equivalent.*/
1244 U_CAPI UChar32 U_EXPORT2
1245 u_tolower(UChar32 c
) {
1246 return ucase_tolower(GET_CASE_PROPS(), c
);
1249 /* Transforms the Unicode character to its upper case equivalent.*/
1250 U_CAPI UChar32 U_EXPORT2
1251 u_toupper(UChar32 c
) {
1252 return ucase_toupper(GET_CASE_PROPS(), c
);
1255 /* Transforms the Unicode character to its title case equivalent.*/
1256 U_CAPI UChar32 U_EXPORT2
1257 u_totitle(UChar32 c
) {
1258 return ucase_totitle(GET_CASE_PROPS(), c
);
1261 /* return the simple case folding mapping for c */
1262 U_CAPI UChar32 U_EXPORT2
1263 u_foldCase(UChar32 c
, uint32_t options
) {
1264 return ucase_fold(GET_CASE_PROPS(), c
, options
);
1267 U_CFUNC
int32_t U_EXPORT2
1268 ucase_hasBinaryProperty(UChar32 c
, UProperty which
) {
1269 /* case mapping properties */
1270 const UChar
*resultString
;
1272 const UCaseProps
*csp
=GET_CASE_PROPS();
1277 case UCHAR_LOWERCASE
:
1278 return (UBool
)(UCASE_LOWER
==ucase_getType(csp
, c
));
1279 case UCHAR_UPPERCASE
:
1280 return (UBool
)(UCASE_UPPER
==ucase_getType(csp
, c
));
1281 case UCHAR_SOFT_DOTTED
:
1282 return ucase_isSoftDotted(csp
, c
);
1283 case UCHAR_CASE_SENSITIVE
:
1284 return ucase_isCaseSensitive(csp
, c
);
1286 return (UBool
)(UCASE_NONE
!=ucase_getType(csp
, c
));
1287 case UCHAR_CASE_IGNORABLE
:
1288 return (UBool
)(ucase_getTypeOrIgnorable(csp
, c
)>>2);
1290 * Note: The following Changes_When_Xyz are defined as testing whether
1291 * the NFD form of the input changes when Xyz-case-mapped.
1292 * However, this simpler implementation of these properties,
1293 * ignoring NFD, passes the tests.
1294 * The implementation needs to be changed if the tests start failing.
1295 * When that happens, optimizations should be used to work with the
1296 * per-single-code point ucase_toFullXyz() functions unless
1297 * the NFD form has more than one code point,
1298 * and the property starts set needs to be the union of the
1299 * start sets for normalization and case mappings.
1301 case UCHAR_CHANGES_WHEN_LOWERCASED
:
1302 locCache
=UCASE_LOC_ROOT
;
1303 return (UBool
)(ucase_toFullLower(csp
, c
, NULL
, NULL
, &resultString
, "", &locCache
)>=0);
1304 case UCHAR_CHANGES_WHEN_UPPERCASED
:
1305 locCache
=UCASE_LOC_ROOT
;
1306 return (UBool
)(ucase_toFullUpper(csp
, c
, NULL
, NULL
, &resultString
, "", &locCache
)>=0);
1307 case UCHAR_CHANGES_WHEN_TITLECASED
:
1308 locCache
=UCASE_LOC_ROOT
;
1309 return (UBool
)(ucase_toFullTitle(csp
, c
, NULL
, NULL
, &resultString
, "", &locCache
)>=0);
1310 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1311 case UCHAR_CHANGES_WHEN_CASEMAPPED
:
1312 locCache
=UCASE_LOC_ROOT
;
1314 ucase_toFullLower(csp
, c
, NULL
, NULL
, &resultString
, "", &locCache
)>=0 ||
1315 ucase_toFullUpper(csp
, c
, NULL
, NULL
, &resultString
, "", &locCache
)>=0 ||
1316 ucase_toFullTitle(csp
, c
, NULL
, NULL
, &resultString
, "", &locCache
)>=0);