1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2004-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: ucase.cpp
12 * tab size: 8 (not used)
15 * created on: 2004aug30
16 * created by: Markus W. Scherer
18 * Low-level Unicode character/string case mapping code.
19 * Much code moved here (and modified) from uchar.c.
22 #include "unicode/utypes.h"
23 #include "unicode/unistr.h"
24 #include "unicode/uset.h"
25 #include "unicode/udata.h" /* UDataInfo */
26 #include "unicode/utf16.h"
27 #include "ucmndata.h" /* DataHeader */
37 const int32_t *indexes
;
38 const uint16_t *exceptions
;
39 const uint16_t *unfold
;
42 uint8_t formatVersion
[4];
45 /* ucase_props_data.h is machine-generated by gencase --csource */
46 #define INCLUDED_FROM_UCASE_CPP
47 #include "ucase_props_data.h"
49 /* set of property starts for UnicodeSet ------------------------------------ */
51 static UBool U_CALLCONV
52 _enumPropertyStartsRange(const void *context
, UChar32 start
, UChar32
/*end*/, uint32_t /*value*/) {
53 /* add the start code point to the USet */
54 const USetAdder
*sa
=(const USetAdder
*)context
;
55 sa
->add(sa
->set
, start
);
59 U_CFUNC
void U_EXPORT2
60 ucase_addPropertyStarts(const USetAdder
*sa
, UErrorCode
*pErrorCode
) {
61 if(U_FAILURE(*pErrorCode
)) {
65 /* add the start code point of each same-value range of the trie */
66 utrie2_enum(&ucase_props_singleton
.trie
, NULL
, _enumPropertyStartsRange
, sa
);
68 /* add code points with hardcoded properties, plus the ones following them */
70 /* (none right now, see comment below) */
73 * Omit code points with hardcoded specialcasing properties
74 * because we do not build property UnicodeSets for them right now.
78 /* data access primitives --------------------------------------------------- */
80 U_CFUNC
const UTrie2
* U_EXPORT2
82 return &ucase_props_singleton
.trie
;
85 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
87 /* number of bits in an 8-bit integer value */
88 static const uint8_t flagsOffset
[256]={
89 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
90 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
91 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
98 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
100 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
101 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
104 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
107 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
108 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
111 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
113 * @param excWord (in) initial exceptions word
114 * @param idx (in) desired slot index
115 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
116 * moved to the last uint16_t of the value, use +1 for beginning of next slot
117 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
119 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \
120 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
121 (pExc16)+=SLOT_OFFSET(excWord, idx); \
124 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
126 (value)=((value)<<16)|*pExc16; \
128 } UPRV_BLOCK_MACRO_END
130 /* simple case mappings ----------------------------------------------------- */
132 U_CAPI UChar32 U_EXPORT2
133 ucase_tolower(UChar32 c
) {
134 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
135 if(!UCASE_HAS_EXCEPTION(props
)) {
136 if(UCASE_IS_UPPER_OR_TITLE(props
)) {
137 c
+=UCASE_GET_DELTA(props
);
140 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
);
141 uint16_t excWord
=*pe
++;
142 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
) && UCASE_IS_UPPER_OR_TITLE(props
)) {
144 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe
, delta
);
145 return (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
;
147 if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
148 GET_SLOT_VALUE(excWord
, UCASE_EXC_LOWER
, pe
, c
);
154 U_CAPI UChar32 U_EXPORT2
155 ucase_toupper(UChar32 c
) {
156 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
157 if(!UCASE_HAS_EXCEPTION(props
)) {
158 if(UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
159 c
+=UCASE_GET_DELTA(props
);
162 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
);
163 uint16_t excWord
=*pe
++;
164 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
) && UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
166 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe
, delta
);
167 return (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
;
169 if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
170 GET_SLOT_VALUE(excWord
, UCASE_EXC_UPPER
, pe
, c
);
176 U_CAPI UChar32 U_EXPORT2
177 ucase_totitle(UChar32 c
) {
178 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
179 if(!UCASE_HAS_EXCEPTION(props
)) {
180 if(UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
181 c
+=UCASE_GET_DELTA(props
);
184 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
);
185 uint16_t excWord
=*pe
++;
186 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
) && UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
188 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe
, delta
);
189 return (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
;
192 if(HAS_SLOT(excWord
, UCASE_EXC_TITLE
)) {
194 } else if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
199 GET_SLOT_VALUE(excWord
, idx
, pe
, c
);
204 static const UChar iDot
[2] = { 0x69, 0x307 };
205 static const UChar jDot
[2] = { 0x6a, 0x307 };
206 static const UChar iOgonekDot
[3] = { 0x12f, 0x307 };
207 static const UChar iDotGrave
[3] = { 0x69, 0x307, 0x300 };
208 static const UChar iDotAcute
[3] = { 0x69, 0x307, 0x301 };
209 static const UChar iDotTilde
[3] = { 0x69, 0x307, 0x303 };
212 U_CFUNC
void U_EXPORT2
213 ucase_addCaseClosure(UChar32 c
, const USetAdder
*sa
) {
217 * Hardcode the case closure of i and its relatives and ignore the
218 * data file data for these characters.
219 * The Turkic dotless i and dotted I with their case mapping conditions
220 * and case folding option make the related characters behave specially.
221 * This code matches their closure behavior to their case folding behavior.
226 /* regular i and I are in one equivalence class */
227 sa
->add(sa
->set
, 0x69);
230 sa
->add(sa
->set
, 0x49);
233 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
234 sa
->addString(sa
->set
, iDot
, 2);
237 /* dotless i is in a class by itself */
240 /* otherwise use the data file data */
244 props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
245 if(!UCASE_HAS_EXCEPTION(props
)) {
246 if(UCASE_GET_TYPE(props
)!=UCASE_NONE
) {
247 /* add the one simple case mapping, no matter what type it is */
248 int32_t delta
=UCASE_GET_DELTA(props
);
250 sa
->add(sa
->set
, c
+delta
);
255 * c has exceptions, so there may be multiple simple and/or
256 * full case mappings. Add them all.
258 const uint16_t *pe0
, *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
);
259 const UChar
*closure
;
260 uint16_t excWord
=*pe
++;
261 int32_t idx
, closureLength
, fullLength
, length
;
265 /* add all simple case mappings */
266 for(idx
=UCASE_EXC_LOWER
; idx
<=UCASE_EXC_TITLE
; ++idx
) {
267 if(HAS_SLOT(excWord
, idx
)) {
269 GET_SLOT_VALUE(excWord
, idx
, pe
, c
);
273 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
)) {
276 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe
, delta
);
277 sa
->add(sa
->set
, (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
);
280 /* get the closure string pointer & length */
281 if(HAS_SLOT(excWord
, UCASE_EXC_CLOSURE
)) {
283 GET_SLOT_VALUE(excWord
, UCASE_EXC_CLOSURE
, pe
, closureLength
);
284 closureLength
&=UCASE_CLOSURE_MAX_LENGTH
; /* higher bits are reserved */
285 closure
=(const UChar
*)pe
+1; /* behind this slot, unless there are full case mappings */
291 /* add the full case folding */
292 if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
294 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, fullLength
);
296 /* start of full case mapping strings */
299 fullLength
&=0xffff; /* bits 16 and higher are reserved */
301 /* skip the lowercase result string */
302 pe
+=fullLength
&UCASE_FULL_LOWER
;
305 /* add the full case folding string */
306 length
=fullLength
&0xf;
308 sa
->addString(sa
->set
, (const UChar
*)pe
, length
);
312 /* skip the uppercase and titlecase strings */
318 closure
=(const UChar
*)pe
; /* behind full case mappings */
321 /* add each code point in the closure string */
322 for(idx
=0; idx
<closureLength
;) {
323 U16_NEXT_UNSAFE(closure
, idx
, c
);
330 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
331 * must be length>0 and max>0 and length<=max
333 static inline int32_t
334 strcmpMax(const UChar
*s
, int32_t length
, const UChar
*t
, int32_t max
) {
337 max
-=length
; /* we require length<=max, so no need to decrement max in the loop */
342 return 1; /* reached the end of t but not of s */
346 return c1
; /* return difference result */
349 /* ends with length==0 */
351 if(max
==0 || *t
==0) {
352 return 0; /* equal to length of both strings */
354 return -max
; /* return lengh difference */
358 U_CFUNC UBool U_EXPORT2
359 ucase_addStringCaseClosure(const UChar
*s
, int32_t length
, const USetAdder
*sa
) {
360 int32_t i
, start
, limit
, result
, unfoldRows
, unfoldRowWidth
, unfoldStringWidth
;
362 if(ucase_props_singleton
.unfold
==NULL
|| s
==NULL
) {
363 return FALSE
; /* no reverse case folding data, or no string */
366 /* the string is too short to find any match */
368 * more precise would be:
369 * if(!u_strHasMoreChar32Than(s, length, 1))
370 * but this does not make much practical difference because
371 * a single supplementary code point would just not be found
376 const uint16_t *unfold
=ucase_props_singleton
.unfold
;
377 unfoldRows
=unfold
[UCASE_UNFOLD_ROWS
];
378 unfoldRowWidth
=unfold
[UCASE_UNFOLD_ROW_WIDTH
];
379 unfoldStringWidth
=unfold
[UCASE_UNFOLD_STRING_WIDTH
];
380 unfold
+=unfoldRowWidth
;
382 if(length
>unfoldStringWidth
) {
383 /* the string is too long to find any match */
387 /* do a binary search for the string */
392 const UChar
*p
=reinterpret_cast<const UChar
*>(unfold
+(i
*unfoldRowWidth
));
393 result
=strcmpMax(s
, length
, p
, unfoldStringWidth
);
396 /* found the string: add each code point, and its case closure */
399 for(i
=unfoldStringWidth
; i
<unfoldRowWidth
&& p
[i
]!=0;) {
400 U16_NEXT_UNSAFE(p
, i
, c
);
402 ucase_addCaseClosure(c
, sa
);
405 } else if(result
<0) {
407 } else /* result>0 */ {
412 return FALSE
; /* string not found */
417 FullCaseFoldingIterator::FullCaseFoldingIterator()
418 : unfold(reinterpret_cast<const UChar
*>(ucase_props_singleton
.unfold
)),
419 unfoldRows(unfold
[UCASE_UNFOLD_ROWS
]),
420 unfoldRowWidth(unfold
[UCASE_UNFOLD_ROW_WIDTH
]),
421 unfoldStringWidth(unfold
[UCASE_UNFOLD_STRING_WIDTH
]),
423 rowCpIndex(unfoldStringWidth
) {
424 unfold
+=unfoldRowWidth
;
428 FullCaseFoldingIterator::next(UnicodeString
&full
) {
429 // Advance past the last-delivered code point.
430 const UChar
*p
=unfold
+(currentRow
*unfoldRowWidth
);
431 if(rowCpIndex
>=unfoldRowWidth
|| p
[rowCpIndex
]==0) {
434 rowCpIndex
=unfoldStringWidth
;
436 if(currentRow
>=unfoldRows
) { return U_SENTINEL
; }
437 // Set "full" to the NUL-terminated string in the first unfold column.
438 int32_t length
=unfoldStringWidth
;
439 while(length
>0 && p
[length
-1]==0) { --length
; }
440 full
.setTo(FALSE
, p
, length
);
441 // Return the code point.
443 U16_NEXT_UNSAFE(p
, rowCpIndex
, c
);
447 namespace LatinCase
{
449 const int8_t TO_LOWER_NORMAL
[LIMIT
] = {
450 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
451 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
452 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
455 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
456 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
457 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
460 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
461 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
462 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
463 0, 0, 0, 0, 0, EXC
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
465 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
466 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC
,
467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
470 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
471 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
472 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
473 EXC
, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
475 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC
, 1, 0, 1, 0, 1, 0,
476 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
477 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
478 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
481 const int8_t TO_LOWER_TR_LT
[LIMIT
] = {
482 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
487 0, 32, 32, 32, 32, 32, 32, 32, 32, EXC
, EXC
, 32, 32, 32, 32, 32,
488 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495 0, 0, 0, 0, 0, EXC
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
497 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC
, EXC
, 32, 32,
498 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC
,
499 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
500 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
502 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
503 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
504 1, 0, 1, 0, 1, 0, 1, 0, EXC
, 0, 1, 0, 1, 0, EXC
, 0,
505 EXC
, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
507 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC
, 1, 0, 1, 0, 1, 0,
508 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
509 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
510 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
513 const int8_t TO_UPPER_NORMAL
[LIMIT
] = {
514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
515 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
516 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
517 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
519 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
520 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
521 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
522 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
526 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
527 0, 0, 0, 0, 0, EXC
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
529 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
530 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC
,
531 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
532 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
534 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
535 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
536 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
537 0, EXC
, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
539 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC
, 0, -1, 0, -1, 0, -1,
540 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
541 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
542 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
545 const int8_t TO_UPPER_TR
[LIMIT
] = {
546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
553 0, -32, -32, -32, -32, -32, -32, -32, -32, EXC
, -32, -32, -32, -32, -32, -32,
554 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
556 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
557 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559 0, 0, 0, 0, 0, EXC
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
561 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
562 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC
,
563 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
564 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
566 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
567 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
568 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
569 0, EXC
, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
571 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC
, 0, -1, 0, -1, 0, -1,
572 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
573 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
574 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
577 } // namespace LatinCase
581 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
582 U_CAPI
int32_t U_EXPORT2
583 ucase_getType(UChar32 c
) {
584 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
585 return UCASE_GET_TYPE(props
);
588 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
589 U_CAPI
int32_t U_EXPORT2
590 ucase_getTypeOrIgnorable(UChar32 c
) {
591 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
592 return UCASE_GET_TYPE_AND_IGNORABLE(props
);
595 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
596 static inline int32_t
597 getDotType(UChar32 c
) {
598 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
599 if(!UCASE_HAS_EXCEPTION(props
)) {
600 return props
&UCASE_DOT_MASK
;
602 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
);
603 return (*pe
>>UCASE_EXC_DOT_SHIFT
)&UCASE_DOT_MASK
;
607 U_CAPI UBool U_EXPORT2
608 ucase_isSoftDotted(UChar32 c
) {
609 return (UBool
)(getDotType(c
)==UCASE_SOFT_DOTTED
);
612 U_CAPI UBool U_EXPORT2
613 ucase_isCaseSensitive(UChar32 c
) {
614 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
615 if(!UCASE_HAS_EXCEPTION(props
)) {
616 return (UBool
)((props
&UCASE_SENSITIVE
)!=0);
618 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
);
619 return (UBool
)((*pe
&UCASE_EXC_SENSITIVE
)!=0);
623 /* string casing ------------------------------------------------------------ */
626 * These internal functions form the core of string case mappings.
627 * They map single code points to result code points or strings and take
628 * all necessary conditions (context, locale ID, options) into account.
630 * They do not iterate over the source or write to the destination
631 * so that the same functions are useful for non-standard string storage,
632 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
633 * For the same reason, the "surrounding text" context is passed in as a
634 * UCaseContextIterator which does not make any assumptions about
635 * the underlying storage.
637 * This section contains helper functions that check for conditions
638 * in the input text surrounding the current code point
639 * according to SpecialCasing.txt.
641 * Each helper function gets the index
642 * - after the current code point if it looks at following text
643 * - before the current code point if it looks at preceding text
645 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
648 * C is preceded by a sequence consisting of
649 * a cased letter and a case-ignorable sequence,
650 * and C is not followed by a sequence consisting of
651 * an ignorable sequence and then a cased letter.
654 * C is followed by one or more characters of combining class 230 (ABOVE)
655 * in the combining character sequence.
658 * The last preceding character with combining class of zero before C
660 * and there is no intervening combining character class 230 (ABOVE).
663 * C is followed by combining dot above (U+0307).
664 * Any sequence of characters with a combining class that is neither 0 nor 230
665 * may intervene between the current character and the combining dot above.
667 * The erratum from 2002-10-31 adds the condition
670 * The last preceding base character was an uppercase I, and there is no
671 * intervening combining character class 230 (ABOVE).
673 * (See Jitterbug 2344 and the comments on After_I below.)
675 * Helper definitions in Unicode 3.2 UAX 21:
677 * D1. A character C is defined to be cased
678 * if it meets any of the following criteria:
680 * - The general category of C is Titlecase Letter (Lt)
681 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
682 * - Given D = NFD(C), then it is not the case that:
683 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
684 * (This third criterium does not add any characters to the list
685 * for Unicode 3.2. Ignored.)
687 * D2. A character C is defined to be case-ignorable
688 * if it meets either of the following criteria:
690 * - The general category of C is
691 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
692 * Letter Modifier (Lm), or Symbol Modifier (Sk)
693 * - C is one of the following characters
695 * U+00AD SOFT HYPHEN (SHY)
696 * U+2019 RIGHT SINGLE QUOTATION MARK
697 * (the preferred character for apostrophe)
699 * D3. A case-ignorable sequence is a sequence of
700 * zero or more case-ignorable characters.
703 #define is_d(c) ((c)=='d' || (c)=='D')
704 #define is_e(c) ((c)=='e' || (c)=='E')
705 #define is_i(c) ((c)=='i' || (c)=='I')
706 #define is_l(c) ((c)=='l' || (c)=='L')
707 #define is_r(c) ((c)=='r' || (c)=='R')
708 #define is_t(c) ((c)=='t' || (c)=='T')
709 #define is_u(c) ((c)=='u' || (c)=='U')
710 #define is_z(c) ((c)=='z' || (c)=='Z')
713 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
716 * Requires non-NULL locale ID but otherwise does the equivalent of
717 * checking for language codes as if uloc_getLanguage() were called:
718 * Accepts both 2- and 3-letter codes and accepts case variants.
721 ucase_getCaseLocale(const char *locale
) {
723 * This function used to use uloc_getLanguage(), but the current code
724 * removes the dependency of this low-level code on uloc implementation code
725 * and is faster because not the whole locale ID has to be
726 * examined and copied/transformed.
728 * Because this code does not want to depend on uloc, the caller must
729 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
732 // Fastpath for English "en" which is often used for default (=root locale) case mappings,
733 // and for Chinese "zh": Very common but no special case mapping behavior.
734 // Then check lowercase vs. uppercase to reduce the number of comparisons
735 // for other locales without special behavior.
745 return UCASE_LOC_GREEK
;
748 // en, es, ... -> root
750 return UCASE_LOC_ROOT
;
751 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
752 } else if(c
>='a') { // ASCII a-z = 0x61..0x7a, after A-Z
753 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
754 } else if(c
<='z') { // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
756 # error Unknown charset family!
768 return UCASE_LOC_TURKISH
;
780 return UCASE_LOC_TURKISH
;
792 return UCASE_LOC_LITHUANIAN
;
804 return UCASE_LOC_DUTCH
;
810 // Same code as for lowercase c but also check for 'E'.
820 return UCASE_LOC_TURKISH
;
832 return UCASE_LOC_TURKISH
;
844 return UCASE_LOC_LITHUANIAN
;
856 return UCASE_LOC_GREEK
;
868 return UCASE_LOC_DUTCH
;
873 return UCASE_LOC_ROOT
;
878 * {case-ignorable}* cased
880 * (dir determines looking forward/backward)
881 * If a character is case-ignorable, it is skipped regardless of whether
882 * it is also cased or not.
885 isFollowedByCasedLetter(UCaseContextIterator
*iter
, void *context
, int8_t dir
) {
892 for(/* dir!=0 sets direction */; (c
=iter(context
, dir
))>=0; dir
=0) {
893 int32_t type
=ucase_getTypeOrIgnorable(c
);
895 /* case-ignorable, continue with the loop */
896 } else if(type
!=UCASE_NONE
) {
897 return TRUE
; /* followed by cased letter */
899 return FALSE
; /* uncased and not case-ignorable */
903 return FALSE
; /* not followed by cased letter */
906 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
908 isPrecededBySoftDotted(UCaseContextIterator
*iter
, void *context
) {
917 for(dir
=-1; (c
=iter(context
, dir
))>=0; dir
=0) {
918 dotType
=getDotType(c
);
919 if(dotType
==UCASE_SOFT_DOTTED
) {
920 return TRUE
; /* preceded by TYPE_i */
921 } else if(dotType
!=UCASE_OTHER_ACCENT
) {
922 return FALSE
; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
926 return FALSE
; /* not preceded by TYPE_i */
930 * See Jitterbug 2344:
931 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
932 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
933 * we made those releases compatible with Unicode 3.2 which had not fixed
934 * a related bug in SpecialCasing.txt.
936 * From the Jitterbug 2344 text:
937 * ... this bug is listed as a Unicode erratum
938 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
940 * There are two errors in SpecialCasing.txt.
941 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
942 * 2. An incorrect context definition. Correct as follows:
943 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
944 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
946 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
947 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
948 * where the context After_I is defined as:
949 * The last preceding base character was an uppercase I, and there is no
950 * intervening combining character class 230 (ABOVE).
953 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
955 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
956 * # This matches the behavior of the canonically equivalent I-dot_above
958 * See also the description in this place in older versions of uchar.c (revision 1.100).
960 * Markus W. Scherer 2003-feb-15
963 /* Is preceded by base character 'I' with no intervening cc=230 ? */
965 isPrecededBy_I(UCaseContextIterator
*iter
, void *context
) {
974 for(dir
=-1; (c
=iter(context
, dir
))>=0; dir
=0) {
976 return TRUE
; /* preceded by I */
978 dotType
=getDotType(c
);
979 if(dotType
!=UCASE_OTHER_ACCENT
) {
980 return FALSE
; /* preceded by different base character (not I), or intervening cc==230 */
984 return FALSE
; /* not preceded by I */
987 /* Is followed by one or more cc==230 ? */
989 isFollowedByMoreAbove(UCaseContextIterator
*iter
, void *context
) {
998 for(dir
=1; (c
=iter(context
, dir
))>=0; dir
=0) {
999 dotType
=getDotType(c
);
1000 if(dotType
==UCASE_ABOVE
) {
1001 return TRUE
; /* at least one cc==230 following */
1002 } else if(dotType
!=UCASE_OTHER_ACCENT
) {
1003 return FALSE
; /* next base character, no more cc==230 following */
1007 return FALSE
; /* no more cc==230 following */
1010 /* Is followed by a dot above (without cc==230 in between) ? */
1012 isFollowedByDotAbove(UCaseContextIterator
*iter
, void *context
) {
1021 for(dir
=1; (c
=iter(context
, dir
))>=0; dir
=0) {
1025 dotType
=getDotType(c
);
1026 if(dotType
!=UCASE_OTHER_ACCENT
) {
1027 return FALSE
; /* next base character or cc==230 in between */
1031 return FALSE
; /* no dot above following */
1034 U_CAPI
int32_t U_EXPORT2
1035 ucase_toFullLower(UChar32 c
,
1036 UCaseContextIterator
*iter
, void *context
,
1037 const UChar
**pString
,
1039 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1042 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
1043 if(!UCASE_HAS_EXCEPTION(props
)) {
1044 if(UCASE_IS_UPPER_OR_TITLE(props
)) {
1045 result
=c
+UCASE_GET_DELTA(props
);
1048 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
), *pe2
;
1049 uint16_t excWord
=*pe
++;
1054 if(excWord
&UCASE_EXC_CONDITIONAL_SPECIAL
) {
1055 /* use hardcoded conditions and mappings */
1058 * Test for conditional mappings first
1059 * (otherwise the unconditional default mappings are always taken),
1060 * then test for characters that have unconditional mappings in SpecialCasing.txt,
1061 * then get the UnicodeData.txt mappings.
1063 if( loc
==UCASE_LOC_LITHUANIAN
&&
1064 /* base characters, find accents above */
1065 (((c
==0x49 || c
==0x4a || c
==0x12e) &&
1066 isFollowedByMoreAbove(iter
, context
)) ||
1067 /* precomposed with accent above, no need to find one */
1068 (c
==0xcc || c
==0xcd || c
==0x128))
1073 # Lithuanian retains the dot in a lowercase i when followed by accents.
1075 # Introduce an explicit dot above when lowercasing capital I's and J's
1076 # whenever there are more accents above.
1077 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1079 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1080 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1081 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1082 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1083 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1084 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1087 case 0x49: /* LATIN CAPITAL LETTER I */
1090 case 0x4a: /* LATIN CAPITAL LETTER J */
1093 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1094 *pString
=iOgonekDot
;
1096 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
1099 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
1102 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1106 return 0; /* will not occur */
1108 /* # Turkish and Azeri */
1109 } else if(loc
==UCASE_LOC_TURKISH
&& c
==0x130) {
1111 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1112 # The following rules handle those cases.
1114 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1115 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1118 } else if(loc
==UCASE_LOC_TURKISH
&& c
==0x307 && isPrecededBy_I(iter
, context
)) {
1120 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1121 # This matches the behavior of the canonically equivalent I-dot_above
1123 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1124 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1127 return 0; /* remove the dot (continue without output) */
1128 } else if(loc
==UCASE_LOC_TURKISH
&& c
==0x49 && !isFollowedByDotAbove(iter
, context
)) {
1130 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1132 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1133 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1136 } else if(c
==0x130) {
1138 # Preserve canonical equivalence for I with dot. Turkic is handled below.
1140 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1144 } else if( c
==0x3a3 &&
1145 !isFollowedByCasedLetter(iter
, context
, 1) &&
1146 isFollowedByCasedLetter(iter
, context
, -1) /* -1=preceded */
1148 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1150 # Special case for final form of sigma
1152 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1154 return 0x3c2; /* greek small final sigma */
1156 /* no known conditional special case mapping, use a normal mapping */
1158 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1159 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1160 full
&=UCASE_FULL_LOWER
;
1162 /* set the output pointer to the lowercase mapping */
1163 *pString
=reinterpret_cast<const UChar
*>(pe
+1);
1165 /* return the string length */
1170 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
) && UCASE_IS_UPPER_OR_TITLE(props
)) {
1172 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe2
, delta
);
1173 return (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
;
1175 if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1176 GET_SLOT_VALUE(excWord
, UCASE_EXC_LOWER
, pe2
, result
);
1180 return (result
==c
) ? ~result
: result
;
1185 toUpperOrTitle(UChar32 c
,
1186 UCaseContextIterator
*iter
, void *context
,
1187 const UChar
**pString
,
1189 UBool upperNotTitle
) {
1190 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1193 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
1194 if(!UCASE_HAS_EXCEPTION(props
)) {
1195 if(UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
1196 result
=c
+UCASE_GET_DELTA(props
);
1199 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
), *pe2
;
1200 uint16_t excWord
=*pe
++;
1205 if(excWord
&UCASE_EXC_CONDITIONAL_SPECIAL
) {
1206 /* use hardcoded conditions and mappings */
1207 if(loc
==UCASE_LOC_TURKISH
&& c
==0x69) {
1211 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1212 # The following rules handle those cases.
1214 # When uppercasing, i turns into a dotted capital I
1216 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1217 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1220 } else if(loc
==UCASE_LOC_LITHUANIAN
&& c
==0x307 && isPrecededBySoftDotted(iter
, context
)) {
1224 # Lithuanian retains the dot in a lowercase i when followed by accents.
1226 # Remove DOT ABOVE after "i" with upper or titlecase
1228 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1231 return 0; /* remove the dot (continue without output) */
1233 /* no known conditional special case mapping, use a normal mapping */
1235 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1236 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1238 /* start of full case mapping strings */
1241 /* skip the lowercase and case-folding result strings */
1242 pe
+=full
&UCASE_FULL_LOWER
;
1250 /* skip the uppercase result string */
1256 /* set the output pointer to the result string */
1257 *pString
=reinterpret_cast<const UChar
*>(pe
);
1259 /* return the string length */
1264 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
) && UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
1266 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe2
, delta
);
1267 return (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
;
1269 if(!upperNotTitle
&& HAS_SLOT(excWord
, UCASE_EXC_TITLE
)) {
1270 idx
=UCASE_EXC_TITLE
;
1271 } else if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
1272 /* here, titlecase is same as uppercase */
1273 idx
=UCASE_EXC_UPPER
;
1277 GET_SLOT_VALUE(excWord
, idx
, pe2
, result
);
1280 return (result
==c
) ? ~result
: result
;
1283 U_CAPI
int32_t U_EXPORT2
1284 ucase_toFullUpper(UChar32 c
,
1285 UCaseContextIterator
*iter
, void *context
,
1286 const UChar
**pString
,
1287 int32_t caseLocale
) {
1288 return toUpperOrTitle(c
, iter
, context
, pString
, caseLocale
, TRUE
);
1291 U_CAPI
int32_t U_EXPORT2
1292 ucase_toFullTitle(UChar32 c
,
1293 UCaseContextIterator
*iter
, void *context
,
1294 const UChar
**pString
,
1295 int32_t caseLocale
) {
1296 return toUpperOrTitle(c
, iter
, context
, pString
, caseLocale
, FALSE
);
1299 /* case folding ------------------------------------------------------------- */
1302 * Case folding is similar to lowercasing.
1303 * The result may be a simple mapping, i.e., a single code point, or
1304 * a full mapping, i.e., a string.
1305 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1306 * then only the lowercase mapping is stored.
1308 * Some special cases are hardcoded because their conditions cannot be
1309 * parsed and processed from CaseFolding.txt.
1311 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1313 # C: common case folding, common mappings shared by both simple and full mappings.
1314 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1315 # S: simple case folding, mappings to single characters where different from F.
1316 # T: special case for uppercase I and dotted uppercase I
1317 # - For non-Turkic languages, this mapping is normally not used.
1318 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1321 # A. To do a simple case folding, use the mappings with status C + S.
1322 # B. To do a full case folding, use the mappings with status C + F.
1324 # The mappings with status T can be used or omitted depending on the desired case-folding
1325 # behavior. (The default option is to exclude them.)
1327 * Unicode 3.2 has 'T' mappings as follows:
1329 0049; T; 0131; # LATIN CAPITAL LETTER I
1330 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1332 * while the default mappings for these code points are:
1334 0049; C; 0069; # LATIN CAPITAL LETTER I
1335 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1337 * U+0130 has no simple case folding (simple-case-folds to itself).
1340 /* return the simple case folding mapping for c */
1341 U_CAPI UChar32 U_EXPORT2
1342 ucase_fold(UChar32 c
, uint32_t options
) {
1343 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
1344 if(!UCASE_HAS_EXCEPTION(props
)) {
1345 if(UCASE_IS_UPPER_OR_TITLE(props
)) {
1346 c
+=UCASE_GET_DELTA(props
);
1349 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
);
1350 uint16_t excWord
=*pe
++;
1352 if(excWord
&UCASE_EXC_CONDITIONAL_FOLD
) {
1353 /* special case folding mappings, hardcoded */
1354 if((options
&_FOLD_CASE_OPTIONS_MASK
)==U_FOLD_CASE_DEFAULT
) {
1355 /* default mappings */
1357 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1359 } else if(c
==0x130) {
1360 /* no simple case folding for U+0130 */
1364 /* Turkic mappings */
1366 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1368 } else if(c
==0x130) {
1369 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1374 if((excWord
&UCASE_EXC_NO_SIMPLE_CASE_FOLDING
)!=0) {
1377 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
) && UCASE_IS_UPPER_OR_TITLE(props
)) {
1379 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe
, delta
);
1380 return (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
;
1382 if(HAS_SLOT(excWord
, UCASE_EXC_FOLD
)) {
1384 } else if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1385 idx
=UCASE_EXC_LOWER
;
1389 GET_SLOT_VALUE(excWord
, idx
, pe
, c
);
1395 * Issue for canonical caseless match (UAX #21):
1396 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1397 * canonical equivalence, unlike default-option casefolding.
1398 * For example, I-grave and I + grave fold to strings that are not canonically
1400 * For more details, see the comment in unorm_compare() in unorm.cpp
1401 * and the intermediate prototype changes for Jitterbug 2021.
1402 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1404 * This did not get fixed because it appears that it is not possible to fix
1405 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1406 * together in a way that they still fold to common result strings.
1409 U_CAPI
int32_t U_EXPORT2
1410 ucase_toFullFolding(UChar32 c
,
1411 const UChar
**pString
,
1413 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1416 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
1417 if(!UCASE_HAS_EXCEPTION(props
)) {
1418 if(UCASE_IS_UPPER_OR_TITLE(props
)) {
1419 result
=c
+UCASE_GET_DELTA(props
);
1422 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
), *pe2
;
1423 uint16_t excWord
=*pe
++;
1428 if(excWord
&UCASE_EXC_CONDITIONAL_FOLD
) {
1429 /* use hardcoded conditions and mappings */
1430 if((options
&_FOLD_CASE_OPTIONS_MASK
)==U_FOLD_CASE_DEFAULT
) {
1431 /* default mappings */
1433 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1435 } else if(c
==0x130) {
1436 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1441 /* Turkic mappings */
1443 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1445 } else if(c
==0x130) {
1446 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1450 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1451 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1453 /* start of full case mapping strings */
1456 /* skip the lowercase result string */
1457 pe
+=full
&UCASE_FULL_LOWER
;
1461 /* set the output pointer to the result string */
1462 *pString
=reinterpret_cast<const UChar
*>(pe
);
1464 /* return the string length */
1469 if((excWord
&UCASE_EXC_NO_SIMPLE_CASE_FOLDING
)!=0) {
1472 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
) && UCASE_IS_UPPER_OR_TITLE(props
)) {
1474 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe2
, delta
);
1475 return (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
;
1477 if(HAS_SLOT(excWord
, UCASE_EXC_FOLD
)) {
1479 } else if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1480 idx
=UCASE_EXC_LOWER
;
1484 GET_SLOT_VALUE(excWord
, idx
, pe2
, result
);
1487 return (result
==c
) ? ~result
: result
;
1490 /* case mapping properties API ---------------------------------------------- */
1492 /* public API (see uchar.h) */
1494 U_CAPI UBool U_EXPORT2
1495 u_isULowercase(UChar32 c
) {
1496 return (UBool
)(UCASE_LOWER
==ucase_getType(c
));
1499 U_CAPI UBool U_EXPORT2
1500 u_isUUppercase(UChar32 c
) {
1501 return (UBool
)(UCASE_UPPER
==ucase_getType(c
));
1504 /* Transforms the Unicode character to its lower case equivalent.*/
1505 U_CAPI UChar32 U_EXPORT2
1506 u_tolower(UChar32 c
) {
1507 return ucase_tolower(c
);
1510 /* Transforms the Unicode character to its upper case equivalent.*/
1511 U_CAPI UChar32 U_EXPORT2
1512 u_toupper(UChar32 c
) {
1513 return ucase_toupper(c
);
1516 /* Transforms the Unicode character to its title case equivalent.*/
1517 U_CAPI UChar32 U_EXPORT2
1518 u_totitle(UChar32 c
) {
1519 return ucase_totitle(c
);
1522 /* return the simple case folding mapping for c */
1523 U_CAPI UChar32 U_EXPORT2
1524 u_foldCase(UChar32 c
, uint32_t options
) {
1525 return ucase_fold(c
, options
);
1528 U_CFUNC
int32_t U_EXPORT2
1529 ucase_hasBinaryProperty(UChar32 c
, UProperty which
) {
1530 /* case mapping properties */
1531 const UChar
*resultString
;
1533 case UCHAR_LOWERCASE
:
1534 return (UBool
)(UCASE_LOWER
==ucase_getType(c
));
1535 case UCHAR_UPPERCASE
:
1536 return (UBool
)(UCASE_UPPER
==ucase_getType(c
));
1537 case UCHAR_SOFT_DOTTED
:
1538 return ucase_isSoftDotted(c
);
1539 case UCHAR_CASE_SENSITIVE
:
1540 return ucase_isCaseSensitive(c
);
1542 return (UBool
)(UCASE_NONE
!=ucase_getType(c
));
1543 case UCHAR_CASE_IGNORABLE
:
1544 return (UBool
)(ucase_getTypeOrIgnorable(c
)>>2);
1546 * Note: The following Changes_When_Xyz are defined as testing whether
1547 * the NFD form of the input changes when Xyz-case-mapped.
1548 * However, this simpler implementation of these properties,
1549 * ignoring NFD, passes the tests.
1550 * The implementation needs to be changed if the tests start failing.
1551 * When that happens, optimizations should be used to work with the
1552 * per-single-code point ucase_toFullXyz() functions unless
1553 * the NFD form has more than one code point,
1554 * and the property starts set needs to be the union of the
1555 * start sets for normalization and case mappings.
1557 case UCHAR_CHANGES_WHEN_LOWERCASED
:
1558 return (UBool
)(ucase_toFullLower(c
, NULL
, NULL
, &resultString
, UCASE_LOC_ROOT
)>=0);
1559 case UCHAR_CHANGES_WHEN_UPPERCASED
:
1560 return (UBool
)(ucase_toFullUpper(c
, NULL
, NULL
, &resultString
, UCASE_LOC_ROOT
)>=0);
1561 case UCHAR_CHANGES_WHEN_TITLECASED
:
1562 return (UBool
)(ucase_toFullTitle(c
, NULL
, NULL
, &resultString
, UCASE_LOC_ROOT
)>=0);
1563 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1564 case UCHAR_CHANGES_WHEN_CASEMAPPED
:
1566 ucase_toFullLower(c
, NULL
, NULL
, &resultString
, UCASE_LOC_ROOT
)>=0 ||
1567 ucase_toFullUpper(c
, NULL
, NULL
, &resultString
, UCASE_LOC_ROOT
)>=0 ||
1568 ucase_toFullTitle(c
, NULL
, NULL
, &resultString
, UCASE_LOC_ROOT
)>=0);