1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2004-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: ucase.cpp
12 * tab size: 8 (not used)
15 * created on: 2004aug30
16 * created by: Markus W. Scherer
18 * Low-level Unicode character/string case mapping code.
19 * Much code moved here (and modified) from uchar.c.
22 #include "unicode/utypes.h"
23 #include "unicode/unistr.h"
24 #include "unicode/uset.h"
25 #include "unicode/udata.h" /* UDataInfo */
26 #include "unicode/utf16.h"
27 #include "ucmndata.h" /* DataHeader */
37 const int32_t *indexes
;
38 const uint16_t *exceptions
;
39 const uint16_t *unfold
;
42 uint8_t formatVersion
[4];
45 /* ucase_props_data.h is machine-generated by gencase --csource */
46 #define INCLUDED_FROM_UCASE_CPP
47 #include "ucase_props_data.h"
49 /* set of property starts for UnicodeSet ------------------------------------ */
51 static UBool U_CALLCONV
52 _enumPropertyStartsRange(const void *context
, UChar32 start
, UChar32
/*end*/, uint32_t /*value*/) {
53 /* add the start code point to the USet */
54 const USetAdder
*sa
=(const USetAdder
*)context
;
55 sa
->add(sa
->set
, start
);
59 U_CFUNC
void U_EXPORT2
60 ucase_addPropertyStarts(const USetAdder
*sa
, UErrorCode
*pErrorCode
) {
61 if(U_FAILURE(*pErrorCode
)) {
65 /* add the start code point of each same-value range of the trie */
66 utrie2_enum(&ucase_props_singleton
.trie
, NULL
, _enumPropertyStartsRange
, sa
);
68 /* add code points with hardcoded properties, plus the ones following them */
70 /* (none right now, see comment below) */
73 * Omit code points with hardcoded specialcasing properties
74 * because we do not build property UnicodeSets for them right now.
78 /* data access primitives --------------------------------------------------- */
80 U_CFUNC
const UTrie2
* U_EXPORT2
82 return &ucase_props_singleton
.trie
;
85 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
87 /* number of bits in an 8-bit integer value */
88 static const uint8_t flagsOffset
[256]={
89 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
90 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
91 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
98 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
100 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
101 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
104 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
107 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
108 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
111 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
113 * @param excWord (in) initial exceptions word
114 * @param idx (in) desired slot index
115 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
116 * moved to the last uint16_t of the value, use +1 for beginning of next slot
117 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
119 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
120 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
121 (pExc16)+=SLOT_OFFSET(excWord, idx); \
124 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
126 (value)=((value)<<16)|*pExc16; \
129 /* simple case mappings ----------------------------------------------------- */
131 U_CAPI UChar32 U_EXPORT2
132 ucase_tolower(UChar32 c
) {
133 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
134 if(!UCASE_HAS_EXCEPTION(props
)) {
135 if(UCASE_IS_UPPER_OR_TITLE(props
)) {
136 c
+=UCASE_GET_DELTA(props
);
139 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
);
140 uint16_t excWord
=*pe
++;
141 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
) && UCASE_IS_UPPER_OR_TITLE(props
)) {
143 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe
, delta
);
144 return (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
;
146 if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
147 GET_SLOT_VALUE(excWord
, UCASE_EXC_LOWER
, pe
, c
);
153 U_CAPI UChar32 U_EXPORT2
154 ucase_toupper(UChar32 c
) {
155 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
156 if(!UCASE_HAS_EXCEPTION(props
)) {
157 if(UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
158 c
+=UCASE_GET_DELTA(props
);
161 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
);
162 uint16_t excWord
=*pe
++;
163 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
) && UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
165 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe
, delta
);
166 return (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
;
168 if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
169 GET_SLOT_VALUE(excWord
, UCASE_EXC_UPPER
, pe
, c
);
175 U_CAPI UChar32 U_EXPORT2
176 ucase_totitle(UChar32 c
) {
177 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
178 if(!UCASE_HAS_EXCEPTION(props
)) {
179 if(UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
180 c
+=UCASE_GET_DELTA(props
);
183 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
);
184 uint16_t excWord
=*pe
++;
185 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
) && UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
187 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe
, delta
);
188 return (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
;
191 if(HAS_SLOT(excWord
, UCASE_EXC_TITLE
)) {
193 } else if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
198 GET_SLOT_VALUE(excWord
, idx
, pe
, c
);
203 static const UChar iDot
[2] = { 0x69, 0x307 };
204 static const UChar jDot
[2] = { 0x6a, 0x307 };
205 static const UChar iOgonekDot
[3] = { 0x12f, 0x307 };
206 static const UChar iDotGrave
[3] = { 0x69, 0x307, 0x300 };
207 static const UChar iDotAcute
[3] = { 0x69, 0x307, 0x301 };
208 static const UChar iDotTilde
[3] = { 0x69, 0x307, 0x303 };
211 U_CFUNC
void U_EXPORT2
212 ucase_addCaseClosure(UChar32 c
, const USetAdder
*sa
) {
216 * Hardcode the case closure of i and its relatives and ignore the
217 * data file data for these characters.
218 * The Turkic dotless i and dotted I with their case mapping conditions
219 * and case folding option make the related characters behave specially.
220 * This code matches their closure behavior to their case folding behavior.
225 /* regular i and I are in one equivalence class */
226 sa
->add(sa
->set
, 0x69);
229 sa
->add(sa
->set
, 0x49);
232 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
233 sa
->addString(sa
->set
, iDot
, 2);
236 /* dotless i is in a class by itself */
239 /* otherwise use the data file data */
243 props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
244 if(!UCASE_HAS_EXCEPTION(props
)) {
245 if(UCASE_GET_TYPE(props
)!=UCASE_NONE
) {
246 /* add the one simple case mapping, no matter what type it is */
247 int32_t delta
=UCASE_GET_DELTA(props
);
249 sa
->add(sa
->set
, c
+delta
);
254 * c has exceptions, so there may be multiple simple and/or
255 * full case mappings. Add them all.
257 const uint16_t *pe0
, *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
);
258 const UChar
*closure
;
259 uint16_t excWord
=*pe
++;
260 int32_t idx
, closureLength
, fullLength
, length
;
264 /* add all simple case mappings */
265 for(idx
=UCASE_EXC_LOWER
; idx
<=UCASE_EXC_TITLE
; ++idx
) {
266 if(HAS_SLOT(excWord
, idx
)) {
268 GET_SLOT_VALUE(excWord
, idx
, pe
, c
);
272 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
)) {
275 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe
, delta
);
276 sa
->add(sa
->set
, (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
);
279 /* get the closure string pointer & length */
280 if(HAS_SLOT(excWord
, UCASE_EXC_CLOSURE
)) {
282 GET_SLOT_VALUE(excWord
, UCASE_EXC_CLOSURE
, pe
, closureLength
);
283 closureLength
&=UCASE_CLOSURE_MAX_LENGTH
; /* higher bits are reserved */
284 closure
=(const UChar
*)pe
+1; /* behind this slot, unless there are full case mappings */
290 /* add the full case folding */
291 if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
293 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, fullLength
);
295 /* start of full case mapping strings */
298 fullLength
&=0xffff; /* bits 16 and higher are reserved */
300 /* skip the lowercase result string */
301 pe
+=fullLength
&UCASE_FULL_LOWER
;
304 /* add the full case folding string */
305 length
=fullLength
&0xf;
307 sa
->addString(sa
->set
, (const UChar
*)pe
, length
);
311 /* skip the uppercase and titlecase strings */
317 closure
=(const UChar
*)pe
; /* behind full case mappings */
320 /* add each code point in the closure string */
321 for(idx
=0; idx
<closureLength
;) {
322 U16_NEXT_UNSAFE(closure
, idx
, c
);
329 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
330 * must be length>0 and max>0 and length<=max
332 static inline int32_t
333 strcmpMax(const UChar
*s
, int32_t length
, const UChar
*t
, int32_t max
) {
336 max
-=length
; /* we require length<=max, so no need to decrement max in the loop */
341 return 1; /* reached the end of t but not of s */
345 return c1
; /* return difference result */
348 /* ends with length==0 */
350 if(max
==0 || *t
==0) {
351 return 0; /* equal to length of both strings */
353 return -max
; /* return lengh difference */
357 U_CFUNC UBool U_EXPORT2
358 ucase_addStringCaseClosure(const UChar
*s
, int32_t length
, const USetAdder
*sa
) {
359 int32_t i
, start
, limit
, result
, unfoldRows
, unfoldRowWidth
, unfoldStringWidth
;
361 if(ucase_props_singleton
.unfold
==NULL
|| s
==NULL
) {
362 return FALSE
; /* no reverse case folding data, or no string */
365 /* the string is too short to find any match */
367 * more precise would be:
368 * if(!u_strHasMoreChar32Than(s, length, 1))
369 * but this does not make much practical difference because
370 * a single supplementary code point would just not be found
375 const uint16_t *unfold
=ucase_props_singleton
.unfold
;
376 unfoldRows
=unfold
[UCASE_UNFOLD_ROWS
];
377 unfoldRowWidth
=unfold
[UCASE_UNFOLD_ROW_WIDTH
];
378 unfoldStringWidth
=unfold
[UCASE_UNFOLD_STRING_WIDTH
];
379 unfold
+=unfoldRowWidth
;
381 if(length
>unfoldStringWidth
) {
382 /* the string is too long to find any match */
386 /* do a binary search for the string */
391 const UChar
*p
=reinterpret_cast<const UChar
*>(unfold
+(i
*unfoldRowWidth
));
392 result
=strcmpMax(s
, length
, p
, unfoldStringWidth
);
395 /* found the string: add each code point, and its case closure */
398 for(i
=unfoldStringWidth
; i
<unfoldRowWidth
&& p
[i
]!=0;) {
399 U16_NEXT_UNSAFE(p
, i
, c
);
401 ucase_addCaseClosure(c
, sa
);
404 } else if(result
<0) {
406 } else /* result>0 */ {
411 return FALSE
; /* string not found */
416 FullCaseFoldingIterator::FullCaseFoldingIterator()
417 : unfold(reinterpret_cast<const UChar
*>(ucase_props_singleton
.unfold
)),
418 unfoldRows(unfold
[UCASE_UNFOLD_ROWS
]),
419 unfoldRowWidth(unfold
[UCASE_UNFOLD_ROW_WIDTH
]),
420 unfoldStringWidth(unfold
[UCASE_UNFOLD_STRING_WIDTH
]),
422 rowCpIndex(unfoldStringWidth
) {
423 unfold
+=unfoldRowWidth
;
427 FullCaseFoldingIterator::next(UnicodeString
&full
) {
428 // Advance past the last-delivered code point.
429 const UChar
*p
=unfold
+(currentRow
*unfoldRowWidth
);
430 if(rowCpIndex
>=unfoldRowWidth
|| p
[rowCpIndex
]==0) {
433 rowCpIndex
=unfoldStringWidth
;
435 if(currentRow
>=unfoldRows
) { return U_SENTINEL
; }
436 // Set "full" to the NUL-terminated string in the first unfold column.
437 int32_t length
=unfoldStringWidth
;
438 while(length
>0 && p
[length
-1]==0) { --length
; }
439 full
.setTo(FALSE
, p
, length
);
440 // Return the code point.
442 U16_NEXT_UNSAFE(p
, rowCpIndex
, c
);
446 namespace LatinCase
{
448 const int8_t TO_LOWER_NORMAL
[LIMIT
] = {
449 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
450 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
451 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
452 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
454 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
455 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
456 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
457 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
459 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
460 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
461 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
462 0, 0, 0, 0, 0, EXC
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
464 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
465 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC
,
466 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
469 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
470 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
471 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
472 EXC
, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
474 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC
, 1, 0, 1, 0, 1, 0,
475 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
476 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
477 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
480 const int8_t TO_LOWER_TR_LT
[LIMIT
] = {
481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
482 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
486 0, 32, 32, 32, 32, 32, 32, 32, 32, EXC
, EXC
, 32, 32, 32, 32, 32,
487 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494 0, 0, 0, 0, 0, EXC
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
496 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC
, EXC
, 32, 32,
497 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC
,
498 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
499 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
501 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
502 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
503 1, 0, 1, 0, 1, 0, 1, 0, EXC
, 0, 1, 0, 1, 0, EXC
, 0,
504 EXC
, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
506 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC
, 1, 0, 1, 0, 1, 0,
507 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
508 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
509 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
512 const int8_t TO_UPPER_NORMAL
[LIMIT
] = {
513 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
515 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
516 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
518 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
519 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
520 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
521 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
526 0, 0, 0, 0, 0, EXC
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
529 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC
,
530 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
531 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
533 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
534 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
535 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
536 0, EXC
, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
538 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC
, 0, -1, 0, -1, 0, -1,
539 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
540 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
541 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
544 const int8_t TO_UPPER_TR
[LIMIT
] = {
545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552 0, -32, -32, -32, -32, -32, -32, -32, -32, EXC
, -32, -32, -32, -32, -32, -32,
553 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
555 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
556 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
557 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558 0, 0, 0, 0, 0, EXC
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
560 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
561 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC
,
562 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
563 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
565 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
566 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
567 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
568 0, EXC
, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
570 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC
, 0, -1, 0, -1, 0, -1,
571 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
572 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
573 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
576 } // namespace LatinCase
580 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
581 U_CAPI
int32_t U_EXPORT2
582 ucase_getType(UChar32 c
) {
583 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
584 return UCASE_GET_TYPE(props
);
587 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
588 U_CAPI
int32_t U_EXPORT2
589 ucase_getTypeOrIgnorable(UChar32 c
) {
590 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
591 return UCASE_GET_TYPE_AND_IGNORABLE(props
);
594 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
595 static inline int32_t
596 getDotType(UChar32 c
) {
597 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
598 if(!UCASE_HAS_EXCEPTION(props
)) {
599 return props
&UCASE_DOT_MASK
;
601 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
);
602 return (*pe
>>UCASE_EXC_DOT_SHIFT
)&UCASE_DOT_MASK
;
606 U_CAPI UBool U_EXPORT2
607 ucase_isSoftDotted(UChar32 c
) {
608 return (UBool
)(getDotType(c
)==UCASE_SOFT_DOTTED
);
611 U_CAPI UBool U_EXPORT2
612 ucase_isCaseSensitive(UChar32 c
) {
613 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
614 if(!UCASE_HAS_EXCEPTION(props
)) {
615 return (UBool
)((props
&UCASE_SENSITIVE
)!=0);
617 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
);
618 return (UBool
)((*pe
&UCASE_EXC_SENSITIVE
)!=0);
622 /* string casing ------------------------------------------------------------ */
625 * These internal functions form the core of string case mappings.
626 * They map single code points to result code points or strings and take
627 * all necessary conditions (context, locale ID, options) into account.
629 * They do not iterate over the source or write to the destination
630 * so that the same functions are useful for non-standard string storage,
631 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
632 * For the same reason, the "surrounding text" context is passed in as a
633 * UCaseContextIterator which does not make any assumptions about
634 * the underlying storage.
636 * This section contains helper functions that check for conditions
637 * in the input text surrounding the current code point
638 * according to SpecialCasing.txt.
640 * Each helper function gets the index
641 * - after the current code point if it looks at following text
642 * - before the current code point if it looks at preceding text
644 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
647 * C is preceded by a sequence consisting of
648 * a cased letter and a case-ignorable sequence,
649 * and C is not followed by a sequence consisting of
650 * an ignorable sequence and then a cased letter.
653 * C is followed by one or more characters of combining class 230 (ABOVE)
654 * in the combining character sequence.
657 * The last preceding character with combining class of zero before C
659 * and there is no intervening combining character class 230 (ABOVE).
662 * C is followed by combining dot above (U+0307).
663 * Any sequence of characters with a combining class that is neither 0 nor 230
664 * may intervene between the current character and the combining dot above.
666 * The erratum from 2002-10-31 adds the condition
669 * The last preceding base character was an uppercase I, and there is no
670 * intervening combining character class 230 (ABOVE).
672 * (See Jitterbug 2344 and the comments on After_I below.)
674 * Helper definitions in Unicode 3.2 UAX 21:
676 * D1. A character C is defined to be cased
677 * if it meets any of the following criteria:
679 * - The general category of C is Titlecase Letter (Lt)
680 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
681 * - Given D = NFD(C), then it is not the case that:
682 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
683 * (This third criterium does not add any characters to the list
684 * for Unicode 3.2. Ignored.)
686 * D2. A character C is defined to be case-ignorable
687 * if it meets either of the following criteria:
689 * - The general category of C is
690 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
691 * Letter Modifier (Lm), or Symbol Modifier (Sk)
692 * - C is one of the following characters
694 * U+00AD SOFT HYPHEN (SHY)
695 * U+2019 RIGHT SINGLE QUOTATION MARK
696 * (the preferred character for apostrophe)
698 * D3. A case-ignorable sequence is a sequence of
699 * zero or more case-ignorable characters.
702 #define is_d(c) ((c)=='d' || (c)=='D')
703 #define is_e(c) ((c)=='e' || (c)=='E')
704 #define is_i(c) ((c)=='i' || (c)=='I')
705 #define is_l(c) ((c)=='l' || (c)=='L')
706 #define is_r(c) ((c)=='r' || (c)=='R')
707 #define is_t(c) ((c)=='t' || (c)=='T')
708 #define is_u(c) ((c)=='u' || (c)=='U')
709 #define is_z(c) ((c)=='z' || (c)=='Z')
712 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
715 * Requires non-NULL locale ID but otherwise does the equivalent of
716 * checking for language codes as if uloc_getLanguage() were called:
717 * Accepts both 2- and 3-letter codes and accepts case variants.
720 ucase_getCaseLocale(const char *locale
) {
722 * This function used to use uloc_getLanguage(), but the current code
723 * removes the dependency of this low-level code on uloc implementation code
724 * and is faster because not the whole locale ID has to be
725 * examined and copied/transformed.
727 * Because this code does not want to depend on uloc, the caller must
728 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
731 // Fastpath for English "en" which is often used for default (=root locale) case mappings,
732 // and for Chinese "zh": Very common but no special case mapping behavior.
733 // Then check lowercase vs. uppercase to reduce the number of comparisons
734 // for other locales without special behavior.
744 return UCASE_LOC_GREEK
;
747 // en, es, ... -> root
749 return UCASE_LOC_ROOT
;
750 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
751 } else if(c
>='a') { // ASCII a-z = 0x61..0x7a, after A-Z
752 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
753 } else if(c
<='z') { // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
755 # error Unknown charset family!
767 return UCASE_LOC_TURKISH
;
779 return UCASE_LOC_TURKISH
;
791 return UCASE_LOC_LITHUANIAN
;
803 return UCASE_LOC_DUTCH
;
809 // Same code as for lowercase c but also check for 'E'.
819 return UCASE_LOC_TURKISH
;
831 return UCASE_LOC_TURKISH
;
843 return UCASE_LOC_LITHUANIAN
;
855 return UCASE_LOC_GREEK
;
867 return UCASE_LOC_DUTCH
;
872 return UCASE_LOC_ROOT
;
877 * {case-ignorable}* cased
879 * (dir determines looking forward/backward)
880 * If a character is case-ignorable, it is skipped regardless of whether
881 * it is also cased or not.
884 isFollowedByCasedLetter(UCaseContextIterator
*iter
, void *context
, int8_t dir
) {
891 for(/* dir!=0 sets direction */; (c
=iter(context
, dir
))>=0; dir
=0) {
892 int32_t type
=ucase_getTypeOrIgnorable(c
);
894 /* case-ignorable, continue with the loop */
895 } else if(type
!=UCASE_NONE
) {
896 return TRUE
; /* followed by cased letter */
898 return FALSE
; /* uncased and not case-ignorable */
902 return FALSE
; /* not followed by cased letter */
905 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
907 isPrecededBySoftDotted(UCaseContextIterator
*iter
, void *context
) {
916 for(dir
=-1; (c
=iter(context
, dir
))>=0; dir
=0) {
917 dotType
=getDotType(c
);
918 if(dotType
==UCASE_SOFT_DOTTED
) {
919 return TRUE
; /* preceded by TYPE_i */
920 } else if(dotType
!=UCASE_OTHER_ACCENT
) {
921 return FALSE
; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
925 return FALSE
; /* not preceded by TYPE_i */
929 * See Jitterbug 2344:
930 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
931 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
932 * we made those releases compatible with Unicode 3.2 which had not fixed
933 * a related bug in SpecialCasing.txt.
935 * From the Jitterbug 2344 text:
936 * ... this bug is listed as a Unicode erratum
937 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
939 * There are two errors in SpecialCasing.txt.
940 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
941 * 2. An incorrect context definition. Correct as follows:
942 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
943 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
945 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
946 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
947 * where the context After_I is defined as:
948 * The last preceding base character was an uppercase I, and there is no
949 * intervening combining character class 230 (ABOVE).
952 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
954 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
955 * # This matches the behavior of the canonically equivalent I-dot_above
957 * See also the description in this place in older versions of uchar.c (revision 1.100).
959 * Markus W. Scherer 2003-feb-15
962 /* Is preceded by base character 'I' with no intervening cc=230 ? */
964 isPrecededBy_I(UCaseContextIterator
*iter
, void *context
) {
973 for(dir
=-1; (c
=iter(context
, dir
))>=0; dir
=0) {
975 return TRUE
; /* preceded by I */
977 dotType
=getDotType(c
);
978 if(dotType
!=UCASE_OTHER_ACCENT
) {
979 return FALSE
; /* preceded by different base character (not I), or intervening cc==230 */
983 return FALSE
; /* not preceded by I */
986 /* Is followed by one or more cc==230 ? */
988 isFollowedByMoreAbove(UCaseContextIterator
*iter
, void *context
) {
997 for(dir
=1; (c
=iter(context
, dir
))>=0; dir
=0) {
998 dotType
=getDotType(c
);
999 if(dotType
==UCASE_ABOVE
) {
1000 return TRUE
; /* at least one cc==230 following */
1001 } else if(dotType
!=UCASE_OTHER_ACCENT
) {
1002 return FALSE
; /* next base character, no more cc==230 following */
1006 return FALSE
; /* no more cc==230 following */
1009 /* Is followed by a dot above (without cc==230 in between) ? */
1011 isFollowedByDotAbove(UCaseContextIterator
*iter
, void *context
) {
1020 for(dir
=1; (c
=iter(context
, dir
))>=0; dir
=0) {
1024 dotType
=getDotType(c
);
1025 if(dotType
!=UCASE_OTHER_ACCENT
) {
1026 return FALSE
; /* next base character or cc==230 in between */
1030 return FALSE
; /* no dot above following */
1033 U_CAPI
int32_t U_EXPORT2
1034 ucase_toFullLower(UChar32 c
,
1035 UCaseContextIterator
*iter
, void *context
,
1036 const UChar
**pString
,
1038 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1041 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
1042 if(!UCASE_HAS_EXCEPTION(props
)) {
1043 if(UCASE_IS_UPPER_OR_TITLE(props
)) {
1044 result
=c
+UCASE_GET_DELTA(props
);
1047 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
), *pe2
;
1048 uint16_t excWord
=*pe
++;
1053 if(excWord
&UCASE_EXC_CONDITIONAL_SPECIAL
) {
1054 /* use hardcoded conditions and mappings */
1057 * Test for conditional mappings first
1058 * (otherwise the unconditional default mappings are always taken),
1059 * then test for characters that have unconditional mappings in SpecialCasing.txt,
1060 * then get the UnicodeData.txt mappings.
1062 if( loc
==UCASE_LOC_LITHUANIAN
&&
1063 /* base characters, find accents above */
1064 (((c
==0x49 || c
==0x4a || c
==0x12e) &&
1065 isFollowedByMoreAbove(iter
, context
)) ||
1066 /* precomposed with accent above, no need to find one */
1067 (c
==0xcc || c
==0xcd || c
==0x128))
1072 # Lithuanian retains the dot in a lowercase i when followed by accents.
1074 # Introduce an explicit dot above when lowercasing capital I's and J's
1075 # whenever there are more accents above.
1076 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1078 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1079 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1080 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1081 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1082 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1083 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1086 case 0x49: /* LATIN CAPITAL LETTER I */
1089 case 0x4a: /* LATIN CAPITAL LETTER J */
1092 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1093 *pString
=iOgonekDot
;
1095 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
1098 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
1101 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1105 return 0; /* will not occur */
1107 /* # Turkish and Azeri */
1108 } else if(loc
==UCASE_LOC_TURKISH
&& c
==0x130) {
1110 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1111 # The following rules handle those cases.
1113 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1114 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1117 } else if(loc
==UCASE_LOC_TURKISH
&& c
==0x307 && isPrecededBy_I(iter
, context
)) {
1119 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1120 # This matches the behavior of the canonically equivalent I-dot_above
1122 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1123 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1126 return 0; /* remove the dot (continue without output) */
1127 } else if(loc
==UCASE_LOC_TURKISH
&& c
==0x49 && !isFollowedByDotAbove(iter
, context
)) {
1129 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1131 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1132 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1135 } else if(c
==0x130) {
1137 # Preserve canonical equivalence for I with dot. Turkic is handled below.
1139 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1143 } else if( c
==0x3a3 &&
1144 !isFollowedByCasedLetter(iter
, context
, 1) &&
1145 isFollowedByCasedLetter(iter
, context
, -1) /* -1=preceded */
1147 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1149 # Special case for final form of sigma
1151 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1153 return 0x3c2; /* greek small final sigma */
1155 /* no known conditional special case mapping, use a normal mapping */
1157 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1158 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1159 full
&=UCASE_FULL_LOWER
;
1161 /* set the output pointer to the lowercase mapping */
1162 *pString
=reinterpret_cast<const UChar
*>(pe
+1);
1164 /* return the string length */
1169 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
) && UCASE_IS_UPPER_OR_TITLE(props
)) {
1171 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe2
, delta
);
1172 return (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
;
1174 if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1175 GET_SLOT_VALUE(excWord
, UCASE_EXC_LOWER
, pe2
, result
);
1179 return (result
==c
) ? ~result
: result
;
1184 toUpperOrTitle(UChar32 c
,
1185 UCaseContextIterator
*iter
, void *context
,
1186 const UChar
**pString
,
1188 UBool upperNotTitle
) {
1189 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1192 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
1193 if(!UCASE_HAS_EXCEPTION(props
)) {
1194 if(UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
1195 result
=c
+UCASE_GET_DELTA(props
);
1198 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
), *pe2
;
1199 uint16_t excWord
=*pe
++;
1204 if(excWord
&UCASE_EXC_CONDITIONAL_SPECIAL
) {
1205 /* use hardcoded conditions and mappings */
1206 if(loc
==UCASE_LOC_TURKISH
&& c
==0x69) {
1210 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1211 # The following rules handle those cases.
1213 # When uppercasing, i turns into a dotted capital I
1215 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1216 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1219 } else if(loc
==UCASE_LOC_LITHUANIAN
&& c
==0x307 && isPrecededBySoftDotted(iter
, context
)) {
1223 # Lithuanian retains the dot in a lowercase i when followed by accents.
1225 # Remove DOT ABOVE after "i" with upper or titlecase
1227 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1230 return 0; /* remove the dot (continue without output) */
1232 /* no known conditional special case mapping, use a normal mapping */
1234 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1235 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1237 /* start of full case mapping strings */
1240 /* skip the lowercase and case-folding result strings */
1241 pe
+=full
&UCASE_FULL_LOWER
;
1249 /* skip the uppercase result string */
1255 /* set the output pointer to the result string */
1256 *pString
=reinterpret_cast<const UChar
*>(pe
);
1258 /* return the string length */
1263 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
) && UCASE_GET_TYPE(props
)==UCASE_LOWER
) {
1265 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe2
, delta
);
1266 return (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
;
1268 if(!upperNotTitle
&& HAS_SLOT(excWord
, UCASE_EXC_TITLE
)) {
1269 idx
=UCASE_EXC_TITLE
;
1270 } else if(HAS_SLOT(excWord
, UCASE_EXC_UPPER
)) {
1271 /* here, titlecase is same as uppercase */
1272 idx
=UCASE_EXC_UPPER
;
1276 GET_SLOT_VALUE(excWord
, idx
, pe2
, result
);
1279 return (result
==c
) ? ~result
: result
;
1282 U_CAPI
int32_t U_EXPORT2
1283 ucase_toFullUpper(UChar32 c
,
1284 UCaseContextIterator
*iter
, void *context
,
1285 const UChar
**pString
,
1286 int32_t caseLocale
) {
1287 return toUpperOrTitle(c
, iter
, context
, pString
, caseLocale
, TRUE
);
1290 U_CAPI
int32_t U_EXPORT2
1291 ucase_toFullTitle(UChar32 c
,
1292 UCaseContextIterator
*iter
, void *context
,
1293 const UChar
**pString
,
1294 int32_t caseLocale
) {
1295 return toUpperOrTitle(c
, iter
, context
, pString
, caseLocale
, FALSE
);
1298 /* case folding ------------------------------------------------------------- */
1301 * Case folding is similar to lowercasing.
1302 * The result may be a simple mapping, i.e., a single code point, or
1303 * a full mapping, i.e., a string.
1304 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1305 * then only the lowercase mapping is stored.
1307 * Some special cases are hardcoded because their conditions cannot be
1308 * parsed and processed from CaseFolding.txt.
1310 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1312 # C: common case folding, common mappings shared by both simple and full mappings.
1313 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1314 # S: simple case folding, mappings to single characters where different from F.
1315 # T: special case for uppercase I and dotted uppercase I
1316 # - For non-Turkic languages, this mapping is normally not used.
1317 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1320 # A. To do a simple case folding, use the mappings with status C + S.
1321 # B. To do a full case folding, use the mappings with status C + F.
1323 # The mappings with status T can be used or omitted depending on the desired case-folding
1324 # behavior. (The default option is to exclude them.)
1326 * Unicode 3.2 has 'T' mappings as follows:
1328 0049; T; 0131; # LATIN CAPITAL LETTER I
1329 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1331 * while the default mappings for these code points are:
1333 0049; C; 0069; # LATIN CAPITAL LETTER I
1334 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1336 * U+0130 has no simple case folding (simple-case-folds to itself).
1339 /* return the simple case folding mapping for c */
1340 U_CAPI UChar32 U_EXPORT2
1341 ucase_fold(UChar32 c
, uint32_t options
) {
1342 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
1343 if(!UCASE_HAS_EXCEPTION(props
)) {
1344 if(UCASE_IS_UPPER_OR_TITLE(props
)) {
1345 c
+=UCASE_GET_DELTA(props
);
1348 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
);
1349 uint16_t excWord
=*pe
++;
1351 if(excWord
&UCASE_EXC_CONDITIONAL_FOLD
) {
1352 /* special case folding mappings, hardcoded */
1353 if((options
&_FOLD_CASE_OPTIONS_MASK
)==U_FOLD_CASE_DEFAULT
) {
1354 /* default mappings */
1356 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1358 } else if(c
==0x130) {
1359 /* no simple case folding for U+0130 */
1363 /* Turkic mappings */
1365 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1367 } else if(c
==0x130) {
1368 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1373 if((excWord
&UCASE_EXC_NO_SIMPLE_CASE_FOLDING
)!=0) {
1376 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
) && UCASE_IS_UPPER_OR_TITLE(props
)) {
1378 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe
, delta
);
1379 return (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
;
1381 if(HAS_SLOT(excWord
, UCASE_EXC_FOLD
)) {
1383 } else if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1384 idx
=UCASE_EXC_LOWER
;
1388 GET_SLOT_VALUE(excWord
, idx
, pe
, c
);
1394 * Issue for canonical caseless match (UAX #21):
1395 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1396 * canonical equivalence, unlike default-option casefolding.
1397 * For example, I-grave and I + grave fold to strings that are not canonically
1399 * For more details, see the comment in unorm_compare() in unorm.cpp
1400 * and the intermediate prototype changes for Jitterbug 2021.
1401 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1403 * This did not get fixed because it appears that it is not possible to fix
1404 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1405 * together in a way that they still fold to common result strings.
1408 U_CAPI
int32_t U_EXPORT2
1409 ucase_toFullFolding(UChar32 c
,
1410 const UChar
**pString
,
1412 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1415 uint16_t props
=UTRIE2_GET16(&ucase_props_singleton
.trie
, c
);
1416 if(!UCASE_HAS_EXCEPTION(props
)) {
1417 if(UCASE_IS_UPPER_OR_TITLE(props
)) {
1418 result
=c
+UCASE_GET_DELTA(props
);
1421 const uint16_t *pe
=GET_EXCEPTIONS(&ucase_props_singleton
, props
), *pe2
;
1422 uint16_t excWord
=*pe
++;
1427 if(excWord
&UCASE_EXC_CONDITIONAL_FOLD
) {
1428 /* use hardcoded conditions and mappings */
1429 if((options
&_FOLD_CASE_OPTIONS_MASK
)==U_FOLD_CASE_DEFAULT
) {
1430 /* default mappings */
1432 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1434 } else if(c
==0x130) {
1435 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1440 /* Turkic mappings */
1442 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1444 } else if(c
==0x130) {
1445 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1449 } else if(HAS_SLOT(excWord
, UCASE_EXC_FULL_MAPPINGS
)) {
1450 GET_SLOT_VALUE(excWord
, UCASE_EXC_FULL_MAPPINGS
, pe
, full
);
1452 /* start of full case mapping strings */
1455 /* skip the lowercase result string */
1456 pe
+=full
&UCASE_FULL_LOWER
;
1460 /* set the output pointer to the result string */
1461 *pString
=reinterpret_cast<const UChar
*>(pe
);
1463 /* return the string length */
1468 if((excWord
&UCASE_EXC_NO_SIMPLE_CASE_FOLDING
)!=0) {
1471 if(HAS_SLOT(excWord
, UCASE_EXC_DELTA
) && UCASE_IS_UPPER_OR_TITLE(props
)) {
1473 GET_SLOT_VALUE(excWord
, UCASE_EXC_DELTA
, pe2
, delta
);
1474 return (excWord
&UCASE_EXC_DELTA_IS_NEGATIVE
)==0 ? c
+delta
: c
-delta
;
1476 if(HAS_SLOT(excWord
, UCASE_EXC_FOLD
)) {
1478 } else if(HAS_SLOT(excWord
, UCASE_EXC_LOWER
)) {
1479 idx
=UCASE_EXC_LOWER
;
1483 GET_SLOT_VALUE(excWord
, idx
, pe2
, result
);
1486 return (result
==c
) ? ~result
: result
;
1489 /* case mapping properties API ---------------------------------------------- */
1491 /* public API (see uchar.h) */
1493 U_CAPI UBool U_EXPORT2
1494 u_isULowercase(UChar32 c
) {
1495 return (UBool
)(UCASE_LOWER
==ucase_getType(c
));
1498 U_CAPI UBool U_EXPORT2
1499 u_isUUppercase(UChar32 c
) {
1500 return (UBool
)(UCASE_UPPER
==ucase_getType(c
));
1503 /* Transforms the Unicode character to its lower case equivalent.*/
1504 U_CAPI UChar32 U_EXPORT2
1505 u_tolower(UChar32 c
) {
1506 return ucase_tolower(c
);
1509 /* Transforms the Unicode character to its upper case equivalent.*/
1510 U_CAPI UChar32 U_EXPORT2
1511 u_toupper(UChar32 c
) {
1512 return ucase_toupper(c
);
1515 /* Transforms the Unicode character to its title case equivalent.*/
1516 U_CAPI UChar32 U_EXPORT2
1517 u_totitle(UChar32 c
) {
1518 return ucase_totitle(c
);
1521 /* return the simple case folding mapping for c */
1522 U_CAPI UChar32 U_EXPORT2
1523 u_foldCase(UChar32 c
, uint32_t options
) {
1524 return ucase_fold(c
, options
);
1527 U_CFUNC
int32_t U_EXPORT2
1528 ucase_hasBinaryProperty(UChar32 c
, UProperty which
) {
1529 /* case mapping properties */
1530 const UChar
*resultString
;
1532 case UCHAR_LOWERCASE
:
1533 return (UBool
)(UCASE_LOWER
==ucase_getType(c
));
1534 case UCHAR_UPPERCASE
:
1535 return (UBool
)(UCASE_UPPER
==ucase_getType(c
));
1536 case UCHAR_SOFT_DOTTED
:
1537 return ucase_isSoftDotted(c
);
1538 case UCHAR_CASE_SENSITIVE
:
1539 return ucase_isCaseSensitive(c
);
1541 return (UBool
)(UCASE_NONE
!=ucase_getType(c
));
1542 case UCHAR_CASE_IGNORABLE
:
1543 return (UBool
)(ucase_getTypeOrIgnorable(c
)>>2);
1545 * Note: The following Changes_When_Xyz are defined as testing whether
1546 * the NFD form of the input changes when Xyz-case-mapped.
1547 * However, this simpler implementation of these properties,
1548 * ignoring NFD, passes the tests.
1549 * The implementation needs to be changed if the tests start failing.
1550 * When that happens, optimizations should be used to work with the
1551 * per-single-code point ucase_toFullXyz() functions unless
1552 * the NFD form has more than one code point,
1553 * and the property starts set needs to be the union of the
1554 * start sets for normalization and case mappings.
1556 case UCHAR_CHANGES_WHEN_LOWERCASED
:
1557 return (UBool
)(ucase_toFullLower(c
, NULL
, NULL
, &resultString
, UCASE_LOC_ROOT
)>=0);
1558 case UCHAR_CHANGES_WHEN_UPPERCASED
:
1559 return (UBool
)(ucase_toFullUpper(c
, NULL
, NULL
, &resultString
, UCASE_LOC_ROOT
)>=0);
1560 case UCHAR_CHANGES_WHEN_TITLECASED
:
1561 return (UBool
)(ucase_toFullTitle(c
, NULL
, NULL
, &resultString
, UCASE_LOC_ROOT
)>=0);
1562 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1563 case UCHAR_CHANGES_WHEN_CASEMAPPED
:
1565 ucase_toFullLower(c
, NULL
, NULL
, &resultString
, UCASE_LOC_ROOT
)>=0 ||
1566 ucase_toFullUpper(c
, NULL
, NULL
, &resultString
, UCASE_LOC_ROOT
)>=0 ||
1567 ucase_toFullTitle(c
, NULL
, NULL
, &resultString
, UCASE_LOC_ROOT
)>=0);