]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucase.cpp
ICU-59173.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucase.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
374ca955
A
3/*
4*******************************************************************************
5*
b331163b 6* Copyright (C) 2004-2014, International Business Machines
374ca955
A
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
4388f060 10* file name: ucase.cpp
f3c0d7a5 11* encoding: UTF-8
374ca955
A
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2004aug30
16* created by: Markus W. Scherer
17*
18* Low-level Unicode character/string case mapping code.
19* Much code moved here (and modified) from uchar.c.
20*/
21
22#include "unicode/utypes.h"
4388f060 23#include "unicode/unistr.h"
374ca955
A
24#include "unicode/uset.h"
25#include "unicode/udata.h" /* UDataInfo */
4388f060 26#include "unicode/utf16.h"
374ca955
A
27#include "ucmndata.h" /* DataHeader */
28#include "udatamem.h"
29#include "umutex.h"
30#include "uassert.h"
31#include "cmemory.h"
729e4ab9 32#include "utrie2.h"
374ca955 33#include "ucase.h"
374ca955
A
34
35struct UCaseProps {
36 UDataMemory *mem;
37 const int32_t *indexes;
38 const uint16_t *exceptions;
4388f060 39 const uint16_t *unfold;
374ca955 40
729e4ab9 41 UTrie2 trie;
374ca955
A
42 uint8_t formatVersion[4];
43};
44
4388f060
A
45/* ucase_props_data.h is machine-generated by gencase --csource */
46#define INCLUDED_FROM_UCASE_CPP
47#include "ucase_props_data.h"
73c04bcf 48
374ca955
A
49/* set of property starts for UnicodeSet ------------------------------------ */
50
51static UBool U_CALLCONV
4388f060 52_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
374ca955 53 /* add the start code point to the USet */
73c04bcf 54 const USetAdder *sa=(const USetAdder *)context;
374ca955
A
55 sa->add(sa->set, start);
56 return TRUE;
57}
58
46f4442e 59U_CFUNC void U_EXPORT2
f3c0d7a5 60ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
374ca955
A
61 if(U_FAILURE(*pErrorCode)) {
62 return;
63 }
64
65 /* add the start code point of each same-value range of the trie */
f3c0d7a5 66 utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
374ca955
A
67
68 /* add code points with hardcoded properties, plus the ones following them */
69
70 /* (none right now, see comment below) */
71
72 /*
73 * Omit code points with hardcoded specialcasing properties
74 * because we do not build property UnicodeSets for them right now.
75 */
76}
77
78/* data access primitives --------------------------------------------------- */
79
374ca955
A
80#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
81
82#define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
83
84/* number of bits in an 8-bit integer value */
85static const uint8_t flagsOffset[256]={
86 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
87 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
88 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
89 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
90 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
91 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
92 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
94 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
95 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
97 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
98 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
100 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
101 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
102};
103
729e4ab9
A
104#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
105#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
374ca955
A
106
107/*
729e4ab9 108 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
374ca955
A
109 *
110 * @param excWord (in) initial exceptions word
729e4ab9 111 * @param idx (in) desired slot index
374ca955
A
112 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
113 * moved to the last uint16_t of the value, use +1 for beginning of next slot
114 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
115 */
729e4ab9 116#define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
374ca955 117 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
729e4ab9 118 (pExc16)+=SLOT_OFFSET(excWord, idx); \
374ca955
A
119 (value)=*pExc16; \
120 } else { \
729e4ab9 121 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
374ca955
A
122 (value)=*pExc16++; \
123 (value)=((value)<<16)|*pExc16; \
124 }
125
126/* simple case mappings ----------------------------------------------------- */
127
128U_CAPI UChar32 U_EXPORT2
f3c0d7a5
A
129ucase_tolower(UChar32 c) {
130 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
374ca955 131 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
132 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
133 c+=UCASE_GET_DELTA(props);
374ca955
A
134 }
135 } else {
f3c0d7a5 136 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
374ca955
A
137 uint16_t excWord=*pe++;
138 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
139 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
140 }
141 }
142 return c;
143}
144
145U_CAPI UChar32 U_EXPORT2
f3c0d7a5
A
146ucase_toupper(UChar32 c) {
147 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
374ca955 148 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
149 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
150 c+=UCASE_GET_DELTA(props);
374ca955
A
151 }
152 } else {
f3c0d7a5 153 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
374ca955
A
154 uint16_t excWord=*pe++;
155 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
156 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
157 }
158 }
159 return c;
160}
161
162U_CAPI UChar32 U_EXPORT2
f3c0d7a5
A
163ucase_totitle(UChar32 c) {
164 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
374ca955 165 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
166 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
167 c+=UCASE_GET_DELTA(props);
374ca955
A
168 }
169 } else {
f3c0d7a5 170 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
374ca955 171 uint16_t excWord=*pe++;
729e4ab9 172 int32_t idx;
374ca955 173 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
729e4ab9 174 idx=UCASE_EXC_TITLE;
374ca955 175 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
729e4ab9 176 idx=UCASE_EXC_UPPER;
374ca955
A
177 } else {
178 return c;
179 }
729e4ab9 180 GET_SLOT_VALUE(excWord, idx, pe, c);
374ca955
A
181 }
182 return c;
183}
184
46f4442e
A
185static const UChar iDot[2] = { 0x69, 0x307 };
186static const UChar jDot[2] = { 0x6a, 0x307 };
187static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
188static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
189static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
190static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
191
192
193U_CFUNC void U_EXPORT2
f3c0d7a5 194ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
73c04bcf
A
195 uint16_t props;
196
197 /*
198 * Hardcode the case closure of i and its relatives and ignore the
199 * data file data for these characters.
200 * The Turkic dotless i and dotted I with their case mapping conditions
201 * and case folding option make the related characters behave specially.
202 * This code matches their closure behavior to their case folding behavior.
203 */
73c04bcf
A
204
205 switch(c) {
206 case 0x49:
207 /* regular i and I are in one equivalence class */
208 sa->add(sa->set, 0x69);
209 return;
210 case 0x69:
211 sa->add(sa->set, 0x49);
212 return;
213 case 0x130:
214 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
215 sa->addString(sa->set, iDot, 2);
216 return;
217 case 0x131:
218 /* dotless i is in a class by itself */
219 return;
220 default:
221 /* otherwise use the data file data */
222 break;
223 }
224
f3c0d7a5 225 props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
73c04bcf
A
226 if(!PROPS_HAS_EXCEPTION(props)) {
227 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
228 /* add the one simple case mapping, no matter what type it is */
229 int32_t delta=UCASE_GET_DELTA(props);
230 if(delta!=0) {
231 sa->add(sa->set, c+delta);
232 }
233 }
234 } else {
235 /*
236 * c has exceptions, so there may be multiple simple and/or
237 * full case mappings. Add them all.
238 */
f3c0d7a5 239 const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
73c04bcf
A
240 const UChar *closure;
241 uint16_t excWord=*pe++;
729e4ab9 242 int32_t idx, closureLength, fullLength, length;
73c04bcf
A
243
244 pe0=pe;
245
246 /* add all simple case mappings */
729e4ab9
A
247 for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
248 if(HAS_SLOT(excWord, idx)) {
73c04bcf 249 pe=pe0;
729e4ab9 250 GET_SLOT_VALUE(excWord, idx, pe, c);
73c04bcf
A
251 sa->add(sa->set, c);
252 }
253 }
254
255 /* get the closure string pointer & length */
256 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
257 pe=pe0;
258 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
259 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
260 closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
261 } else {
262 closureLength=0;
263 closure=NULL;
264 }
265
266 /* add the full case folding */
267 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
268 pe=pe0;
269 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
270
271 /* start of full case mapping strings */
272 ++pe;
273
274 fullLength&=0xffff; /* bits 16 and higher are reserved */
275
276 /* skip the lowercase result string */
277 pe+=fullLength&UCASE_FULL_LOWER;
278 fullLength>>=4;
279
280 /* add the full case folding string */
281 length=fullLength&0xf;
282 if(length!=0) {
283 sa->addString(sa->set, (const UChar *)pe, length);
284 pe+=length;
285 }
286
287 /* skip the uppercase and titlecase strings */
288 fullLength>>=4;
289 pe+=fullLength&0xf;
290 fullLength>>=4;
291 pe+=fullLength;
292
293 closure=(const UChar *)pe; /* behind full case mappings */
294 }
295
296 /* add each code point in the closure string */
729e4ab9
A
297 for(idx=0; idx<closureLength;) {
298 U16_NEXT_UNSAFE(closure, idx, c);
73c04bcf
A
299 sa->add(sa->set, c);
300 }
301 }
302}
303
304/*
305 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
306 * must be length>0 and max>0 and length<=max
307 */
4388f060 308static inline int32_t
73c04bcf
A
309strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
310 int32_t c1, c2;
311
312 max-=length; /* we require length<=max, so no need to decrement max in the loop */
313 do {
314 c1=*s++;
315 c2=*t++;
316 if(c2==0) {
317 return 1; /* reached the end of t but not of s */
318 }
319 c1-=c2;
320 if(c1!=0) {
321 return c1; /* return difference result */
322 }
323 } while(--length>0);
324 /* ends with length==0 */
325
326 if(max==0 || *t==0) {
327 return 0; /* equal to length of both strings */
328 } else {
329 return -max; /* return lengh difference */
330 }
331}
332
46f4442e 333U_CFUNC UBool U_EXPORT2
f3c0d7a5 334ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) {
73c04bcf
A
335 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
336
f3c0d7a5 337 if(ucase_props_singleton.unfold==NULL || s==NULL) {
73c04bcf
A
338 return FALSE; /* no reverse case folding data, or no string */
339 }
340 if(length<=1) {
341 /* the string is too short to find any match */
342 /*
343 * more precise would be:
344 * if(!u_strHasMoreChar32Than(s, length, 1))
345 * but this does not make much practical difference because
346 * a single supplementary code point would just not be found
347 */
348 return FALSE;
349 }
350
f3c0d7a5 351 const uint16_t *unfold=ucase_props_singleton.unfold;
73c04bcf
A
352 unfoldRows=unfold[UCASE_UNFOLD_ROWS];
353 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
354 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
355 unfold+=unfoldRowWidth;
356
357 if(length>unfoldStringWidth) {
358 /* the string is too long to find any match */
359 return FALSE;
360 }
361
362 /* do a binary search for the string */
363 start=0;
364 limit=unfoldRows;
365 while(start<limit) {
366 i=(start+limit)/2;
4388f060 367 const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
73c04bcf
A
368 result=strcmpMax(s, length, p, unfoldStringWidth);
369
370 if(result==0) {
371 /* found the string: add each code point, and its case closure */
372 UChar32 c;
373
374 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
375 U16_NEXT_UNSAFE(p, i, c);
376 sa->add(sa->set, c);
f3c0d7a5 377 ucase_addCaseClosure(c, sa);
73c04bcf
A
378 }
379 return TRUE;
380 } else if(result<0) {
381 limit=i;
382 } else /* result>0 */ {
383 start=i+1;
384 }
385 }
386
387 return FALSE; /* string not found */
388}
389
4388f060
A
390U_NAMESPACE_BEGIN
391
392FullCaseFoldingIterator::FullCaseFoldingIterator()
393 : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
394 unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
395 unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
396 unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
397 currentRow(0),
398 rowCpIndex(unfoldStringWidth) {
399 unfold+=unfoldRowWidth;
400}
401
402UChar32
403FullCaseFoldingIterator::next(UnicodeString &full) {
404 // Advance past the last-delivered code point.
405 const UChar *p=unfold+(currentRow*unfoldRowWidth);
406 if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
407 ++currentRow;
408 p+=unfoldRowWidth;
409 rowCpIndex=unfoldStringWidth;
410 }
411 if(currentRow>=unfoldRows) { return U_SENTINEL; }
412 // Set "full" to the NUL-terminated string in the first unfold column.
413 int32_t length=unfoldStringWidth;
414 while(length>0 && p[length-1]==0) { --length; }
415 full.setTo(FALSE, p, length);
416 // Return the code point.
417 UChar32 c;
418 U16_NEXT_UNSAFE(p, rowCpIndex, c);
419 return c;
420}
421
422U_NAMESPACE_END
423
374ca955
A
424/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
425U_CAPI int32_t U_EXPORT2
f3c0d7a5
A
426ucase_getType(UChar32 c) {
427 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
73c04bcf 428 return UCASE_GET_TYPE(props);
374ca955
A
429}
430
729e4ab9 431/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
374ca955 432U_CAPI int32_t U_EXPORT2
f3c0d7a5
A
433ucase_getTypeOrIgnorable(UChar32 c) {
434 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
4388f060 435 return UCASE_GET_TYPE_AND_IGNORABLE(props);
374ca955
A
436}
437
438/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
4388f060 439static inline int32_t
f3c0d7a5
A
440getDotType(UChar32 c) {
441 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
374ca955
A
442 if(!PROPS_HAS_EXCEPTION(props)) {
443 return props&UCASE_DOT_MASK;
444 } else {
f3c0d7a5 445 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
374ca955
A
446 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
447 }
448}
449
450U_CAPI UBool U_EXPORT2
f3c0d7a5
A
451ucase_isSoftDotted(UChar32 c) {
452 return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
374ca955
A
453}
454
455U_CAPI UBool U_EXPORT2
f3c0d7a5
A
456ucase_isCaseSensitive(UChar32 c) {
457 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
374ca955
A
458 return (UBool)((props&UCASE_SENSITIVE)!=0);
459}
460
374ca955
A
461/* string casing ------------------------------------------------------------ */
462
463/*
464 * These internal functions form the core of string case mappings.
465 * They map single code points to result code points or strings and take
466 * all necessary conditions (context, locale ID, options) into account.
467 *
468 * They do not iterate over the source or write to the destination
469 * so that the same functions are useful for non-standard string storage,
470 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
471 * For the same reason, the "surrounding text" context is passed in as a
472 * UCaseContextIterator which does not make any assumptions about
473 * the underlying storage.
474 *
475 * This section contains helper functions that check for conditions
476 * in the input text surrounding the current code point
477 * according to SpecialCasing.txt.
478 *
479 * Each helper function gets the index
480 * - after the current code point if it looks at following text
481 * - before the current code point if it looks at preceding text
482 *
483 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
484 *
485 * Final_Sigma
486 * C is preceded by a sequence consisting of
487 * a cased letter and a case-ignorable sequence,
488 * and C is not followed by a sequence consisting of
489 * an ignorable sequence and then a cased letter.
490 *
491 * More_Above
492 * C is followed by one or more characters of combining class 230 (ABOVE)
493 * in the combining character sequence.
494 *
495 * After_Soft_Dotted
496 * The last preceding character with combining class of zero before C
497 * was Soft_Dotted,
498 * and there is no intervening combining character class 230 (ABOVE).
499 *
500 * Before_Dot
501 * C is followed by combining dot above (U+0307).
502 * Any sequence of characters with a combining class that is neither 0 nor 230
503 * may intervene between the current character and the combining dot above.
504 *
505 * The erratum from 2002-10-31 adds the condition
506 *
507 * After_I
508 * The last preceding base character was an uppercase I, and there is no
509 * intervening combining character class 230 (ABOVE).
510 *
511 * (See Jitterbug 2344 and the comments on After_I below.)
512 *
513 * Helper definitions in Unicode 3.2 UAX 21:
514 *
515 * D1. A character C is defined to be cased
516 * if it meets any of the following criteria:
517 *
518 * - The general category of C is Titlecase Letter (Lt)
519 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
520 * - Given D = NFD(C), then it is not the case that:
521 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
522 * (This third criterium does not add any characters to the list
523 * for Unicode 3.2. Ignored.)
524 *
525 * D2. A character C is defined to be case-ignorable
526 * if it meets either of the following criteria:
527 *
528 * - The general category of C is
529 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
530 * Letter Modifier (Lm), or Symbol Modifier (Sk)
531 * - C is one of the following characters
532 * U+0027 APOSTROPHE
533 * U+00AD SOFT HYPHEN (SHY)
534 * U+2019 RIGHT SINGLE QUOTATION MARK
535 * (the preferred character for apostrophe)
536 *
537 * D3. A case-ignorable sequence is a sequence of
538 * zero or more case-ignorable characters.
539 */
540
46f4442e 541#define is_d(c) ((c)=='d' || (c)=='D')
374ca955
A
542#define is_e(c) ((c)=='e' || (c)=='E')
543#define is_i(c) ((c)=='i' || (c)=='I')
544#define is_l(c) ((c)=='l' || (c)=='L')
545#define is_r(c) ((c)=='r' || (c)=='R')
546#define is_t(c) ((c)=='t' || (c)=='T')
547#define is_u(c) ((c)=='u' || (c)=='U')
548#define is_z(c) ((c)=='z' || (c)=='Z')
549
550/* separator? */
551#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
552
73c04bcf 553/**
374ca955
A
554 * Requires non-NULL locale ID but otherwise does the equivalent of
555 * checking for language codes as if uloc_getLanguage() were called:
556 * Accepts both 2- and 3-letter codes and accepts case variants.
557 */
73c04bcf 558U_CFUNC int32_t
f3c0d7a5 559ucase_getCaseLocale(const char *locale) {
374ca955
A
560 /*
561 * This function used to use uloc_getLanguage(), but the current code
562 * removes the dependency of this low-level code on uloc implementation code
563 * and is faster because not the whole locale ID has to be
564 * examined and copied/transformed.
565 *
566 * Because this code does not want to depend on uloc, the caller must
567 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
568 */
f3c0d7a5
A
569 char c=*locale++;
570 // Fastpath for English "en" which is often used for default (=root locale) case mappings,
571 // and for Chinese "zh": Very common but no special case mapping behavior.
572 // Then check lowercase vs. uppercase to reduce the number of comparisons
573 // for other locales without special behavior.
574 if(c=='e') {
575 /* el or ell? */
374ca955 576 c=*locale++;
f3c0d7a5 577 if(is_l(c)) {
374ca955 578 c=*locale++;
f3c0d7a5
A
579 if(is_l(c)) {
580 c=*locale;
581 }
374ca955 582 if(is_sep(c)) {
f3c0d7a5 583 return UCASE_LOC_GREEK;
374ca955
A
584 }
585 }
f3c0d7a5
A
586 // en, es, ... -> root
587 } else if(c=='z') {
588 return UCASE_LOC_ROOT;
589#if U_CHARSET_FAMILY==U_ASCII_FAMILY
590 } else if(c>='a') { // ASCII a-z = 0x61..0x7a, after A-Z
591#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
592 } else if(c<='z') { // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
593#else
594# error Unknown charset family!
595#endif
596 // lowercase c
597 if(c=='t') {
598 /* tr or tur? */
374ca955 599 c=*locale++;
f3c0d7a5
A
600 if(is_u(c)) {
601 c=*locale++;
602 }
603 if(is_r(c)) {
374ca955 604 c=*locale;
f3c0d7a5
A
605 if(is_sep(c)) {
606 return UCASE_LOC_TURKISH;
607 }
374ca955 608 }
f3c0d7a5
A
609 } else if(c=='a') {
610 /* az or aze? */
611 c=*locale++;
612 if(is_z(c)) {
613 c=*locale++;
614 if(is_e(c)) {
615 c=*locale;
616 }
617 if(is_sep(c)) {
618 return UCASE_LOC_TURKISH;
619 }
374ca955 620 }
f3c0d7a5
A
621 } else if(c=='l') {
622 /* lt or lit? */
374ca955 623 c=*locale++;
f3c0d7a5
A
624 if(is_i(c)) {
625 c=*locale++;
626 }
627 if(is_t(c)) {
628 c=*locale;
629 if(is_sep(c)) {
630 return UCASE_LOC_LITHUANIAN;
631 }
632 }
633 } else if(c=='n') {
634 /* nl or nld? */
635 c=*locale++;
636 if(is_l(c)) {
637 c=*locale++;
638 if(is_d(c)) {
639 c=*locale;
640 }
641 if(is_sep(c)) {
642 return UCASE_LOC_DUTCH;
643 }
46f4442e
A
644 }
645 }
f3c0d7a5
A
646 } else {
647 // uppercase c
648 // Same code as for lowercase c but also check for 'E'.
649 if(c=='T') {
650 /* tr or tur? */
46f4442e 651 c=*locale++;
f3c0d7a5
A
652 if(is_u(c)) {
653 c=*locale++;
654 }
655 if(is_r(c)) {
46f4442e 656 c=*locale;
f3c0d7a5
A
657 if(is_sep(c)) {
658 return UCASE_LOC_TURKISH;
659 }
46f4442e 660 }
f3c0d7a5
A
661 } else if(c=='A') {
662 /* az or aze? */
663 c=*locale++;
664 if(is_z(c)) {
665 c=*locale++;
666 if(is_e(c)) {
667 c=*locale;
668 }
669 if(is_sep(c)) {
670 return UCASE_LOC_TURKISH;
671 }
672 }
673 } else if(c=='L') {
674 /* lt or lit? */
675 c=*locale++;
676 if(is_i(c)) {
677 c=*locale++;
678 }
679 if(is_t(c)) {
680 c=*locale;
681 if(is_sep(c)) {
682 return UCASE_LOC_LITHUANIAN;
683 }
684 }
685 } else if(c=='E') {
686 /* el or ell? */
687 c=*locale++;
688 if(is_l(c)) {
689 c=*locale++;
690 if(is_l(c)) {
691 c=*locale;
692 }
693 if(is_sep(c)) {
694 return UCASE_LOC_GREEK;
695 }
696 }
697 } else if(c=='N') {
698 /* nl or nld? */
699 c=*locale++;
700 if(is_l(c)) {
701 c=*locale++;
702 if(is_d(c)) {
703 c=*locale;
704 }
705 if(is_sep(c)) {
706 return UCASE_LOC_DUTCH;
707 }
374ca955
A
708 }
709 }
710 }
f3c0d7a5 711 return UCASE_LOC_ROOT;
374ca955
A
712}
713
729e4ab9
A
714/*
715 * Is followed by
716 * {case-ignorable}* cased
717 * ?
718 * (dir determines looking forward/backward)
719 * If a character is case-ignorable, it is skipped regardless of whether
720 * it is also cased or not.
721 */
374ca955 722static UBool
f3c0d7a5 723isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
374ca955 724 UChar32 c;
374ca955
A
725
726 if(iter==NULL) {
727 return FALSE;
728 }
729
730 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
f3c0d7a5 731 int32_t type=ucase_getTypeOrIgnorable(c);
729e4ab9 732 if(type&4) {
374ca955 733 /* case-ignorable, continue with the loop */
729e4ab9
A
734 } else if(type!=UCASE_NONE) {
735 return TRUE; /* followed by cased letter */
374ca955 736 } else {
729e4ab9 737 return FALSE; /* uncased and not case-ignorable */
374ca955
A
738 }
739 }
740
741 return FALSE; /* not followed by cased letter */
742}
743
744/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
745static UBool
f3c0d7a5 746isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
374ca955
A
747 UChar32 c;
748 int32_t dotType;
749 int8_t dir;
750
751 if(iter==NULL) {
752 return FALSE;
753 }
754
755 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
f3c0d7a5 756 dotType=getDotType(c);
374ca955
A
757 if(dotType==UCASE_SOFT_DOTTED) {
758 return TRUE; /* preceded by TYPE_i */
759 } else if(dotType!=UCASE_OTHER_ACCENT) {
760 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
761 }
762 }
763
764 return FALSE; /* not preceded by TYPE_i */
765}
766
767/*
768 * See Jitterbug 2344:
769 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
770 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
771 * we made those releases compatible with Unicode 3.2 which had not fixed
772 * a related bug in SpecialCasing.txt.
773 *
774 * From the Jitterbug 2344 text:
775 * ... this bug is listed as a Unicode erratum
776 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
777 * <quote>
778 * There are two errors in SpecialCasing.txt.
779 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
780 * 2. An incorrect context definition. Correct as follows:
781 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
782 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
783 * ---
784 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
785 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
786 * where the context After_I is defined as:
787 * The last preceding base character was an uppercase I, and there is no
788 * intervening combining character class 230 (ABOVE).
789 * </quote>
790 *
791 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
792 *
793 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
794 * # This matches the behavior of the canonically equivalent I-dot_above
795 *
796 * See also the description in this place in older versions of uchar.c (revision 1.100).
797 *
798 * Markus W. Scherer 2003-feb-15
799 */
800
801/* Is preceded by base character 'I' with no intervening cc=230 ? */
802static UBool
f3c0d7a5 803isPrecededBy_I(UCaseContextIterator *iter, void *context) {
374ca955
A
804 UChar32 c;
805 int32_t dotType;
806 int8_t dir;
807
808 if(iter==NULL) {
809 return FALSE;
810 }
811
812 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
813 if(c==0x49) {
814 return TRUE; /* preceded by I */
815 }
f3c0d7a5 816 dotType=getDotType(c);
374ca955
A
817 if(dotType!=UCASE_OTHER_ACCENT) {
818 return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
819 }
820 }
821
822 return FALSE; /* not preceded by I */
823}
824
825/* Is followed by one or more cc==230 ? */
826static UBool
f3c0d7a5 827isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
374ca955
A
828 UChar32 c;
829 int32_t dotType;
830 int8_t dir;
831
832 if(iter==NULL) {
833 return FALSE;
834 }
835
836 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
f3c0d7a5 837 dotType=getDotType(c);
374ca955
A
838 if(dotType==UCASE_ABOVE) {
839 return TRUE; /* at least one cc==230 following */
840 } else if(dotType!=UCASE_OTHER_ACCENT) {
841 return FALSE; /* next base character, no more cc==230 following */
842 }
843 }
844
845 return FALSE; /* no more cc==230 following */
846}
847
848/* Is followed by a dot above (without cc==230 in between) ? */
849static UBool
f3c0d7a5 850isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
374ca955
A
851 UChar32 c;
852 int32_t dotType;
853 int8_t dir;
854
855 if(iter==NULL) {
856 return FALSE;
857 }
858
859 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
860 if(c==0x307) {
861 return TRUE;
862 }
f3c0d7a5 863 dotType=getDotType(c);
374ca955
A
864 if(dotType!=UCASE_OTHER_ACCENT) {
865 return FALSE; /* next base character or cc==230 in between */
866 }
867 }
868
869 return FALSE; /* no dot above following */
870}
871
872U_CAPI int32_t U_EXPORT2
f3c0d7a5 873ucase_toFullLower(UChar32 c,
374ca955
A
874 UCaseContextIterator *iter, void *context,
875 const UChar **pString,
f3c0d7a5
A
876 int32_t loc) {
877 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
878 U_ASSERT(c >= 0);
729e4ab9 879 UChar32 result=c;
f3c0d7a5 880 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
374ca955 881 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
882 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
883 result=c+UCASE_GET_DELTA(props);
374ca955
A
884 }
885 } else {
f3c0d7a5 886 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
374ca955
A
887 uint16_t excWord=*pe++;
888 int32_t full;
889
890 pe2=pe;
891
892 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
893 /* use hardcoded conditions and mappings */
374ca955
A
894
895 /*
896 * Test for conditional mappings first
897 * (otherwise the unconditional default mappings are always taken),
898 * then test for characters that have unconditional mappings in SpecialCasing.txt,
899 * then get the UnicodeData.txt mappings.
900 */
46f4442e 901 if( loc==UCASE_LOC_LITHUANIAN &&
374ca955
A
902 /* base characters, find accents above */
903 (((c==0x49 || c==0x4a || c==0x12e) &&
f3c0d7a5 904 isFollowedByMoreAbove(iter, context)) ||
374ca955
A
905 /* precomposed with accent above, no need to find one */
906 (c==0xcc || c==0xcd || c==0x128))
907 ) {
908 /*
909 # Lithuanian
910
911 # Lithuanian retains the dot in a lowercase i when followed by accents.
912
913 # Introduce an explicit dot above when lowercasing capital I's and J's
914 # whenever there are more accents above.
915 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
916
917 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
918 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
919 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
920 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
921 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
922 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
923 */
924 switch(c) {
925 case 0x49: /* LATIN CAPITAL LETTER I */
926 *pString=iDot;
927 return 2;
928 case 0x4a: /* LATIN CAPITAL LETTER J */
929 *pString=jDot;
930 return 2;
931 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
932 *pString=iOgonekDot;
933 return 2;
934 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
935 *pString=iDotGrave;
936 return 3;
937 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
938 *pString=iDotAcute;
939 return 3;
940 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
941 *pString=iDotTilde;
942 return 3;
943 default:
944 return 0; /* will not occur */
945 }
946 /* # Turkish and Azeri */
46f4442e 947 } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
374ca955
A
948 /*
949 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
950 # The following rules handle those cases.
951
952 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
953 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
954 */
955 return 0x69;
f3c0d7a5 956 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
374ca955
A
957 /*
958 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
959 # This matches the behavior of the canonically equivalent I-dot_above
960
961 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
962 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
963 */
964 return 0; /* remove the dot (continue without output) */
f3c0d7a5 965 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
374ca955
A
966 /*
967 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
968
969 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
970 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
971 */
972 return 0x131;
973 } else if(c==0x130) {
974 /*
975 # Preserve canonical equivalence for I with dot. Turkic is handled below.
976
977 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
978 */
979 *pString=iDot;
980 return 2;
981 } else if( c==0x3a3 &&
f3c0d7a5
A
982 !isFollowedByCasedLetter(iter, context, 1) &&
983 isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
374ca955
A
984 ) {
985 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
986 /*
987 # Special case for final form of sigma
988
989 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
990 */
991 return 0x3c2; /* greek small final sigma */
992 } else {
993 /* no known conditional special case mapping, use a normal mapping */
994 }
995 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
996 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
997 full&=UCASE_FULL_LOWER;
998 if(full!=0) {
999 /* set the output pointer to the lowercase mapping */
4388f060 1000 *pString=reinterpret_cast<const UChar *>(pe+1);
374ca955
A
1001
1002 /* return the string length */
1003 return full;
1004 }
1005 }
1006
1007 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1008 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1009 }
1010 }
1011
1012 return (result==c) ? ~result : result;
1013}
1014
1015/* internal */
1016static int32_t
f3c0d7a5 1017toUpperOrTitle(UChar32 c,
374ca955
A
1018 UCaseContextIterator *iter, void *context,
1019 const UChar **pString,
f3c0d7a5 1020 int32_t loc,
374ca955 1021 UBool upperNotTitle) {
f3c0d7a5
A
1022 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1023 U_ASSERT(c >= 0);
729e4ab9 1024 UChar32 result=c;
f3c0d7a5 1025 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
374ca955 1026 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
1027 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1028 result=c+UCASE_GET_DELTA(props);
374ca955
A
1029 }
1030 } else {
f3c0d7a5 1031 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
374ca955 1032 uint16_t excWord=*pe++;
729e4ab9 1033 int32_t full, idx;
374ca955
A
1034
1035 pe2=pe;
1036
1037 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1038 /* use hardcoded conditions and mappings */
46f4442e 1039 if(loc==UCASE_LOC_TURKISH && c==0x69) {
374ca955
A
1040 /*
1041 # Turkish and Azeri
1042
1043 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1044 # The following rules handle those cases.
1045
1046 # When uppercasing, i turns into a dotted capital I
1047
1048 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1049 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1050 */
1051 return 0x130;
f3c0d7a5 1052 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
374ca955
A
1053 /*
1054 # Lithuanian
1055
1056 # Lithuanian retains the dot in a lowercase i when followed by accents.
1057
1058 # Remove DOT ABOVE after "i" with upper or titlecase
1059
1060 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1061 */
1062 return 0; /* remove the dot (continue without output) */
1063 } else {
1064 /* no known conditional special case mapping, use a normal mapping */
1065 }
1066 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1067 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1068
1069 /* start of full case mapping strings */
1070 ++pe;
1071
1072 /* skip the lowercase and case-folding result strings */
1073 pe+=full&UCASE_FULL_LOWER;
1074 full>>=4;
1075 pe+=full&0xf;
1076 full>>=4;
1077
1078 if(upperNotTitle) {
1079 full&=0xf;
1080 } else {
1081 /* skip the uppercase result string */
1082 pe+=full&0xf;
1083 full=(full>>4)&0xf;
1084 }
1085
1086 if(full!=0) {
1087 /* set the output pointer to the result string */
4388f060 1088 *pString=reinterpret_cast<const UChar *>(pe);
374ca955
A
1089
1090 /* return the string length */
1091 return full;
1092 }
1093 }
1094
1095 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
729e4ab9 1096 idx=UCASE_EXC_TITLE;
374ca955
A
1097 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1098 /* here, titlecase is same as uppercase */
729e4ab9 1099 idx=UCASE_EXC_UPPER;
374ca955
A
1100 } else {
1101 return ~c;
1102 }
729e4ab9 1103 GET_SLOT_VALUE(excWord, idx, pe2, result);
374ca955
A
1104 }
1105
1106 return (result==c) ? ~result : result;
1107}
1108
1109U_CAPI int32_t U_EXPORT2
f3c0d7a5 1110ucase_toFullUpper(UChar32 c,
374ca955
A
1111 UCaseContextIterator *iter, void *context,
1112 const UChar **pString,
f3c0d7a5
A
1113 int32_t caseLocale) {
1114 return toUpperOrTitle(c, iter, context, pString, caseLocale, TRUE);
374ca955
A
1115}
1116
1117U_CAPI int32_t U_EXPORT2
f3c0d7a5 1118ucase_toFullTitle(UChar32 c,
374ca955
A
1119 UCaseContextIterator *iter, void *context,
1120 const UChar **pString,
f3c0d7a5
A
1121 int32_t caseLocale) {
1122 return toUpperOrTitle(c, iter, context, pString, caseLocale, FALSE);
374ca955
A
1123}
1124
1125/* case folding ------------------------------------------------------------- */
1126
1127/*
1128 * Case folding is similar to lowercasing.
1129 * The result may be a simple mapping, i.e., a single code point, or
1130 * a full mapping, i.e., a string.
1131 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1132 * then only the lowercase mapping is stored.
1133 *
1134 * Some special cases are hardcoded because their conditions cannot be
1135 * parsed and processed from CaseFolding.txt.
1136 *
1137 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1138
1139# C: common case folding, common mappings shared by both simple and full mappings.
1140# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1141# S: simple case folding, mappings to single characters where different from F.
1142# T: special case for uppercase I and dotted uppercase I
1143# - For non-Turkic languages, this mapping is normally not used.
1144# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1145#
1146# Usage:
1147# A. To do a simple case folding, use the mappings with status C + S.
1148# B. To do a full case folding, use the mappings with status C + F.
1149#
1150# The mappings with status T can be used or omitted depending on the desired case-folding
1151# behavior. (The default option is to exclude them.)
1152
1153 * Unicode 3.2 has 'T' mappings as follows:
1154
11550049; T; 0131; # LATIN CAPITAL LETTER I
11560130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1157
1158 * while the default mappings for these code points are:
1159
11600049; C; 0069; # LATIN CAPITAL LETTER I
11610130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1162
73c04bcf 1163 * U+0130 has no simple case folding (simple-case-folds to itself).
374ca955
A
1164 */
1165
1166/* return the simple case folding mapping for c */
1167U_CAPI UChar32 U_EXPORT2
f3c0d7a5
A
1168ucase_fold(UChar32 c, uint32_t options) {
1169 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
374ca955 1170 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
1171 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1172 c+=UCASE_GET_DELTA(props);
374ca955
A
1173 }
1174 } else {
f3c0d7a5 1175 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
374ca955 1176 uint16_t excWord=*pe++;
729e4ab9 1177 int32_t idx;
374ca955
A
1178 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1179 /* special case folding mappings, hardcoded */
1180 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1181 /* default mappings */
1182 if(c==0x49) {
1183 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1184 return 0x69;
1185 } else if(c==0x130) {
73c04bcf
A
1186 /* no simple case folding for U+0130 */
1187 return c;
374ca955
A
1188 }
1189 } else {
1190 /* Turkic mappings */
1191 if(c==0x49) {
1192 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1193 return 0x131;
1194 } else if(c==0x130) {
1195 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1196 return 0x69;
1197 }
1198 }
1199 }
1200 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
729e4ab9 1201 idx=UCASE_EXC_FOLD;
374ca955 1202 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
729e4ab9 1203 idx=UCASE_EXC_LOWER;
374ca955
A
1204 } else {
1205 return c;
1206 }
729e4ab9 1207 GET_SLOT_VALUE(excWord, idx, pe, c);
374ca955
A
1208 }
1209 return c;
1210}
1211
1212/*
1213 * Issue for canonical caseless match (UAX #21):
1214 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1215 * canonical equivalence, unlike default-option casefolding.
1216 * For example, I-grave and I + grave fold to strings that are not canonically
1217 * equivalent.
1218 * For more details, see the comment in unorm_compare() in unorm.cpp
1219 * and the intermediate prototype changes for Jitterbug 2021.
1220 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1221 *
1222 * This did not get fixed because it appears that it is not possible to fix
1223 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1224 * together in a way that they still fold to common result strings.
1225 */
1226
1227U_CAPI int32_t U_EXPORT2
f3c0d7a5 1228ucase_toFullFolding(UChar32 c,
374ca955 1229 const UChar **pString,
f3c0d7a5
A
1230 uint32_t options) {
1231 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1232 U_ASSERT(c >= 0);
729e4ab9 1233 UChar32 result=c;
f3c0d7a5 1234 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
374ca955 1235 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
1236 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1237 result=c+UCASE_GET_DELTA(props);
374ca955
A
1238 }
1239 } else {
f3c0d7a5 1240 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
374ca955 1241 uint16_t excWord=*pe++;
729e4ab9 1242 int32_t full, idx;
374ca955
A
1243
1244 pe2=pe;
1245
1246 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1247 /* use hardcoded conditions and mappings */
1248 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1249 /* default mappings */
1250 if(c==0x49) {
1251 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1252 return 0x69;
1253 } else if(c==0x130) {
1254 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1255 *pString=iDot;
1256 return 2;
1257 }
1258 } else {
1259 /* Turkic mappings */
1260 if(c==0x49) {
1261 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1262 return 0x131;
1263 } else if(c==0x130) {
1264 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1265 return 0x69;
1266 }
1267 }
1268 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1269 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1270
1271 /* start of full case mapping strings */
1272 ++pe;
1273
1274 /* skip the lowercase result string */
1275 pe+=full&UCASE_FULL_LOWER;
1276 full=(full>>4)&0xf;
1277
1278 if(full!=0) {
1279 /* set the output pointer to the result string */
4388f060 1280 *pString=reinterpret_cast<const UChar *>(pe);
374ca955
A
1281
1282 /* return the string length */
1283 return full;
1284 }
1285 }
1286
1287 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
729e4ab9 1288 idx=UCASE_EXC_FOLD;
374ca955 1289 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
729e4ab9 1290 idx=UCASE_EXC_LOWER;
374ca955
A
1291 } else {
1292 return ~c;
1293 }
729e4ab9 1294 GET_SLOT_VALUE(excWord, idx, pe2, result);
374ca955
A
1295 }
1296
1297 return (result==c) ? ~result : result;
1298}
73c04bcf
A
1299
1300/* case mapping properties API ---------------------------------------------- */
1301
73c04bcf
A
1302/* public API (see uchar.h) */
1303
1304U_CAPI UBool U_EXPORT2
1305u_isULowercase(UChar32 c) {
f3c0d7a5 1306 return (UBool)(UCASE_LOWER==ucase_getType(c));
73c04bcf
A
1307}
1308
1309U_CAPI UBool U_EXPORT2
1310u_isUUppercase(UChar32 c) {
f3c0d7a5 1311 return (UBool)(UCASE_UPPER==ucase_getType(c));
73c04bcf
A
1312}
1313
1314/* Transforms the Unicode character to its lower case equivalent.*/
1315U_CAPI UChar32 U_EXPORT2
1316u_tolower(UChar32 c) {
f3c0d7a5 1317 return ucase_tolower(c);
73c04bcf
A
1318}
1319
1320/* Transforms the Unicode character to its upper case equivalent.*/
1321U_CAPI UChar32 U_EXPORT2
1322u_toupper(UChar32 c) {
f3c0d7a5 1323 return ucase_toupper(c);
73c04bcf
A
1324}
1325
1326/* Transforms the Unicode character to its title case equivalent.*/
1327U_CAPI UChar32 U_EXPORT2
1328u_totitle(UChar32 c) {
f3c0d7a5 1329 return ucase_totitle(c);
73c04bcf
A
1330}
1331
1332/* return the simple case folding mapping for c */
1333U_CAPI UChar32 U_EXPORT2
1334u_foldCase(UChar32 c, uint32_t options) {
f3c0d7a5 1335 return ucase_fold(c, options);
73c04bcf
A
1336}
1337
1338U_CFUNC int32_t U_EXPORT2
1339ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1340 /* case mapping properties */
729e4ab9 1341 const UChar *resultString;
73c04bcf
A
1342 switch(which) {
1343 case UCHAR_LOWERCASE:
f3c0d7a5 1344 return (UBool)(UCASE_LOWER==ucase_getType(c));
73c04bcf 1345 case UCHAR_UPPERCASE:
f3c0d7a5 1346 return (UBool)(UCASE_UPPER==ucase_getType(c));
73c04bcf 1347 case UCHAR_SOFT_DOTTED:
f3c0d7a5 1348 return ucase_isSoftDotted(c);
73c04bcf 1349 case UCHAR_CASE_SENSITIVE:
f3c0d7a5 1350 return ucase_isCaseSensitive(c);
729e4ab9 1351 case UCHAR_CASED:
f3c0d7a5 1352 return (UBool)(UCASE_NONE!=ucase_getType(c));
729e4ab9 1353 case UCHAR_CASE_IGNORABLE:
f3c0d7a5 1354 return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
729e4ab9
A
1355 /*
1356 * Note: The following Changes_When_Xyz are defined as testing whether
1357 * the NFD form of the input changes when Xyz-case-mapped.
1358 * However, this simpler implementation of these properties,
1359 * ignoring NFD, passes the tests.
1360 * The implementation needs to be changed if the tests start failing.
1361 * When that happens, optimizations should be used to work with the
1362 * per-single-code point ucase_toFullXyz() functions unless
1363 * the NFD form has more than one code point,
1364 * and the property starts set needs to be the union of the
1365 * start sets for normalization and case mappings.
1366 */
1367 case UCHAR_CHANGES_WHEN_LOWERCASED:
f3c0d7a5 1368 return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
729e4ab9 1369 case UCHAR_CHANGES_WHEN_UPPERCASED:
f3c0d7a5 1370 return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
729e4ab9 1371 case UCHAR_CHANGES_WHEN_TITLECASED:
f3c0d7a5 1372 return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
729e4ab9
A
1373 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1374 case UCHAR_CHANGES_WHEN_CASEMAPPED:
729e4ab9 1375 return (UBool)(
f3c0d7a5
A
1376 ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1377 ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1378 ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
73c04bcf
A
1379 default:
1380 return FALSE;
1381 }
1382}