]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucase.cpp
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucase.cpp
CommitLineData
374ca955
A
1/*
2*******************************************************************************
3*
b331163b 4* Copyright (C) 2004-2014, International Business Machines
374ca955
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
4388f060 8* file name: ucase.cpp
374ca955
A
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2004aug30
14* created by: Markus W. Scherer
15*
16* Low-level Unicode character/string case mapping code.
17* Much code moved here (and modified) from uchar.c.
18*/
19
20#include "unicode/utypes.h"
4388f060 21#include "unicode/unistr.h"
374ca955
A
22#include "unicode/uset.h"
23#include "unicode/udata.h" /* UDataInfo */
4388f060 24#include "unicode/utf16.h"
374ca955
A
25#include "ucmndata.h" /* DataHeader */
26#include "udatamem.h"
27#include "umutex.h"
28#include "uassert.h"
29#include "cmemory.h"
729e4ab9 30#include "utrie2.h"
374ca955 31#include "ucase.h"
374ca955
A
32
33struct UCaseProps {
34 UDataMemory *mem;
35 const int32_t *indexes;
36 const uint16_t *exceptions;
4388f060 37 const uint16_t *unfold;
374ca955 38
729e4ab9 39 UTrie2 trie;
374ca955
A
40 uint8_t formatVersion[4];
41};
42
4388f060
A
43/* ucase_props_data.h is machine-generated by gencase --csource */
44#define INCLUDED_FROM_UCASE_CPP
45#include "ucase_props_data.h"
73c04bcf 46
374ca955
A
47/* UCaseProps singleton ----------------------------------------------------- */
48
73c04bcf 49U_CAPI const UCaseProps * U_EXPORT2
729e4ab9 50ucase_getSingleton() {
73c04bcf 51 return &ucase_props_singleton;
374ca955
A
52}
53
54/* set of property starts for UnicodeSet ------------------------------------ */
55
56static UBool U_CALLCONV
4388f060 57_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
374ca955 58 /* add the start code point to the USet */
73c04bcf 59 const USetAdder *sa=(const USetAdder *)context;
374ca955
A
60 sa->add(sa->set, start);
61 return TRUE;
62}
63
46f4442e 64U_CFUNC void U_EXPORT2
73c04bcf 65ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
374ca955
A
66 if(U_FAILURE(*pErrorCode)) {
67 return;
68 }
69
70 /* add the start code point of each same-value range of the trie */
729e4ab9 71 utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
374ca955
A
72
73 /* add code points with hardcoded properties, plus the ones following them */
74
75 /* (none right now, see comment below) */
76
77 /*
78 * Omit code points with hardcoded specialcasing properties
79 * because we do not build property UnicodeSets for them right now.
80 */
81}
82
83/* data access primitives --------------------------------------------------- */
84
374ca955
A
85#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
86
87#define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
88
89/* number of bits in an 8-bit integer value */
90static const uint8_t flagsOffset[256]={
91 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
92 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
93 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
96 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
97 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
98 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
99 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
100 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
101 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
104 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
105 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
106 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
107};
108
729e4ab9
A
109#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
110#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
374ca955
A
111
112/*
729e4ab9 113 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
374ca955
A
114 *
115 * @param excWord (in) initial exceptions word
729e4ab9 116 * @param idx (in) desired slot index
374ca955
A
117 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
118 * moved to the last uint16_t of the value, use +1 for beginning of next slot
119 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
120 */
729e4ab9 121#define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
374ca955 122 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
729e4ab9 123 (pExc16)+=SLOT_OFFSET(excWord, idx); \
374ca955
A
124 (value)=*pExc16; \
125 } else { \
729e4ab9 126 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
374ca955
A
127 (value)=*pExc16++; \
128 (value)=((value)<<16)|*pExc16; \
129 }
130
131/* simple case mappings ----------------------------------------------------- */
132
133U_CAPI UChar32 U_EXPORT2
134ucase_tolower(const UCaseProps *csp, UChar32 c) {
729e4ab9 135 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955 136 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
137 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
138 c+=UCASE_GET_DELTA(props);
374ca955
A
139 }
140 } else {
141 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
142 uint16_t excWord=*pe++;
143 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
144 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
145 }
146 }
147 return c;
148}
149
150U_CAPI UChar32 U_EXPORT2
151ucase_toupper(const UCaseProps *csp, UChar32 c) {
729e4ab9 152 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955 153 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
154 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
155 c+=UCASE_GET_DELTA(props);
374ca955
A
156 }
157 } else {
158 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
159 uint16_t excWord=*pe++;
160 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
161 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
162 }
163 }
164 return c;
165}
166
167U_CAPI UChar32 U_EXPORT2
168ucase_totitle(const UCaseProps *csp, UChar32 c) {
729e4ab9 169 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955 170 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
171 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
172 c+=UCASE_GET_DELTA(props);
374ca955
A
173 }
174 } else {
175 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
176 uint16_t excWord=*pe++;
729e4ab9 177 int32_t idx;
374ca955 178 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
729e4ab9 179 idx=UCASE_EXC_TITLE;
374ca955 180 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
729e4ab9 181 idx=UCASE_EXC_UPPER;
374ca955
A
182 } else {
183 return c;
184 }
729e4ab9 185 GET_SLOT_VALUE(excWord, idx, pe, c);
374ca955
A
186 }
187 return c;
188}
189
46f4442e
A
190static const UChar iDot[2] = { 0x69, 0x307 };
191static const UChar jDot[2] = { 0x6a, 0x307 };
192static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
193static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
194static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
195static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
196
197
198U_CFUNC void U_EXPORT2
73c04bcf
A
199ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
200 uint16_t props;
201
202 /*
203 * Hardcode the case closure of i and its relatives and ignore the
204 * data file data for these characters.
205 * The Turkic dotless i and dotted I with their case mapping conditions
206 * and case folding option make the related characters behave specially.
207 * This code matches their closure behavior to their case folding behavior.
208 */
73c04bcf
A
209
210 switch(c) {
211 case 0x49:
212 /* regular i and I are in one equivalence class */
213 sa->add(sa->set, 0x69);
214 return;
215 case 0x69:
216 sa->add(sa->set, 0x49);
217 return;
218 case 0x130:
219 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
220 sa->addString(sa->set, iDot, 2);
221 return;
222 case 0x131:
223 /* dotless i is in a class by itself */
224 return;
225 default:
226 /* otherwise use the data file data */
227 break;
228 }
229
729e4ab9 230 props=UTRIE2_GET16(&csp->trie, c);
73c04bcf
A
231 if(!PROPS_HAS_EXCEPTION(props)) {
232 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
233 /* add the one simple case mapping, no matter what type it is */
234 int32_t delta=UCASE_GET_DELTA(props);
235 if(delta!=0) {
236 sa->add(sa->set, c+delta);
237 }
238 }
239 } else {
240 /*
241 * c has exceptions, so there may be multiple simple and/or
242 * full case mappings. Add them all.
243 */
244 const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
245 const UChar *closure;
246 uint16_t excWord=*pe++;
729e4ab9 247 int32_t idx, closureLength, fullLength, length;
73c04bcf
A
248
249 pe0=pe;
250
251 /* add all simple case mappings */
729e4ab9
A
252 for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
253 if(HAS_SLOT(excWord, idx)) {
73c04bcf 254 pe=pe0;
729e4ab9 255 GET_SLOT_VALUE(excWord, idx, pe, c);
73c04bcf
A
256 sa->add(sa->set, c);
257 }
258 }
259
260 /* get the closure string pointer & length */
261 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
262 pe=pe0;
263 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
264 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
265 closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
266 } else {
267 closureLength=0;
268 closure=NULL;
269 }
270
271 /* add the full case folding */
272 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
273 pe=pe0;
274 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
275
276 /* start of full case mapping strings */
277 ++pe;
278
279 fullLength&=0xffff; /* bits 16 and higher are reserved */
280
281 /* skip the lowercase result string */
282 pe+=fullLength&UCASE_FULL_LOWER;
283 fullLength>>=4;
284
285 /* add the full case folding string */
286 length=fullLength&0xf;
287 if(length!=0) {
288 sa->addString(sa->set, (const UChar *)pe, length);
289 pe+=length;
290 }
291
292 /* skip the uppercase and titlecase strings */
293 fullLength>>=4;
294 pe+=fullLength&0xf;
295 fullLength>>=4;
296 pe+=fullLength;
297
298 closure=(const UChar *)pe; /* behind full case mappings */
299 }
300
301 /* add each code point in the closure string */
729e4ab9
A
302 for(idx=0; idx<closureLength;) {
303 U16_NEXT_UNSAFE(closure, idx, c);
73c04bcf
A
304 sa->add(sa->set, c);
305 }
306 }
307}
308
309/*
310 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
311 * must be length>0 and max>0 and length<=max
312 */
4388f060 313static inline int32_t
73c04bcf
A
314strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
315 int32_t c1, c2;
316
317 max-=length; /* we require length<=max, so no need to decrement max in the loop */
318 do {
319 c1=*s++;
320 c2=*t++;
321 if(c2==0) {
322 return 1; /* reached the end of t but not of s */
323 }
324 c1-=c2;
325 if(c1!=0) {
326 return c1; /* return difference result */
327 }
328 } while(--length>0);
329 /* ends with length==0 */
330
331 if(max==0 || *t==0) {
332 return 0; /* equal to length of both strings */
333 } else {
334 return -max; /* return lengh difference */
335 }
336}
337
46f4442e 338U_CFUNC UBool U_EXPORT2
73c04bcf 339ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
73c04bcf
A
340 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
341
342 if(csp->unfold==NULL || s==NULL) {
343 return FALSE; /* no reverse case folding data, or no string */
344 }
345 if(length<=1) {
346 /* the string is too short to find any match */
347 /*
348 * more precise would be:
349 * if(!u_strHasMoreChar32Than(s, length, 1))
350 * but this does not make much practical difference because
351 * a single supplementary code point would just not be found
352 */
353 return FALSE;
354 }
355
4388f060 356 const uint16_t *unfold=csp->unfold;
73c04bcf
A
357 unfoldRows=unfold[UCASE_UNFOLD_ROWS];
358 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
359 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
360 unfold+=unfoldRowWidth;
361
362 if(length>unfoldStringWidth) {
363 /* the string is too long to find any match */
364 return FALSE;
365 }
366
367 /* do a binary search for the string */
368 start=0;
369 limit=unfoldRows;
370 while(start<limit) {
371 i=(start+limit)/2;
4388f060 372 const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
73c04bcf
A
373 result=strcmpMax(s, length, p, unfoldStringWidth);
374
375 if(result==0) {
376 /* found the string: add each code point, and its case closure */
377 UChar32 c;
378
379 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
380 U16_NEXT_UNSAFE(p, i, c);
381 sa->add(sa->set, c);
382 ucase_addCaseClosure(csp, c, sa);
383 }
384 return TRUE;
385 } else if(result<0) {
386 limit=i;
387 } else /* result>0 */ {
388 start=i+1;
389 }
390 }
391
392 return FALSE; /* string not found */
393}
394
4388f060
A
395U_NAMESPACE_BEGIN
396
397FullCaseFoldingIterator::FullCaseFoldingIterator()
398 : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
399 unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
400 unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
401 unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
402 currentRow(0),
403 rowCpIndex(unfoldStringWidth) {
404 unfold+=unfoldRowWidth;
405}
406
407UChar32
408FullCaseFoldingIterator::next(UnicodeString &full) {
409 // Advance past the last-delivered code point.
410 const UChar *p=unfold+(currentRow*unfoldRowWidth);
411 if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
412 ++currentRow;
413 p+=unfoldRowWidth;
414 rowCpIndex=unfoldStringWidth;
415 }
416 if(currentRow>=unfoldRows) { return U_SENTINEL; }
417 // Set "full" to the NUL-terminated string in the first unfold column.
418 int32_t length=unfoldStringWidth;
419 while(length>0 && p[length-1]==0) { --length; }
420 full.setTo(FALSE, p, length);
421 // Return the code point.
422 UChar32 c;
423 U16_NEXT_UNSAFE(p, rowCpIndex, c);
424 return c;
425}
426
427U_NAMESPACE_END
428
374ca955
A
429/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
430U_CAPI int32_t U_EXPORT2
431ucase_getType(const UCaseProps *csp, UChar32 c) {
729e4ab9 432 uint16_t props=UTRIE2_GET16(&csp->trie, c);
73c04bcf 433 return UCASE_GET_TYPE(props);
374ca955
A
434}
435
729e4ab9 436/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
374ca955
A
437U_CAPI int32_t U_EXPORT2
438ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
729e4ab9 439 uint16_t props=UTRIE2_GET16(&csp->trie, c);
4388f060 440 return UCASE_GET_TYPE_AND_IGNORABLE(props);
374ca955
A
441}
442
443/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
4388f060 444static inline int32_t
374ca955 445getDotType(const UCaseProps *csp, UChar32 c) {
729e4ab9 446 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955
A
447 if(!PROPS_HAS_EXCEPTION(props)) {
448 return props&UCASE_DOT_MASK;
449 } else {
450 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
451 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
452 }
453}
454
455U_CAPI UBool U_EXPORT2
456ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
457 return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
458}
459
460U_CAPI UBool U_EXPORT2
461ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
729e4ab9 462 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955
A
463 return (UBool)((props&UCASE_SENSITIVE)!=0);
464}
465
374ca955
A
466/* string casing ------------------------------------------------------------ */
467
468/*
469 * These internal functions form the core of string case mappings.
470 * They map single code points to result code points or strings and take
471 * all necessary conditions (context, locale ID, options) into account.
472 *
473 * They do not iterate over the source or write to the destination
474 * so that the same functions are useful for non-standard string storage,
475 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
476 * For the same reason, the "surrounding text" context is passed in as a
477 * UCaseContextIterator which does not make any assumptions about
478 * the underlying storage.
479 *
480 * This section contains helper functions that check for conditions
481 * in the input text surrounding the current code point
482 * according to SpecialCasing.txt.
483 *
484 * Each helper function gets the index
485 * - after the current code point if it looks at following text
486 * - before the current code point if it looks at preceding text
487 *
488 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
489 *
490 * Final_Sigma
491 * C is preceded by a sequence consisting of
492 * a cased letter and a case-ignorable sequence,
493 * and C is not followed by a sequence consisting of
494 * an ignorable sequence and then a cased letter.
495 *
496 * More_Above
497 * C is followed by one or more characters of combining class 230 (ABOVE)
498 * in the combining character sequence.
499 *
500 * After_Soft_Dotted
501 * The last preceding character with combining class of zero before C
502 * was Soft_Dotted,
503 * and there is no intervening combining character class 230 (ABOVE).
504 *
505 * Before_Dot
506 * C is followed by combining dot above (U+0307).
507 * Any sequence of characters with a combining class that is neither 0 nor 230
508 * may intervene between the current character and the combining dot above.
509 *
510 * The erratum from 2002-10-31 adds the condition
511 *
512 * After_I
513 * The last preceding base character was an uppercase I, and there is no
514 * intervening combining character class 230 (ABOVE).
515 *
516 * (See Jitterbug 2344 and the comments on After_I below.)
517 *
518 * Helper definitions in Unicode 3.2 UAX 21:
519 *
520 * D1. A character C is defined to be cased
521 * if it meets any of the following criteria:
522 *
523 * - The general category of C is Titlecase Letter (Lt)
524 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
525 * - Given D = NFD(C), then it is not the case that:
526 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
527 * (This third criterium does not add any characters to the list
528 * for Unicode 3.2. Ignored.)
529 *
530 * D2. A character C is defined to be case-ignorable
531 * if it meets either of the following criteria:
532 *
533 * - The general category of C is
534 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
535 * Letter Modifier (Lm), or Symbol Modifier (Sk)
536 * - C is one of the following characters
537 * U+0027 APOSTROPHE
538 * U+00AD SOFT HYPHEN (SHY)
539 * U+2019 RIGHT SINGLE QUOTATION MARK
540 * (the preferred character for apostrophe)
541 *
542 * D3. A case-ignorable sequence is a sequence of
543 * zero or more case-ignorable characters.
544 */
545
374ca955 546#define is_a(c) ((c)=='a' || (c)=='A')
46f4442e 547#define is_d(c) ((c)=='d' || (c)=='D')
374ca955
A
548#define is_e(c) ((c)=='e' || (c)=='E')
549#define is_i(c) ((c)=='i' || (c)=='I')
550#define is_l(c) ((c)=='l' || (c)=='L')
46f4442e 551#define is_n(c) ((c)=='n' || (c)=='N')
374ca955
A
552#define is_r(c) ((c)=='r' || (c)=='R')
553#define is_t(c) ((c)=='t' || (c)=='T')
554#define is_u(c) ((c)=='u' || (c)=='U')
555#define is_z(c) ((c)=='z' || (c)=='Z')
556
557/* separator? */
558#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
559
73c04bcf 560/**
374ca955
A
561 * Requires non-NULL locale ID but otherwise does the equivalent of
562 * checking for language codes as if uloc_getLanguage() were called:
563 * Accepts both 2- and 3-letter codes and accepts case variants.
564 */
73c04bcf
A
565U_CFUNC int32_t
566ucase_getCaseLocale(const char *locale, int32_t *locCache) {
374ca955
A
567 int32_t result;
568 char c;
569
46f4442e 570 if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
374ca955
A
571 return result;
572 }
573
46f4442e 574 result=UCASE_LOC_ROOT;
374ca955
A
575
576 /*
577 * This function used to use uloc_getLanguage(), but the current code
578 * removes the dependency of this low-level code on uloc implementation code
579 * and is faster because not the whole locale ID has to be
580 * examined and copied/transformed.
581 *
582 * Because this code does not want to depend on uloc, the caller must
583 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
584 */
585 c=*locale++;
586 if(is_t(c)) {
587 /* tr or tur? */
588 c=*locale++;
589 if(is_u(c)) {
590 c=*locale++;
591 }
592 if(is_r(c)) {
593 c=*locale;
594 if(is_sep(c)) {
46f4442e 595 result=UCASE_LOC_TURKISH;
374ca955
A
596 }
597 }
598 } else if(is_a(c)) {
599 /* az or aze? */
600 c=*locale++;
601 if(is_z(c)) {
602 c=*locale++;
603 if(is_e(c)) {
604 c=*locale;
605 }
606 if(is_sep(c)) {
46f4442e 607 result=UCASE_LOC_TURKISH;
374ca955
A
608 }
609 }
610 } else if(is_l(c)) {
611 /* lt or lit? */
612 c=*locale++;
613 if(is_i(c)) {
614 c=*locale++;
615 }
616 if(is_t(c)) {
617 c=*locale;
618 if(is_sep(c)) {
46f4442e
A
619 result=UCASE_LOC_LITHUANIAN;
620 }
621 }
622 } else if(is_n(c)) {
623 /* nl or nld? */
624 c=*locale++;
625 if(is_l(c)) {
626 c=*locale++;
627 if(is_d(c)) {
628 c=*locale;
629 }
630 if(is_sep(c)) {
631 result=UCASE_LOC_DUTCH;
374ca955
A
632 }
633 }
634 }
635
636 if(locCache!=NULL) {
637 *locCache=result;
638 }
639 return result;
640}
641
729e4ab9
A
642/*
643 * Is followed by
644 * {case-ignorable}* cased
645 * ?
646 * (dir determines looking forward/backward)
647 * If a character is case-ignorable, it is skipped regardless of whether
648 * it is also cased or not.
649 */
374ca955
A
650static UBool
651isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
652 UChar32 c;
374ca955
A
653
654 if(iter==NULL) {
655 return FALSE;
656 }
657
658 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
729e4ab9
A
659 int32_t type=ucase_getTypeOrIgnorable(csp, c);
660 if(type&4) {
374ca955 661 /* case-ignorable, continue with the loop */
729e4ab9
A
662 } else if(type!=UCASE_NONE) {
663 return TRUE; /* followed by cased letter */
374ca955 664 } else {
729e4ab9 665 return FALSE; /* uncased and not case-ignorable */
374ca955
A
666 }
667 }
668
669 return FALSE; /* not followed by cased letter */
670}
671
672/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
673static UBool
674isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
675 UChar32 c;
676 int32_t dotType;
677 int8_t dir;
678
679 if(iter==NULL) {
680 return FALSE;
681 }
682
683 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
684 dotType=getDotType(csp, c);
685 if(dotType==UCASE_SOFT_DOTTED) {
686 return TRUE; /* preceded by TYPE_i */
687 } else if(dotType!=UCASE_OTHER_ACCENT) {
688 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
689 }
690 }
691
692 return FALSE; /* not preceded by TYPE_i */
693}
694
695/*
696 * See Jitterbug 2344:
697 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
698 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
699 * we made those releases compatible with Unicode 3.2 which had not fixed
700 * a related bug in SpecialCasing.txt.
701 *
702 * From the Jitterbug 2344 text:
703 * ... this bug is listed as a Unicode erratum
704 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
705 * <quote>
706 * There are two errors in SpecialCasing.txt.
707 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
708 * 2. An incorrect context definition. Correct as follows:
709 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
710 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
711 * ---
712 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
713 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
714 * where the context After_I is defined as:
715 * The last preceding base character was an uppercase I, and there is no
716 * intervening combining character class 230 (ABOVE).
717 * </quote>
718 *
719 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
720 *
721 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
722 * # This matches the behavior of the canonically equivalent I-dot_above
723 *
724 * See also the description in this place in older versions of uchar.c (revision 1.100).
725 *
726 * Markus W. Scherer 2003-feb-15
727 */
728
729/* Is preceded by base character 'I' with no intervening cc=230 ? */
730static UBool
731isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
732 UChar32 c;
733 int32_t dotType;
734 int8_t dir;
735
736 if(iter==NULL) {
737 return FALSE;
738 }
739
740 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
741 if(c==0x49) {
742 return TRUE; /* preceded by I */
743 }
744 dotType=getDotType(csp, c);
745 if(dotType!=UCASE_OTHER_ACCENT) {
746 return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
747 }
748 }
749
750 return FALSE; /* not preceded by I */
751}
752
753/* Is followed by one or more cc==230 ? */
754static UBool
755isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
756 UChar32 c;
757 int32_t dotType;
758 int8_t dir;
759
760 if(iter==NULL) {
761 return FALSE;
762 }
763
764 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
765 dotType=getDotType(csp, c);
766 if(dotType==UCASE_ABOVE) {
767 return TRUE; /* at least one cc==230 following */
768 } else if(dotType!=UCASE_OTHER_ACCENT) {
769 return FALSE; /* next base character, no more cc==230 following */
770 }
771 }
772
773 return FALSE; /* no more cc==230 following */
774}
775
776/* Is followed by a dot above (without cc==230 in between) ? */
777static UBool
778isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
779 UChar32 c;
780 int32_t dotType;
781 int8_t dir;
782
783 if(iter==NULL) {
784 return FALSE;
785 }
786
787 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
788 if(c==0x307) {
789 return TRUE;
790 }
791 dotType=getDotType(csp, c);
792 if(dotType!=UCASE_OTHER_ACCENT) {
793 return FALSE; /* next base character or cc==230 in between */
794 }
795 }
796
797 return FALSE; /* no dot above following */
798}
799
800U_CAPI int32_t U_EXPORT2
801ucase_toFullLower(const UCaseProps *csp, UChar32 c,
802 UCaseContextIterator *iter, void *context,
803 const UChar **pString,
46f4442e
A
804 const char *locale, int32_t *locCache)
805{
729e4ab9
A
806 UChar32 result=c;
807 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955 808 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
809 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
810 result=c+UCASE_GET_DELTA(props);
374ca955
A
811 }
812 } else {
813 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
814 uint16_t excWord=*pe++;
815 int32_t full;
816
817 pe2=pe;
818
819 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
820 /* use hardcoded conditions and mappings */
73c04bcf 821 int32_t loc=ucase_getCaseLocale(locale, locCache);
374ca955
A
822
823 /*
824 * Test for conditional mappings first
825 * (otherwise the unconditional default mappings are always taken),
826 * then test for characters that have unconditional mappings in SpecialCasing.txt,
827 * then get the UnicodeData.txt mappings.
828 */
46f4442e 829 if( loc==UCASE_LOC_LITHUANIAN &&
374ca955
A
830 /* base characters, find accents above */
831 (((c==0x49 || c==0x4a || c==0x12e) &&
832 isFollowedByMoreAbove(csp, iter, context)) ||
833 /* precomposed with accent above, no need to find one */
834 (c==0xcc || c==0xcd || c==0x128))
835 ) {
836 /*
837 # Lithuanian
838
839 # Lithuanian retains the dot in a lowercase i when followed by accents.
840
841 # Introduce an explicit dot above when lowercasing capital I's and J's
842 # whenever there are more accents above.
843 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
844
845 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
846 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
847 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
848 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
849 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
850 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
851 */
852 switch(c) {
853 case 0x49: /* LATIN CAPITAL LETTER I */
854 *pString=iDot;
855 return 2;
856 case 0x4a: /* LATIN CAPITAL LETTER J */
857 *pString=jDot;
858 return 2;
859 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
860 *pString=iOgonekDot;
861 return 2;
862 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
863 *pString=iDotGrave;
864 return 3;
865 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
866 *pString=iDotAcute;
867 return 3;
868 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
869 *pString=iDotTilde;
870 return 3;
871 default:
872 return 0; /* will not occur */
873 }
874 /* # Turkish and Azeri */
46f4442e 875 } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
374ca955
A
876 /*
877 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
878 # The following rules handle those cases.
879
880 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
881 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
882 */
883 return 0x69;
46f4442e 884 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
374ca955
A
885 /*
886 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
887 # This matches the behavior of the canonically equivalent I-dot_above
888
889 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
890 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
891 */
892 return 0; /* remove the dot (continue without output) */
46f4442e 893 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
374ca955
A
894 /*
895 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
896
897 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
898 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
899 */
900 return 0x131;
901 } else if(c==0x130) {
902 /*
903 # Preserve canonical equivalence for I with dot. Turkic is handled below.
904
905 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
906 */
907 *pString=iDot;
908 return 2;
909 } else if( c==0x3a3 &&
910 !isFollowedByCasedLetter(csp, iter, context, 1) &&
911 isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
912 ) {
913 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
914 /*
915 # Special case for final form of sigma
916
917 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
918 */
919 return 0x3c2; /* greek small final sigma */
920 } else {
921 /* no known conditional special case mapping, use a normal mapping */
922 }
923 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
924 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
925 full&=UCASE_FULL_LOWER;
926 if(full!=0) {
927 /* set the output pointer to the lowercase mapping */
4388f060 928 *pString=reinterpret_cast<const UChar *>(pe+1);
374ca955
A
929
930 /* return the string length */
931 return full;
932 }
933 }
934
935 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
936 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
937 }
938 }
939
940 return (result==c) ? ~result : result;
941}
942
943/* internal */
944static int32_t
945toUpperOrTitle(const UCaseProps *csp, UChar32 c,
946 UCaseContextIterator *iter, void *context,
947 const UChar **pString,
948 const char *locale, int32_t *locCache,
949 UBool upperNotTitle) {
729e4ab9
A
950 UChar32 result=c;
951 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955 952 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
953 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
954 result=c+UCASE_GET_DELTA(props);
374ca955
A
955 }
956 } else {
957 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
958 uint16_t excWord=*pe++;
729e4ab9 959 int32_t full, idx;
374ca955
A
960
961 pe2=pe;
962
963 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
964 /* use hardcoded conditions and mappings */
73c04bcf 965 int32_t loc=ucase_getCaseLocale(locale, locCache);
374ca955 966
46f4442e 967 if(loc==UCASE_LOC_TURKISH && c==0x69) {
374ca955
A
968 /*
969 # Turkish and Azeri
970
971 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
972 # The following rules handle those cases.
973
974 # When uppercasing, i turns into a dotted capital I
975
976 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
977 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
978 */
979 return 0x130;
46f4442e 980 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
374ca955
A
981 /*
982 # Lithuanian
983
984 # Lithuanian retains the dot in a lowercase i when followed by accents.
985
986 # Remove DOT ABOVE after "i" with upper or titlecase
987
988 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
989 */
990 return 0; /* remove the dot (continue without output) */
991 } else {
992 /* no known conditional special case mapping, use a normal mapping */
993 }
994 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
995 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
996
997 /* start of full case mapping strings */
998 ++pe;
999
1000 /* skip the lowercase and case-folding result strings */
1001 pe+=full&UCASE_FULL_LOWER;
1002 full>>=4;
1003 pe+=full&0xf;
1004 full>>=4;
1005
1006 if(upperNotTitle) {
1007 full&=0xf;
1008 } else {
1009 /* skip the uppercase result string */
1010 pe+=full&0xf;
1011 full=(full>>4)&0xf;
1012 }
1013
1014 if(full!=0) {
1015 /* set the output pointer to the result string */
4388f060 1016 *pString=reinterpret_cast<const UChar *>(pe);
374ca955
A
1017
1018 /* return the string length */
1019 return full;
1020 }
1021 }
1022
1023 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
729e4ab9 1024 idx=UCASE_EXC_TITLE;
374ca955
A
1025 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1026 /* here, titlecase is same as uppercase */
729e4ab9 1027 idx=UCASE_EXC_UPPER;
374ca955
A
1028 } else {
1029 return ~c;
1030 }
729e4ab9 1031 GET_SLOT_VALUE(excWord, idx, pe2, result);
374ca955
A
1032 }
1033
1034 return (result==c) ? ~result : result;
1035}
1036
1037U_CAPI int32_t U_EXPORT2
1038ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
1039 UCaseContextIterator *iter, void *context,
1040 const UChar **pString,
1041 const char *locale, int32_t *locCache) {
1042 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
1043}
1044
1045U_CAPI int32_t U_EXPORT2
1046ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
1047 UCaseContextIterator *iter, void *context,
1048 const UChar **pString,
1049 const char *locale, int32_t *locCache) {
1050 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
1051}
1052
1053/* case folding ------------------------------------------------------------- */
1054
1055/*
1056 * Case folding is similar to lowercasing.
1057 * The result may be a simple mapping, i.e., a single code point, or
1058 * a full mapping, i.e., a string.
1059 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1060 * then only the lowercase mapping is stored.
1061 *
1062 * Some special cases are hardcoded because their conditions cannot be
1063 * parsed and processed from CaseFolding.txt.
1064 *
1065 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1066
1067# C: common case folding, common mappings shared by both simple and full mappings.
1068# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1069# S: simple case folding, mappings to single characters where different from F.
1070# T: special case for uppercase I and dotted uppercase I
1071# - For non-Turkic languages, this mapping is normally not used.
1072# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1073#
1074# Usage:
1075# A. To do a simple case folding, use the mappings with status C + S.
1076# B. To do a full case folding, use the mappings with status C + F.
1077#
1078# The mappings with status T can be used or omitted depending on the desired case-folding
1079# behavior. (The default option is to exclude them.)
1080
1081 * Unicode 3.2 has 'T' mappings as follows:
1082
10830049; T; 0131; # LATIN CAPITAL LETTER I
10840130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1085
1086 * while the default mappings for these code points are:
1087
10880049; C; 0069; # LATIN CAPITAL LETTER I
10890130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1090
73c04bcf 1091 * U+0130 has no simple case folding (simple-case-folds to itself).
374ca955
A
1092 */
1093
1094/* return the simple case folding mapping for c */
1095U_CAPI UChar32 U_EXPORT2
73c04bcf 1096ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
729e4ab9 1097 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955 1098 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
1099 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1100 c+=UCASE_GET_DELTA(props);
374ca955
A
1101 }
1102 } else {
1103 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
1104 uint16_t excWord=*pe++;
729e4ab9 1105 int32_t idx;
374ca955
A
1106 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1107 /* special case folding mappings, hardcoded */
1108 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1109 /* default mappings */
1110 if(c==0x49) {
1111 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1112 return 0x69;
1113 } else if(c==0x130) {
73c04bcf
A
1114 /* no simple case folding for U+0130 */
1115 return c;
374ca955
A
1116 }
1117 } else {
1118 /* Turkic mappings */
1119 if(c==0x49) {
1120 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1121 return 0x131;
1122 } else if(c==0x130) {
1123 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1124 return 0x69;
1125 }
1126 }
1127 }
1128 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
729e4ab9 1129 idx=UCASE_EXC_FOLD;
374ca955 1130 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
729e4ab9 1131 idx=UCASE_EXC_LOWER;
374ca955
A
1132 } else {
1133 return c;
1134 }
729e4ab9 1135 GET_SLOT_VALUE(excWord, idx, pe, c);
374ca955
A
1136 }
1137 return c;
1138}
1139
1140/*
1141 * Issue for canonical caseless match (UAX #21):
1142 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1143 * canonical equivalence, unlike default-option casefolding.
1144 * For example, I-grave and I + grave fold to strings that are not canonically
1145 * equivalent.
1146 * For more details, see the comment in unorm_compare() in unorm.cpp
1147 * and the intermediate prototype changes for Jitterbug 2021.
1148 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1149 *
1150 * This did not get fixed because it appears that it is not possible to fix
1151 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1152 * together in a way that they still fold to common result strings.
1153 */
1154
1155U_CAPI int32_t U_EXPORT2
1156ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
1157 const UChar **pString,
46f4442e
A
1158 uint32_t options)
1159{
729e4ab9
A
1160 UChar32 result=c;
1161 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955 1162 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
1163 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1164 result=c+UCASE_GET_DELTA(props);
374ca955
A
1165 }
1166 } else {
1167 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1168 uint16_t excWord=*pe++;
729e4ab9 1169 int32_t full, idx;
374ca955
A
1170
1171 pe2=pe;
1172
1173 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1174 /* use hardcoded conditions and mappings */
1175 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1176 /* default mappings */
1177 if(c==0x49) {
1178 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1179 return 0x69;
1180 } else if(c==0x130) {
1181 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1182 *pString=iDot;
1183 return 2;
1184 }
1185 } else {
1186 /* Turkic mappings */
1187 if(c==0x49) {
1188 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1189 return 0x131;
1190 } else if(c==0x130) {
1191 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1192 return 0x69;
1193 }
1194 }
1195 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1196 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1197
1198 /* start of full case mapping strings */
1199 ++pe;
1200
1201 /* skip the lowercase result string */
1202 pe+=full&UCASE_FULL_LOWER;
1203 full=(full>>4)&0xf;
1204
1205 if(full!=0) {
1206 /* set the output pointer to the result string */
4388f060 1207 *pString=reinterpret_cast<const UChar *>(pe);
374ca955
A
1208
1209 /* return the string length */
1210 return full;
1211 }
1212 }
1213
1214 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
729e4ab9 1215 idx=UCASE_EXC_FOLD;
374ca955 1216 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
729e4ab9 1217 idx=UCASE_EXC_LOWER;
374ca955
A
1218 } else {
1219 return ~c;
1220 }
729e4ab9 1221 GET_SLOT_VALUE(excWord, idx, pe2, result);
374ca955
A
1222 }
1223
1224 return (result==c) ? ~result : result;
1225}
73c04bcf
A
1226
1227/* case mapping properties API ---------------------------------------------- */
1228
46f4442e 1229#define GET_CASE_PROPS() &ucase_props_singleton
73c04bcf
A
1230
1231/* public API (see uchar.h) */
1232
1233U_CAPI UBool U_EXPORT2
1234u_isULowercase(UChar32 c) {
1235 return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
1236}
1237
1238U_CAPI UBool U_EXPORT2
1239u_isUUppercase(UChar32 c) {
1240 return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
1241}
1242
1243/* Transforms the Unicode character to its lower case equivalent.*/
1244U_CAPI UChar32 U_EXPORT2
1245u_tolower(UChar32 c) {
1246 return ucase_tolower(GET_CASE_PROPS(), c);
1247}
1248
1249/* Transforms the Unicode character to its upper case equivalent.*/
1250U_CAPI UChar32 U_EXPORT2
1251u_toupper(UChar32 c) {
1252 return ucase_toupper(GET_CASE_PROPS(), c);
1253}
1254
1255/* Transforms the Unicode character to its title case equivalent.*/
1256U_CAPI UChar32 U_EXPORT2
1257u_totitle(UChar32 c) {
1258 return ucase_totitle(GET_CASE_PROPS(), c);
1259}
1260
1261/* return the simple case folding mapping for c */
1262U_CAPI UChar32 U_EXPORT2
1263u_foldCase(UChar32 c, uint32_t options) {
1264 return ucase_fold(GET_CASE_PROPS(), c, options);
1265}
1266
1267U_CFUNC int32_t U_EXPORT2
1268ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1269 /* case mapping properties */
729e4ab9
A
1270 const UChar *resultString;
1271 int32_t locCache;
73c04bcf
A
1272 const UCaseProps *csp=GET_CASE_PROPS();
1273 if(csp==NULL) {
1274 return FALSE;
1275 }
1276 switch(which) {
1277 case UCHAR_LOWERCASE:
1278 return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
1279 case UCHAR_UPPERCASE:
1280 return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
1281 case UCHAR_SOFT_DOTTED:
1282 return ucase_isSoftDotted(csp, c);
1283 case UCHAR_CASE_SENSITIVE:
1284 return ucase_isCaseSensitive(csp, c);
729e4ab9
A
1285 case UCHAR_CASED:
1286 return (UBool)(UCASE_NONE!=ucase_getType(csp, c));
1287 case UCHAR_CASE_IGNORABLE:
1288 return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);
1289 /*
1290 * Note: The following Changes_When_Xyz are defined as testing whether
1291 * the NFD form of the input changes when Xyz-case-mapped.
1292 * However, this simpler implementation of these properties,
1293 * ignoring NFD, passes the tests.
1294 * The implementation needs to be changed if the tests start failing.
1295 * When that happens, optimizations should be used to work with the
1296 * per-single-code point ucase_toFullXyz() functions unless
1297 * the NFD form has more than one code point,
1298 * and the property starts set needs to be the union of the
1299 * start sets for normalization and case mappings.
1300 */
1301 case UCHAR_CHANGES_WHEN_LOWERCASED:
1302 locCache=UCASE_LOC_ROOT;
1303 return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1304 case UCHAR_CHANGES_WHEN_UPPERCASED:
1305 locCache=UCASE_LOC_ROOT;
1306 return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1307 case UCHAR_CHANGES_WHEN_TITLECASED:
1308 locCache=UCASE_LOC_ROOT;
1309 return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1310 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1311 case UCHAR_CHANGES_WHEN_CASEMAPPED:
1312 locCache=UCASE_LOC_ROOT;
1313 return (UBool)(
1314 ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1315 ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1316 ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
73c04bcf
A
1317 default:
1318 return FALSE;
1319 }
1320}