]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucase.cpp
ICU-531.48.tar.gz
[apple/icu.git] / icuSources / common / ucase.cpp
CommitLineData
374ca955
A
1/*
2*******************************************************************************
3*
4388f060 4* Copyright (C) 2004-2012, International Business Machines
374ca955
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
4388f060 8* file name: ucase.cpp
374ca955
A
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2004aug30
14* created by: Markus W. Scherer
15*
16* Low-level Unicode character/string case mapping code.
17* Much code moved here (and modified) from uchar.c.
18*/
19
20#include "unicode/utypes.h"
4388f060 21#include "unicode/unistr.h"
374ca955
A
22#include "unicode/uset.h"
23#include "unicode/udata.h" /* UDataInfo */
4388f060 24#include "unicode/utf16.h"
374ca955
A
25#include "ucmndata.h" /* DataHeader */
26#include "udatamem.h"
27#include "umutex.h"
28#include "uassert.h"
29#include "cmemory.h"
729e4ab9 30#include "utrie2.h"
374ca955
A
31#include "ucase.h"
32#include "ucln_cmn.h"
33
34struct UCaseProps {
35 UDataMemory *mem;
36 const int32_t *indexes;
37 const uint16_t *exceptions;
4388f060 38 const uint16_t *unfold;
374ca955 39
729e4ab9 40 UTrie2 trie;
374ca955
A
41 uint8_t formatVersion[4];
42};
43
4388f060
A
44/* ucase_props_data.h is machine-generated by gencase --csource */
45#define INCLUDED_FROM_UCASE_CPP
46#include "ucase_props_data.h"
73c04bcf 47
374ca955
A
48/* UCaseProps singleton ----------------------------------------------------- */
49
73c04bcf 50U_CAPI const UCaseProps * U_EXPORT2
729e4ab9 51ucase_getSingleton() {
73c04bcf 52 return &ucase_props_singleton;
374ca955
A
53}
54
55/* set of property starts for UnicodeSet ------------------------------------ */
56
57static UBool U_CALLCONV
4388f060 58_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
374ca955 59 /* add the start code point to the USet */
73c04bcf 60 const USetAdder *sa=(const USetAdder *)context;
374ca955
A
61 sa->add(sa->set, start);
62 return TRUE;
63}
64
46f4442e 65U_CFUNC void U_EXPORT2
73c04bcf 66ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
374ca955
A
67 if(U_FAILURE(*pErrorCode)) {
68 return;
69 }
70
71 /* add the start code point of each same-value range of the trie */
729e4ab9 72 utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
374ca955
A
73
74 /* add code points with hardcoded properties, plus the ones following them */
75
76 /* (none right now, see comment below) */
77
78 /*
79 * Omit code points with hardcoded specialcasing properties
80 * because we do not build property UnicodeSets for them right now.
81 */
82}
83
84/* data access primitives --------------------------------------------------- */
85
374ca955
A
86#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
87
88#define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
89
90/* number of bits in an 8-bit integer value */
91static const uint8_t flagsOffset[256]={
92 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
93 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
95 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
97 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
98 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
100 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
101 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
103 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
104 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
105 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
106 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
107 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
108};
109
729e4ab9
A
110#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
111#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
374ca955
A
112
113/*
729e4ab9 114 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
374ca955
A
115 *
116 * @param excWord (in) initial exceptions word
729e4ab9 117 * @param idx (in) desired slot index
374ca955
A
118 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
119 * moved to the last uint16_t of the value, use +1 for beginning of next slot
120 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
121 */
729e4ab9 122#define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
374ca955 123 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
729e4ab9 124 (pExc16)+=SLOT_OFFSET(excWord, idx); \
374ca955
A
125 (value)=*pExc16; \
126 } else { \
729e4ab9 127 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
374ca955
A
128 (value)=*pExc16++; \
129 (value)=((value)<<16)|*pExc16; \
130 }
131
132/* simple case mappings ----------------------------------------------------- */
133
134U_CAPI UChar32 U_EXPORT2
135ucase_tolower(const UCaseProps *csp, UChar32 c) {
729e4ab9 136 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955 137 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
138 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
139 c+=UCASE_GET_DELTA(props);
374ca955
A
140 }
141 } else {
142 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
143 uint16_t excWord=*pe++;
144 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
145 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
146 }
147 }
148 return c;
149}
150
151U_CAPI UChar32 U_EXPORT2
152ucase_toupper(const UCaseProps *csp, UChar32 c) {
729e4ab9 153 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955 154 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
155 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
156 c+=UCASE_GET_DELTA(props);
374ca955
A
157 }
158 } else {
159 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
160 uint16_t excWord=*pe++;
161 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
162 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
163 }
164 }
165 return c;
166}
167
168U_CAPI UChar32 U_EXPORT2
169ucase_totitle(const UCaseProps *csp, UChar32 c) {
729e4ab9 170 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955 171 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
172 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
173 c+=UCASE_GET_DELTA(props);
374ca955
A
174 }
175 } else {
176 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
177 uint16_t excWord=*pe++;
729e4ab9 178 int32_t idx;
374ca955 179 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
729e4ab9 180 idx=UCASE_EXC_TITLE;
374ca955 181 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
729e4ab9 182 idx=UCASE_EXC_UPPER;
374ca955
A
183 } else {
184 return c;
185 }
729e4ab9 186 GET_SLOT_VALUE(excWord, idx, pe, c);
374ca955
A
187 }
188 return c;
189}
190
46f4442e
A
191static const UChar iDot[2] = { 0x69, 0x307 };
192static const UChar jDot[2] = { 0x6a, 0x307 };
193static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
194static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
195static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
196static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
197
198
199U_CFUNC void U_EXPORT2
73c04bcf
A
200ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
201 uint16_t props;
202
203 /*
204 * Hardcode the case closure of i and its relatives and ignore the
205 * data file data for these characters.
206 * The Turkic dotless i and dotted I with their case mapping conditions
207 * and case folding option make the related characters behave specially.
208 * This code matches their closure behavior to their case folding behavior.
209 */
73c04bcf
A
210
211 switch(c) {
212 case 0x49:
213 /* regular i and I are in one equivalence class */
214 sa->add(sa->set, 0x69);
215 return;
216 case 0x69:
217 sa->add(sa->set, 0x49);
218 return;
219 case 0x130:
220 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
221 sa->addString(sa->set, iDot, 2);
222 return;
223 case 0x131:
224 /* dotless i is in a class by itself */
225 return;
226 default:
227 /* otherwise use the data file data */
228 break;
229 }
230
729e4ab9 231 props=UTRIE2_GET16(&csp->trie, c);
73c04bcf
A
232 if(!PROPS_HAS_EXCEPTION(props)) {
233 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
234 /* add the one simple case mapping, no matter what type it is */
235 int32_t delta=UCASE_GET_DELTA(props);
236 if(delta!=0) {
237 sa->add(sa->set, c+delta);
238 }
239 }
240 } else {
241 /*
242 * c has exceptions, so there may be multiple simple and/or
243 * full case mappings. Add them all.
244 */
245 const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
246 const UChar *closure;
247 uint16_t excWord=*pe++;
729e4ab9 248 int32_t idx, closureLength, fullLength, length;
73c04bcf
A
249
250 pe0=pe;
251
252 /* add all simple case mappings */
729e4ab9
A
253 for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
254 if(HAS_SLOT(excWord, idx)) {
73c04bcf 255 pe=pe0;
729e4ab9 256 GET_SLOT_VALUE(excWord, idx, pe, c);
73c04bcf
A
257 sa->add(sa->set, c);
258 }
259 }
260
261 /* get the closure string pointer & length */
262 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
263 pe=pe0;
264 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
265 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
266 closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
267 } else {
268 closureLength=0;
269 closure=NULL;
270 }
271
272 /* add the full case folding */
273 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
274 pe=pe0;
275 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
276
277 /* start of full case mapping strings */
278 ++pe;
279
280 fullLength&=0xffff; /* bits 16 and higher are reserved */
281
282 /* skip the lowercase result string */
283 pe+=fullLength&UCASE_FULL_LOWER;
284 fullLength>>=4;
285
286 /* add the full case folding string */
287 length=fullLength&0xf;
288 if(length!=0) {
289 sa->addString(sa->set, (const UChar *)pe, length);
290 pe+=length;
291 }
292
293 /* skip the uppercase and titlecase strings */
294 fullLength>>=4;
295 pe+=fullLength&0xf;
296 fullLength>>=4;
297 pe+=fullLength;
298
299 closure=(const UChar *)pe; /* behind full case mappings */
300 }
301
302 /* add each code point in the closure string */
729e4ab9
A
303 for(idx=0; idx<closureLength;) {
304 U16_NEXT_UNSAFE(closure, idx, c);
73c04bcf
A
305 sa->add(sa->set, c);
306 }
307 }
308}
309
310/*
311 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
312 * must be length>0 and max>0 and length<=max
313 */
4388f060 314static inline int32_t
73c04bcf
A
315strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
316 int32_t c1, c2;
317
318 max-=length; /* we require length<=max, so no need to decrement max in the loop */
319 do {
320 c1=*s++;
321 c2=*t++;
322 if(c2==0) {
323 return 1; /* reached the end of t but not of s */
324 }
325 c1-=c2;
326 if(c1!=0) {
327 return c1; /* return difference result */
328 }
329 } while(--length>0);
330 /* ends with length==0 */
331
332 if(max==0 || *t==0) {
333 return 0; /* equal to length of both strings */
334 } else {
335 return -max; /* return lengh difference */
336 }
337}
338
46f4442e 339U_CFUNC UBool U_EXPORT2
73c04bcf 340ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
73c04bcf
A
341 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
342
343 if(csp->unfold==NULL || s==NULL) {
344 return FALSE; /* no reverse case folding data, or no string */
345 }
346 if(length<=1) {
347 /* the string is too short to find any match */
348 /*
349 * more precise would be:
350 * if(!u_strHasMoreChar32Than(s, length, 1))
351 * but this does not make much practical difference because
352 * a single supplementary code point would just not be found
353 */
354 return FALSE;
355 }
356
4388f060 357 const uint16_t *unfold=csp->unfold;
73c04bcf
A
358 unfoldRows=unfold[UCASE_UNFOLD_ROWS];
359 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
360 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
361 unfold+=unfoldRowWidth;
362
363 if(length>unfoldStringWidth) {
364 /* the string is too long to find any match */
365 return FALSE;
366 }
367
368 /* do a binary search for the string */
369 start=0;
370 limit=unfoldRows;
371 while(start<limit) {
372 i=(start+limit)/2;
4388f060 373 const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
73c04bcf
A
374 result=strcmpMax(s, length, p, unfoldStringWidth);
375
376 if(result==0) {
377 /* found the string: add each code point, and its case closure */
378 UChar32 c;
379
380 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
381 U16_NEXT_UNSAFE(p, i, c);
382 sa->add(sa->set, c);
383 ucase_addCaseClosure(csp, c, sa);
384 }
385 return TRUE;
386 } else if(result<0) {
387 limit=i;
388 } else /* result>0 */ {
389 start=i+1;
390 }
391 }
392
393 return FALSE; /* string not found */
394}
395
4388f060
A
396U_NAMESPACE_BEGIN
397
398FullCaseFoldingIterator::FullCaseFoldingIterator()
399 : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
400 unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
401 unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
402 unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
403 currentRow(0),
404 rowCpIndex(unfoldStringWidth) {
405 unfold+=unfoldRowWidth;
406}
407
408UChar32
409FullCaseFoldingIterator::next(UnicodeString &full) {
410 // Advance past the last-delivered code point.
411 const UChar *p=unfold+(currentRow*unfoldRowWidth);
412 if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
413 ++currentRow;
414 p+=unfoldRowWidth;
415 rowCpIndex=unfoldStringWidth;
416 }
417 if(currentRow>=unfoldRows) { return U_SENTINEL; }
418 // Set "full" to the NUL-terminated string in the first unfold column.
419 int32_t length=unfoldStringWidth;
420 while(length>0 && p[length-1]==0) { --length; }
421 full.setTo(FALSE, p, length);
422 // Return the code point.
423 UChar32 c;
424 U16_NEXT_UNSAFE(p, rowCpIndex, c);
425 return c;
426}
427
428U_NAMESPACE_END
429
374ca955
A
430/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
431U_CAPI int32_t U_EXPORT2
432ucase_getType(const UCaseProps *csp, UChar32 c) {
729e4ab9 433 uint16_t props=UTRIE2_GET16(&csp->trie, c);
73c04bcf 434 return UCASE_GET_TYPE(props);
374ca955
A
435}
436
729e4ab9 437/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
374ca955
A
438U_CAPI int32_t U_EXPORT2
439ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
729e4ab9 440 uint16_t props=UTRIE2_GET16(&csp->trie, c);
4388f060 441 return UCASE_GET_TYPE_AND_IGNORABLE(props);
374ca955
A
442}
443
444/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
4388f060 445static inline int32_t
374ca955 446getDotType(const UCaseProps *csp, UChar32 c) {
729e4ab9 447 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955
A
448 if(!PROPS_HAS_EXCEPTION(props)) {
449 return props&UCASE_DOT_MASK;
450 } else {
451 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
452 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
453 }
454}
455
456U_CAPI UBool U_EXPORT2
457ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
458 return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
459}
460
461U_CAPI UBool U_EXPORT2
462ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
729e4ab9 463 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955
A
464 return (UBool)((props&UCASE_SENSITIVE)!=0);
465}
466
374ca955
A
467/* string casing ------------------------------------------------------------ */
468
469/*
470 * These internal functions form the core of string case mappings.
471 * They map single code points to result code points or strings and take
472 * all necessary conditions (context, locale ID, options) into account.
473 *
474 * They do not iterate over the source or write to the destination
475 * so that the same functions are useful for non-standard string storage,
476 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
477 * For the same reason, the "surrounding text" context is passed in as a
478 * UCaseContextIterator which does not make any assumptions about
479 * the underlying storage.
480 *
481 * This section contains helper functions that check for conditions
482 * in the input text surrounding the current code point
483 * according to SpecialCasing.txt.
484 *
485 * Each helper function gets the index
486 * - after the current code point if it looks at following text
487 * - before the current code point if it looks at preceding text
488 *
489 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
490 *
491 * Final_Sigma
492 * C is preceded by a sequence consisting of
493 * a cased letter and a case-ignorable sequence,
494 * and C is not followed by a sequence consisting of
495 * an ignorable sequence and then a cased letter.
496 *
497 * More_Above
498 * C is followed by one or more characters of combining class 230 (ABOVE)
499 * in the combining character sequence.
500 *
501 * After_Soft_Dotted
502 * The last preceding character with combining class of zero before C
503 * was Soft_Dotted,
504 * and there is no intervening combining character class 230 (ABOVE).
505 *
506 * Before_Dot
507 * C is followed by combining dot above (U+0307).
508 * Any sequence of characters with a combining class that is neither 0 nor 230
509 * may intervene between the current character and the combining dot above.
510 *
511 * The erratum from 2002-10-31 adds the condition
512 *
513 * After_I
514 * The last preceding base character was an uppercase I, and there is no
515 * intervening combining character class 230 (ABOVE).
516 *
517 * (See Jitterbug 2344 and the comments on After_I below.)
518 *
519 * Helper definitions in Unicode 3.2 UAX 21:
520 *
521 * D1. A character C is defined to be cased
522 * if it meets any of the following criteria:
523 *
524 * - The general category of C is Titlecase Letter (Lt)
525 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
526 * - Given D = NFD(C), then it is not the case that:
527 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
528 * (This third criterium does not add any characters to the list
529 * for Unicode 3.2. Ignored.)
530 *
531 * D2. A character C is defined to be case-ignorable
532 * if it meets either of the following criteria:
533 *
534 * - The general category of C is
535 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
536 * Letter Modifier (Lm), or Symbol Modifier (Sk)
537 * - C is one of the following characters
538 * U+0027 APOSTROPHE
539 * U+00AD SOFT HYPHEN (SHY)
540 * U+2019 RIGHT SINGLE QUOTATION MARK
541 * (the preferred character for apostrophe)
542 *
543 * D3. A case-ignorable sequence is a sequence of
544 * zero or more case-ignorable characters.
545 */
546
374ca955 547#define is_a(c) ((c)=='a' || (c)=='A')
46f4442e 548#define is_d(c) ((c)=='d' || (c)=='D')
374ca955
A
549#define is_e(c) ((c)=='e' || (c)=='E')
550#define is_i(c) ((c)=='i' || (c)=='I')
551#define is_l(c) ((c)=='l' || (c)=='L')
46f4442e 552#define is_n(c) ((c)=='n' || (c)=='N')
374ca955
A
553#define is_r(c) ((c)=='r' || (c)=='R')
554#define is_t(c) ((c)=='t' || (c)=='T')
555#define is_u(c) ((c)=='u' || (c)=='U')
556#define is_z(c) ((c)=='z' || (c)=='Z')
557
558/* separator? */
559#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
560
73c04bcf 561/**
374ca955
A
562 * Requires non-NULL locale ID but otherwise does the equivalent of
563 * checking for language codes as if uloc_getLanguage() were called:
564 * Accepts both 2- and 3-letter codes and accepts case variants.
565 */
73c04bcf
A
566U_CFUNC int32_t
567ucase_getCaseLocale(const char *locale, int32_t *locCache) {
374ca955
A
568 int32_t result;
569 char c;
570
46f4442e 571 if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
374ca955
A
572 return result;
573 }
574
46f4442e 575 result=UCASE_LOC_ROOT;
374ca955
A
576
577 /*
578 * This function used to use uloc_getLanguage(), but the current code
579 * removes the dependency of this low-level code on uloc implementation code
580 * and is faster because not the whole locale ID has to be
581 * examined and copied/transformed.
582 *
583 * Because this code does not want to depend on uloc, the caller must
584 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
585 */
586 c=*locale++;
587 if(is_t(c)) {
588 /* tr or tur? */
589 c=*locale++;
590 if(is_u(c)) {
591 c=*locale++;
592 }
593 if(is_r(c)) {
594 c=*locale;
595 if(is_sep(c)) {
46f4442e 596 result=UCASE_LOC_TURKISH;
374ca955
A
597 }
598 }
599 } else if(is_a(c)) {
600 /* az or aze? */
601 c=*locale++;
602 if(is_z(c)) {
603 c=*locale++;
604 if(is_e(c)) {
605 c=*locale;
606 }
607 if(is_sep(c)) {
46f4442e 608 result=UCASE_LOC_TURKISH;
374ca955
A
609 }
610 }
611 } else if(is_l(c)) {
612 /* lt or lit? */
613 c=*locale++;
614 if(is_i(c)) {
615 c=*locale++;
616 }
617 if(is_t(c)) {
618 c=*locale;
619 if(is_sep(c)) {
46f4442e
A
620 result=UCASE_LOC_LITHUANIAN;
621 }
622 }
623 } else if(is_n(c)) {
624 /* nl or nld? */
625 c=*locale++;
626 if(is_l(c)) {
627 c=*locale++;
628 if(is_d(c)) {
629 c=*locale;
630 }
631 if(is_sep(c)) {
632 result=UCASE_LOC_DUTCH;
374ca955
A
633 }
634 }
635 }
636
637 if(locCache!=NULL) {
638 *locCache=result;
639 }
640 return result;
641}
642
729e4ab9
A
643/*
644 * Is followed by
645 * {case-ignorable}* cased
646 * ?
647 * (dir determines looking forward/backward)
648 * If a character is case-ignorable, it is skipped regardless of whether
649 * it is also cased or not.
650 */
374ca955
A
651static UBool
652isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
653 UChar32 c;
374ca955
A
654
655 if(iter==NULL) {
656 return FALSE;
657 }
658
659 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
729e4ab9
A
660 int32_t type=ucase_getTypeOrIgnorable(csp, c);
661 if(type&4) {
374ca955 662 /* case-ignorable, continue with the loop */
729e4ab9
A
663 } else if(type!=UCASE_NONE) {
664 return TRUE; /* followed by cased letter */
374ca955 665 } else {
729e4ab9 666 return FALSE; /* uncased and not case-ignorable */
374ca955
A
667 }
668 }
669
670 return FALSE; /* not followed by cased letter */
671}
672
673/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
674static UBool
675isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
676 UChar32 c;
677 int32_t dotType;
678 int8_t dir;
679
680 if(iter==NULL) {
681 return FALSE;
682 }
683
684 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
685 dotType=getDotType(csp, c);
686 if(dotType==UCASE_SOFT_DOTTED) {
687 return TRUE; /* preceded by TYPE_i */
688 } else if(dotType!=UCASE_OTHER_ACCENT) {
689 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
690 }
691 }
692
693 return FALSE; /* not preceded by TYPE_i */
694}
695
696/*
697 * See Jitterbug 2344:
698 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
699 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
700 * we made those releases compatible with Unicode 3.2 which had not fixed
701 * a related bug in SpecialCasing.txt.
702 *
703 * From the Jitterbug 2344 text:
704 * ... this bug is listed as a Unicode erratum
705 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
706 * <quote>
707 * There are two errors in SpecialCasing.txt.
708 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
709 * 2. An incorrect context definition. Correct as follows:
710 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
711 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
712 * ---
713 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
714 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
715 * where the context After_I is defined as:
716 * The last preceding base character was an uppercase I, and there is no
717 * intervening combining character class 230 (ABOVE).
718 * </quote>
719 *
720 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
721 *
722 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
723 * # This matches the behavior of the canonically equivalent I-dot_above
724 *
725 * See also the description in this place in older versions of uchar.c (revision 1.100).
726 *
727 * Markus W. Scherer 2003-feb-15
728 */
729
730/* Is preceded by base character 'I' with no intervening cc=230 ? */
731static UBool
732isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
733 UChar32 c;
734 int32_t dotType;
735 int8_t dir;
736
737 if(iter==NULL) {
738 return FALSE;
739 }
740
741 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
742 if(c==0x49) {
743 return TRUE; /* preceded by I */
744 }
745 dotType=getDotType(csp, c);
746 if(dotType!=UCASE_OTHER_ACCENT) {
747 return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
748 }
749 }
750
751 return FALSE; /* not preceded by I */
752}
753
754/* Is followed by one or more cc==230 ? */
755static UBool
756isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
757 UChar32 c;
758 int32_t dotType;
759 int8_t dir;
760
761 if(iter==NULL) {
762 return FALSE;
763 }
764
765 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
766 dotType=getDotType(csp, c);
767 if(dotType==UCASE_ABOVE) {
768 return TRUE; /* at least one cc==230 following */
769 } else if(dotType!=UCASE_OTHER_ACCENT) {
770 return FALSE; /* next base character, no more cc==230 following */
771 }
772 }
773
774 return FALSE; /* no more cc==230 following */
775}
776
777/* Is followed by a dot above (without cc==230 in between) ? */
778static UBool
779isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
780 UChar32 c;
781 int32_t dotType;
782 int8_t dir;
783
784 if(iter==NULL) {
785 return FALSE;
786 }
787
788 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
789 if(c==0x307) {
790 return TRUE;
791 }
792 dotType=getDotType(csp, c);
793 if(dotType!=UCASE_OTHER_ACCENT) {
794 return FALSE; /* next base character or cc==230 in between */
795 }
796 }
797
798 return FALSE; /* no dot above following */
799}
800
801U_CAPI int32_t U_EXPORT2
802ucase_toFullLower(const UCaseProps *csp, UChar32 c,
803 UCaseContextIterator *iter, void *context,
804 const UChar **pString,
46f4442e
A
805 const char *locale, int32_t *locCache)
806{
729e4ab9
A
807 UChar32 result=c;
808 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955 809 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
810 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
811 result=c+UCASE_GET_DELTA(props);
374ca955
A
812 }
813 } else {
814 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
815 uint16_t excWord=*pe++;
816 int32_t full;
817
818 pe2=pe;
819
820 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
821 /* use hardcoded conditions and mappings */
73c04bcf 822 int32_t loc=ucase_getCaseLocale(locale, locCache);
374ca955
A
823
824 /*
825 * Test for conditional mappings first
826 * (otherwise the unconditional default mappings are always taken),
827 * then test for characters that have unconditional mappings in SpecialCasing.txt,
828 * then get the UnicodeData.txt mappings.
829 */
46f4442e 830 if( loc==UCASE_LOC_LITHUANIAN &&
374ca955
A
831 /* base characters, find accents above */
832 (((c==0x49 || c==0x4a || c==0x12e) &&
833 isFollowedByMoreAbove(csp, iter, context)) ||
834 /* precomposed with accent above, no need to find one */
835 (c==0xcc || c==0xcd || c==0x128))
836 ) {
837 /*
838 # Lithuanian
839
840 # Lithuanian retains the dot in a lowercase i when followed by accents.
841
842 # Introduce an explicit dot above when lowercasing capital I's and J's
843 # whenever there are more accents above.
844 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
845
846 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
847 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
848 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
849 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
850 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
851 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
852 */
853 switch(c) {
854 case 0x49: /* LATIN CAPITAL LETTER I */
855 *pString=iDot;
856 return 2;
857 case 0x4a: /* LATIN CAPITAL LETTER J */
858 *pString=jDot;
859 return 2;
860 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
861 *pString=iOgonekDot;
862 return 2;
863 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
864 *pString=iDotGrave;
865 return 3;
866 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
867 *pString=iDotAcute;
868 return 3;
869 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
870 *pString=iDotTilde;
871 return 3;
872 default:
873 return 0; /* will not occur */
874 }
875 /* # Turkish and Azeri */
46f4442e 876 } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
374ca955
A
877 /*
878 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
879 # The following rules handle those cases.
880
881 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
882 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
883 */
884 return 0x69;
46f4442e 885 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
374ca955
A
886 /*
887 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
888 # This matches the behavior of the canonically equivalent I-dot_above
889
890 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
891 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
892 */
893 return 0; /* remove the dot (continue without output) */
46f4442e 894 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
374ca955
A
895 /*
896 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
897
898 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
899 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
900 */
901 return 0x131;
902 } else if(c==0x130) {
903 /*
904 # Preserve canonical equivalence for I with dot. Turkic is handled below.
905
906 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
907 */
908 *pString=iDot;
909 return 2;
910 } else if( c==0x3a3 &&
911 !isFollowedByCasedLetter(csp, iter, context, 1) &&
912 isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
913 ) {
914 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
915 /*
916 # Special case for final form of sigma
917
918 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
919 */
920 return 0x3c2; /* greek small final sigma */
921 } else {
922 /* no known conditional special case mapping, use a normal mapping */
923 }
924 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
925 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
926 full&=UCASE_FULL_LOWER;
927 if(full!=0) {
928 /* set the output pointer to the lowercase mapping */
4388f060 929 *pString=reinterpret_cast<const UChar *>(pe+1);
374ca955
A
930
931 /* return the string length */
932 return full;
933 }
934 }
935
936 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
937 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
938 }
939 }
940
941 return (result==c) ? ~result : result;
942}
943
944/* internal */
945static int32_t
946toUpperOrTitle(const UCaseProps *csp, UChar32 c,
947 UCaseContextIterator *iter, void *context,
948 const UChar **pString,
949 const char *locale, int32_t *locCache,
950 UBool upperNotTitle) {
729e4ab9
A
951 UChar32 result=c;
952 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955 953 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
954 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
955 result=c+UCASE_GET_DELTA(props);
374ca955
A
956 }
957 } else {
958 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
959 uint16_t excWord=*pe++;
729e4ab9 960 int32_t full, idx;
374ca955
A
961
962 pe2=pe;
963
964 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
965 /* use hardcoded conditions and mappings */
73c04bcf 966 int32_t loc=ucase_getCaseLocale(locale, locCache);
374ca955 967
46f4442e 968 if(loc==UCASE_LOC_TURKISH && c==0x69) {
374ca955
A
969 /*
970 # Turkish and Azeri
971
972 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
973 # The following rules handle those cases.
974
975 # When uppercasing, i turns into a dotted capital I
976
977 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
978 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
979 */
980 return 0x130;
46f4442e 981 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
374ca955
A
982 /*
983 # Lithuanian
984
985 # Lithuanian retains the dot in a lowercase i when followed by accents.
986
987 # Remove DOT ABOVE after "i" with upper or titlecase
988
989 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
990 */
991 return 0; /* remove the dot (continue without output) */
992 } else {
993 /* no known conditional special case mapping, use a normal mapping */
994 }
995 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
996 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
997
998 /* start of full case mapping strings */
999 ++pe;
1000
1001 /* skip the lowercase and case-folding result strings */
1002 pe+=full&UCASE_FULL_LOWER;
1003 full>>=4;
1004 pe+=full&0xf;
1005 full>>=4;
1006
1007 if(upperNotTitle) {
1008 full&=0xf;
1009 } else {
1010 /* skip the uppercase result string */
1011 pe+=full&0xf;
1012 full=(full>>4)&0xf;
1013 }
1014
1015 if(full!=0) {
1016 /* set the output pointer to the result string */
4388f060 1017 *pString=reinterpret_cast<const UChar *>(pe);
374ca955
A
1018
1019 /* return the string length */
1020 return full;
1021 }
1022 }
1023
1024 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
729e4ab9 1025 idx=UCASE_EXC_TITLE;
374ca955
A
1026 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1027 /* here, titlecase is same as uppercase */
729e4ab9 1028 idx=UCASE_EXC_UPPER;
374ca955
A
1029 } else {
1030 return ~c;
1031 }
729e4ab9 1032 GET_SLOT_VALUE(excWord, idx, pe2, result);
374ca955
A
1033 }
1034
1035 return (result==c) ? ~result : result;
1036}
1037
1038U_CAPI int32_t U_EXPORT2
1039ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
1040 UCaseContextIterator *iter, void *context,
1041 const UChar **pString,
1042 const char *locale, int32_t *locCache) {
1043 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
1044}
1045
1046U_CAPI int32_t U_EXPORT2
1047ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
1048 UCaseContextIterator *iter, void *context,
1049 const UChar **pString,
1050 const char *locale, int32_t *locCache) {
1051 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
1052}
1053
1054/* case folding ------------------------------------------------------------- */
1055
1056/*
1057 * Case folding is similar to lowercasing.
1058 * The result may be a simple mapping, i.e., a single code point, or
1059 * a full mapping, i.e., a string.
1060 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1061 * then only the lowercase mapping is stored.
1062 *
1063 * Some special cases are hardcoded because their conditions cannot be
1064 * parsed and processed from CaseFolding.txt.
1065 *
1066 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1067
1068# C: common case folding, common mappings shared by both simple and full mappings.
1069# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1070# S: simple case folding, mappings to single characters where different from F.
1071# T: special case for uppercase I and dotted uppercase I
1072# - For non-Turkic languages, this mapping is normally not used.
1073# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1074#
1075# Usage:
1076# A. To do a simple case folding, use the mappings with status C + S.
1077# B. To do a full case folding, use the mappings with status C + F.
1078#
1079# The mappings with status T can be used or omitted depending on the desired case-folding
1080# behavior. (The default option is to exclude them.)
1081
1082 * Unicode 3.2 has 'T' mappings as follows:
1083
10840049; T; 0131; # LATIN CAPITAL LETTER I
10850130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1086
1087 * while the default mappings for these code points are:
1088
10890049; C; 0069; # LATIN CAPITAL LETTER I
10900130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1091
73c04bcf 1092 * U+0130 has no simple case folding (simple-case-folds to itself).
374ca955
A
1093 */
1094
1095/* return the simple case folding mapping for c */
1096U_CAPI UChar32 U_EXPORT2
73c04bcf 1097ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
729e4ab9 1098 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955 1099 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
1100 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1101 c+=UCASE_GET_DELTA(props);
374ca955
A
1102 }
1103 } else {
1104 const uint16_t *pe=GET_EXCEPTIONS(csp, props);
1105 uint16_t excWord=*pe++;
729e4ab9 1106 int32_t idx;
374ca955
A
1107 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1108 /* special case folding mappings, hardcoded */
1109 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1110 /* default mappings */
1111 if(c==0x49) {
1112 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1113 return 0x69;
1114 } else if(c==0x130) {
73c04bcf
A
1115 /* no simple case folding for U+0130 */
1116 return c;
374ca955
A
1117 }
1118 } else {
1119 /* Turkic mappings */
1120 if(c==0x49) {
1121 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1122 return 0x131;
1123 } else if(c==0x130) {
1124 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1125 return 0x69;
1126 }
1127 }
1128 }
1129 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
729e4ab9 1130 idx=UCASE_EXC_FOLD;
374ca955 1131 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
729e4ab9 1132 idx=UCASE_EXC_LOWER;
374ca955
A
1133 } else {
1134 return c;
1135 }
729e4ab9 1136 GET_SLOT_VALUE(excWord, idx, pe, c);
374ca955
A
1137 }
1138 return c;
1139}
1140
1141/*
1142 * Issue for canonical caseless match (UAX #21):
1143 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1144 * canonical equivalence, unlike default-option casefolding.
1145 * For example, I-grave and I + grave fold to strings that are not canonically
1146 * equivalent.
1147 * For more details, see the comment in unorm_compare() in unorm.cpp
1148 * and the intermediate prototype changes for Jitterbug 2021.
1149 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1150 *
1151 * This did not get fixed because it appears that it is not possible to fix
1152 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1153 * together in a way that they still fold to common result strings.
1154 */
1155
1156U_CAPI int32_t U_EXPORT2
1157ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
1158 const UChar **pString,
46f4442e
A
1159 uint32_t options)
1160{
729e4ab9
A
1161 UChar32 result=c;
1162 uint16_t props=UTRIE2_GET16(&csp->trie, c);
374ca955 1163 if(!PROPS_HAS_EXCEPTION(props)) {
73c04bcf
A
1164 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1165 result=c+UCASE_GET_DELTA(props);
374ca955
A
1166 }
1167 } else {
1168 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1169 uint16_t excWord=*pe++;
729e4ab9 1170 int32_t full, idx;
374ca955
A
1171
1172 pe2=pe;
1173
1174 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1175 /* use hardcoded conditions and mappings */
1176 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1177 /* default mappings */
1178 if(c==0x49) {
1179 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1180 return 0x69;
1181 } else if(c==0x130) {
1182 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1183 *pString=iDot;
1184 return 2;
1185 }
1186 } else {
1187 /* Turkic mappings */
1188 if(c==0x49) {
1189 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1190 return 0x131;
1191 } else if(c==0x130) {
1192 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1193 return 0x69;
1194 }
1195 }
1196 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1197 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1198
1199 /* start of full case mapping strings */
1200 ++pe;
1201
1202 /* skip the lowercase result string */
1203 pe+=full&UCASE_FULL_LOWER;
1204 full=(full>>4)&0xf;
1205
1206 if(full!=0) {
1207 /* set the output pointer to the result string */
4388f060 1208 *pString=reinterpret_cast<const UChar *>(pe);
374ca955
A
1209
1210 /* return the string length */
1211 return full;
1212 }
1213 }
1214
1215 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
729e4ab9 1216 idx=UCASE_EXC_FOLD;
374ca955 1217 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
729e4ab9 1218 idx=UCASE_EXC_LOWER;
374ca955
A
1219 } else {
1220 return ~c;
1221 }
729e4ab9 1222 GET_SLOT_VALUE(excWord, idx, pe2, result);
374ca955
A
1223 }
1224
1225 return (result==c) ? ~result : result;
1226}
73c04bcf
A
1227
1228/* case mapping properties API ---------------------------------------------- */
1229
46f4442e 1230#define GET_CASE_PROPS() &ucase_props_singleton
73c04bcf
A
1231
1232/* public API (see uchar.h) */
1233
1234U_CAPI UBool U_EXPORT2
1235u_isULowercase(UChar32 c) {
1236 return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
1237}
1238
1239U_CAPI UBool U_EXPORT2
1240u_isUUppercase(UChar32 c) {
1241 return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
1242}
1243
1244/* Transforms the Unicode character to its lower case equivalent.*/
1245U_CAPI UChar32 U_EXPORT2
1246u_tolower(UChar32 c) {
1247 return ucase_tolower(GET_CASE_PROPS(), c);
1248}
1249
1250/* Transforms the Unicode character to its upper case equivalent.*/
1251U_CAPI UChar32 U_EXPORT2
1252u_toupper(UChar32 c) {
1253 return ucase_toupper(GET_CASE_PROPS(), c);
1254}
1255
1256/* Transforms the Unicode character to its title case equivalent.*/
1257U_CAPI UChar32 U_EXPORT2
1258u_totitle(UChar32 c) {
1259 return ucase_totitle(GET_CASE_PROPS(), c);
1260}
1261
1262/* return the simple case folding mapping for c */
1263U_CAPI UChar32 U_EXPORT2
1264u_foldCase(UChar32 c, uint32_t options) {
1265 return ucase_fold(GET_CASE_PROPS(), c, options);
1266}
1267
1268U_CFUNC int32_t U_EXPORT2
1269ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1270 /* case mapping properties */
729e4ab9
A
1271 const UChar *resultString;
1272 int32_t locCache;
73c04bcf
A
1273 const UCaseProps *csp=GET_CASE_PROPS();
1274 if(csp==NULL) {
1275 return FALSE;
1276 }
1277 switch(which) {
1278 case UCHAR_LOWERCASE:
1279 return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
1280 case UCHAR_UPPERCASE:
1281 return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
1282 case UCHAR_SOFT_DOTTED:
1283 return ucase_isSoftDotted(csp, c);
1284 case UCHAR_CASE_SENSITIVE:
1285 return ucase_isCaseSensitive(csp, c);
729e4ab9
A
1286 case UCHAR_CASED:
1287 return (UBool)(UCASE_NONE!=ucase_getType(csp, c));
1288 case UCHAR_CASE_IGNORABLE:
1289 return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);
1290 /*
1291 * Note: The following Changes_When_Xyz are defined as testing whether
1292 * the NFD form of the input changes when Xyz-case-mapped.
1293 * However, this simpler implementation of these properties,
1294 * ignoring NFD, passes the tests.
1295 * The implementation needs to be changed if the tests start failing.
1296 * When that happens, optimizations should be used to work with the
1297 * per-single-code point ucase_toFullXyz() functions unless
1298 * the NFD form has more than one code point,
1299 * and the property starts set needs to be the union of the
1300 * start sets for normalization and case mappings.
1301 */
1302 case UCHAR_CHANGES_WHEN_LOWERCASED:
1303 locCache=UCASE_LOC_ROOT;
1304 return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1305 case UCHAR_CHANGES_WHEN_UPPERCASED:
1306 locCache=UCASE_LOC_ROOT;
1307 return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1308 case UCHAR_CHANGES_WHEN_TITLECASED:
1309 locCache=UCASE_LOC_ROOT;
1310 return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1311 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1312 case UCHAR_CHANGES_WHEN_CASEMAPPED:
1313 locCache=UCASE_LOC_ROOT;
1314 return (UBool)(
1315 ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1316 ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1317 ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
73c04bcf
A
1318 default:
1319 return FALSE;
1320 }
1321}