icuSources/common/ucase.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 2004-2014, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  ucase.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2004aug30
  16 *   created by: Markus W. Scherer
  17 *
  18 *   Low-level Unicode character/string case mapping code.
  19 *   Much code moved here (and modified) from uchar.c.
  20 */
  21
  22 #include "unicode/utypes.h"
  23 #include "unicode/unistr.h"
  24 #include "unicode/uset.h"
  25 #include "unicode/udata.h" /* UDataInfo */
  26 #include "unicode/utf16.h"
  27 #include "ucmndata.h" /* DataHeader */
  28 #include "udatamem.h"
  29 #include "umutex.h"
  30 #include "uassert.h"
  31 #include "cmemory.h"
  32 #include "utrie2.h"
  33 #include "ucase.h"
  34
  35 struct UCaseProps {
  36     UDataMemory *mem;
  37     const int32_t *indexes;
  38     const uint16_t *exceptions;
  39     const uint16_t *unfold;
  40
  41     UTrie2 trie;
  42     uint8_t formatVersion[4];
  43 };
  44
  45 /* ucase_props_data.h is machine-generated by gencase --csource */
  46 #define INCLUDED_FROM_UCASE_CPP
  47 #include "ucase_props_data.h"
  48
  49 /* set of property starts for UnicodeSet ------------------------------------ */
  50
  51 static UBool U_CALLCONV
  52 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
  53     /* add the start code point to the USet */
  54     const USetAdder *sa=(const USetAdder *)context;
  55     sa->add(sa->set, start);
  56     return TRUE;
  57 }
  58
  59 U_CFUNC void U_EXPORT2
  60 ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
  61     if(U_FAILURE(*pErrorCode)) {
  62         return;
  63     }
  64
  65     /* add the start code point of each same-value range of the trie */
  66     utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
  67
  68     /* add code points with hardcoded properties, plus the ones following them */
  69
  70     /* (none right now, see comment below) */
  71
  72     /*
  73      * Omit code points with hardcoded specialcasing properties
  74      * because we do not build property UnicodeSets for them right now.
  75      */
  76 }
  77
  78 /* data access primitives --------------------------------------------------- */
  79
  80 U_CFUNC const UTrie2 * U_EXPORT2
  81 ucase_getTrie() {
  82     return &ucase_props_singleton.trie;
  83 }
  84
  85 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
  86
  87 /* number of bits in an 8-bit integer value */
  88 static const uint8_t flagsOffset[256]={
  89     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
  90     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  91     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  92     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  93     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  94     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  95     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  96     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  97     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  98     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  99     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 100     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 101     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 102     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 103     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 104     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 105 };
 106
 107 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
 108 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
 109
 110 /*
 111  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
 112  *
 113  * @param excWord (in) initial exceptions word
 114  * @param idx (in) desired slot index
 115  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
 116  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
 117  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
 118  */
 119 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \
 120     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
 121         (pExc16)+=SLOT_OFFSET(excWord, idx); \
 122         (value)=*pExc16; \
 123     } else { \
 124         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
 125         (value)=*pExc16++; \
 126         (value)=((value)<<16)|*pExc16; \
 127     } \
 128 } UPRV_BLOCK_MACRO_END
 129
 130 /* simple case mappings ----------------------------------------------------- */
 131
 132 U_CAPI UChar32 U_EXPORT2
 133 ucase_tolower(UChar32 c) {
 134     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
 135     if(!UCASE_HAS_EXCEPTION(props)) {
 136         if(UCASE_IS_UPPER_OR_TITLE(props)) {
 137             c+=UCASE_GET_DELTA(props);
 138         }
 139     } else {
 140         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
 141         uint16_t excWord=*pe++;
 142         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
 143             int32_t delta;
 144             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
 145             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
 146         }
 147         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
 148             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
 149         }
 150     }
 151     return c;
 152 }
 153
 154 U_CAPI UChar32 U_EXPORT2
 155 ucase_toupper(UChar32 c) {
 156     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
 157     if(!UCASE_HAS_EXCEPTION(props)) {
 158         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
 159             c+=UCASE_GET_DELTA(props);
 160         }
 161     } else {
 162         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
 163         uint16_t excWord=*pe++;
 164         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
 165             int32_t delta;
 166             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
 167             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
 168         }
 169         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
 170             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
 171         }
 172     }
 173     return c;
 174 }
 175
 176 U_CAPI UChar32 U_EXPORT2
 177 ucase_totitle(UChar32 c) {
 178     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
 179     if(!UCASE_HAS_EXCEPTION(props)) {
 180         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
 181             c+=UCASE_GET_DELTA(props);
 182         }
 183     } else {
 184         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
 185         uint16_t excWord=*pe++;
 186         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
 187             int32_t delta;
 188             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
 189             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
 190         }
 191         int32_t idx;
 192         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
 193             idx=UCASE_EXC_TITLE;
 194         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
 195             idx=UCASE_EXC_UPPER;
 196         } else {
 197             return c;
 198         }
 199         GET_SLOT_VALUE(excWord, idx, pe, c);
 200     }
 201     return c;
 202 }
 203
 204 static const UChar iDot[2] = { 0x69, 0x307 };
 205 static const UChar jDot[2] = { 0x6a, 0x307 };
 206 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
 207 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
 208 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
 209 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
 210
 211
 212 U_CFUNC void U_EXPORT2
 213 ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
 214     uint16_t props;
 215
 216     /*
 217      * Hardcode the case closure of i and its relatives and ignore the
 218      * data file data for these characters.
 219      * The Turkic dotless i and dotted I with their case mapping conditions
 220      * and case folding option make the related characters behave specially.
 221      * This code matches their closure behavior to their case folding behavior.
 222      */
 223
 224     switch(c) {
 225     case 0x49:
 226         /* regular i and I are in one equivalence class */
 227         sa->add(sa->set, 0x69);
 228         return;
 229     case 0x69:
 230         sa->add(sa->set, 0x49);
 231         return;
 232     case 0x130:
 233         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
 234         sa->addString(sa->set, iDot, 2);
 235         return;
 236     case 0x131:
 237         /* dotless i is in a class by itself */
 238         return;
 239     default:
 240         /* otherwise use the data file data */
 241         break;
 242     }
 243
 244     props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
 245     if(!UCASE_HAS_EXCEPTION(props)) {
 246         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
 247             /* add the one simple case mapping, no matter what type it is */
 248             int32_t delta=UCASE_GET_DELTA(props);
 249             if(delta!=0) {
 250                 sa->add(sa->set, c+delta);
 251             }
 252         }
 253     } else {
 254         /*
 255          * c has exceptions, so there may be multiple simple and/or
 256          * full case mappings. Add them all.
 257          */
 258         const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
 259         const UChar *closure;
 260         uint16_t excWord=*pe++;
 261         int32_t idx, closureLength, fullLength, length;
 262
 263         pe0=pe;
 264
 265         /* add all simple case mappings */
 266         for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
 267             if(HAS_SLOT(excWord, idx)) {
 268                 pe=pe0;
 269                 GET_SLOT_VALUE(excWord, idx, pe, c);
 270                 sa->add(sa->set, c);
 271             }
 272         }
 273         if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
 274             pe=pe0;
 275             int32_t delta;
 276             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
 277             sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
 278         }
 279
 280         /* get the closure string pointer & length */
 281         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
 282             pe=pe0;
 283             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
 284             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
 285             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
 286         } else {
 287             closureLength=0;
 288             closure=NULL;
 289         }
 290
 291         /* add the full case folding */
 292         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
 293             pe=pe0;
 294             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
 295
 296             /* start of full case mapping strings */
 297             ++pe;
 298
 299             fullLength&=0xffff; /* bits 16 and higher are reserved */
 300
 301             /* skip the lowercase result string */
 302             pe+=fullLength&UCASE_FULL_LOWER;
 303             fullLength>>=4;
 304
 305             /* add the full case folding string */
 306             length=fullLength&0xf;
 307             if(length!=0) {
 308                 sa->addString(sa->set, (const UChar *)pe, length);
 309                 pe+=length;
 310             }
 311
 312             /* skip the uppercase and titlecase strings */
 313             fullLength>>=4;
 314             pe+=fullLength&0xf;
 315             fullLength>>=4;
 316             pe+=fullLength;
 317
 318             closure=(const UChar *)pe; /* behind full case mappings */
 319         }
 320
 321         /* add each code point in the closure string */
 322         for(idx=0; idx<closureLength;) {
 323             U16_NEXT_UNSAFE(closure, idx, c);
 324             sa->add(sa->set, c);
 325         }
 326     }
 327 }
 328
 329 /*
 330  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
 331  * must be length>0 and max>0 and length<=max
 332  */
 333 static inline int32_t
 334 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
 335     int32_t c1, c2;
 336
 337     max-=length; /* we require length<=max, so no need to decrement max in the loop */
 338     do {
 339         c1=*s++;
 340         c2=*t++;
 341         if(c2==0) {
 342             return 1; /* reached the end of t but not of s */
 343         }
 344         c1-=c2;
 345         if(c1!=0) {
 346             return c1; /* return difference result */
 347         }
 348     } while(--length>0);
 349     /* ends with length==0 */
 350
 351     if(max==0 || *t==0) {
 352         return 0; /* equal to length of both strings */
 353     } else {
 354         return -max; /* return lengh difference */
 355     }
 356 }
 357
 358 U_CFUNC UBool U_EXPORT2
 359 ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) {
 360     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
 361
 362     if(ucase_props_singleton.unfold==NULL || s==NULL) {
 363         return FALSE; /* no reverse case folding data, or no string */
 364     }
 365     if(length<=1) {
 366         /* the string is too short to find any match */
 367         /*
 368          * more precise would be:
 369          * if(!u_strHasMoreChar32Than(s, length, 1))
 370          * but this does not make much practical difference because
 371          * a single supplementary code point would just not be found
 372          */
 373         return FALSE;
 374     }
 375
 376     const uint16_t *unfold=ucase_props_singleton.unfold;
 377     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
 378     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
 379     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
 380     unfold+=unfoldRowWidth;
 381
 382     if(length>unfoldStringWidth) {
 383         /* the string is too long to find any match */
 384         return FALSE;
 385     }
 386
 387     /* do a binary search for the string */
 388     start=0;
 389     limit=unfoldRows;
 390     while(start<limit) {
 391         i=(start+limit)/2;
 392         const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
 393         result=strcmpMax(s, length, p, unfoldStringWidth);
 394
 395         if(result==0) {
 396             /* found the string: add each code point, and its case closure */
 397             UChar32 c;
 398
 399             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
 400                 U16_NEXT_UNSAFE(p, i, c);
 401                 sa->add(sa->set, c);
 402                 ucase_addCaseClosure(c, sa);
 403             }
 404             return TRUE;
 405         } else if(result<0) {
 406             limit=i;
 407         } else /* result>0 */ {
 408             start=i+1;
 409         }
 410     }
 411
 412     return FALSE; /* string not found */
 413 }
 414
 415 U_NAMESPACE_BEGIN
 416
 417 FullCaseFoldingIterator::FullCaseFoldingIterator()
 418         : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
 419           unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
 420           unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
 421           unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
 422           currentRow(0),
 423           rowCpIndex(unfoldStringWidth) {
 424     unfold+=unfoldRowWidth;
 425 }
 426
 427 UChar32
 428 FullCaseFoldingIterator::next(UnicodeString &full) {
 429     // Advance past the last-delivered code point.
 430     const UChar *p=unfold+(currentRow*unfoldRowWidth);
 431     if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
 432         ++currentRow;
 433         p+=unfoldRowWidth;
 434         rowCpIndex=unfoldStringWidth;
 435     }
 436     if(currentRow>=unfoldRows) { return U_SENTINEL; }
 437     // Set "full" to the NUL-terminated string in the first unfold column.
 438     int32_t length=unfoldStringWidth;
 439     while(length>0 && p[length-1]==0) { --length; }
 440     full.setTo(FALSE, p, length);
 441     // Return the code point.
 442     UChar32 c;
 443     U16_NEXT_UNSAFE(p, rowCpIndex, c);
 444     return c;
 445 }
 446
 447 namespace LatinCase {
 448
 449 const int8_t TO_LOWER_NORMAL[LIMIT] = {
 450     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 451     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 452     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 453     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 454
 455     0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
 456     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
 457     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 458     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 459
 460     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 461     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 462     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 463     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 464
 465     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
 466     32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
 467     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 468     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 469
 470     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
 471     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
 472     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
 473     EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
 474
 475     0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
 476     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
 477     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
 478     1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
 479 };
 480
 481 const int8_t TO_LOWER_TR_LT[LIMIT] = {
 482     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 483     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 484     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 485     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 486
 487     0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
 488     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
 489     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 490     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 491
 492     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 493     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 494     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 495     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 496
 497     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
 498     32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
 499     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 500     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 501
 502     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
 503     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
 504     1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
 505     EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
 506
 507     0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
 508     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
 509     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
 510     1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
 511 };
 512
 513 const int8_t TO_UPPER_NORMAL[LIMIT] = {
 514     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 515     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 516     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 517     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 518
 519     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 520     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 521     0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
 522     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
 523
 524     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 525     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 526     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 527     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 528
 529     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 530     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
 531     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
 532     -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
 533
 534     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
 535     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
 536     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
 537     0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
 538
 539     -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
 540     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
 541     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
 542     0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
 543 };
 544
 545 const int8_t TO_UPPER_TR[LIMIT] = {
 546     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 547     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 548     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 549     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 550
 551     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 552     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 553     0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
 554     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
 555
 556     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 557     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 558     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 559     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 560
 561     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 562     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
 563     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
 564     -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
 565
 566     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
 567     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
 568     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
 569     0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
 570
 571     -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
 572     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
 573     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
 574     0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
 575 };
 576
 577 }  // namespace LatinCase
 578
 579 U_NAMESPACE_END
 580
 581 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
 582 U_CAPI int32_t U_EXPORT2
 583 ucase_getType(UChar32 c) {
 584     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
 585     return UCASE_GET_TYPE(props);
 586 }
 587
 588 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
 589 U_CAPI int32_t U_EXPORT2
 590 ucase_getTypeOrIgnorable(UChar32 c) {
 591     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
 592     return UCASE_GET_TYPE_AND_IGNORABLE(props);
 593 }
 594
 595 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
 596 static inline int32_t
 597 getDotType(UChar32 c) {
 598     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
 599     if(!UCASE_HAS_EXCEPTION(props)) {
 600         return props&UCASE_DOT_MASK;
 601     } else {
 602         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
 603         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
 604     }
 605 }
 606
 607 U_CAPI UBool U_EXPORT2
 608 ucase_isSoftDotted(UChar32 c) {
 609     return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
 610 }
 611
 612 U_CAPI UBool U_EXPORT2
 613 ucase_isCaseSensitive(UChar32 c) {
 614     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
 615     if(!UCASE_HAS_EXCEPTION(props)) {
 616         return (UBool)((props&UCASE_SENSITIVE)!=0);
 617     } else {
 618         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
 619         return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
 620     }
 621 }
 622
 623 /* string casing ------------------------------------------------------------ */
 624
 625 /*
 626  * These internal functions form the core of string case mappings.
 627  * They map single code points to result code points or strings and take
 628  * all necessary conditions (context, locale ID, options) into account.
 629  *
 630  * They do not iterate over the source or write to the destination
 631  * so that the same functions are useful for non-standard string storage,
 632  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
 633  * For the same reason, the "surrounding text" context is passed in as a
 634  * UCaseContextIterator which does not make any assumptions about
 635  * the underlying storage.
 636  *
 637  * This section contains helper functions that check for conditions
 638  * in the input text surrounding the current code point
 639  * according to SpecialCasing.txt.
 640  *
 641  * Each helper function gets the index
 642  * - after the current code point if it looks at following text
 643  * - before the current code point if it looks at preceding text
 644  *
 645  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
 646  *
 647  * Final_Sigma
 648  *   C is preceded by a sequence consisting of
 649  *     a cased letter and a case-ignorable sequence,
 650  *   and C is not followed by a sequence consisting of
 651  *     an ignorable sequence and then a cased letter.
 652  *
 653  * More_Above
 654  *   C is followed by one or more characters of combining class 230 (ABOVE)
 655  *   in the combining character sequence.
 656  *
 657  * After_Soft_Dotted
 658  *   The last preceding character with combining class of zero before C
 659  *   was Soft_Dotted,
 660  *   and there is no intervening combining character class 230 (ABOVE).
 661  *
 662  * Before_Dot
 663  *   C is followed by combining dot above (U+0307).
 664  *   Any sequence of characters with a combining class that is neither 0 nor 230
 665  *   may intervene between the current character and the combining dot above.
 666  *
 667  * The erratum from 2002-10-31 adds the condition
 668  *
 669  * After_I
 670  *   The last preceding base character was an uppercase I, and there is no
 671  *   intervening combining character class 230 (ABOVE).
 672  *
 673  *   (See Jitterbug 2344 and the comments on After_I below.)
 674  *
 675  * Helper definitions in Unicode 3.2 UAX 21:
 676  *
 677  * D1. A character C is defined to be cased
 678  *     if it meets any of the following criteria:
 679  *
 680  *   - The general category of C is Titlecase Letter (Lt)
 681  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
 682  *   - Given D = NFD(C), then it is not the case that:
 683  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
 684  *     (This third criterium does not add any characters to the list
 685  *      for Unicode 3.2. Ignored.)
 686  *
 687  * D2. A character C is defined to be case-ignorable
 688  *     if it meets either of the following criteria:
 689  *
 690  *   - The general category of C is
 691  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
 692  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
 693  *   - C is one of the following characters
 694  *     U+0027 APOSTROPHE
 695  *     U+00AD SOFT HYPHEN (SHY)
 696  *     U+2019 RIGHT SINGLE QUOTATION MARK
 697  *            (the preferred character for apostrophe)
 698  *
 699  * D3. A case-ignorable sequence is a sequence of
 700  *     zero or more case-ignorable characters.
 701  */
 702
 703 #define is_d(c) ((c)=='d' || (c)=='D')
 704 #define is_e(c) ((c)=='e' || (c)=='E')
 705 #define is_i(c) ((c)=='i' || (c)=='I')
 706 #define is_l(c) ((c)=='l' || (c)=='L')
 707 #define is_r(c) ((c)=='r' || (c)=='R')
 708 #define is_t(c) ((c)=='t' || (c)=='T')
 709 #define is_u(c) ((c)=='u' || (c)=='U')
 710 #define is_z(c) ((c)=='z' || (c)=='Z')
 711
 712 /* separator? */
 713 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
 714
 715 /**
 716  * Requires non-NULL locale ID but otherwise does the equivalent of
 717  * checking for language codes as if uloc_getLanguage() were called:
 718  * Accepts both 2- and 3-letter codes and accepts case variants.
 719  */
 720 U_CFUNC int32_t
 721 ucase_getCaseLocale(const char *locale) {
 722     /*
 723      * This function used to use uloc_getLanguage(), but the current code
 724      * removes the dependency of this low-level code on uloc implementation code
 725      * and is faster because not the whole locale ID has to be
 726      * examined and copied/transformed.
 727      *
 728      * Because this code does not want to depend on uloc, the caller must
 729      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
 730      */
 731     char c=*locale++;
 732     // Fastpath for English "en" which is often used for default (=root locale) case mappings,
 733     // and for Chinese "zh": Very common but no special case mapping behavior.
 734     // Then check lowercase vs. uppercase to reduce the number of comparisons
 735     // for other locales without special behavior.
 736     if(c=='e') {
 737         /* el or ell? */
 738         c=*locale++;
 739         if(is_l(c)) {
 740             c=*locale++;
 741             if(is_l(c)) {
 742                 c=*locale;
 743             }
 744             if(is_sep(c)) {
 745                 return UCASE_LOC_GREEK;
 746             }
 747         }
 748         // en, es, ... -> root
 749     } else if(c=='z') {
 750         return UCASE_LOC_ROOT;
 751 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
 752     } else if(c>='a') {  // ASCII a-z = 0x61..0x7a, after A-Z
 753 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
 754     } else if(c<='z') {  // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
 755 #else
 756 #   error Unknown charset family!
 757 #endif
 758         // lowercase c
 759         if(c=='t') {
 760             /* tr or tur? */
 761             c=*locale++;
 762             if(is_u(c)) {
 763                 c=*locale++;
 764             }
 765             if(is_r(c)) {
 766                 c=*locale;
 767                 if(is_sep(c)) {
 768                     return UCASE_LOC_TURKISH;
 769                 }
 770             }
 771         } else if(c=='a') {
 772             /* az or aze? */
 773             c=*locale++;
 774             if(is_z(c)) {
 775                 c=*locale++;
 776                 if(is_e(c)) {
 777                     c=*locale;
 778                 }
 779                 if(is_sep(c)) {
 780                     return UCASE_LOC_TURKISH;
 781                 }
 782             }
 783         } else if(c=='l') {
 784             /* lt or lit? */
 785             c=*locale++;
 786             if(is_i(c)) {
 787                 c=*locale++;
 788             }
 789             if(is_t(c)) {
 790                 c=*locale;
 791                 if(is_sep(c)) {
 792                     return UCASE_LOC_LITHUANIAN;
 793                 }
 794             }
 795         } else if(c=='n') {
 796             /* nl or nld? */
 797             c=*locale++;
 798             if(is_l(c)) {
 799                 c=*locale++;
 800                 if(is_d(c)) {
 801                     c=*locale;
 802                 }
 803                 if(is_sep(c)) {
 804                     return UCASE_LOC_DUTCH;
 805                 }
 806             }
 807         }
 808     } else {
 809         // uppercase c
 810         // Same code as for lowercase c but also check for 'E'.
 811         if(c=='T') {
 812             /* tr or tur? */
 813             c=*locale++;
 814             if(is_u(c)) {
 815                 c=*locale++;
 816             }
 817             if(is_r(c)) {
 818                 c=*locale;
 819                 if(is_sep(c)) {
 820                     return UCASE_LOC_TURKISH;
 821                 }
 822             }
 823         } else if(c=='A') {
 824             /* az or aze? */
 825             c=*locale++;
 826             if(is_z(c)) {
 827                 c=*locale++;
 828                 if(is_e(c)) {
 829                     c=*locale;
 830                 }
 831                 if(is_sep(c)) {
 832                     return UCASE_LOC_TURKISH;
 833                 }
 834             }
 835         } else if(c=='L') {
 836             /* lt or lit? */
 837             c=*locale++;
 838             if(is_i(c)) {
 839                 c=*locale++;
 840             }
 841             if(is_t(c)) {
 842                 c=*locale;
 843                 if(is_sep(c)) {
 844                     return UCASE_LOC_LITHUANIAN;
 845                 }
 846             }
 847         } else if(c=='E') {
 848             /* el or ell? */
 849             c=*locale++;
 850             if(is_l(c)) {
 851                 c=*locale++;
 852                 if(is_l(c)) {
 853                     c=*locale;
 854                 }
 855                 if(is_sep(c)) {
 856                     return UCASE_LOC_GREEK;
 857                 }
 858             }
 859         } else if(c=='N') {
 860             /* nl or nld? */
 861             c=*locale++;
 862             if(is_l(c)) {
 863                 c=*locale++;
 864                 if(is_d(c)) {
 865                     c=*locale;
 866                 }
 867                 if(is_sep(c)) {
 868                     return UCASE_LOC_DUTCH;
 869                 }
 870             }
 871         }
 872     }
 873     return UCASE_LOC_ROOT;
 874 }
 875
 876 /*
 877  * Is followed by
 878  *   {case-ignorable}* cased
 879  * ?
 880  * (dir determines looking forward/backward)
 881  * If a character is case-ignorable, it is skipped regardless of whether
 882  * it is also cased or not.
 883  */
 884 static UBool
 885 isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
 886     UChar32 c;
 887
 888     if(iter==NULL) {
 889         return FALSE;
 890     }
 891
 892     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
 893         int32_t type=ucase_getTypeOrIgnorable(c);
 894         if(type&4) {
 895             /* case-ignorable, continue with the loop */
 896         } else if(type!=UCASE_NONE) {
 897             return TRUE; /* followed by cased letter */
 898         } else {
 899             return FALSE; /* uncased and not case-ignorable */
 900         }
 901     }
 902
 903     return FALSE; /* not followed by cased letter */
 904 }
 905
 906 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
 907 static UBool
 908 isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
 909     UChar32 c;
 910     int32_t dotType;
 911     int8_t dir;
 912
 913     if(iter==NULL) {
 914         return FALSE;
 915     }
 916
 917     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
 918         dotType=getDotType(c);
 919         if(dotType==UCASE_SOFT_DOTTED) {
 920             return TRUE; /* preceded by TYPE_i */
 921         } else if(dotType!=UCASE_OTHER_ACCENT) {
 922             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
 923         }
 924     }
 925
 926     return FALSE; /* not preceded by TYPE_i */
 927 }
 928
 929 /*
 930  * See Jitterbug 2344:
 931  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
 932  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
 933  * we made those releases compatible with Unicode 3.2 which had not fixed
 934  * a related bug in SpecialCasing.txt.
 935  *
 936  * From the Jitterbug 2344 text:
 937  * ... this bug is listed as a Unicode erratum
 938  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
 939  * <quote>
 940  * There are two errors in SpecialCasing.txt.
 941  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
 942  * 2. An incorrect context definition. Correct as follows:
 943  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
 944  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
 945  * ---
 946  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
 947  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
 948  * where the context After_I is defined as:
 949  * The last preceding base character was an uppercase I, and there is no
 950  * intervening combining character class 230 (ABOVE).
 951  * </quote>
 952  *
 953  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
 954  *
 955  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
 956  * # This matches the behavior of the canonically equivalent I-dot_above
 957  *
 958  * See also the description in this place in older versions of uchar.c (revision 1.100).
 959  *
 960  * Markus W. Scherer 2003-feb-15
 961  */
 962
 963 /* Is preceded by base character 'I' with no intervening cc=230 ? */
 964 static UBool
 965 isPrecededBy_I(UCaseContextIterator *iter, void *context) {
 966     UChar32 c;
 967     int32_t dotType;
 968     int8_t dir;
 969
 970     if(iter==NULL) {
 971         return FALSE;
 972     }
 973
 974     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
 975         if(c==0x49) {
 976             return TRUE; /* preceded by I */
 977         }
 978         dotType=getDotType(c);
 979         if(dotType!=UCASE_OTHER_ACCENT) {
 980             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
 981         }
 982     }
 983
 984     return FALSE; /* not preceded by I */
 985 }
 986
 987 /* Is followed by one or more cc==230 ? */
 988 static UBool
 989 isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
 990     UChar32 c;
 991     int32_t dotType;
 992     int8_t dir;
 993
 994     if(iter==NULL) {
 995         return FALSE;
 996     }
 997
 998     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
 999         dotType=getDotType(c);
1000         if(dotType==UCASE_ABOVE) {
1001             return TRUE; /* at least one cc==230 following */
1002         } else if(dotType!=UCASE_OTHER_ACCENT) {
1003             return FALSE; /* next base character, no more cc==230 following */
1004         }
1005     }
1006
1007     return FALSE; /* no more cc==230 following */
1008 }
1009
1010 /* Is followed by a dot above (without cc==230 in between) ? */
1011 static UBool
1012 isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
1013     UChar32 c;
1014     int32_t dotType;
1015     int8_t dir;
1016
1017     if(iter==NULL) {
1018         return FALSE;
1019     }
1020
1021     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1022         if(c==0x307) {
1023             return TRUE;
1024         }
1025         dotType=getDotType(c);
1026         if(dotType!=UCASE_OTHER_ACCENT) {
1027             return FALSE; /* next base character or cc==230 in between */
1028         }
1029     }
1030
1031     return FALSE; /* no dot above following */
1032 }
1033
1034 U_CAPI int32_t U_EXPORT2
1035 ucase_toFullLower(UChar32 c,
1036                   UCaseContextIterator *iter, void *context,
1037                   const UChar **pString,
1038                   int32_t loc) {
1039     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1040     U_ASSERT(c >= 0);
1041     UChar32 result=c;
1042     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1043     if(!UCASE_HAS_EXCEPTION(props)) {
1044         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1045             result=c+UCASE_GET_DELTA(props);
1046         }
1047     } else {
1048         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1049         uint16_t excWord=*pe++;
1050         int32_t full;
1051
1052         pe2=pe;
1053
1054         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1055             /* use hardcoded conditions and mappings */
1056
1057             /*
1058              * Test for conditional mappings first
1059              *   (otherwise the unconditional default mappings are always taken),
1060              * then test for characters that have unconditional mappings in SpecialCasing.txt,
1061              * then get the UnicodeData.txt mappings.
1062              */
1063             if( loc==UCASE_LOC_LITHUANIAN &&
1064                     /* base characters, find accents above */
1065                     (((c==0x49 || c==0x4a || c==0x12e) &&
1066                         isFollowedByMoreAbove(iter, context)) ||
1067                     /* precomposed with accent above, no need to find one */
1068                     (c==0xcc || c==0xcd || c==0x128))
1069             ) {
1070                 /*
1071                     # Lithuanian
1072
1073                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1074
1075                     # Introduce an explicit dot above when lowercasing capital I's and J's
1076                     # whenever there are more accents above.
1077                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1078
1079                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1080                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1081                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1082                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1083                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1084                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1085                  */
1086                 switch(c) {
1087                 case 0x49:  /* LATIN CAPITAL LETTER I */
1088                     *pString=iDot;
1089                     return 2;
1090                 case 0x4a:  /* LATIN CAPITAL LETTER J */
1091                     *pString=jDot;
1092                     return 2;
1093                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1094                     *pString=iOgonekDot;
1095                     return 2;
1096                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
1097                     *pString=iDotGrave;
1098                     return 3;
1099                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
1100                     *pString=iDotAcute;
1101                     return 3;
1102                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1103                     *pString=iDotTilde;
1104                     return 3;
1105                 default:
1106                     return 0; /* will not occur */
1107                 }
1108             /* # Turkish and Azeri */
1109             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1110                 /*
1111                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1112                     # The following rules handle those cases.
1113
1114                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1115                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1116                  */
1117                 return 0x69;
1118             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
1119                 /*
1120                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1121                     # This matches the behavior of the canonically equivalent I-dot_above
1122
1123                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1124                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1125                  */
1126                 *pString=nullptr;
1127                 return 0; /* remove the dot (continue without output) */
1128             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
1129                 /*
1130                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1131
1132                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1133                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1134                  */
1135                 return 0x131;
1136             } else if(c==0x130) {
1137                 /*
1138                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
1139
1140                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1141                  */
1142                 *pString=iDot;
1143                 return 2;
1144             } else if(  c==0x3a3 &&
1145                         !isFollowedByCasedLetter(iter, context, 1) &&
1146                         isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
1147             ) {
1148                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1149                 /*
1150                     # Special case for final form of sigma
1151
1152                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1153                  */
1154                 return 0x3c2; /* greek small final sigma */
1155             } else {
1156                 /* no known conditional special case mapping, use a normal mapping */
1157             }
1158         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1159             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1160             full&=UCASE_FULL_LOWER;
1161             if(full!=0) {
1162                 /* set the output pointer to the lowercase mapping */
1163                 *pString=reinterpret_cast<const UChar *>(pe+1);
1164
1165                 /* return the string length */
1166                 return full;
1167             }
1168         }
1169
1170         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1171             int32_t delta;
1172             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1173             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1174         }
1175         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1176             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1177         }
1178     }
1179
1180     return (result==c) ? ~result : result;
1181 }
1182
1183 /* internal */
1184 static int32_t
1185 toUpperOrTitle(UChar32 c,
1186                UCaseContextIterator *iter, void *context,
1187                const UChar **pString,
1188                int32_t loc,
1189                UBool upperNotTitle) {
1190     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1191     U_ASSERT(c >= 0);
1192     UChar32 result=c;
1193     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1194     if(!UCASE_HAS_EXCEPTION(props)) {
1195         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1196             result=c+UCASE_GET_DELTA(props);
1197         }
1198     } else {
1199         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1200         uint16_t excWord=*pe++;
1201         int32_t full, idx;
1202
1203         pe2=pe;
1204
1205         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1206             /* use hardcoded conditions and mappings */
1207             if(loc==UCASE_LOC_TURKISH && c==0x69) {
1208                 /*
1209                     # Turkish and Azeri
1210
1211                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1212                     # The following rules handle those cases.
1213
1214                     # When uppercasing, i turns into a dotted capital I
1215
1216                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1217                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1218                 */
1219                 return 0x130;
1220             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
1221                 /*
1222                     # Lithuanian
1223
1224                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1225
1226                     # Remove DOT ABOVE after "i" with upper or titlecase
1227
1228                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1229                  */
1230                 *pString=nullptr;
1231                 return 0; /* remove the dot (continue without output) */
1232             } else {
1233                 /* no known conditional special case mapping, use a normal mapping */
1234             }
1235         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1236             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1237
1238             /* start of full case mapping strings */
1239             ++pe;
1240
1241             /* skip the lowercase and case-folding result strings */
1242             pe+=full&UCASE_FULL_LOWER;
1243             full>>=4;
1244             pe+=full&0xf;
1245             full>>=4;
1246
1247             if(upperNotTitle) {
1248                 full&=0xf;
1249             } else {
1250                 /* skip the uppercase result string */
1251                 pe+=full&0xf;
1252                 full=(full>>4)&0xf;
1253             }
1254
1255             if(full!=0) {
1256                 /* set the output pointer to the result string */
1257                 *pString=reinterpret_cast<const UChar *>(pe);
1258
1259                 /* return the string length */
1260                 return full;
1261             }
1262         }
1263
1264         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1265             int32_t delta;
1266             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1267             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1268         }
1269         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1270             idx=UCASE_EXC_TITLE;
1271         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1272             /* here, titlecase is same as uppercase */
1273             idx=UCASE_EXC_UPPER;
1274         } else {
1275             return ~c;
1276         }
1277         GET_SLOT_VALUE(excWord, idx, pe2, result);
1278     }
1279
1280     return (result==c) ? ~result : result;
1281 }
1282
1283 U_CAPI int32_t U_EXPORT2
1284 ucase_toFullUpper(UChar32 c,
1285                   UCaseContextIterator *iter, void *context,
1286                   const UChar **pString,
1287                   int32_t caseLocale) {
1288     return toUpperOrTitle(c, iter, context, pString, caseLocale, TRUE);
1289 }
1290
1291 U_CAPI int32_t U_EXPORT2
1292 ucase_toFullTitle(UChar32 c,
1293                   UCaseContextIterator *iter, void *context,
1294                   const UChar **pString,
1295                   int32_t caseLocale) {
1296     return toUpperOrTitle(c, iter, context, pString, caseLocale, FALSE);
1297 }
1298
1299 /* case folding ------------------------------------------------------------- */
1300
1301 /*
1302  * Case folding is similar to lowercasing.
1303  * The result may be a simple mapping, i.e., a single code point, or
1304  * a full mapping, i.e., a string.
1305  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1306  * then only the lowercase mapping is stored.
1307  *
1308  * Some special cases are hardcoded because their conditions cannot be
1309  * parsed and processed from CaseFolding.txt.
1310  *
1311  * Unicode 3.2 CaseFolding.txt specifies for its status field:
1312
1313 # C: common case folding, common mappings shared by both simple and full mappings.
1314 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1315 # S: simple case folding, mappings to single characters where different from F.
1316 # T: special case for uppercase I and dotted uppercase I
1317 #    - For non-Turkic languages, this mapping is normally not used.
1318 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1319 #
1320 # Usage:
1321 #  A. To do a simple case folding, use the mappings with status C + S.
1322 #  B. To do a full case folding, use the mappings with status C + F.
1323 #
1324 #    The mappings with status T can be used or omitted depending on the desired case-folding
1325 #    behavior. (The default option is to exclude them.)
1326
1327  * Unicode 3.2 has 'T' mappings as follows:
1328
1329 0049; T; 0131; # LATIN CAPITAL LETTER I
1330 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1331
1332  * while the default mappings for these code points are:
1333
1334 0049; C; 0069; # LATIN CAPITAL LETTER I
1335 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1336
1337  * U+0130 has no simple case folding (simple-case-folds to itself).
1338  */
1339
1340 /* return the simple case folding mapping for c */
1341 U_CAPI UChar32 U_EXPORT2
1342 ucase_fold(UChar32 c, uint32_t options) {
1343     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1344     if(!UCASE_HAS_EXCEPTION(props)) {
1345         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1346             c+=UCASE_GET_DELTA(props);
1347         }
1348     } else {
1349         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
1350         uint16_t excWord=*pe++;
1351         int32_t idx;
1352         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1353             /* special case folding mappings, hardcoded */
1354             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1355                 /* default mappings */
1356                 if(c==0x49) {
1357                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1358                     return 0x69;
1359                 } else if(c==0x130) {
1360                     /* no simple case folding for U+0130 */
1361                     return c;
1362                 }
1363             } else {
1364                 /* Turkic mappings */
1365                 if(c==0x49) {
1366                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1367                     return 0x131;
1368                 } else if(c==0x130) {
1369                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1370                     return 0x69;
1371                 }
1372             }
1373         }
1374         if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1375             return c;
1376         }
1377         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1378             int32_t delta;
1379             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1380             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1381         }
1382         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1383             idx=UCASE_EXC_FOLD;
1384         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1385             idx=UCASE_EXC_LOWER;
1386         } else {
1387             return c;
1388         }
1389         GET_SLOT_VALUE(excWord, idx, pe, c);
1390     }
1391     return c;
1392 }
1393
1394 /*
1395  * Issue for canonical caseless match (UAX #21):
1396  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1397  * canonical equivalence, unlike default-option casefolding.
1398  * For example, I-grave and I + grave fold to strings that are not canonically
1399  * equivalent.
1400  * For more details, see the comment in unorm_compare() in unorm.cpp
1401  * and the intermediate prototype changes for Jitterbug 2021.
1402  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1403  *
1404  * This did not get fixed because it appears that it is not possible to fix
1405  * it for uppercase and lowercase characters (I-grave vs. i-grave)
1406  * together in a way that they still fold to common result strings.
1407  */
1408
1409 U_CAPI int32_t U_EXPORT2
1410 ucase_toFullFolding(UChar32 c,
1411                     const UChar **pString,
1412                     uint32_t options) {
1413     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1414     U_ASSERT(c >= 0);
1415     UChar32 result=c;
1416     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1417     if(!UCASE_HAS_EXCEPTION(props)) {
1418         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1419             result=c+UCASE_GET_DELTA(props);
1420         }
1421     } else {
1422         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1423         uint16_t excWord=*pe++;
1424         int32_t full, idx;
1425
1426         pe2=pe;
1427
1428         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1429             /* use hardcoded conditions and mappings */
1430             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1431                 /* default mappings */
1432                 if(c==0x49) {
1433                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1434                     return 0x69;
1435                 } else if(c==0x130) {
1436                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1437                     *pString=iDot;
1438                     return 2;
1439                 }
1440             } else {
1441                 /* Turkic mappings */
1442                 if(c==0x49) {
1443                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1444                     return 0x131;
1445                 } else if(c==0x130) {
1446                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1447                     return 0x69;
1448                 }
1449             }
1450         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1451             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1452
1453             /* start of full case mapping strings */
1454             ++pe;
1455
1456             /* skip the lowercase result string */
1457             pe+=full&UCASE_FULL_LOWER;
1458             full=(full>>4)&0xf;
1459
1460             if(full!=0) {
1461                 /* set the output pointer to the result string */
1462                 *pString=reinterpret_cast<const UChar *>(pe);
1463
1464                 /* return the string length */
1465                 return full;
1466             }
1467         }
1468
1469         if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1470             return ~c;
1471         }
1472         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1473             int32_t delta;
1474             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1475             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1476         }
1477         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1478             idx=UCASE_EXC_FOLD;
1479         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1480             idx=UCASE_EXC_LOWER;
1481         } else {
1482             return ~c;
1483         }
1484         GET_SLOT_VALUE(excWord, idx, pe2, result);
1485     }
1486
1487     return (result==c) ? ~result : result;
1488 }
1489
1490 /* case mapping properties API ---------------------------------------------- */
1491
1492 /* public API (see uchar.h) */
1493
1494 U_CAPI UBool U_EXPORT2
1495 u_isULowercase(UChar32 c) {
1496     return (UBool)(UCASE_LOWER==ucase_getType(c));
1497 }
1498
1499 U_CAPI UBool U_EXPORT2
1500 u_isUUppercase(UChar32 c) {
1501     return (UBool)(UCASE_UPPER==ucase_getType(c));
1502 }
1503
1504 /* Transforms the Unicode character to its lower case equivalent.*/
1505 U_CAPI UChar32 U_EXPORT2
1506 u_tolower(UChar32 c) {
1507     return ucase_tolower(c);
1508 }
1509
1510 /* Transforms the Unicode character to its upper case equivalent.*/
1511 U_CAPI UChar32 U_EXPORT2
1512 u_toupper(UChar32 c) {
1513     return ucase_toupper(c);
1514 }
1515
1516 /* Transforms the Unicode character to its title case equivalent.*/
1517 U_CAPI UChar32 U_EXPORT2
1518 u_totitle(UChar32 c) {
1519     return ucase_totitle(c);
1520 }
1521
1522 /* return the simple case folding mapping for c */
1523 U_CAPI UChar32 U_EXPORT2
1524 u_foldCase(UChar32 c, uint32_t options) {
1525     return ucase_fold(c, options);
1526 }
1527
1528 U_CFUNC int32_t U_EXPORT2
1529 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1530     /* case mapping properties */
1531     const UChar *resultString;
1532     switch(which) {
1533     case UCHAR_LOWERCASE:
1534         return (UBool)(UCASE_LOWER==ucase_getType(c));
1535     case UCHAR_UPPERCASE:
1536         return (UBool)(UCASE_UPPER==ucase_getType(c));
1537     case UCHAR_SOFT_DOTTED:
1538         return ucase_isSoftDotted(c);
1539     case UCHAR_CASE_SENSITIVE:
1540         return ucase_isCaseSensitive(c);
1541     case UCHAR_CASED:
1542         return (UBool)(UCASE_NONE!=ucase_getType(c));
1543     case UCHAR_CASE_IGNORABLE:
1544         return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
1545     /*
1546      * Note: The following Changes_When_Xyz are defined as testing whether
1547      * the NFD form of the input changes when Xyz-case-mapped.
1548      * However, this simpler implementation of these properties,
1549      * ignoring NFD, passes the tests.
1550      * The implementation needs to be changed if the tests start failing.
1551      * When that happens, optimizations should be used to work with the
1552      * per-single-code point ucase_toFullXyz() functions unless
1553      * the NFD form has more than one code point,
1554      * and the property starts set needs to be the union of the
1555      * start sets for normalization and case mappings.
1556      */
1557     case UCHAR_CHANGES_WHEN_LOWERCASED:
1558         return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1559     case UCHAR_CHANGES_WHEN_UPPERCASED:
1560         return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1561     case UCHAR_CHANGES_WHEN_TITLECASED:
1562         return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1563     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1564     case UCHAR_CHANGES_WHEN_CASEMAPPED:
1565         return (UBool)(
1566             ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1567             ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1568             ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1569     default:
1570         return FALSE;
1571     }
1572 }