icuSources/common/ucase.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2004-2012, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucase.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2004aug30
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Low-level Unicode character/string case mapping code.
  17 *   Much code moved here (and modified) from uchar.c.
  18 */
  19
  20 #include "unicode/utypes.h"
  21 #include "unicode/unistr.h"
  22 #include "unicode/uset.h"
  23 #include "unicode/udata.h" /* UDataInfo */
  24 #include "unicode/utf16.h"
  25 #include "ucmndata.h" /* DataHeader */
  26 #include "udatamem.h"
  27 #include "umutex.h"
  28 #include "uassert.h"
  29 #include "cmemory.h"
  30 #include "utrie2.h"
  31 #include "ucase.h"
  32 #include "ucln_cmn.h"
  33
  34 struct UCaseProps {
  35     UDataMemory *mem;
  36     const int32_t *indexes;
  37     const uint16_t *exceptions;
  38     const uint16_t *unfold;
  39
  40     UTrie2 trie;
  41     uint8_t formatVersion[4];
  42 };
  43
  44 /* ucase_props_data.h is machine-generated by gencase --csource */
  45 #define INCLUDED_FROM_UCASE_CPP
  46 #include "ucase_props_data.h"
  47
  48 /* UCaseProps singleton ----------------------------------------------------- */
  49
  50 U_CAPI const UCaseProps * U_EXPORT2
  51 ucase_getSingleton() {
  52     return &ucase_props_singleton;
  53 }
  54
  55 /* set of property starts for UnicodeSet ------------------------------------ */
  56
  57 static UBool U_CALLCONV
  58 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
  59     /* add the start code point to the USet */
  60     const USetAdder *sa=(const USetAdder *)context;
  61     sa->add(sa->set, start);
  62     return TRUE;
  63 }
  64
  65 U_CFUNC void U_EXPORT2
  66 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
  67     if(U_FAILURE(*pErrorCode)) {
  68         return;
  69     }
  70
  71     /* add the start code point of each same-value range of the trie */
  72     utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
  73
  74     /* add code points with hardcoded properties, plus the ones following them */
  75
  76     /* (none right now, see comment below) */
  77
  78     /*
  79      * Omit code points with hardcoded specialcasing properties
  80      * because we do not build property UnicodeSets for them right now.
  81      */
  82 }
  83
  84 /* data access primitives --------------------------------------------------- */
  85
  86 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
  87
  88 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
  89
  90 /* number of bits in an 8-bit integer value */
  91 static const uint8_t flagsOffset[256]={
  92     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
  93     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  94     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  95     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  96     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  97     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  98     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  99     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 100     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
 101     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 102     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 103     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 104     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 105     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 106     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 107     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 108 };
 109
 110 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
 111 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
 112
 113 /*
 114  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
 115  *
 116  * @param excWord (in) initial exceptions word
 117  * @param idx (in) desired slot index
 118  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
 119  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
 120  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
 121  */
 122 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
 123     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
 124         (pExc16)+=SLOT_OFFSET(excWord, idx); \
 125         (value)=*pExc16; \
 126     } else { \
 127         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
 128         (value)=*pExc16++; \
 129         (value)=((value)<<16)|*pExc16; \
 130     }
 131
 132 /* simple case mappings ----------------------------------------------------- */
 133
 134 U_CAPI UChar32 U_EXPORT2
 135 ucase_tolower(const UCaseProps *csp, UChar32 c) {
 136     uint16_t props=UTRIE2_GET16(&csp->trie, c);
 137     if(!PROPS_HAS_EXCEPTION(props)) {
 138         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
 139             c+=UCASE_GET_DELTA(props);
 140         }
 141     } else {
 142         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 143         uint16_t excWord=*pe++;
 144         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
 145             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
 146         }
 147     }
 148     return c;
 149 }
 150
 151 U_CAPI UChar32 U_EXPORT2
 152 ucase_toupper(const UCaseProps *csp, UChar32 c) {
 153     uint16_t props=UTRIE2_GET16(&csp->trie, c);
 154     if(!PROPS_HAS_EXCEPTION(props)) {
 155         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
 156             c+=UCASE_GET_DELTA(props);
 157         }
 158     } else {
 159         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 160         uint16_t excWord=*pe++;
 161         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
 162             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
 163         }
 164     }
 165     return c;
 166 }
 167
 168 U_CAPI UChar32 U_EXPORT2
 169 ucase_totitle(const UCaseProps *csp, UChar32 c) {
 170     uint16_t props=UTRIE2_GET16(&csp->trie, c);
 171     if(!PROPS_HAS_EXCEPTION(props)) {
 172         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
 173             c+=UCASE_GET_DELTA(props);
 174         }
 175     } else {
 176         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 177         uint16_t excWord=*pe++;
 178         int32_t idx;
 179         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
 180             idx=UCASE_EXC_TITLE;
 181         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
 182             idx=UCASE_EXC_UPPER;
 183         } else {
 184             return c;
 185         }
 186         GET_SLOT_VALUE(excWord, idx, pe, c);
 187     }
 188     return c;
 189 }
 190
 191 static const UChar iDot[2] = { 0x69, 0x307 };
 192 static const UChar jDot[2] = { 0x6a, 0x307 };
 193 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
 194 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
 195 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
 196 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
 197
 198
 199 U_CFUNC void U_EXPORT2
 200 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
 201     uint16_t props;
 202
 203     /*
 204      * Hardcode the case closure of i and its relatives and ignore the
 205      * data file data for these characters.
 206      * The Turkic dotless i and dotted I with their case mapping conditions
 207      * and case folding option make the related characters behave specially.
 208      * This code matches their closure behavior to their case folding behavior.
 209      */
 210
 211     switch(c) {
 212     case 0x49:
 213         /* regular i and I are in one equivalence class */
 214         sa->add(sa->set, 0x69);
 215         return;
 216     case 0x69:
 217         sa->add(sa->set, 0x49);
 218         return;
 219     case 0x130:
 220         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
 221         sa->addString(sa->set, iDot, 2);
 222         return;
 223     case 0x131:
 224         /* dotless i is in a class by itself */
 225         return;
 226     default:
 227         /* otherwise use the data file data */
 228         break;
 229     }
 230
 231     props=UTRIE2_GET16(&csp->trie, c);
 232     if(!PROPS_HAS_EXCEPTION(props)) {
 233         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
 234             /* add the one simple case mapping, no matter what type it is */
 235             int32_t delta=UCASE_GET_DELTA(props);
 236             if(delta!=0) {
 237                 sa->add(sa->set, c+delta);
 238             }
 239         }
 240     } else {
 241         /*
 242          * c has exceptions, so there may be multiple simple and/or
 243          * full case mappings. Add them all.
 244          */
 245         const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
 246         const UChar *closure;
 247         uint16_t excWord=*pe++;
 248         int32_t idx, closureLength, fullLength, length;
 249
 250         pe0=pe;
 251
 252         /* add all simple case mappings */
 253         for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
 254             if(HAS_SLOT(excWord, idx)) {
 255                 pe=pe0;
 256                 GET_SLOT_VALUE(excWord, idx, pe, c);
 257                 sa->add(sa->set, c);
 258             }
 259         }
 260
 261         /* get the closure string pointer & length */
 262         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
 263             pe=pe0;
 264             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
 265             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
 266             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
 267         } else {
 268             closureLength=0;
 269             closure=NULL;
 270         }
 271
 272         /* add the full case folding */
 273         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
 274             pe=pe0;
 275             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
 276
 277             /* start of full case mapping strings */
 278             ++pe;
 279
 280             fullLength&=0xffff; /* bits 16 and higher are reserved */
 281
 282             /* skip the lowercase result string */
 283             pe+=fullLength&UCASE_FULL_LOWER;
 284             fullLength>>=4;
 285
 286             /* add the full case folding string */
 287             length=fullLength&0xf;
 288             if(length!=0) {
 289                 sa->addString(sa->set, (const UChar *)pe, length);
 290                 pe+=length;
 291             }
 292
 293             /* skip the uppercase and titlecase strings */
 294             fullLength>>=4;
 295             pe+=fullLength&0xf;
 296             fullLength>>=4;
 297             pe+=fullLength;
 298
 299             closure=(const UChar *)pe; /* behind full case mappings */
 300         }
 301
 302         /* add each code point in the closure string */
 303         for(idx=0; idx<closureLength;) {
 304             U16_NEXT_UNSAFE(closure, idx, c);
 305             sa->add(sa->set, c);
 306         }
 307     }
 308 }
 309
 310 /*
 311  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
 312  * must be length>0 and max>0 and length<=max
 313  */
 314 static inline int32_t
 315 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
 316     int32_t c1, c2;
 317
 318     max-=length; /* we require length<=max, so no need to decrement max in the loop */
 319     do {
 320         c1=*s++;
 321         c2=*t++;
 322         if(c2==0) {
 323             return 1; /* reached the end of t but not of s */
 324         }
 325         c1-=c2;
 326         if(c1!=0) {
 327             return c1; /* return difference result */
 328         }
 329     } while(--length>0);
 330     /* ends with length==0 */
 331
 332     if(max==0 || *t==0) {
 333         return 0; /* equal to length of both strings */
 334     } else {
 335         return -max; /* return lengh difference */
 336     }
 337 }
 338
 339 U_CFUNC UBool U_EXPORT2
 340 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
 341     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
 342
 343     if(csp->unfold==NULL || s==NULL) {
 344         return FALSE; /* no reverse case folding data, or no string */
 345     }
 346     if(length<=1) {
 347         /* the string is too short to find any match */
 348         /*
 349          * more precise would be:
 350          * if(!u_strHasMoreChar32Than(s, length, 1))
 351          * but this does not make much practical difference because
 352          * a single supplementary code point would just not be found
 353          */
 354         return FALSE;
 355     }
 356
 357     const uint16_t *unfold=csp->unfold;
 358     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
 359     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
 360     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
 361     unfold+=unfoldRowWidth;
 362
 363     if(length>unfoldStringWidth) {
 364         /* the string is too long to find any match */
 365         return FALSE;
 366     }
 367
 368     /* do a binary search for the string */
 369     start=0;
 370     limit=unfoldRows;
 371     while(start<limit) {
 372         i=(start+limit)/2;
 373         const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
 374         result=strcmpMax(s, length, p, unfoldStringWidth);
 375
 376         if(result==0) {
 377             /* found the string: add each code point, and its case closure */
 378             UChar32 c;
 379
 380             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
 381                 U16_NEXT_UNSAFE(p, i, c);
 382                 sa->add(sa->set, c);
 383                 ucase_addCaseClosure(csp, c, sa);
 384             }
 385             return TRUE;
 386         } else if(result<0) {
 387             limit=i;
 388         } else /* result>0 */ {
 389             start=i+1;
 390         }
 391     }
 392
 393     return FALSE; /* string not found */
 394 }
 395
 396 U_NAMESPACE_BEGIN
 397
 398 FullCaseFoldingIterator::FullCaseFoldingIterator()
 399         : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
 400           unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
 401           unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
 402           unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
 403           currentRow(0),
 404           rowCpIndex(unfoldStringWidth) {
 405     unfold+=unfoldRowWidth;
 406 }
 407
 408 UChar32
 409 FullCaseFoldingIterator::next(UnicodeString &full) {
 410     // Advance past the last-delivered code point.
 411     const UChar *p=unfold+(currentRow*unfoldRowWidth);
 412     if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
 413         ++currentRow;
 414         p+=unfoldRowWidth;
 415         rowCpIndex=unfoldStringWidth;
 416     }
 417     if(currentRow>=unfoldRows) { return U_SENTINEL; }
 418     // Set "full" to the NUL-terminated string in the first unfold column.
 419     int32_t length=unfoldStringWidth;
 420     while(length>0 && p[length-1]==0) { --length; }
 421     full.setTo(FALSE, p, length);
 422     // Return the code point.
 423     UChar32 c;
 424     U16_NEXT_UNSAFE(p, rowCpIndex, c);
 425     return c;
 426 }
 427
 428 U_NAMESPACE_END
 429
 430 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
 431 U_CAPI int32_t U_EXPORT2
 432 ucase_getType(const UCaseProps *csp, UChar32 c) {
 433     uint16_t props=UTRIE2_GET16(&csp->trie, c);
 434     return UCASE_GET_TYPE(props);
 435 }
 436
 437 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
 438 U_CAPI int32_t U_EXPORT2
 439 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
 440     uint16_t props=UTRIE2_GET16(&csp->trie, c);
 441     return UCASE_GET_TYPE_AND_IGNORABLE(props);
 442 }
 443
 444 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
 445 static inline int32_t
 446 getDotType(const UCaseProps *csp, UChar32 c) {
 447     uint16_t props=UTRIE2_GET16(&csp->trie, c);
 448     if(!PROPS_HAS_EXCEPTION(props)) {
 449         return props&UCASE_DOT_MASK;
 450     } else {
 451         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 452         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
 453     }
 454 }
 455
 456 U_CAPI UBool U_EXPORT2
 457 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
 458     return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
 459 }
 460
 461 U_CAPI UBool U_EXPORT2
 462 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
 463     uint16_t props=UTRIE2_GET16(&csp->trie, c);
 464     return (UBool)((props&UCASE_SENSITIVE)!=0);
 465 }
 466
 467 /* string casing ------------------------------------------------------------ */
 468
 469 /*
 470  * These internal functions form the core of string case mappings.
 471  * They map single code points to result code points or strings and take
 472  * all necessary conditions (context, locale ID, options) into account.
 473  *
 474  * They do not iterate over the source or write to the destination
 475  * so that the same functions are useful for non-standard string storage,
 476  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
 477  * For the same reason, the "surrounding text" context is passed in as a
 478  * UCaseContextIterator which does not make any assumptions about
 479  * the underlying storage.
 480  *
 481  * This section contains helper functions that check for conditions
 482  * in the input text surrounding the current code point
 483  * according to SpecialCasing.txt.
 484  *
 485  * Each helper function gets the index
 486  * - after the current code point if it looks at following text
 487  * - before the current code point if it looks at preceding text
 488  *
 489  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
 490  *
 491  * Final_Sigma
 492  *   C is preceded by a sequence consisting of
 493  *     a cased letter and a case-ignorable sequence,
 494  *   and C is not followed by a sequence consisting of
 495  *     an ignorable sequence and then a cased letter.
 496  *
 497  * More_Above
 498  *   C is followed by one or more characters of combining class 230 (ABOVE)
 499  *   in the combining character sequence.
 500  *
 501  * After_Soft_Dotted
 502  *   The last preceding character with combining class of zero before C
 503  *   was Soft_Dotted,
 504  *   and there is no intervening combining character class 230 (ABOVE).
 505  *
 506  * Before_Dot
 507  *   C is followed by combining dot above (U+0307).
 508  *   Any sequence of characters with a combining class that is neither 0 nor 230
 509  *   may intervene between the current character and the combining dot above.
 510  *
 511  * The erratum from 2002-10-31 adds the condition
 512  *
 513  * After_I
 514  *   The last preceding base character was an uppercase I, and there is no
 515  *   intervening combining character class 230 (ABOVE).
 516  *
 517  *   (See Jitterbug 2344 and the comments on After_I below.)
 518  *
 519  * Helper definitions in Unicode 3.2 UAX 21:
 520  *
 521  * D1. A character C is defined to be cased
 522  *     if it meets any of the following criteria:
 523  *
 524  *   - The general category of C is Titlecase Letter (Lt)
 525  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
 526  *   - Given D = NFD(C), then it is not the case that:
 527  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
 528  *     (This third criterium does not add any characters to the list
 529  *      for Unicode 3.2. Ignored.)
 530  *
 531  * D2. A character C is defined to be case-ignorable
 532  *     if it meets either of the following criteria:
 533  *
 534  *   - The general category of C is
 535  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
 536  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
 537  *   - C is one of the following characters
 538  *     U+0027 APOSTROPHE
 539  *     U+00AD SOFT HYPHEN (SHY)
 540  *     U+2019 RIGHT SINGLE QUOTATION MARK
 541  *            (the preferred character for apostrophe)
 542  *
 543  * D3. A case-ignorable sequence is a sequence of
 544  *     zero or more case-ignorable characters.
 545  */
 546
 547 #define is_a(c) ((c)=='a' || (c)=='A')
 548 #define is_d(c) ((c)=='d' || (c)=='D')
 549 #define is_e(c) ((c)=='e' || (c)=='E')
 550 #define is_i(c) ((c)=='i' || (c)=='I')
 551 #define is_l(c) ((c)=='l' || (c)=='L')
 552 #define is_n(c) ((c)=='n' || (c)=='N')
 553 #define is_r(c) ((c)=='r' || (c)=='R')
 554 #define is_t(c) ((c)=='t' || (c)=='T')
 555 #define is_u(c) ((c)=='u' || (c)=='U')
 556 #define is_z(c) ((c)=='z' || (c)=='Z')
 557
 558 /* separator? */
 559 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
 560
 561 /**
 562  * Requires non-NULL locale ID but otherwise does the equivalent of
 563  * checking for language codes as if uloc_getLanguage() were called:
 564  * Accepts both 2- and 3-letter codes and accepts case variants.
 565  */
 566 U_CFUNC int32_t
 567 ucase_getCaseLocale(const char *locale, int32_t *locCache) {
 568     int32_t result;
 569     char c;
 570
 571     if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
 572         return result;
 573     }
 574
 575     result=UCASE_LOC_ROOT;
 576
 577     /*
 578      * This function used to use uloc_getLanguage(), but the current code
 579      * removes the dependency of this low-level code on uloc implementation code
 580      * and is faster because not the whole locale ID has to be
 581      * examined and copied/transformed.
 582      *
 583      * Because this code does not want to depend on uloc, the caller must
 584      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
 585      */
 586     c=*locale++;
 587     if(is_t(c)) {
 588         /* tr or tur? */
 589         c=*locale++;
 590         if(is_u(c)) {
 591             c=*locale++;
 592         }
 593         if(is_r(c)) {
 594             c=*locale;
 595             if(is_sep(c)) {
 596                 result=UCASE_LOC_TURKISH;
 597             }
 598         }
 599     } else if(is_a(c)) {
 600         /* az or aze? */
 601         c=*locale++;
 602         if(is_z(c)) {
 603             c=*locale++;
 604             if(is_e(c)) {
 605                 c=*locale;
 606             }
 607             if(is_sep(c)) {
 608                 result=UCASE_LOC_TURKISH;
 609             }
 610         }
 611     } else if(is_l(c)) {
 612         /* lt or lit? */
 613         c=*locale++;
 614         if(is_i(c)) {
 615             c=*locale++;
 616         }
 617         if(is_t(c)) {
 618             c=*locale;
 619             if(is_sep(c)) {
 620                 result=UCASE_LOC_LITHUANIAN;
 621             }
 622         }
 623     } else if(is_n(c)) {
 624         /* nl or nld? */
 625         c=*locale++;
 626         if(is_l(c)) {
 627             c=*locale++;
 628             if(is_d(c)) {
 629                 c=*locale;
 630             }
 631             if(is_sep(c)) {
 632                 result=UCASE_LOC_DUTCH;
 633             }
 634         }
 635     }
 636
 637     if(locCache!=NULL) {
 638         *locCache=result;
 639     }
 640     return result;
 641 }
 642
 643 /*
 644  * Is followed by
 645  *   {case-ignorable}* cased
 646  * ?
 647  * (dir determines looking forward/backward)
 648  * If a character is case-ignorable, it is skipped regardless of whether
 649  * it is also cased or not.
 650  */
 651 static UBool
 652 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
 653     UChar32 c;
 654
 655     if(iter==NULL) {
 656         return FALSE;
 657     }
 658
 659     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
 660         int32_t type=ucase_getTypeOrIgnorable(csp, c);
 661         if(type&4) {
 662             /* case-ignorable, continue with the loop */
 663         } else if(type!=UCASE_NONE) {
 664             return TRUE; /* followed by cased letter */
 665         } else {
 666             return FALSE; /* uncased and not case-ignorable */
 667         }
 668     }
 669
 670     return FALSE; /* not followed by cased letter */
 671 }
 672
 673 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
 674 static UBool
 675 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
 676     UChar32 c;
 677     int32_t dotType;
 678     int8_t dir;
 679
 680     if(iter==NULL) {
 681         return FALSE;
 682     }
 683
 684     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
 685         dotType=getDotType(csp, c);
 686         if(dotType==UCASE_SOFT_DOTTED) {
 687             return TRUE; /* preceded by TYPE_i */
 688         } else if(dotType!=UCASE_OTHER_ACCENT) {
 689             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
 690         }
 691     }
 692
 693     return FALSE; /* not preceded by TYPE_i */
 694 }
 695
 696 /*
 697  * See Jitterbug 2344:
 698  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
 699  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
 700  * we made those releases compatible with Unicode 3.2 which had not fixed
 701  * a related bug in SpecialCasing.txt.
 702  *
 703  * From the Jitterbug 2344 text:
 704  * ... this bug is listed as a Unicode erratum
 705  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
 706  * <quote>
 707  * There are two errors in SpecialCasing.txt.
 708  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
 709  * 2. An incorrect context definition. Correct as follows:
 710  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
 711  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
 712  * ---
 713  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
 714  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
 715  * where the context After_I is defined as:
 716  * The last preceding base character was an uppercase I, and there is no
 717  * intervening combining character class 230 (ABOVE).
 718  * </quote>
 719  *
 720  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
 721  *
 722  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
 723  * # This matches the behavior of the canonically equivalent I-dot_above
 724  *
 725  * See also the description in this place in older versions of uchar.c (revision 1.100).
 726  *
 727  * Markus W. Scherer 2003-feb-15
 728  */
 729
 730 /* Is preceded by base character 'I' with no intervening cc=230 ? */
 731 static UBool
 732 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
 733     UChar32 c;
 734     int32_t dotType;
 735     int8_t dir;
 736
 737     if(iter==NULL) {
 738         return FALSE;
 739     }
 740
 741     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
 742         if(c==0x49) {
 743             return TRUE; /* preceded by I */
 744         }
 745         dotType=getDotType(csp, c);
 746         if(dotType!=UCASE_OTHER_ACCENT) {
 747             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
 748         }
 749     }
 750
 751     return FALSE; /* not preceded by I */
 752 }
 753
 754 /* Is followed by one or more cc==230 ? */
 755 static UBool
 756 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
 757     UChar32 c;
 758     int32_t dotType;
 759     int8_t dir;
 760
 761     if(iter==NULL) {
 762         return FALSE;
 763     }
 764
 765     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
 766         dotType=getDotType(csp, c);
 767         if(dotType==UCASE_ABOVE) {
 768             return TRUE; /* at least one cc==230 following */
 769         } else if(dotType!=UCASE_OTHER_ACCENT) {
 770             return FALSE; /* next base character, no more cc==230 following */
 771         }
 772     }
 773
 774     return FALSE; /* no more cc==230 following */
 775 }
 776
 777 /* Is followed by a dot above (without cc==230 in between) ? */
 778 static UBool
 779 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
 780     UChar32 c;
 781     int32_t dotType;
 782     int8_t dir;
 783
 784     if(iter==NULL) {
 785         return FALSE;
 786     }
 787
 788     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
 789         if(c==0x307) {
 790             return TRUE;
 791         }
 792         dotType=getDotType(csp, c);
 793         if(dotType!=UCASE_OTHER_ACCENT) {
 794             return FALSE; /* next base character or cc==230 in between */
 795         }
 796     }
 797
 798     return FALSE; /* no dot above following */
 799 }
 800
 801 U_CAPI int32_t U_EXPORT2
 802 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
 803                   UCaseContextIterator *iter, void *context,
 804                   const UChar **pString,
 805                   const char *locale, int32_t *locCache)
 806 {
 807     UChar32 result=c;
 808     uint16_t props=UTRIE2_GET16(&csp->trie, c);
 809     if(!PROPS_HAS_EXCEPTION(props)) {
 810         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
 811             result=c+UCASE_GET_DELTA(props);
 812         }
 813     } else {
 814         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
 815         uint16_t excWord=*pe++;
 816         int32_t full;
 817
 818         pe2=pe;
 819
 820         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
 821             /* use hardcoded conditions and mappings */
 822             int32_t loc=ucase_getCaseLocale(locale, locCache);
 823
 824             /*
 825              * Test for conditional mappings first
 826              *   (otherwise the unconditional default mappings are always taken),
 827              * then test for characters that have unconditional mappings in SpecialCasing.txt,
 828              * then get the UnicodeData.txt mappings.
 829              */
 830             if( loc==UCASE_LOC_LITHUANIAN &&
 831                     /* base characters, find accents above */
 832                     (((c==0x49 || c==0x4a || c==0x12e) &&
 833                         isFollowedByMoreAbove(csp, iter, context)) ||
 834                     /* precomposed with accent above, no need to find one */
 835                     (c==0xcc || c==0xcd || c==0x128))
 836             ) {
 837                 /*
 838                     # Lithuanian
 839
 840                     # Lithuanian retains the dot in a lowercase i when followed by accents.
 841
 842                     # Introduce an explicit dot above when lowercasing capital I's and J's
 843                     # whenever there are more accents above.
 844                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
 845
 846                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
 847                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
 848                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
 849                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
 850                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
 851                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
 852                  */
 853                 switch(c) {
 854                 case 0x49:  /* LATIN CAPITAL LETTER I */
 855                     *pString=iDot;
 856                     return 2;
 857                 case 0x4a:  /* LATIN CAPITAL LETTER J */
 858                     *pString=jDot;
 859                     return 2;
 860                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
 861                     *pString=iOgonekDot;
 862                     return 2;
 863                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
 864                     *pString=iDotGrave;
 865                     return 3;
 866                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
 867                     *pString=iDotAcute;
 868                     return 3;
 869                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
 870                     *pString=iDotTilde;
 871                     return 3;
 872                 default:
 873                     return 0; /* will not occur */
 874                 }
 875             /* # Turkish and Azeri */
 876             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
 877                 /*
 878                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
 879                     # The following rules handle those cases.
 880
 881                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
 882                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
 883                  */
 884                 return 0x69;
 885             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
 886                 /*
 887                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
 888                     # This matches the behavior of the canonically equivalent I-dot_above
 889
 890                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
 891                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
 892                  */
 893                 return 0; /* remove the dot (continue without output) */
 894             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
 895                 /*
 896                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
 897
 898                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
 899                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
 900                  */
 901                 return 0x131;
 902             } else if(c==0x130) {
 903                 /*
 904                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
 905
 906                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
 907                  */
 908                 *pString=iDot;
 909                 return 2;
 910             } else if(  c==0x3a3 &&
 911                         !isFollowedByCasedLetter(csp, iter, context, 1) &&
 912                         isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
 913             ) {
 914                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
 915                 /*
 916                     # Special case for final form of sigma
 917
 918                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
 919                  */
 920                 return 0x3c2; /* greek small final sigma */
 921             } else {
 922                 /* no known conditional special case mapping, use a normal mapping */
 923             }
 924         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
 925             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
 926             full&=UCASE_FULL_LOWER;
 927             if(full!=0) {
 928                 /* set the output pointer to the lowercase mapping */
 929                 *pString=reinterpret_cast<const UChar *>(pe+1);
 930
 931                 /* return the string length */
 932                 return full;
 933             }
 934         }
 935
 936         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
 937             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
 938         }
 939     }
 940
 941     return (result==c) ? ~result : result;
 942 }
 943
 944 /* internal */
 945 static int32_t
 946 toUpperOrTitle(const UCaseProps *csp, UChar32 c,
 947                UCaseContextIterator *iter, void *context,
 948                const UChar **pString,
 949                const char *locale, int32_t *locCache,
 950                UBool upperNotTitle) {
 951     UChar32 result=c;
 952     uint16_t props=UTRIE2_GET16(&csp->trie, c);
 953     if(!PROPS_HAS_EXCEPTION(props)) {
 954         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
 955             result=c+UCASE_GET_DELTA(props);
 956         }
 957     } else {
 958         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
 959         uint16_t excWord=*pe++;
 960         int32_t full, idx;
 961
 962         pe2=pe;
 963
 964         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
 965             /* use hardcoded conditions and mappings */
 966             int32_t loc=ucase_getCaseLocale(locale, locCache);
 967
 968             if(loc==UCASE_LOC_TURKISH && c==0x69) {
 969                 /*
 970                     # Turkish and Azeri
 971
 972                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
 973                     # The following rules handle those cases.
 974
 975                     # When uppercasing, i turns into a dotted capital I
 976
 977                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
 978                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
 979                 */
 980                 return 0x130;
 981             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
 982                 /*
 983                     # Lithuanian
 984
 985                     # Lithuanian retains the dot in a lowercase i when followed by accents.
 986
 987                     # Remove DOT ABOVE after "i" with upper or titlecase
 988
 989                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
 990                  */
 991                 return 0; /* remove the dot (continue without output) */
 992             } else {
 993                 /* no known conditional special case mapping, use a normal mapping */
 994             }
 995         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
 996             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
 997
 998             /* start of full case mapping strings */
 999             ++pe;
1000
1001             /* skip the lowercase and case-folding result strings */
1002             pe+=full&UCASE_FULL_LOWER;
1003             full>>=4;
1004             pe+=full&0xf;
1005             full>>=4;
1006
1007             if(upperNotTitle) {
1008                 full&=0xf;
1009             } else {
1010                 /* skip the uppercase result string */
1011                 pe+=full&0xf;
1012                 full=(full>>4)&0xf;
1013             }
1014
1015             if(full!=0) {
1016                 /* set the output pointer to the result string */
1017                 *pString=reinterpret_cast<const UChar *>(pe);
1018
1019                 /* return the string length */
1020                 return full;
1021             }
1022         }
1023
1024         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1025             idx=UCASE_EXC_TITLE;
1026         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1027             /* here, titlecase is same as uppercase */
1028             idx=UCASE_EXC_UPPER;
1029         } else {
1030             return ~c;
1031         }
1032         GET_SLOT_VALUE(excWord, idx, pe2, result);
1033     }
1034
1035     return (result==c) ? ~result : result;
1036 }
1037
1038 U_CAPI int32_t U_EXPORT2
1039 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
1040                   UCaseContextIterator *iter, void *context,
1041                   const UChar **pString,
1042                   const char *locale, int32_t *locCache) {
1043     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
1044 }
1045
1046 U_CAPI int32_t U_EXPORT2
1047 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
1048                   UCaseContextIterator *iter, void *context,
1049                   const UChar **pString,
1050                   const char *locale, int32_t *locCache) {
1051     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
1052 }
1053
1054 /* case folding ------------------------------------------------------------- */
1055
1056 /*
1057  * Case folding is similar to lowercasing.
1058  * The result may be a simple mapping, i.e., a single code point, or
1059  * a full mapping, i.e., a string.
1060  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1061  * then only the lowercase mapping is stored.
1062  *
1063  * Some special cases are hardcoded because their conditions cannot be
1064  * parsed and processed from CaseFolding.txt.
1065  *
1066  * Unicode 3.2 CaseFolding.txt specifies for its status field:
1067
1068 # C: common case folding, common mappings shared by both simple and full mappings.
1069 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1070 # S: simple case folding, mappings to single characters where different from F.
1071 # T: special case for uppercase I and dotted uppercase I
1072 #    - For non-Turkic languages, this mapping is normally not used.
1073 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1074 #
1075 # Usage:
1076 #  A. To do a simple case folding, use the mappings with status C + S.
1077 #  B. To do a full case folding, use the mappings with status C + F.
1078 #
1079 #    The mappings with status T can be used or omitted depending on the desired case-folding
1080 #    behavior. (The default option is to exclude them.)
1081
1082  * Unicode 3.2 has 'T' mappings as follows:
1083
1084 0049; T; 0131; # LATIN CAPITAL LETTER I
1085 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1086
1087  * while the default mappings for these code points are:
1088
1089 0049; C; 0069; # LATIN CAPITAL LETTER I
1090 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1091
1092  * U+0130 has no simple case folding (simple-case-folds to itself).
1093  */
1094
1095 /* return the simple case folding mapping for c */
1096 U_CAPI UChar32 U_EXPORT2
1097 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
1098     uint16_t props=UTRIE2_GET16(&csp->trie, c);
1099     if(!PROPS_HAS_EXCEPTION(props)) {
1100         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1101             c+=UCASE_GET_DELTA(props);
1102         }
1103     } else {
1104         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
1105         uint16_t excWord=*pe++;
1106         int32_t idx;
1107         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1108             /* special case folding mappings, hardcoded */
1109             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1110                 /* default mappings */
1111                 if(c==0x49) {
1112                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1113                     return 0x69;
1114                 } else if(c==0x130) {
1115                     /* no simple case folding for U+0130 */
1116                     return c;
1117                 }
1118             } else {
1119                 /* Turkic mappings */
1120                 if(c==0x49) {
1121                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1122                     return 0x131;
1123                 } else if(c==0x130) {
1124                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1125                     return 0x69;
1126                 }
1127             }
1128         }
1129         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1130             idx=UCASE_EXC_FOLD;
1131         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1132             idx=UCASE_EXC_LOWER;
1133         } else {
1134             return c;
1135         }
1136         GET_SLOT_VALUE(excWord, idx, pe, c);
1137     }
1138     return c;
1139 }
1140
1141 /*
1142  * Issue for canonical caseless match (UAX #21):
1143  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1144  * canonical equivalence, unlike default-option casefolding.
1145  * For example, I-grave and I + grave fold to strings that are not canonically
1146  * equivalent.
1147  * For more details, see the comment in unorm_compare() in unorm.cpp
1148  * and the intermediate prototype changes for Jitterbug 2021.
1149  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1150  *
1151  * This did not get fixed because it appears that it is not possible to fix
1152  * it for uppercase and lowercase characters (I-grave vs. i-grave)
1153  * together in a way that they still fold to common result strings.
1154  */
1155
1156 U_CAPI int32_t U_EXPORT2
1157 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
1158                     const UChar **pString,
1159                     uint32_t options)
1160 {
1161     UChar32 result=c;
1162     uint16_t props=UTRIE2_GET16(&csp->trie, c);
1163     if(!PROPS_HAS_EXCEPTION(props)) {
1164         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1165             result=c+UCASE_GET_DELTA(props);
1166         }
1167     } else {
1168         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1169         uint16_t excWord=*pe++;
1170         int32_t full, idx;
1171
1172         pe2=pe;
1173
1174         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1175             /* use hardcoded conditions and mappings */
1176             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1177                 /* default mappings */
1178                 if(c==0x49) {
1179                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1180                     return 0x69;
1181                 } else if(c==0x130) {
1182                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1183                     *pString=iDot;
1184                     return 2;
1185                 }
1186             } else {
1187                 /* Turkic mappings */
1188                 if(c==0x49) {
1189                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1190                     return 0x131;
1191                 } else if(c==0x130) {
1192                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1193                     return 0x69;
1194                 }
1195             }
1196         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1197             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1198
1199             /* start of full case mapping strings */
1200             ++pe;
1201
1202             /* skip the lowercase result string */
1203             pe+=full&UCASE_FULL_LOWER;
1204             full=(full>>4)&0xf;
1205
1206             if(full!=0) {
1207                 /* set the output pointer to the result string */
1208                 *pString=reinterpret_cast<const UChar *>(pe);
1209
1210                 /* return the string length */
1211                 return full;
1212             }
1213         }
1214
1215         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1216             idx=UCASE_EXC_FOLD;
1217         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1218             idx=UCASE_EXC_LOWER;
1219         } else {
1220             return ~c;
1221         }
1222         GET_SLOT_VALUE(excWord, idx, pe2, result);
1223     }
1224
1225     return (result==c) ? ~result : result;
1226 }
1227
1228 /* case mapping properties API ---------------------------------------------- */
1229
1230 #define GET_CASE_PROPS() &ucase_props_singleton
1231
1232 /* public API (see uchar.h) */
1233
1234 U_CAPI UBool U_EXPORT2
1235 u_isULowercase(UChar32 c) {
1236     return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
1237 }
1238
1239 U_CAPI UBool U_EXPORT2
1240 u_isUUppercase(UChar32 c) {
1241     return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
1242 }
1243
1244 /* Transforms the Unicode character to its lower case equivalent.*/
1245 U_CAPI UChar32 U_EXPORT2
1246 u_tolower(UChar32 c) {
1247     return ucase_tolower(GET_CASE_PROPS(), c);
1248 }
1249
1250 /* Transforms the Unicode character to its upper case equivalent.*/
1251 U_CAPI UChar32 U_EXPORT2
1252 u_toupper(UChar32 c) {
1253     return ucase_toupper(GET_CASE_PROPS(), c);
1254 }
1255
1256 /* Transforms the Unicode character to its title case equivalent.*/
1257 U_CAPI UChar32 U_EXPORT2
1258 u_totitle(UChar32 c) {
1259     return ucase_totitle(GET_CASE_PROPS(), c);
1260 }
1261
1262 /* return the simple case folding mapping for c */
1263 U_CAPI UChar32 U_EXPORT2
1264 u_foldCase(UChar32 c, uint32_t options) {
1265     return ucase_fold(GET_CASE_PROPS(), c, options);
1266 }
1267
1268 U_CFUNC int32_t U_EXPORT2
1269 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1270     /* case mapping properties */
1271     const UChar *resultString;
1272     int32_t locCache;
1273     const UCaseProps *csp=GET_CASE_PROPS();
1274     if(csp==NULL) {
1275         return FALSE;
1276     }
1277     switch(which) {
1278     case UCHAR_LOWERCASE:
1279         return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
1280     case UCHAR_UPPERCASE:
1281         return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
1282     case UCHAR_SOFT_DOTTED:
1283         return ucase_isSoftDotted(csp, c);
1284     case UCHAR_CASE_SENSITIVE:
1285         return ucase_isCaseSensitive(csp, c);
1286     case UCHAR_CASED:
1287         return (UBool)(UCASE_NONE!=ucase_getType(csp, c));
1288     case UCHAR_CASE_IGNORABLE:
1289         return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);
1290     /*
1291      * Note: The following Changes_When_Xyz are defined as testing whether
1292      * the NFD form of the input changes when Xyz-case-mapped.
1293      * However, this simpler implementation of these properties,
1294      * ignoring NFD, passes the tests.
1295      * The implementation needs to be changed if the tests start failing.
1296      * When that happens, optimizations should be used to work with the
1297      * per-single-code point ucase_toFullXyz() functions unless
1298      * the NFD form has more than one code point,
1299      * and the property starts set needs to be the union of the
1300      * start sets for normalization and case mappings.
1301      */
1302     case UCHAR_CHANGES_WHEN_LOWERCASED:
1303         locCache=UCASE_LOC_ROOT;
1304         return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1305     case UCHAR_CHANGES_WHEN_UPPERCASED:
1306         locCache=UCASE_LOC_ROOT;
1307         return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1308     case UCHAR_CHANGES_WHEN_TITLECASED:
1309         locCache=UCASE_LOC_ROOT;
1310         return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1311     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1312     case UCHAR_CHANGES_WHEN_CASEMAPPED:
1313         locCache=UCASE_LOC_ROOT;
1314         return (UBool)(
1315             ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1316             ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1317             ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1318     default:
1319         return FALSE;
1320     }
1321 }