icuSources/common/ucase.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2004-2006, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucase.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2004aug30
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Low-level Unicode character/string case mapping code.
  17 *   Much code moved here (and modified) from uchar.c.
  18 */
  19
  20 #include "unicode/utypes.h"
  21 #include "unicode/uset.h"
  22 #include "unicode/udata.h" /* UDataInfo */
  23 #include "ucmndata.h" /* DataHeader */
  24 #include "udatamem.h"
  25 #include "umutex.h"
  26 #include "uassert.h"
  27 #include "cmemory.h"
  28 #include "utrie.h"
  29 #include "ucase.h"
  30 #include "ucln_cmn.h"
  31
  32 struct UCaseProps {
  33     UDataMemory *mem;
  34     const int32_t *indexes;
  35     const uint16_t *exceptions;
  36     const UChar *unfold;
  37
  38     UTrie trie;
  39     uint8_t formatVersion[4];
  40 };
  41
  42 /* data loading etc. -------------------------------------------------------- */
  43
  44 #define UCASE_HARDCODE_DATA 1
  45
  46 #if UCASE_HARDCODE_DATA
  47
  48 /* ucase_props_data.c is machine-generated by gencase --csource */
  49 #include "ucase_props_data.c"
  50
  51 #else
  52
  53 static UBool U_CALLCONV
  54 isAcceptable(void *context,
  55              const char *type, const char *name,
  56              const UDataInfo *pInfo) {
  57     if(
  58         pInfo->size>=20 &&
  59         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
  60         pInfo->charsetFamily==U_CHARSET_FAMILY &&
  61         pInfo->dataFormat[0]==UCASE_FMT_0 &&    /* dataFormat="cAsE" */
  62         pInfo->dataFormat[1]==UCASE_FMT_1 &&
  63         pInfo->dataFormat[2]==UCASE_FMT_2 &&
  64         pInfo->dataFormat[3]==UCASE_FMT_3 &&
  65         pInfo->formatVersion[0]==1 &&
  66         pInfo->formatVersion[2]==UTRIE_SHIFT &&
  67         pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
  68     ) {
  69         UCaseProps *csp=(UCaseProps *)context;
  70         uprv_memcpy(csp->formatVersion, pInfo->formatVersion, 4);
  71         return TRUE;
  72     } else {
  73         return FALSE;
  74     }
  75 }
  76
  77 static UCaseProps *
  78 ucase_openData(UCaseProps *cspProto,
  79                const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
  80     UCaseProps *csp;
  81     int32_t size;
  82
  83     cspProto->indexes=(const int32_t *)bin;
  84     if( (length>=0 && length<16*4) ||
  85         cspProto->indexes[UCASE_IX_INDEX_TOP]<16
  86     ) {
  87         /* length or indexes[] too short for minimum indexes[] length of 16 */
  88         *pErrorCode=U_INVALID_FORMAT_ERROR;
  89         return NULL;
  90     }
  91     size=cspProto->indexes[UCASE_IX_INDEX_TOP]*4;
  92     if(length>=0) {
  93         if(length>=size && length>=cspProto->indexes[UCASE_IX_LENGTH]) {
  94             length-=size;
  95         } else {
  96             /* length too short for indexes[] or for the whole data length */
  97             *pErrorCode=U_INVALID_FORMAT_ERROR;
  98             return NULL;
  99         }
 100     }
 101     bin+=size;
 102     /* from here on, assume that the sizes of the items fit into the total length */
 103
 104     /* unserialize the trie, after indexes[] */
 105     size=cspProto->indexes[UCASE_IX_TRIE_SIZE];
 106     utrie_unserialize(&cspProto->trie, bin, size, pErrorCode);
 107     if(U_FAILURE(*pErrorCode)) {
 108         return NULL;
 109     }
 110     bin+=size;
 111
 112     /* get exceptions[] */
 113     size=2*cspProto->indexes[UCASE_IX_EXC_LENGTH];
 114     cspProto->exceptions=(const uint16_t *)bin;
 115     bin+=size;
 116
 117     /* get unfold[] */
 118     size=2*cspProto->indexes[UCASE_IX_UNFOLD_LENGTH];
 119     if(size!=0) {
 120         cspProto->unfold=(const UChar *)bin;
 121         bin+=size;
 122     } else {
 123         cspProto->unfold=NULL;
 124     }
 125
 126     /* allocate, copy, and return the new UCaseProps */
 127     csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps));
 128     if(csp==NULL) {
 129         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
 130         return NULL;
 131     } else {
 132         uprv_memcpy(csp, cspProto, sizeof(UCaseProps));
 133         return csp;
 134     }
 135 }
 136
 137 U_CAPI UCaseProps * U_EXPORT2
 138 ucase_open(UErrorCode *pErrorCode) {
 139     UCaseProps cspProto={ NULL }, *csp;
 140
 141     cspProto.mem=udata_openChoice(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, isAcceptable, &cspProto, pErrorCode);
 142     if(U_FAILURE(*pErrorCode)) {
 143         return NULL;
 144     }
 145
 146     csp=ucase_openData(
 147             &cspProto,
 148             udata_getMemory(cspProto.mem),
 149             udata_getLength(cspProto.mem),
 150             pErrorCode);
 151     if(U_FAILURE(*pErrorCode)) {
 152         udata_close(cspProto.mem);
 153         return NULL;
 154     } else {
 155         return csp;
 156     }
 157 }
 158
 159 U_CAPI UCaseProps * U_EXPORT2
 160 ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
 161     UCaseProps cspProto={ NULL };
 162     const DataHeader *hdr;
 163
 164     if(U_FAILURE(*pErrorCode)) {
 165         return NULL;
 166     }
 167     if(bin==NULL) {
 168         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 169         return NULL;
 170     }
 171
 172     /* check the header */
 173     if(length>=0 && length<20) {
 174         *pErrorCode=U_INVALID_FORMAT_ERROR;
 175         return NULL;
 176     }
 177     hdr=(const DataHeader *)bin;
 178     if(
 179         !(hdr->dataHeader.magic1==0xda && hdr->dataHeader.magic2==0x27 &&
 180           hdr->info.isBigEndian==U_IS_BIG_ENDIAN &&
 181           isAcceptable(&cspProto, UCASE_DATA_TYPE, UCASE_DATA_NAME, &hdr->info))
 182     ) {
 183         *pErrorCode=U_INVALID_FORMAT_ERROR;
 184         return NULL;
 185     }
 186
 187     bin+=hdr->dataHeader.headerSize;
 188     if(length>=0) {
 189         length-=hdr->dataHeader.headerSize;
 190     }
 191     return ucase_openData(&cspProto, bin, length, pErrorCode);
 192 }
 193
 194 #endif
 195
 196 U_CAPI void U_EXPORT2
 197 ucase_close(UCaseProps *csp) {
 198     if(csp!=NULL) {
 199 #if !UCASE_HARDCODE_DATA
 200         udata_close(csp->mem);
 201 #endif
 202         uprv_free(csp);
 203     }
 204 }
 205
 206 /* UCaseProps singleton ----------------------------------------------------- */
 207
 208 static UCaseProps *gCsp=NULL, *gCspDummy=NULL;
 209 #if !UCASE_HARDCODE_DATA
 210 static UErrorCode gErrorCode=U_ZERO_ERROR;
 211 static int8_t gHaveData=0;
 212 #endif
 213
 214 static UBool U_CALLCONV ucase_cleanup(void) {
 215     ucase_close(gCsp);
 216     gCsp=NULL;
 217     ucase_close(gCspDummy);
 218     gCspDummy=NULL;
 219 #if !UCASE_HARDCODE_DATA
 220     gErrorCode=U_ZERO_ERROR;
 221     gHaveData=0;
 222 #endif
 223     return TRUE;
 224 }
 225
 226 U_CAPI const UCaseProps * U_EXPORT2
 227 ucase_getSingleton(UErrorCode *pErrorCode) {
 228 #if UCASE_HARDCODE_DATA
 229     if(U_FAILURE(*pErrorCode)) {
 230         return NULL;
 231     }
 232     return &ucase_props_singleton;
 233 #else
 234     int8_t haveData;
 235
 236     if(U_FAILURE(*pErrorCode)) {
 237         return NULL;
 238     }
 239
 240     UMTX_CHECK(NULL, gHaveData, haveData);
 241
 242     if(haveData>0) {
 243         /* data was loaded */
 244         return gCsp;
 245     } else if(haveData<0) {
 246         /* data loading failed */
 247         *pErrorCode=gErrorCode;
 248         return NULL;
 249     } else /* haveData==0 */ {
 250         /* load the data */
 251         UCaseProps *csp=ucase_open(pErrorCode);
 252         if(U_FAILURE(*pErrorCode)) {
 253             gHaveData=-1;
 254             gErrorCode=*pErrorCode;
 255             return NULL;
 256         }
 257
 258         /* set the static variables */
 259         umtx_lock(NULL);
 260         if(gCsp==NULL) {
 261             gCsp=csp;
 262             csp=NULL;
 263             gHaveData=1;
 264             ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup);
 265         }
 266         umtx_unlock(NULL);
 267
 268         ucase_close(csp);
 269         return gCsp;
 270     }
 271 #endif
 272 }
 273
 274 U_CAPI const UCaseProps * U_EXPORT2
 275 ucase_getDummy(UErrorCode *pErrorCode) {
 276     UCaseProps *csp;
 277
 278     if(U_FAILURE(*pErrorCode)) {
 279         return NULL;
 280     }
 281
 282     UMTX_CHECK(NULL, gCspDummy, csp);
 283
 284     if(csp!=NULL) {
 285         /* the dummy object was already created */
 286         return csp;
 287     } else /* csp==NULL */ {
 288         /* create the dummy object */
 289         int32_t *indexes;
 290
 291         csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps)+UCASE_IX_TOP*4+UTRIE_DUMMY_SIZE);
 292         if(csp==NULL) {
 293             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
 294             return NULL;
 295         }
 296         uprv_memset(csp, 0, sizeof(UCaseProps)+UCASE_IX_TOP*4);
 297
 298         csp->indexes=indexes=(int32_t *)(csp+1);
 299         indexes[UCASE_IX_INDEX_TOP]=UCASE_IX_TOP;
 300
 301         indexes[UCASE_IX_TRIE_SIZE]=
 302             utrie_unserializeDummy(&csp->trie, indexes+UCASE_IX_TOP, UTRIE_DUMMY_SIZE, 0, 0, TRUE, pErrorCode);
 303         if(U_FAILURE(*pErrorCode)) {
 304             uprv_free(csp);
 305             return NULL;
 306         }
 307
 308         csp->formatVersion[0]=1;
 309         csp->formatVersion[2]=UTRIE_SHIFT;
 310         csp->formatVersion[3]=UTRIE_INDEX_SHIFT;
 311
 312         /* set the static variables */
 313         umtx_lock(NULL);
 314         if(gCspDummy==NULL) {
 315             gCspDummy=csp;
 316             csp=NULL;
 317             ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup);
 318         }
 319         umtx_unlock(NULL);
 320
 321         uprv_free(csp);
 322         return gCspDummy;
 323     }
 324 }
 325
 326 /* set of property starts for UnicodeSet ------------------------------------ */
 327
 328 static UBool U_CALLCONV
 329 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
 330     /* add the start code point to the USet */
 331     const USetAdder *sa=(const USetAdder *)context;
 332     sa->add(sa->set, start);
 333     return TRUE;
 334 }
 335
 336 U_CAPI void U_EXPORT2
 337 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
 338     if(U_FAILURE(*pErrorCode)) {
 339         return;
 340     }
 341
 342     /* add the start code point of each same-value range of the trie */
 343     utrie_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
 344
 345     /* add code points with hardcoded properties, plus the ones following them */
 346
 347     /* (none right now, see comment below) */
 348
 349     /*
 350      * Omit code points with hardcoded specialcasing properties
 351      * because we do not build property UnicodeSets for them right now.
 352      */
 353 }
 354
 355 /* data access primitives --------------------------------------------------- */
 356
 357 /* UTRIE_GET16() itself validates c */
 358 #define GET_PROPS(csp, c, result) \
 359     UTRIE_GET16(&(csp)->trie, c, result);
 360
 361 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
 362
 363 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
 364
 365 /* number of bits in an 8-bit integer value */
 366 static const uint8_t flagsOffset[256]={
 367     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
 368     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
 369     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
 370     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 371     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
 372     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 373     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 374     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 375     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
 376     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 377     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 378     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 379     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 380     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 381     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 382     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 383 };
 384
 385 #define HAS_SLOT(flags, index) ((flags)&(1<<(index)))
 386 #define SLOT_OFFSET(flags, index) flagsOffset[(flags)&((1<<(index))-1)]
 387
 388 /*
 389  * Get the value of an optional-value slot where HAS_SLOT(excWord, index).
 390  *
 391  * @param excWord (in) initial exceptions word
 392  * @param index (in) desired slot index
 393  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
 394  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
 395  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
 396  */
 397 #define GET_SLOT_VALUE(excWord, index, pExc16, value) \
 398     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
 399         (pExc16)+=SLOT_OFFSET(excWord, index); \
 400         (value)=*pExc16; \
 401     } else { \
 402         (pExc16)+=2*SLOT_OFFSET(excWord, index); \
 403         (value)=*pExc16++; \
 404         (value)=((value)<<16)|*pExc16; \
 405     }
 406
 407 /* simple case mappings ----------------------------------------------------- */
 408
 409 U_CAPI UChar32 U_EXPORT2
 410 ucase_tolower(const UCaseProps *csp, UChar32 c) {
 411     uint16_t props;
 412     GET_PROPS(csp, c, props);
 413     if(!PROPS_HAS_EXCEPTION(props)) {
 414         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
 415             c+=UCASE_GET_DELTA(props);
 416         }
 417     } else {
 418         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 419         uint16_t excWord=*pe++;
 420         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
 421             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
 422         }
 423     }
 424     return c;
 425 }
 426
 427 U_CAPI UChar32 U_EXPORT2
 428 ucase_toupper(const UCaseProps *csp, UChar32 c) {
 429     uint16_t props;
 430     GET_PROPS(csp, c, props);
 431     if(!PROPS_HAS_EXCEPTION(props)) {
 432         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
 433             c+=UCASE_GET_DELTA(props);
 434         }
 435     } else {
 436         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 437         uint16_t excWord=*pe++;
 438         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
 439             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
 440         }
 441     }
 442     return c;
 443 }
 444
 445 U_CAPI UChar32 U_EXPORT2
 446 ucase_totitle(const UCaseProps *csp, UChar32 c) {
 447     uint16_t props;
 448     GET_PROPS(csp, c, props);
 449     if(!PROPS_HAS_EXCEPTION(props)) {
 450         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
 451             c+=UCASE_GET_DELTA(props);
 452         }
 453     } else {
 454         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 455         uint16_t excWord=*pe++;
 456         int32_t index;
 457         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
 458             index=UCASE_EXC_TITLE;
 459         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
 460             index=UCASE_EXC_UPPER;
 461         } else {
 462             return c;
 463         }
 464         GET_SLOT_VALUE(excWord, index, pe, c);
 465     }
 466     return c;
 467 }
 468
 469 U_CAPI void U_EXPORT2
 470 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
 471     uint16_t props;
 472
 473     /*
 474      * Hardcode the case closure of i and its relatives and ignore the
 475      * data file data for these characters.
 476      * The Turkic dotless i and dotted I with their case mapping conditions
 477      * and case folding option make the related characters behave specially.
 478      * This code matches their closure behavior to their case folding behavior.
 479      */
 480     static const UChar
 481         iDot[2]=        { 0x69, 0x307 };
 482
 483     switch(c) {
 484     case 0x49:
 485         /* regular i and I are in one equivalence class */
 486         sa->add(sa->set, 0x69);
 487         return;
 488     case 0x69:
 489         sa->add(sa->set, 0x49);
 490         return;
 491     case 0x130:
 492         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
 493         sa->addString(sa->set, iDot, 2);
 494         return;
 495     case 0x131:
 496         /* dotless i is in a class by itself */
 497         return;
 498     default:
 499         /* otherwise use the data file data */
 500         break;
 501     }
 502
 503     GET_PROPS(csp, c, props);
 504     if(!PROPS_HAS_EXCEPTION(props)) {
 505         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
 506             /* add the one simple case mapping, no matter what type it is */
 507             int32_t delta=UCASE_GET_DELTA(props);
 508             if(delta!=0) {
 509                 sa->add(sa->set, c+delta);
 510             }
 511         }
 512     } else {
 513         /*
 514          * c has exceptions, so there may be multiple simple and/or
 515          * full case mappings. Add them all.
 516          */
 517         const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
 518         const UChar *closure;
 519         uint16_t excWord=*pe++;
 520         int32_t index, closureLength, fullLength, length;
 521
 522         pe0=pe;
 523
 524         /* add all simple case mappings */
 525         for(index=UCASE_EXC_LOWER; index<=UCASE_EXC_TITLE; ++index) {
 526             if(HAS_SLOT(excWord, index)) {
 527                 pe=pe0;
 528                 GET_SLOT_VALUE(excWord, index, pe, c);
 529                 sa->add(sa->set, c);
 530             }
 531         }
 532
 533         /* get the closure string pointer & length */
 534         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
 535             pe=pe0;
 536             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
 537             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
 538             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
 539         } else {
 540             closureLength=0;
 541             closure=NULL;
 542         }
 543
 544         /* add the full case folding */
 545         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
 546             pe=pe0;
 547             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
 548
 549             /* start of full case mapping strings */
 550             ++pe;
 551
 552             fullLength&=0xffff; /* bits 16 and higher are reserved */
 553
 554             /* skip the lowercase result string */
 555             pe+=fullLength&UCASE_FULL_LOWER;
 556             fullLength>>=4;
 557
 558             /* add the full case folding string */
 559             length=fullLength&0xf;
 560             if(length!=0) {
 561                 sa->addString(sa->set, (const UChar *)pe, length);
 562                 pe+=length;
 563             }
 564
 565             /* skip the uppercase and titlecase strings */
 566             fullLength>>=4;
 567             pe+=fullLength&0xf;
 568             fullLength>>=4;
 569             pe+=fullLength;
 570
 571             closure=(const UChar *)pe; /* behind full case mappings */
 572         }
 573
 574         /* add each code point in the closure string */
 575         for(index=0; index<closureLength;) {
 576             U16_NEXT_UNSAFE(closure, index, c);
 577             sa->add(sa->set, c);
 578         }
 579     }
 580 }
 581
 582 /*
 583  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
 584  * must be length>0 and max>0 and length<=max
 585  */
 586 static U_INLINE int32_t
 587 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
 588     int32_t c1, c2;
 589
 590     max-=length; /* we require length<=max, so no need to decrement max in the loop */
 591     do {
 592         c1=*s++;
 593         c2=*t++;
 594         if(c2==0) {
 595             return 1; /* reached the end of t but not of s */
 596         }
 597         c1-=c2;
 598         if(c1!=0) {
 599             return c1; /* return difference result */
 600         }
 601     } while(--length>0);
 602     /* ends with length==0 */
 603
 604     if(max==0 || *t==0) {
 605         return 0; /* equal to length of both strings */
 606     } else {
 607         return -max; /* return lengh difference */
 608     }
 609 }
 610
 611 U_CAPI UBool U_EXPORT2
 612 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
 613     const UChar *unfold, *p;
 614     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
 615
 616     if(csp->unfold==NULL || s==NULL) {
 617         return FALSE; /* no reverse case folding data, or no string */
 618     }
 619     if(length<=1) {
 620         /* the string is too short to find any match */
 621         /*
 622          * more precise would be:
 623          * if(!u_strHasMoreChar32Than(s, length, 1))
 624          * but this does not make much practical difference because
 625          * a single supplementary code point would just not be found
 626          */
 627         return FALSE;
 628     }
 629
 630     unfold=csp->unfold;
 631     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
 632     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
 633     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
 634     unfold+=unfoldRowWidth;
 635
 636     if(length>unfoldStringWidth) {
 637         /* the string is too long to find any match */
 638         return FALSE;
 639     }
 640
 641     /* do a binary search for the string */
 642     start=0;
 643     limit=unfoldRows;
 644     while(start<limit) {
 645         i=(start+limit)/2;
 646         p=unfold+(i*unfoldRowWidth);
 647         result=strcmpMax(s, length, p, unfoldStringWidth);
 648
 649         if(result==0) {
 650             /* found the string: add each code point, and its case closure */
 651             UChar32 c;
 652
 653             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
 654                 U16_NEXT_UNSAFE(p, i, c);
 655                 sa->add(sa->set, c);
 656                 ucase_addCaseClosure(csp, c, sa);
 657             }
 658             return TRUE;
 659         } else if(result<0) {
 660             limit=i;
 661         } else /* result>0 */ {
 662             start=i+1;
 663         }
 664     }
 665
 666     return FALSE; /* string not found */
 667 }
 668
 669 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
 670 U_CAPI int32_t U_EXPORT2
 671 ucase_getType(const UCaseProps *csp, UChar32 c) {
 672     uint16_t props;
 673     GET_PROPS(csp, c, props);
 674     return UCASE_GET_TYPE(props);
 675 }
 676
 677 /** @return same as ucase_getType(), or <0 if c is case-ignorable */
 678 U_CAPI int32_t U_EXPORT2
 679 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
 680     int32_t type;
 681     uint16_t props;
 682     GET_PROPS(csp, c, props);
 683     type=UCASE_GET_TYPE(props);
 684     if(type!=UCASE_NONE) {
 685         return type;
 686     } else if(
 687         c==0x307 ||
 688         (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE
 689     ) {
 690         return -1; /* case-ignorable */
 691     } else {
 692         return 0; /* c is neither cased nor case-ignorable */
 693     }
 694 }
 695
 696 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
 697 static U_INLINE int32_t
 698 getDotType(const UCaseProps *csp, UChar32 c) {
 699     uint16_t props;
 700     GET_PROPS(csp, c, props);
 701     if(!PROPS_HAS_EXCEPTION(props)) {
 702         return props&UCASE_DOT_MASK;
 703     } else {
 704         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 705         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
 706     }
 707 }
 708
 709 U_CAPI UBool U_EXPORT2
 710 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
 711     return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
 712 }
 713
 714 U_CAPI UBool U_EXPORT2
 715 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
 716     uint16_t props;
 717     GET_PROPS(csp, c, props);
 718     return (UBool)((props&UCASE_SENSITIVE)!=0);
 719 }
 720
 721 /* string casing ------------------------------------------------------------ */
 722
 723 /*
 724  * These internal functions form the core of string case mappings.
 725  * They map single code points to result code points or strings and take
 726  * all necessary conditions (context, locale ID, options) into account.
 727  *
 728  * They do not iterate over the source or write to the destination
 729  * so that the same functions are useful for non-standard string storage,
 730  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
 731  * For the same reason, the "surrounding text" context is passed in as a
 732  * UCaseContextIterator which does not make any assumptions about
 733  * the underlying storage.
 734  *
 735  * This section contains helper functions that check for conditions
 736  * in the input text surrounding the current code point
 737  * according to SpecialCasing.txt.
 738  *
 739  * Each helper function gets the index
 740  * - after the current code point if it looks at following text
 741  * - before the current code point if it looks at preceding text
 742  *
 743  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
 744  *
 745  * Final_Sigma
 746  *   C is preceded by a sequence consisting of
 747  *     a cased letter and a case-ignorable sequence,
 748  *   and C is not followed by a sequence consisting of
 749  *     an ignorable sequence and then a cased letter.
 750  *
 751  * More_Above
 752  *   C is followed by one or more characters of combining class 230 (ABOVE)
 753  *   in the combining character sequence.
 754  *
 755  * After_Soft_Dotted
 756  *   The last preceding character with combining class of zero before C
 757  *   was Soft_Dotted,
 758  *   and there is no intervening combining character class 230 (ABOVE).
 759  *
 760  * Before_Dot
 761  *   C is followed by combining dot above (U+0307).
 762  *   Any sequence of characters with a combining class that is neither 0 nor 230
 763  *   may intervene between the current character and the combining dot above.
 764  *
 765  * The erratum from 2002-10-31 adds the condition
 766  *
 767  * After_I
 768  *   The last preceding base character was an uppercase I, and there is no
 769  *   intervening combining character class 230 (ABOVE).
 770  *
 771  *   (See Jitterbug 2344 and the comments on After_I below.)
 772  *
 773  * Helper definitions in Unicode 3.2 UAX 21:
 774  *
 775  * D1. A character C is defined to be cased
 776  *     if it meets any of the following criteria:
 777  *
 778  *   - The general category of C is Titlecase Letter (Lt)
 779  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
 780  *   - Given D = NFD(C), then it is not the case that:
 781  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
 782  *     (This third criterium does not add any characters to the list
 783  *      for Unicode 3.2. Ignored.)
 784  *
 785  * D2. A character C is defined to be case-ignorable
 786  *     if it meets either of the following criteria:
 787  *
 788  *   - The general category of C is
 789  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
 790  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
 791  *   - C is one of the following characters
 792  *     U+0027 APOSTROPHE
 793  *     U+00AD SOFT HYPHEN (SHY)
 794  *     U+2019 RIGHT SINGLE QUOTATION MARK
 795  *            (the preferred character for apostrophe)
 796  *
 797  * D3. A case-ignorable sequence is a sequence of
 798  *     zero or more case-ignorable characters.
 799  */
 800
 801 enum {
 802     LOC_UNKNOWN,
 803     LOC_ROOT,
 804     LOC_TURKISH,
 805     LOC_LITHUANIAN
 806 };
 807
 808 #define is_a(c) ((c)=='a' || (c)=='A')
 809 #define is_e(c) ((c)=='e' || (c)=='E')
 810 #define is_i(c) ((c)=='i' || (c)=='I')
 811 #define is_l(c) ((c)=='l' || (c)=='L')
 812 #define is_r(c) ((c)=='r' || (c)=='R')
 813 #define is_t(c) ((c)=='t' || (c)=='T')
 814 #define is_u(c) ((c)=='u' || (c)=='U')
 815 #define is_z(c) ((c)=='z' || (c)=='Z')
 816
 817 /* separator? */
 818 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
 819
 820 /**
 821  * Requires non-NULL locale ID but otherwise does the equivalent of
 822  * checking for language codes as if uloc_getLanguage() were called:
 823  * Accepts both 2- and 3-letter codes and accepts case variants.
 824  */
 825 U_CFUNC int32_t
 826 ucase_getCaseLocale(const char *locale, int32_t *locCache) {
 827     int32_t result;
 828     char c;
 829
 830     if(locCache!=NULL && (result=*locCache)!=LOC_UNKNOWN) {
 831         return result;
 832     }
 833
 834     result=LOC_ROOT;
 835
 836     /*
 837      * This function used to use uloc_getLanguage(), but the current code
 838      * removes the dependency of this low-level code on uloc implementation code
 839      * and is faster because not the whole locale ID has to be
 840      * examined and copied/transformed.
 841      *
 842      * Because this code does not want to depend on uloc, the caller must
 843      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
 844      */
 845     c=*locale++;
 846     if(is_t(c)) {
 847         /* tr or tur? */
 848         c=*locale++;
 849         if(is_u(c)) {
 850             c=*locale++;
 851         }
 852         if(is_r(c)) {
 853             c=*locale;
 854             if(is_sep(c)) {
 855                 result=LOC_TURKISH;
 856             }
 857         }
 858     } else if(is_a(c)) {
 859         /* az or aze? */
 860         c=*locale++;
 861         if(is_z(c)) {
 862             c=*locale++;
 863             if(is_e(c)) {
 864                 c=*locale;
 865             }
 866             if(is_sep(c)) {
 867                 result=LOC_TURKISH;
 868             }
 869         }
 870     } else if(is_l(c)) {
 871         /* lt or lit? */
 872         c=*locale++;
 873         if(is_i(c)) {
 874             c=*locale++;
 875         }
 876         if(is_t(c)) {
 877             c=*locale;
 878             if(is_sep(c)) {
 879                 result=LOC_LITHUANIAN;
 880             }
 881         }
 882     }
 883
 884     if(locCache!=NULL) {
 885         *locCache=result;
 886     }
 887     return result;
 888 }
 889
 890 /* Is followed by {case-ignorable}* cased  ? (dir determines looking forward/backward) */
 891 static UBool
 892 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
 893     UChar32 c;
 894     uint16_t props;
 895
 896     if(iter==NULL) {
 897         return FALSE;
 898     }
 899
 900     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
 901         GET_PROPS(csp, c, props);
 902         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
 903             return TRUE; /* followed by cased letter */
 904         } else if(c==0x307 || (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE) {
 905             /* case-ignorable, continue with the loop */
 906         } else {
 907             return FALSE; /* not ignorable */
 908         }
 909     }
 910
 911     return FALSE; /* not followed by cased letter */
 912 }
 913
 914 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
 915 static UBool
 916 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
 917     UChar32 c;
 918     int32_t dotType;
 919     int8_t dir;
 920
 921     if(iter==NULL) {
 922         return FALSE;
 923     }
 924
 925     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
 926         dotType=getDotType(csp, c);
 927         if(dotType==UCASE_SOFT_DOTTED) {
 928             return TRUE; /* preceded by TYPE_i */
 929         } else if(dotType!=UCASE_OTHER_ACCENT) {
 930             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
 931         }
 932     }
 933
 934     return FALSE; /* not preceded by TYPE_i */
 935 }
 936
 937 /*
 938  * See Jitterbug 2344:
 939  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
 940  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
 941  * we made those releases compatible with Unicode 3.2 which had not fixed
 942  * a related bug in SpecialCasing.txt.
 943  *
 944  * From the Jitterbug 2344 text:
 945  * ... this bug is listed as a Unicode erratum
 946  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
 947  * <quote>
 948  * There are two errors in SpecialCasing.txt.
 949  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
 950  * 2. An incorrect context definition. Correct as follows:
 951  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
 952  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
 953  * ---
 954  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
 955  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
 956  * where the context After_I is defined as:
 957  * The last preceding base character was an uppercase I, and there is no
 958  * intervening combining character class 230 (ABOVE).
 959  * </quote>
 960  *
 961  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
 962  *
 963  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
 964  * # This matches the behavior of the canonically equivalent I-dot_above
 965  *
 966  * See also the description in this place in older versions of uchar.c (revision 1.100).
 967  *
 968  * Markus W. Scherer 2003-feb-15
 969  */
 970
 971 /* Is preceded by base character 'I' with no intervening cc=230 ? */
 972 static UBool
 973 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
 974     UChar32 c;
 975     int32_t dotType;
 976     int8_t dir;
 977
 978     if(iter==NULL) {
 979         return FALSE;
 980     }
 981
 982     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
 983         if(c==0x49) {
 984             return TRUE; /* preceded by I */
 985         }
 986         dotType=getDotType(csp, c);
 987         if(dotType!=UCASE_OTHER_ACCENT) {
 988             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
 989         }
 990     }
 991
 992     return FALSE; /* not preceded by I */
 993 }
 994
 995 /* Is followed by one or more cc==230 ? */
 996 static UBool
 997 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
 998     UChar32 c;
 999     int32_t dotType;
1000     int8_t dir;
1001
1002     if(iter==NULL) {
1003         return FALSE;
1004     }
1005
1006     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1007         dotType=getDotType(csp, c);
1008         if(dotType==UCASE_ABOVE) {
1009             return TRUE; /* at least one cc==230 following */
1010         } else if(dotType!=UCASE_OTHER_ACCENT) {
1011             return FALSE; /* next base character, no more cc==230 following */
1012         }
1013     }
1014
1015     return FALSE; /* no more cc==230 following */
1016 }
1017
1018 /* Is followed by a dot above (without cc==230 in between) ? */
1019 static UBool
1020 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
1021     UChar32 c;
1022     int32_t dotType;
1023     int8_t dir;
1024
1025     if(iter==NULL) {
1026         return FALSE;
1027     }
1028
1029     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1030         if(c==0x307) {
1031             return TRUE;
1032         }
1033         dotType=getDotType(csp, c);
1034         if(dotType!=UCASE_OTHER_ACCENT) {
1035             return FALSE; /* next base character or cc==230 in between */
1036         }
1037     }
1038
1039     return FALSE; /* no dot above following */
1040 }
1041
1042 U_CAPI int32_t U_EXPORT2
1043 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
1044                   UCaseContextIterator *iter, void *context,
1045                   const UChar **pString,
1046                   const char *locale, int32_t *locCache) {
1047     static const UChar
1048         iDot[2]=        { 0x69, 0x307 },
1049         jDot[2]=        { 0x6a, 0x307 },
1050         iOgonekDot[3]= { 0x12f, 0x307 },
1051         iDotGrave[3]=   { 0x69, 0x307, 0x300 },
1052         iDotAcute[3]=   { 0x69, 0x307, 0x301 },
1053         iDotTilde[3]=   { 0x69, 0x307, 0x303 };
1054
1055     UChar32 result;
1056     uint16_t props;
1057
1058     result=c;
1059     GET_PROPS(csp, c, props);
1060     if(!PROPS_HAS_EXCEPTION(props)) {
1061         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1062             result=c+UCASE_GET_DELTA(props);
1063         }
1064     } else {
1065         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1066         uint16_t excWord=*pe++;
1067         int32_t full;
1068
1069         pe2=pe;
1070
1071         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1072             /* use hardcoded conditions and mappings */
1073             int32_t loc=ucase_getCaseLocale(locale, locCache);
1074
1075             /*
1076              * Test for conditional mappings first
1077              *   (otherwise the unconditional default mappings are always taken),
1078              * then test for characters that have unconditional mappings in SpecialCasing.txt,
1079              * then get the UnicodeData.txt mappings.
1080              */
1081             if( loc==LOC_LITHUANIAN &&
1082                     /* base characters, find accents above */
1083                     (((c==0x49 || c==0x4a || c==0x12e) &&
1084                         isFollowedByMoreAbove(csp, iter, context)) ||
1085                     /* precomposed with accent above, no need to find one */
1086                     (c==0xcc || c==0xcd || c==0x128))
1087             ) {
1088                 /*
1089                     # Lithuanian
1090
1091                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1092
1093                     # Introduce an explicit dot above when lowercasing capital I's and J's
1094                     # whenever there are more accents above.
1095                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1096
1097                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1098                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1099                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1100                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1101                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1102                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1103                  */
1104                 switch(c) {
1105                 case 0x49:  /* LATIN CAPITAL LETTER I */
1106                     *pString=iDot;
1107                     return 2;
1108                 case 0x4a:  /* LATIN CAPITAL LETTER J */
1109                     *pString=jDot;
1110                     return 2;
1111                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1112                     *pString=iOgonekDot;
1113                     return 2;
1114                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
1115                     *pString=iDotGrave;
1116                     return 3;
1117                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
1118                     *pString=iDotAcute;
1119                     return 3;
1120                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1121                     *pString=iDotTilde;
1122                     return 3;
1123                 default:
1124                     return 0; /* will not occur */
1125                 }
1126             /* # Turkish and Azeri */
1127             } else if(loc==LOC_TURKISH && c==0x130) {
1128                 /*
1129                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1130                     # The following rules handle those cases.
1131
1132                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1133                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1134                  */
1135                 return 0x69;
1136             } else if(loc==LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
1137                 /*
1138                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1139                     # This matches the behavior of the canonically equivalent I-dot_above
1140
1141                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1142                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1143                  */
1144                 return 0; /* remove the dot (continue without output) */
1145             } else if(loc==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
1146                 /*
1147                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1148
1149                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1150                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1151                  */
1152                 return 0x131;
1153             } else if(c==0x130) {
1154                 /*
1155                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
1156
1157                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1158                  */
1159                 *pString=iDot;
1160                 return 2;
1161             } else if(  c==0x3a3 &&
1162                         !isFollowedByCasedLetter(csp, iter, context, 1) &&
1163                         isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
1164             ) {
1165                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1166                 /*
1167                     # Special case for final form of sigma
1168
1169                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1170                  */
1171                 return 0x3c2; /* greek small final sigma */
1172             } else {
1173                 /* no known conditional special case mapping, use a normal mapping */
1174             }
1175         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1176             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1177             full&=UCASE_FULL_LOWER;
1178             if(full!=0) {
1179                 /* set the output pointer to the lowercase mapping */
1180                 *pString=pe+1;
1181
1182                 /* return the string length */
1183                 return full;
1184             }
1185         }
1186
1187         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1188             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1189         }
1190     }
1191
1192     return (result==c) ? ~result : result;
1193 }
1194
1195 /* internal */
1196 static int32_t
1197 toUpperOrTitle(const UCaseProps *csp, UChar32 c,
1198                UCaseContextIterator *iter, void *context,
1199                const UChar **pString,
1200                const char *locale, int32_t *locCache,
1201                UBool upperNotTitle) {
1202     UChar32 result;
1203     uint16_t props;
1204
1205     result=c;
1206     GET_PROPS(csp, c, props);
1207     if(!PROPS_HAS_EXCEPTION(props)) {
1208         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1209             result=c+UCASE_GET_DELTA(props);
1210         }
1211     } else {
1212         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1213         uint16_t excWord=*pe++;
1214         int32_t full, index;
1215
1216         pe2=pe;
1217
1218         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1219             /* use hardcoded conditions and mappings */
1220             int32_t loc=ucase_getCaseLocale(locale, locCache);
1221
1222             if(loc==LOC_TURKISH && c==0x69) {
1223                 /*
1224                     # Turkish and Azeri
1225
1226                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1227                     # The following rules handle those cases.
1228
1229                     # When uppercasing, i turns into a dotted capital I
1230
1231                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1232                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1233                 */
1234                 return 0x130;
1235             } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
1236                 /*
1237                     # Lithuanian
1238
1239                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1240
1241                     # Remove DOT ABOVE after "i" with upper or titlecase
1242
1243                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1244                  */
1245                 return 0; /* remove the dot (continue without output) */
1246             } else {
1247                 /* no known conditional special case mapping, use a normal mapping */
1248             }
1249         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1250             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1251
1252             /* start of full case mapping strings */
1253             ++pe;
1254
1255             /* skip the lowercase and case-folding result strings */
1256             pe+=full&UCASE_FULL_LOWER;
1257             full>>=4;
1258             pe+=full&0xf;
1259             full>>=4;
1260
1261             if(upperNotTitle) {
1262                 full&=0xf;
1263             } else {
1264                 /* skip the uppercase result string */
1265                 pe+=full&0xf;
1266                 full=(full>>4)&0xf;
1267             }
1268
1269             if(full!=0) {
1270                 /* set the output pointer to the result string */
1271                 *pString=pe;
1272
1273                 /* return the string length */
1274                 return full;
1275             }
1276         }
1277
1278         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1279             index=UCASE_EXC_TITLE;
1280         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1281             /* here, titlecase is same as uppercase */
1282             index=UCASE_EXC_UPPER;
1283         } else {
1284             return ~c;
1285         }
1286         GET_SLOT_VALUE(excWord, index, pe2, result);
1287     }
1288
1289     return (result==c) ? ~result : result;
1290 }
1291
1292 U_CAPI int32_t U_EXPORT2
1293 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
1294                   UCaseContextIterator *iter, void *context,
1295                   const UChar **pString,
1296                   const char *locale, int32_t *locCache) {
1297     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
1298 }
1299
1300 U_CAPI int32_t U_EXPORT2
1301 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
1302                   UCaseContextIterator *iter, void *context,
1303                   const UChar **pString,
1304                   const char *locale, int32_t *locCache) {
1305     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
1306 }
1307
1308 /* case folding ------------------------------------------------------------- */
1309
1310 /*
1311  * Case folding is similar to lowercasing.
1312  * The result may be a simple mapping, i.e., a single code point, or
1313  * a full mapping, i.e., a string.
1314  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1315  * then only the lowercase mapping is stored.
1316  *
1317  * Some special cases are hardcoded because their conditions cannot be
1318  * parsed and processed from CaseFolding.txt.
1319  *
1320  * Unicode 3.2 CaseFolding.txt specifies for its status field:
1321
1322 # C: common case folding, common mappings shared by both simple and full mappings.
1323 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1324 # S: simple case folding, mappings to single characters where different from F.
1325 # T: special case for uppercase I and dotted uppercase I
1326 #    - For non-Turkic languages, this mapping is normally not used.
1327 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1328 #
1329 # Usage:
1330 #  A. To do a simple case folding, use the mappings with status C + S.
1331 #  B. To do a full case folding, use the mappings with status C + F.
1332 #
1333 #    The mappings with status T can be used or omitted depending on the desired case-folding
1334 #    behavior. (The default option is to exclude them.)
1335
1336  * Unicode 3.2 has 'T' mappings as follows:
1337
1338 0049; T; 0131; # LATIN CAPITAL LETTER I
1339 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1340
1341  * while the default mappings for these code points are:
1342
1343 0049; C; 0069; # LATIN CAPITAL LETTER I
1344 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1345
1346  * U+0130 has no simple case folding (simple-case-folds to itself).
1347  */
1348
1349 /* return the simple case folding mapping for c */
1350 U_CAPI UChar32 U_EXPORT2
1351 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
1352     uint16_t props;
1353     GET_PROPS(csp, c, props);
1354     if(!PROPS_HAS_EXCEPTION(props)) {
1355         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1356             c+=UCASE_GET_DELTA(props);
1357         }
1358     } else {
1359         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
1360         uint16_t excWord=*pe++;
1361         int32_t index;
1362         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1363             /* special case folding mappings, hardcoded */
1364             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1365                 /* default mappings */
1366                 if(c==0x49) {
1367                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1368                     return 0x69;
1369                 } else if(c==0x130) {
1370                     /* no simple case folding for U+0130 */
1371                     return c;
1372                 }
1373             } else {
1374                 /* Turkic mappings */
1375                 if(c==0x49) {
1376                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1377                     return 0x131;
1378                 } else if(c==0x130) {
1379                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1380                     return 0x69;
1381                 }
1382             }
1383         }
1384         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1385             index=UCASE_EXC_FOLD;
1386         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1387             index=UCASE_EXC_LOWER;
1388         } else {
1389             return c;
1390         }
1391         GET_SLOT_VALUE(excWord, index, pe, c);
1392     }
1393     return c;
1394 }
1395
1396 /*
1397  * Issue for canonical caseless match (UAX #21):
1398  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1399  * canonical equivalence, unlike default-option casefolding.
1400  * For example, I-grave and I + grave fold to strings that are not canonically
1401  * equivalent.
1402  * For more details, see the comment in unorm_compare() in unorm.cpp
1403  * and the intermediate prototype changes for Jitterbug 2021.
1404  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1405  *
1406  * This did not get fixed because it appears that it is not possible to fix
1407  * it for uppercase and lowercase characters (I-grave vs. i-grave)
1408  * together in a way that they still fold to common result strings.
1409  */
1410
1411 U_CAPI int32_t U_EXPORT2
1412 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
1413                     const UChar **pString,
1414                     uint32_t options) {
1415     static const UChar
1416         iDot[2]=        { 0x69, 0x307 };
1417
1418     UChar32 result;
1419     uint16_t props;
1420
1421     result=c;
1422     GET_PROPS(csp, c, props);
1423     if(!PROPS_HAS_EXCEPTION(props)) {
1424         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1425             result=c+UCASE_GET_DELTA(props);
1426         }
1427     } else {
1428         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1429         uint16_t excWord=*pe++;
1430         int32_t full, index;
1431
1432         pe2=pe;
1433
1434         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1435             /* use hardcoded conditions and mappings */
1436             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1437                 /* default mappings */
1438                 if(c==0x49) {
1439                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1440                     return 0x69;
1441                 } else if(c==0x130) {
1442                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1443                     *pString=iDot;
1444                     return 2;
1445                 }
1446             } else {
1447                 /* Turkic mappings */
1448                 if(c==0x49) {
1449                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1450                     return 0x131;
1451                 } else if(c==0x130) {
1452                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1453                     return 0x69;
1454                 }
1455             }
1456         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1457             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1458
1459             /* start of full case mapping strings */
1460             ++pe;
1461
1462             /* skip the lowercase result string */
1463             pe+=full&UCASE_FULL_LOWER;
1464             full=(full>>4)&0xf;
1465
1466             if(full!=0) {
1467                 /* set the output pointer to the result string */
1468                 *pString=pe;
1469
1470                 /* return the string length */
1471                 return full;
1472             }
1473         }
1474
1475         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1476             index=UCASE_EXC_FOLD;
1477         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1478             index=UCASE_EXC_LOWER;
1479         } else {
1480             return ~c;
1481         }
1482         GET_SLOT_VALUE(excWord, index, pe2, result);
1483     }
1484
1485     return (result==c) ? ~result : result;
1486 }
1487
1488 /* case mapping properties API ---------------------------------------------- */
1489
1490 /* get the UCaseProps singleton, or else its dummy, once and for all */
1491 static const UCaseProps *
1492 getCaseProps() {
1493     /*
1494      * This lazy intialization with double-checked locking (without mutex protection for
1495      * the initial check) is transiently unsafe under certain circumstances.
1496      * Check the readme and use u_init() if necessary.
1497      */
1498
1499     /* the initial check is performed by the GET_CASE_PROPS() macro */
1500     const UCaseProps *csp;
1501     UErrorCode errorCode=U_ZERO_ERROR;
1502
1503     csp=ucase_getSingleton(&errorCode);
1504     if(U_FAILURE(errorCode)) {
1505         errorCode=U_ZERO_ERROR;
1506         csp=ucase_getDummy(&errorCode);
1507         if(U_FAILURE(errorCode)) {
1508             return NULL;
1509         }
1510     }
1511
1512     return csp;
1513 }
1514
1515 /*
1516  * In ICU 3.0, most Unicode properties were loaded from uprops.icu.
1517  * ICU 3.2 adds ucase.icu for case mapping properties.
1518  * ICU 3.4 adds ubidi.icu for bidi/shaping properties and
1519  * removes case/bidi/shaping properties from uprops.icu.
1520  *
1521  * Loading of uprops.icu was never mutex-protected and required u_init()
1522  * for thread safety.
1523  * In order to maintain performance for all such properties,
1524  * ucase.icu and ubidi.icu are loaded lazily, without mutexing.
1525  * u_init() will try to load them for thread safety,
1526  * but u_init() will not fail if they are missing.
1527  *
1528  * uchar.c maintains a tri-state flag for (not loaded/loaded/failed to load)
1529  * and an error code for load failure.
1530  * Instead, here we try to load at most once.
1531  * If it works, we use the resulting singleton object.
1532  * If it fails, then we get a dummy object, which always works unless
1533  * we are seriously out of memory.
1534  * After the first try, we have a never-changing pointer to either the
1535  * real singleton or the dummy.
1536  *
1537  * This method is used in Unicode properties APIs (uchar.h) that
1538  * do not have a service object and also do not have an error code parameter.
1539  * Other API implementations get the singleton themselves
1540  * (with mutexing), store it in the service object, and report errors.
1541  */
1542 #define GET_CASE_PROPS() (gCsp!=NULL ? gCsp : getCaseProps())
1543
1544 /* public API (see uchar.h) */
1545
1546 U_CAPI UBool U_EXPORT2
1547 u_isULowercase(UChar32 c) {
1548     return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
1549 }
1550
1551 U_CAPI UBool U_EXPORT2
1552 u_isUUppercase(UChar32 c) {
1553     return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
1554 }
1555
1556 /* Transforms the Unicode character to its lower case equivalent.*/
1557 U_CAPI UChar32 U_EXPORT2
1558 u_tolower(UChar32 c) {
1559     return ucase_tolower(GET_CASE_PROPS(), c);
1560 }
1561
1562 /* Transforms the Unicode character to its upper case equivalent.*/
1563 U_CAPI UChar32 U_EXPORT2
1564 u_toupper(UChar32 c) {
1565     return ucase_toupper(GET_CASE_PROPS(), c);
1566 }
1567
1568 /* Transforms the Unicode character to its title case equivalent.*/
1569 U_CAPI UChar32 U_EXPORT2
1570 u_totitle(UChar32 c) {
1571     return ucase_totitle(GET_CASE_PROPS(), c);
1572 }
1573
1574 /* return the simple case folding mapping for c */
1575 U_CAPI UChar32 U_EXPORT2
1576 u_foldCase(UChar32 c, uint32_t options) {
1577     return ucase_fold(GET_CASE_PROPS(), c, options);
1578 }
1579
1580 U_CFUNC int32_t U_EXPORT2
1581 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1582     /* case mapping properties */
1583     const UCaseProps *csp=GET_CASE_PROPS();
1584     if(csp==NULL) {
1585         return FALSE;
1586     }
1587     switch(which) {
1588     case UCHAR_LOWERCASE:
1589         return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
1590     case UCHAR_UPPERCASE:
1591         return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
1592     case UCHAR_SOFT_DOTTED:
1593         return ucase_isSoftDotted(csp, c);
1594     case UCHAR_CASE_SENSITIVE:
1595         return ucase_isCaseSensitive(csp, c);
1596     default:
1597         return FALSE;
1598     }
1599 }