icuSources/common/ucase.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2004-2008, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucase.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2004aug30
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Low-level Unicode character/string case mapping code.
  17 *   Much code moved here (and modified) from uchar.c.
  18 */
  19
  20 #include "unicode/utypes.h"
  21 #include "unicode/uset.h"
  22 #include "unicode/udata.h" /* UDataInfo */
  23 #include "ucmndata.h" /* DataHeader */
  24 #include "udatamem.h"
  25 #include "umutex.h"
  26 #include "uassert.h"
  27 #include "cmemory.h"
  28 #include "utrie.h"
  29 #include "ucase.h"
  30 #include "ucln_cmn.h"
  31
  32 struct UCaseProps {
  33     UDataMemory *mem;
  34     const int32_t *indexes;
  35     const uint16_t *exceptions;
  36     const UChar *unfold;
  37
  38     UTrie trie;
  39     uint8_t formatVersion[4];
  40 };
  41
  42 /* data loading etc. -------------------------------------------------------- */
  43
  44 #if UCASE_HARDCODE_DATA
  45
  46 /* ucase_props_data.c is machine-generated by gencase --csource */
  47 #include "ucase_props_data.c"
  48
  49 #else
  50
  51 static UBool U_CALLCONV
  52 isAcceptable(void *context,
  53              const char *type, const char *name,
  54              const UDataInfo *pInfo) {
  55     if(
  56         pInfo->size>=20 &&
  57         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
  58         pInfo->charsetFamily==U_CHARSET_FAMILY &&
  59         pInfo->dataFormat[0]==UCASE_FMT_0 &&    /* dataFormat="cAsE" */
  60         pInfo->dataFormat[1]==UCASE_FMT_1 &&
  61         pInfo->dataFormat[2]==UCASE_FMT_2 &&
  62         pInfo->dataFormat[3]==UCASE_FMT_3 &&
  63         pInfo->formatVersion[0]==1 &&
  64         pInfo->formatVersion[2]==UTRIE_SHIFT &&
  65         pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
  66     ) {
  67         UCaseProps *csp=(UCaseProps *)context;
  68         uprv_memcpy(csp->formatVersion, pInfo->formatVersion, 4);
  69         return TRUE;
  70     } else {
  71         return FALSE;
  72     }
  73 }
  74
  75 static UCaseProps *
  76 ucase_openData(UCaseProps *cspProto,
  77                const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
  78     UCaseProps *csp;
  79     int32_t size;
  80
  81     cspProto->indexes=(const int32_t *)bin;
  82     if( (length>=0 && length<16*4) ||
  83         cspProto->indexes[UCASE_IX_INDEX_TOP]<16
  84     ) {
  85         /* length or indexes[] too short for minimum indexes[] length of 16 */
  86         *pErrorCode=U_INVALID_FORMAT_ERROR;
  87         return NULL;
  88     }
  89     size=cspProto->indexes[UCASE_IX_INDEX_TOP]*4;
  90     if(length>=0) {
  91         if(length>=size && length>=cspProto->indexes[UCASE_IX_LENGTH]) {
  92             length-=size;
  93         } else {
  94             /* length too short for indexes[] or for the whole data length */
  95             *pErrorCode=U_INVALID_FORMAT_ERROR;
  96             return NULL;
  97         }
  98     }
  99     bin+=size;
 100     /* from here on, assume that the sizes of the items fit into the total length */
 101
 102     /* unserialize the trie, after indexes[] */
 103     size=cspProto->indexes[UCASE_IX_TRIE_SIZE];
 104     utrie_unserialize(&cspProto->trie, bin, size, pErrorCode);
 105     if(U_FAILURE(*pErrorCode)) {
 106         return NULL;
 107     }
 108     bin+=size;
 109
 110     /* get exceptions[] */
 111     size=2*cspProto->indexes[UCASE_IX_EXC_LENGTH];
 112     cspProto->exceptions=(const uint16_t *)bin;
 113     bin+=size;
 114
 115     /* get unfold[] */
 116     size=2*cspProto->indexes[UCASE_IX_UNFOLD_LENGTH];
 117     if(size!=0) {
 118         cspProto->unfold=(const UChar *)bin;
 119         bin+=size;
 120     } else {
 121         cspProto->unfold=NULL;
 122     }
 123
 124     /* allocate, copy, and return the new UCaseProps */
 125     csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps));
 126     if(csp==NULL) {
 127         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
 128         return NULL;
 129     } else {
 130         uprv_memcpy(csp, cspProto, sizeof(UCaseProps));
 131         return csp;
 132     }
 133 }
 134
 135 U_CAPI UCaseProps * U_EXPORT2
 136 ucase_open(UErrorCode *pErrorCode) {
 137     UCaseProps cspProto={ NULL }, *csp;
 138
 139     cspProto.mem=udata_openChoice(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, isAcceptable, &cspProto, pErrorCode);
 140     if(U_FAILURE(*pErrorCode)) {
 141         return NULL;
 142     }
 143
 144     csp=ucase_openData(
 145             &cspProto,
 146             udata_getMemory(cspProto.mem),
 147             udata_getLength(cspProto.mem),
 148             pErrorCode);
 149     if(U_FAILURE(*pErrorCode)) {
 150         udata_close(cspProto.mem);
 151         return NULL;
 152     } else {
 153         return csp;
 154     }
 155 }
 156
 157 U_CAPI UCaseProps * U_EXPORT2
 158 ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
 159     UCaseProps cspProto={ NULL };
 160     const DataHeader *hdr;
 161
 162     if(U_FAILURE(*pErrorCode)) {
 163         return NULL;
 164     }
 165     if(bin==NULL) {
 166         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 167         return NULL;
 168     }
 169
 170     /* check the header */
 171     if(length>=0 && length<20) {
 172         *pErrorCode=U_INVALID_FORMAT_ERROR;
 173         return NULL;
 174     }
 175     hdr=(const DataHeader *)bin;
 176     if(
 177         !(hdr->dataHeader.magic1==0xda && hdr->dataHeader.magic2==0x27 &&
 178           hdr->info.isBigEndian==U_IS_BIG_ENDIAN &&
 179           isAcceptable(&cspProto, UCASE_DATA_TYPE, UCASE_DATA_NAME, &hdr->info))
 180     ) {
 181         *pErrorCode=U_INVALID_FORMAT_ERROR;
 182         return NULL;
 183     }
 184
 185     bin+=hdr->dataHeader.headerSize;
 186     if(length>=0) {
 187         length-=hdr->dataHeader.headerSize;
 188     }
 189     return ucase_openData(&cspProto, bin, length, pErrorCode);
 190 }
 191
 192 #endif
 193
 194 U_CAPI void U_EXPORT2
 195 ucase_close(UCaseProps *csp) {
 196     if(csp!=NULL) {
 197 #if !UCASE_HARDCODE_DATA
 198         udata_close(csp->mem);
 199 #endif
 200         uprv_free(csp);
 201     }
 202 }
 203
 204 /* UCaseProps singleton ----------------------------------------------------- */
 205
 206 #if !UCASE_HARDCODE_DATA
 207 static UCaseProps *gCsp=NULL;
 208 static UCaseProps *gCspDummy=NULL;
 209 static UErrorCode gErrorCode=U_ZERO_ERROR;
 210 static int8_t gHaveData=0;
 211 #endif
 212
 213 #if !UCASE_HARDCODE_DATA
 214 static UBool U_CALLCONV ucase_cleanup(void) {
 215     ucase_close(gCsp);
 216     gCsp=NULL;
 217     ucase_close(gCspDummy);
 218     gCspDummy=NULL;
 219     gErrorCode=U_ZERO_ERROR;
 220     gHaveData=0;
 221     return TRUE;
 222 }
 223 #endif
 224
 225 U_CAPI const UCaseProps * U_EXPORT2
 226 ucase_getSingleton(UErrorCode *pErrorCode) {
 227 #if UCASE_HARDCODE_DATA
 228     if(U_FAILURE(*pErrorCode)) {
 229         return NULL;
 230     }
 231     return &ucase_props_singleton;
 232 #else
 233     int8_t haveData;
 234
 235     if(U_FAILURE(*pErrorCode)) {
 236         return NULL;
 237     }
 238
 239     UMTX_CHECK(NULL, gHaveData, haveData);
 240
 241     if(haveData>0) {
 242         /* data was loaded */
 243         return gCsp;
 244     } else if(haveData<0) {
 245         /* data loading failed */
 246         *pErrorCode=gErrorCode;
 247         return NULL;
 248     } else /* haveData==0 */ {
 249         /* load the data */
 250         UCaseProps *csp=ucase_open(pErrorCode);
 251         if(U_FAILURE(*pErrorCode)) {
 252             gHaveData=-1;
 253             gErrorCode=*pErrorCode;
 254             return NULL;
 255         }
 256
 257         /* set the static variables */
 258         umtx_lock(NULL);
 259         if(gCsp==NULL) {
 260             gCsp=csp;
 261             csp=NULL;
 262             gHaveData=1;
 263             ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup);
 264         }
 265         umtx_unlock(NULL);
 266
 267         ucase_close(csp);
 268         return gCsp;
 269     }
 270 #endif
 271 }
 272
 273 #if !UCASE_HARDCODE_DATA
 274 U_CAPI const UCaseProps * U_EXPORT2
 275 ucase_getDummy(UErrorCode *pErrorCode) {
 276     UCaseProps *csp;
 277
 278     if(U_FAILURE(*pErrorCode)) {
 279         return NULL;
 280     }
 281
 282     UMTX_CHECK(NULL, gCspDummy, csp);
 283
 284     if(csp!=NULL) {
 285         /* the dummy object was already created */
 286         return csp;
 287     } else /* csp==NULL */ {
 288         /* create the dummy object */
 289         int32_t *indexes;
 290
 291         csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps)+UCASE_IX_TOP*4+UTRIE_DUMMY_SIZE);
 292         if(csp==NULL) {
 293             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
 294             return NULL;
 295         }
 296         uprv_memset(csp, 0, sizeof(UCaseProps)+UCASE_IX_TOP*4);
 297
 298         csp->indexes=indexes=(int32_t *)(csp+1);
 299         indexes[UCASE_IX_INDEX_TOP]=UCASE_IX_TOP;
 300
 301         indexes[UCASE_IX_TRIE_SIZE]=
 302             utrie_unserializeDummy(&csp->trie, indexes+UCASE_IX_TOP, UTRIE_DUMMY_SIZE, 0, 0, TRUE, pErrorCode);
 303         if(U_FAILURE(*pErrorCode)) {
 304             uprv_free(csp);
 305             return NULL;
 306         }
 307
 308         csp->formatVersion[0]=1;
 309         csp->formatVersion[2]=UTRIE_SHIFT;
 310         csp->formatVersion[3]=UTRIE_INDEX_SHIFT;
 311
 312         /* set the static variables */
 313         umtx_lock(NULL);
 314         if(gCspDummy==NULL) {
 315             gCspDummy=csp;
 316             csp=NULL;
 317             ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup);
 318         }
 319         umtx_unlock(NULL);
 320
 321         uprv_free(csp);
 322         return gCspDummy;
 323     }
 324 }
 325 #endif
 326
 327 /* set of property starts for UnicodeSet ------------------------------------ */
 328
 329 static UBool U_CALLCONV
 330 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
 331     /* add the start code point to the USet */
 332     const USetAdder *sa=(const USetAdder *)context;
 333     sa->add(sa->set, start);
 334     return TRUE;
 335 }
 336
 337 U_CFUNC void U_EXPORT2
 338 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
 339     if(U_FAILURE(*pErrorCode)) {
 340         return;
 341     }
 342
 343     /* add the start code point of each same-value range of the trie */
 344     utrie_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
 345
 346     /* add code points with hardcoded properties, plus the ones following them */
 347
 348     /* (none right now, see comment below) */
 349
 350     /*
 351      * Omit code points with hardcoded specialcasing properties
 352      * because we do not build property UnicodeSets for them right now.
 353      */
 354 }
 355
 356 /* data access primitives --------------------------------------------------- */
 357
 358 /* UTRIE_GET16() itself validates c */
 359 #define GET_PROPS(csp, c, result) \
 360     UTRIE_GET16(&(csp)->trie, c, result);
 361
 362 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
 363
 364 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
 365
 366 /* number of bits in an 8-bit integer value */
 367 static const uint8_t flagsOffset[256]={
 368     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
 369     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
 370     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
 371     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 372     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
 373     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 374     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 375     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 376     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
 377     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 378     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 379     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 380     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 381     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 382     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 383     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 384 };
 385
 386 #define HAS_SLOT(flags, index) ((flags)&(1<<(index)))
 387 #define SLOT_OFFSET(flags, index) flagsOffset[(flags)&((1<<(index))-1)]
 388
 389 /*
 390  * Get the value of an optional-value slot where HAS_SLOT(excWord, index).
 391  *
 392  * @param excWord (in) initial exceptions word
 393  * @param index (in) desired slot index
 394  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
 395  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
 396  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
 397  */
 398 #define GET_SLOT_VALUE(excWord, index, pExc16, value) \
 399     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
 400         (pExc16)+=SLOT_OFFSET(excWord, index); \
 401         (value)=*pExc16; \
 402     } else { \
 403         (pExc16)+=2*SLOT_OFFSET(excWord, index); \
 404         (value)=*pExc16++; \
 405         (value)=((value)<<16)|*pExc16; \
 406     }
 407
 408 /* simple case mappings ----------------------------------------------------- */
 409
 410 U_CAPI UChar32 U_EXPORT2
 411 ucase_tolower(const UCaseProps *csp, UChar32 c) {
 412     uint16_t props;
 413     GET_PROPS(csp, c, props);
 414     if(!PROPS_HAS_EXCEPTION(props)) {
 415         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
 416             c+=UCASE_GET_DELTA(props);
 417         }
 418     } else {
 419         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 420         uint16_t excWord=*pe++;
 421         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
 422             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
 423         }
 424     }
 425     return c;
 426 }
 427
 428 U_CAPI UChar32 U_EXPORT2
 429 ucase_toupper(const UCaseProps *csp, UChar32 c) {
 430     uint16_t props;
 431     GET_PROPS(csp, c, props);
 432     if(!PROPS_HAS_EXCEPTION(props)) {
 433         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
 434             c+=UCASE_GET_DELTA(props);
 435         }
 436     } else {
 437         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 438         uint16_t excWord=*pe++;
 439         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
 440             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
 441         }
 442     }
 443     return c;
 444 }
 445
 446 U_CAPI UChar32 U_EXPORT2
 447 ucase_totitle(const UCaseProps *csp, UChar32 c) {
 448     uint16_t props;
 449     GET_PROPS(csp, c, props);
 450     if(!PROPS_HAS_EXCEPTION(props)) {
 451         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
 452             c+=UCASE_GET_DELTA(props);
 453         }
 454     } else {
 455         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 456         uint16_t excWord=*pe++;
 457         int32_t index;
 458         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
 459             index=UCASE_EXC_TITLE;
 460         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
 461             index=UCASE_EXC_UPPER;
 462         } else {
 463             return c;
 464         }
 465         GET_SLOT_VALUE(excWord, index, pe, c);
 466     }
 467     return c;
 468 }
 469
 470 static const UChar iDot[2] = { 0x69, 0x307 };
 471 static const UChar jDot[2] = { 0x6a, 0x307 };
 472 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
 473 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
 474 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
 475 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
 476
 477
 478 U_CFUNC void U_EXPORT2
 479 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
 480     uint16_t props;
 481
 482     /*
 483      * Hardcode the case closure of i and its relatives and ignore the
 484      * data file data for these characters.
 485      * The Turkic dotless i and dotted I with their case mapping conditions
 486      * and case folding option make the related characters behave specially.
 487      * This code matches their closure behavior to their case folding behavior.
 488      */
 489
 490     switch(c) {
 491     case 0x49:
 492         /* regular i and I are in one equivalence class */
 493         sa->add(sa->set, 0x69);
 494         return;
 495     case 0x69:
 496         sa->add(sa->set, 0x49);
 497         return;
 498     case 0x130:
 499         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
 500         sa->addString(sa->set, iDot, 2);
 501         return;
 502     case 0x131:
 503         /* dotless i is in a class by itself */
 504         return;
 505     default:
 506         /* otherwise use the data file data */
 507         break;
 508     }
 509
 510     GET_PROPS(csp, c, props);
 511     if(!PROPS_HAS_EXCEPTION(props)) {
 512         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
 513             /* add the one simple case mapping, no matter what type it is */
 514             int32_t delta=UCASE_GET_DELTA(props);
 515             if(delta!=0) {
 516                 sa->add(sa->set, c+delta);
 517             }
 518         }
 519     } else {
 520         /*
 521          * c has exceptions, so there may be multiple simple and/or
 522          * full case mappings. Add them all.
 523          */
 524         const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
 525         const UChar *closure;
 526         uint16_t excWord=*pe++;
 527         int32_t index, closureLength, fullLength, length;
 528
 529         pe0=pe;
 530
 531         /* add all simple case mappings */
 532         for(index=UCASE_EXC_LOWER; index<=UCASE_EXC_TITLE; ++index) {
 533             if(HAS_SLOT(excWord, index)) {
 534                 pe=pe0;
 535                 GET_SLOT_VALUE(excWord, index, pe, c);
 536                 sa->add(sa->set, c);
 537             }
 538         }
 539
 540         /* get the closure string pointer & length */
 541         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
 542             pe=pe0;
 543             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
 544             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
 545             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
 546         } else {
 547             closureLength=0;
 548             closure=NULL;
 549         }
 550
 551         /* add the full case folding */
 552         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
 553             pe=pe0;
 554             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
 555
 556             /* start of full case mapping strings */
 557             ++pe;
 558
 559             fullLength&=0xffff; /* bits 16 and higher are reserved */
 560
 561             /* skip the lowercase result string */
 562             pe+=fullLength&UCASE_FULL_LOWER;
 563             fullLength>>=4;
 564
 565             /* add the full case folding string */
 566             length=fullLength&0xf;
 567             if(length!=0) {
 568                 sa->addString(sa->set, (const UChar *)pe, length);
 569                 pe+=length;
 570             }
 571
 572             /* skip the uppercase and titlecase strings */
 573             fullLength>>=4;
 574             pe+=fullLength&0xf;
 575             fullLength>>=4;
 576             pe+=fullLength;
 577
 578             closure=(const UChar *)pe; /* behind full case mappings */
 579         }
 580
 581         /* add each code point in the closure string */
 582         for(index=0; index<closureLength;) {
 583             U16_NEXT_UNSAFE(closure, index, c);
 584             sa->add(sa->set, c);
 585         }
 586     }
 587 }
 588
 589 /*
 590  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
 591  * must be length>0 and max>0 and length<=max
 592  */
 593 static U_INLINE int32_t
 594 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
 595     int32_t c1, c2;
 596
 597     max-=length; /* we require length<=max, so no need to decrement max in the loop */
 598     do {
 599         c1=*s++;
 600         c2=*t++;
 601         if(c2==0) {
 602             return 1; /* reached the end of t but not of s */
 603         }
 604         c1-=c2;
 605         if(c1!=0) {
 606             return c1; /* return difference result */
 607         }
 608     } while(--length>0);
 609     /* ends with length==0 */
 610
 611     if(max==0 || *t==0) {
 612         return 0; /* equal to length of both strings */
 613     } else {
 614         return -max; /* return lengh difference */
 615     }
 616 }
 617
 618 U_CFUNC UBool U_EXPORT2
 619 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
 620     const UChar *unfold, *p;
 621     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
 622
 623     if(csp->unfold==NULL || s==NULL) {
 624         return FALSE; /* no reverse case folding data, or no string */
 625     }
 626     if(length<=1) {
 627         /* the string is too short to find any match */
 628         /*
 629          * more precise would be:
 630          * if(!u_strHasMoreChar32Than(s, length, 1))
 631          * but this does not make much practical difference because
 632          * a single supplementary code point would just not be found
 633          */
 634         return FALSE;
 635     }
 636
 637     unfold=csp->unfold;
 638     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
 639     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
 640     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
 641     unfold+=unfoldRowWidth;
 642
 643     if(length>unfoldStringWidth) {
 644         /* the string is too long to find any match */
 645         return FALSE;
 646     }
 647
 648     /* do a binary search for the string */
 649     start=0;
 650     limit=unfoldRows;
 651     while(start<limit) {
 652         i=(start+limit)/2;
 653         p=unfold+(i*unfoldRowWidth);
 654         result=strcmpMax(s, length, p, unfoldStringWidth);
 655
 656         if(result==0) {
 657             /* found the string: add each code point, and its case closure */
 658             UChar32 c;
 659
 660             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
 661                 U16_NEXT_UNSAFE(p, i, c);
 662                 sa->add(sa->set, c);
 663                 ucase_addCaseClosure(csp, c, sa);
 664             }
 665             return TRUE;
 666         } else if(result<0) {
 667             limit=i;
 668         } else /* result>0 */ {
 669             start=i+1;
 670         }
 671     }
 672
 673     return FALSE; /* string not found */
 674 }
 675
 676 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
 677 U_CAPI int32_t U_EXPORT2
 678 ucase_getType(const UCaseProps *csp, UChar32 c) {
 679     uint16_t props;
 680     GET_PROPS(csp, c, props);
 681     return UCASE_GET_TYPE(props);
 682 }
 683
 684 /** @return same as ucase_getType(), or <0 if c is case-ignorable */
 685 U_CAPI int32_t U_EXPORT2
 686 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
 687     int32_t type;
 688     uint16_t props;
 689     GET_PROPS(csp, c, props);
 690     type=UCASE_GET_TYPE(props);
 691     if(type!=UCASE_NONE) {
 692         return type;
 693     } else if(
 694         c==0x307 ||
 695         (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE
 696     ) {
 697         return -1; /* case-ignorable */
 698     } else {
 699         return 0; /* c is neither cased nor case-ignorable */
 700     }
 701 }
 702
 703 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
 704 static U_INLINE int32_t
 705 getDotType(const UCaseProps *csp, UChar32 c) {
 706     uint16_t props;
 707     GET_PROPS(csp, c, props);
 708     if(!PROPS_HAS_EXCEPTION(props)) {
 709         return props&UCASE_DOT_MASK;
 710     } else {
 711         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 712         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
 713     }
 714 }
 715
 716 U_CAPI UBool U_EXPORT2
 717 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
 718     return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
 719 }
 720
 721 U_CAPI UBool U_EXPORT2
 722 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
 723     uint16_t props;
 724     GET_PROPS(csp, c, props);
 725     return (UBool)((props&UCASE_SENSITIVE)!=0);
 726 }
 727
 728 /* string casing ------------------------------------------------------------ */
 729
 730 /*
 731  * These internal functions form the core of string case mappings.
 732  * They map single code points to result code points or strings and take
 733  * all necessary conditions (context, locale ID, options) into account.
 734  *
 735  * They do not iterate over the source or write to the destination
 736  * so that the same functions are useful for non-standard string storage,
 737  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
 738  * For the same reason, the "surrounding text" context is passed in as a
 739  * UCaseContextIterator which does not make any assumptions about
 740  * the underlying storage.
 741  *
 742  * This section contains helper functions that check for conditions
 743  * in the input text surrounding the current code point
 744  * according to SpecialCasing.txt.
 745  *
 746  * Each helper function gets the index
 747  * - after the current code point if it looks at following text
 748  * - before the current code point if it looks at preceding text
 749  *
 750  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
 751  *
 752  * Final_Sigma
 753  *   C is preceded by a sequence consisting of
 754  *     a cased letter and a case-ignorable sequence,
 755  *   and C is not followed by a sequence consisting of
 756  *     an ignorable sequence and then a cased letter.
 757  *
 758  * More_Above
 759  *   C is followed by one or more characters of combining class 230 (ABOVE)
 760  *   in the combining character sequence.
 761  *
 762  * After_Soft_Dotted
 763  *   The last preceding character with combining class of zero before C
 764  *   was Soft_Dotted,
 765  *   and there is no intervening combining character class 230 (ABOVE).
 766  *
 767  * Before_Dot
 768  *   C is followed by combining dot above (U+0307).
 769  *   Any sequence of characters with a combining class that is neither 0 nor 230
 770  *   may intervene between the current character and the combining dot above.
 771  *
 772  * The erratum from 2002-10-31 adds the condition
 773  *
 774  * After_I
 775  *   The last preceding base character was an uppercase I, and there is no
 776  *   intervening combining character class 230 (ABOVE).
 777  *
 778  *   (See Jitterbug 2344 and the comments on After_I below.)
 779  *
 780  * Helper definitions in Unicode 3.2 UAX 21:
 781  *
 782  * D1. A character C is defined to be cased
 783  *     if it meets any of the following criteria:
 784  *
 785  *   - The general category of C is Titlecase Letter (Lt)
 786  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
 787  *   - Given D = NFD(C), then it is not the case that:
 788  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
 789  *     (This third criterium does not add any characters to the list
 790  *      for Unicode 3.2. Ignored.)
 791  *
 792  * D2. A character C is defined to be case-ignorable
 793  *     if it meets either of the following criteria:
 794  *
 795  *   - The general category of C is
 796  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
 797  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
 798  *   - C is one of the following characters
 799  *     U+0027 APOSTROPHE
 800  *     U+00AD SOFT HYPHEN (SHY)
 801  *     U+2019 RIGHT SINGLE QUOTATION MARK
 802  *            (the preferred character for apostrophe)
 803  *
 804  * D3. A case-ignorable sequence is a sequence of
 805  *     zero or more case-ignorable characters.
 806  */
 807
 808 #define is_a(c) ((c)=='a' || (c)=='A')
 809 #define is_d(c) ((c)=='d' || (c)=='D')
 810 #define is_e(c) ((c)=='e' || (c)=='E')
 811 #define is_i(c) ((c)=='i' || (c)=='I')
 812 #define is_l(c) ((c)=='l' || (c)=='L')
 813 #define is_n(c) ((c)=='n' || (c)=='N')
 814 #define is_r(c) ((c)=='r' || (c)=='R')
 815 #define is_t(c) ((c)=='t' || (c)=='T')
 816 #define is_u(c) ((c)=='u' || (c)=='U')
 817 #define is_z(c) ((c)=='z' || (c)=='Z')
 818
 819 /* separator? */
 820 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
 821
 822 /**
 823  * Requires non-NULL locale ID but otherwise does the equivalent of
 824  * checking for language codes as if uloc_getLanguage() were called:
 825  * Accepts both 2- and 3-letter codes and accepts case variants.
 826  */
 827 U_CFUNC int32_t
 828 ucase_getCaseLocale(const char *locale, int32_t *locCache) {
 829     int32_t result;
 830     char c;
 831
 832     if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
 833         return result;
 834     }
 835
 836     result=UCASE_LOC_ROOT;
 837
 838     /*
 839      * This function used to use uloc_getLanguage(), but the current code
 840      * removes the dependency of this low-level code on uloc implementation code
 841      * and is faster because not the whole locale ID has to be
 842      * examined and copied/transformed.
 843      *
 844      * Because this code does not want to depend on uloc, the caller must
 845      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
 846      */
 847     c=*locale++;
 848     if(is_t(c)) {
 849         /* tr or tur? */
 850         c=*locale++;
 851         if(is_u(c)) {
 852             c=*locale++;
 853         }
 854         if(is_r(c)) {
 855             c=*locale;
 856             if(is_sep(c)) {
 857                 result=UCASE_LOC_TURKISH;
 858             }
 859         }
 860     } else if(is_a(c)) {
 861         /* az or aze? */
 862         c=*locale++;
 863         if(is_z(c)) {
 864             c=*locale++;
 865             if(is_e(c)) {
 866                 c=*locale;
 867             }
 868             if(is_sep(c)) {
 869                 result=UCASE_LOC_TURKISH;
 870             }
 871         }
 872     } else if(is_l(c)) {
 873         /* lt or lit? */
 874         c=*locale++;
 875         if(is_i(c)) {
 876             c=*locale++;
 877         }
 878         if(is_t(c)) {
 879             c=*locale;
 880             if(is_sep(c)) {
 881                 result=UCASE_LOC_LITHUANIAN;
 882             }
 883         }
 884     } else if(is_n(c)) {
 885         /* nl or nld? */
 886         c=*locale++;
 887         if(is_l(c)) {
 888             c=*locale++;
 889             if(is_d(c)) {
 890                 c=*locale;
 891             }
 892             if(is_sep(c)) {
 893                 result=UCASE_LOC_DUTCH;
 894             }
 895         }
 896     }
 897
 898     if(locCache!=NULL) {
 899         *locCache=result;
 900     }
 901     return result;
 902 }
 903
 904 /* Is followed by {case-ignorable}* cased  ? (dir determines looking forward/backward) */
 905 static UBool
 906 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
 907     UChar32 c;
 908     uint16_t props;
 909
 910     if(iter==NULL) {
 911         return FALSE;
 912     }
 913
 914     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
 915         GET_PROPS(csp, c, props);
 916         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
 917             return TRUE; /* followed by cased letter */
 918         } else if(c==0x307 || (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE) {
 919             /* case-ignorable, continue with the loop */
 920         } else {
 921             return FALSE; /* not ignorable */
 922         }
 923     }
 924
 925     return FALSE; /* not followed by cased letter */
 926 }
 927
 928 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
 929 static UBool
 930 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
 931     UChar32 c;
 932     int32_t dotType;
 933     int8_t dir;
 934
 935     if(iter==NULL) {
 936         return FALSE;
 937     }
 938
 939     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
 940         dotType=getDotType(csp, c);
 941         if(dotType==UCASE_SOFT_DOTTED) {
 942             return TRUE; /* preceded by TYPE_i */
 943         } else if(dotType!=UCASE_OTHER_ACCENT) {
 944             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
 945         }
 946     }
 947
 948     return FALSE; /* not preceded by TYPE_i */
 949 }
 950
 951 /*
 952  * See Jitterbug 2344:
 953  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
 954  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
 955  * we made those releases compatible with Unicode 3.2 which had not fixed
 956  * a related bug in SpecialCasing.txt.
 957  *
 958  * From the Jitterbug 2344 text:
 959  * ... this bug is listed as a Unicode erratum
 960  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
 961  * <quote>
 962  * There are two errors in SpecialCasing.txt.
 963  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
 964  * 2. An incorrect context definition. Correct as follows:
 965  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
 966  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
 967  * ---
 968  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
 969  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
 970  * where the context After_I is defined as:
 971  * The last preceding base character was an uppercase I, and there is no
 972  * intervening combining character class 230 (ABOVE).
 973  * </quote>
 974  *
 975  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
 976  *
 977  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
 978  * # This matches the behavior of the canonically equivalent I-dot_above
 979  *
 980  * See also the description in this place in older versions of uchar.c (revision 1.100).
 981  *
 982  * Markus W. Scherer 2003-feb-15
 983  */
 984
 985 /* Is preceded by base character 'I' with no intervening cc=230 ? */
 986 static UBool
 987 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
 988     UChar32 c;
 989     int32_t dotType;
 990     int8_t dir;
 991
 992     if(iter==NULL) {
 993         return FALSE;
 994     }
 995
 996     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
 997         if(c==0x49) {
 998             return TRUE; /* preceded by I */
 999         }
1000         dotType=getDotType(csp, c);
1001         if(dotType!=UCASE_OTHER_ACCENT) {
1002             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
1003         }
1004     }
1005
1006     return FALSE; /* not preceded by I */
1007 }
1008
1009 /* Is followed by one or more cc==230 ? */
1010 static UBool
1011 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
1012     UChar32 c;
1013     int32_t dotType;
1014     int8_t dir;
1015
1016     if(iter==NULL) {
1017         return FALSE;
1018     }
1019
1020     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1021         dotType=getDotType(csp, c);
1022         if(dotType==UCASE_ABOVE) {
1023             return TRUE; /* at least one cc==230 following */
1024         } else if(dotType!=UCASE_OTHER_ACCENT) {
1025             return FALSE; /* next base character, no more cc==230 following */
1026         }
1027     }
1028
1029     return FALSE; /* no more cc==230 following */
1030 }
1031
1032 /* Is followed by a dot above (without cc==230 in between) ? */
1033 static UBool
1034 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
1035     UChar32 c;
1036     int32_t dotType;
1037     int8_t dir;
1038
1039     if(iter==NULL) {
1040         return FALSE;
1041     }
1042
1043     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1044         if(c==0x307) {
1045             return TRUE;
1046         }
1047         dotType=getDotType(csp, c);
1048         if(dotType!=UCASE_OTHER_ACCENT) {
1049             return FALSE; /* next base character or cc==230 in between */
1050         }
1051     }
1052
1053     return FALSE; /* no dot above following */
1054 }
1055
1056 U_CAPI int32_t U_EXPORT2
1057 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
1058                   UCaseContextIterator *iter, void *context,
1059                   const UChar **pString,
1060                   const char *locale, int32_t *locCache)
1061 {
1062     UChar32 result;
1063     uint16_t props;
1064
1065     result=c;
1066     GET_PROPS(csp, c, props);
1067     if(!PROPS_HAS_EXCEPTION(props)) {
1068         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1069             result=c+UCASE_GET_DELTA(props);
1070         }
1071     } else {
1072         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1073         uint16_t excWord=*pe++;
1074         int32_t full;
1075
1076         pe2=pe;
1077
1078         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1079             /* use hardcoded conditions and mappings */
1080             int32_t loc=ucase_getCaseLocale(locale, locCache);
1081
1082             /*
1083              * Test for conditional mappings first
1084              *   (otherwise the unconditional default mappings are always taken),
1085              * then test for characters that have unconditional mappings in SpecialCasing.txt,
1086              * then get the UnicodeData.txt mappings.
1087              */
1088             if( loc==UCASE_LOC_LITHUANIAN &&
1089                     /* base characters, find accents above */
1090                     (((c==0x49 || c==0x4a || c==0x12e) &&
1091                         isFollowedByMoreAbove(csp, iter, context)) ||
1092                     /* precomposed with accent above, no need to find one */
1093                     (c==0xcc || c==0xcd || c==0x128))
1094             ) {
1095                 /*
1096                     # Lithuanian
1097
1098                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1099
1100                     # Introduce an explicit dot above when lowercasing capital I's and J's
1101                     # whenever there are more accents above.
1102                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1103
1104                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1105                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1106                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1107                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1108                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1109                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1110                  */
1111                 switch(c) {
1112                 case 0x49:  /* LATIN CAPITAL LETTER I */
1113                     *pString=iDot;
1114                     return 2;
1115                 case 0x4a:  /* LATIN CAPITAL LETTER J */
1116                     *pString=jDot;
1117                     return 2;
1118                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1119                     *pString=iOgonekDot;
1120                     return 2;
1121                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
1122                     *pString=iDotGrave;
1123                     return 3;
1124                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
1125                     *pString=iDotAcute;
1126                     return 3;
1127                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1128                     *pString=iDotTilde;
1129                     return 3;
1130                 default:
1131                     return 0; /* will not occur */
1132                 }
1133             /* # Turkish and Azeri */
1134             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1135                 /*
1136                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1137                     # The following rules handle those cases.
1138
1139                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1140                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1141                  */
1142                 return 0x69;
1143             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
1144                 /*
1145                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1146                     # This matches the behavior of the canonically equivalent I-dot_above
1147
1148                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1149                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1150                  */
1151                 return 0; /* remove the dot (continue without output) */
1152             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
1153                 /*
1154                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1155
1156                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1157                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1158                  */
1159                 return 0x131;
1160             } else if(c==0x130) {
1161                 /*
1162                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
1163
1164                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1165                  */
1166                 *pString=iDot;
1167                 return 2;
1168             } else if(  c==0x3a3 &&
1169                         !isFollowedByCasedLetter(csp, iter, context, 1) &&
1170                         isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
1171             ) {
1172                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1173                 /*
1174                     # Special case for final form of sigma
1175
1176                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1177                  */
1178                 return 0x3c2; /* greek small final sigma */
1179             } else {
1180                 /* no known conditional special case mapping, use a normal mapping */
1181             }
1182         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1183             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1184             full&=UCASE_FULL_LOWER;
1185             if(full!=0) {
1186                 /* set the output pointer to the lowercase mapping */
1187                 *pString=pe+1;
1188
1189                 /* return the string length */
1190                 return full;
1191             }
1192         }
1193
1194         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1195             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1196         }
1197     }
1198
1199     return (result==c) ? ~result : result;
1200 }
1201
1202 /* internal */
1203 static int32_t
1204 toUpperOrTitle(const UCaseProps *csp, UChar32 c,
1205                UCaseContextIterator *iter, void *context,
1206                const UChar **pString,
1207                const char *locale, int32_t *locCache,
1208                UBool upperNotTitle) {
1209     UChar32 result;
1210     uint16_t props;
1211
1212     result=c;
1213     GET_PROPS(csp, c, props);
1214     if(!PROPS_HAS_EXCEPTION(props)) {
1215         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1216             result=c+UCASE_GET_DELTA(props);
1217         }
1218     } else {
1219         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1220         uint16_t excWord=*pe++;
1221         int32_t full, index;
1222
1223         pe2=pe;
1224
1225         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1226             /* use hardcoded conditions and mappings */
1227             int32_t loc=ucase_getCaseLocale(locale, locCache);
1228
1229             if(loc==UCASE_LOC_TURKISH && c==0x69) {
1230                 /*
1231                     # Turkish and Azeri
1232
1233                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1234                     # The following rules handle those cases.
1235
1236                     # When uppercasing, i turns into a dotted capital I
1237
1238                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1239                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1240                 */
1241                 return 0x130;
1242             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
1243                 /*
1244                     # Lithuanian
1245
1246                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1247
1248                     # Remove DOT ABOVE after "i" with upper or titlecase
1249
1250                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1251                  */
1252                 return 0; /* remove the dot (continue without output) */
1253             } else {
1254                 /* no known conditional special case mapping, use a normal mapping */
1255             }
1256         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1257             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1258
1259             /* start of full case mapping strings */
1260             ++pe;
1261
1262             /* skip the lowercase and case-folding result strings */
1263             pe+=full&UCASE_FULL_LOWER;
1264             full>>=4;
1265             pe+=full&0xf;
1266             full>>=4;
1267
1268             if(upperNotTitle) {
1269                 full&=0xf;
1270             } else {
1271                 /* skip the uppercase result string */
1272                 pe+=full&0xf;
1273                 full=(full>>4)&0xf;
1274             }
1275
1276             if(full!=0) {
1277                 /* set the output pointer to the result string */
1278                 *pString=pe;
1279
1280                 /* return the string length */
1281                 return full;
1282             }
1283         }
1284
1285         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1286             index=UCASE_EXC_TITLE;
1287         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1288             /* here, titlecase is same as uppercase */
1289             index=UCASE_EXC_UPPER;
1290         } else {
1291             return ~c;
1292         }
1293         GET_SLOT_VALUE(excWord, index, pe2, result);
1294     }
1295
1296     return (result==c) ? ~result : result;
1297 }
1298
1299 U_CAPI int32_t U_EXPORT2
1300 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
1301                   UCaseContextIterator *iter, void *context,
1302                   const UChar **pString,
1303                   const char *locale, int32_t *locCache) {
1304     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
1305 }
1306
1307 U_CAPI int32_t U_EXPORT2
1308 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
1309                   UCaseContextIterator *iter, void *context,
1310                   const UChar **pString,
1311                   const char *locale, int32_t *locCache) {
1312     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
1313 }
1314
1315 /* case folding ------------------------------------------------------------- */
1316
1317 /*
1318  * Case folding is similar to lowercasing.
1319  * The result may be a simple mapping, i.e., a single code point, or
1320  * a full mapping, i.e., a string.
1321  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1322  * then only the lowercase mapping is stored.
1323  *
1324  * Some special cases are hardcoded because their conditions cannot be
1325  * parsed and processed from CaseFolding.txt.
1326  *
1327  * Unicode 3.2 CaseFolding.txt specifies for its status field:
1328
1329 # C: common case folding, common mappings shared by both simple and full mappings.
1330 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1331 # S: simple case folding, mappings to single characters where different from F.
1332 # T: special case for uppercase I and dotted uppercase I
1333 #    - For non-Turkic languages, this mapping is normally not used.
1334 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1335 #
1336 # Usage:
1337 #  A. To do a simple case folding, use the mappings with status C + S.
1338 #  B. To do a full case folding, use the mappings with status C + F.
1339 #
1340 #    The mappings with status T can be used or omitted depending on the desired case-folding
1341 #    behavior. (The default option is to exclude them.)
1342
1343  * Unicode 3.2 has 'T' mappings as follows:
1344
1345 0049; T; 0131; # LATIN CAPITAL LETTER I
1346 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1347
1348  * while the default mappings for these code points are:
1349
1350 0049; C; 0069; # LATIN CAPITAL LETTER I
1351 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1352
1353  * U+0130 has no simple case folding (simple-case-folds to itself).
1354  */
1355
1356 /* return the simple case folding mapping for c */
1357 U_CAPI UChar32 U_EXPORT2
1358 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
1359     uint16_t props;
1360     GET_PROPS(csp, c, props);
1361     if(!PROPS_HAS_EXCEPTION(props)) {
1362         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1363             c+=UCASE_GET_DELTA(props);
1364         }
1365     } else {
1366         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
1367         uint16_t excWord=*pe++;
1368         int32_t index;
1369         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1370             /* special case folding mappings, hardcoded */
1371             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1372                 /* default mappings */
1373                 if(c==0x49) {
1374                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1375                     return 0x69;
1376                 } else if(c==0x130) {
1377                     /* no simple case folding for U+0130 */
1378                     return c;
1379                 }
1380             } else {
1381                 /* Turkic mappings */
1382                 if(c==0x49) {
1383                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1384                     return 0x131;
1385                 } else if(c==0x130) {
1386                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1387                     return 0x69;
1388                 }
1389             }
1390         }
1391         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1392             index=UCASE_EXC_FOLD;
1393         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1394             index=UCASE_EXC_LOWER;
1395         } else {
1396             return c;
1397         }
1398         GET_SLOT_VALUE(excWord, index, pe, c);
1399     }
1400     return c;
1401 }
1402
1403 /*
1404  * Issue for canonical caseless match (UAX #21):
1405  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1406  * canonical equivalence, unlike default-option casefolding.
1407  * For example, I-grave and I + grave fold to strings that are not canonically
1408  * equivalent.
1409  * For more details, see the comment in unorm_compare() in unorm.cpp
1410  * and the intermediate prototype changes for Jitterbug 2021.
1411  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1412  *
1413  * This did not get fixed because it appears that it is not possible to fix
1414  * it for uppercase and lowercase characters (I-grave vs. i-grave)
1415  * together in a way that they still fold to common result strings.
1416  */
1417
1418 U_CAPI int32_t U_EXPORT2
1419 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
1420                     const UChar **pString,
1421                     uint32_t options)
1422 {
1423     UChar32 result;
1424     uint16_t props;
1425
1426     result=c;
1427     GET_PROPS(csp, c, props);
1428     if(!PROPS_HAS_EXCEPTION(props)) {
1429         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1430             result=c+UCASE_GET_DELTA(props);
1431         }
1432     } else {
1433         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1434         uint16_t excWord=*pe++;
1435         int32_t full, index;
1436
1437         pe2=pe;
1438
1439         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1440             /* use hardcoded conditions and mappings */
1441             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1442                 /* default mappings */
1443                 if(c==0x49) {
1444                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1445                     return 0x69;
1446                 } else if(c==0x130) {
1447                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1448                     *pString=iDot;
1449                     return 2;
1450                 }
1451             } else {
1452                 /* Turkic mappings */
1453                 if(c==0x49) {
1454                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1455                     return 0x131;
1456                 } else if(c==0x130) {
1457                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1458                     return 0x69;
1459                 }
1460             }
1461         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1462             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1463
1464             /* start of full case mapping strings */
1465             ++pe;
1466
1467             /* skip the lowercase result string */
1468             pe+=full&UCASE_FULL_LOWER;
1469             full=(full>>4)&0xf;
1470
1471             if(full!=0) {
1472                 /* set the output pointer to the result string */
1473                 *pString=pe;
1474
1475                 /* return the string length */
1476                 return full;
1477             }
1478         }
1479
1480         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1481             index=UCASE_EXC_FOLD;
1482         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1483             index=UCASE_EXC_LOWER;
1484         } else {
1485             return ~c;
1486         }
1487         GET_SLOT_VALUE(excWord, index, pe2, result);
1488     }
1489
1490     return (result==c) ? ~result : result;
1491 }
1492
1493 /* case mapping properties API ---------------------------------------------- */
1494
1495 /* get the UCaseProps singleton, or else its dummy, once and for all */
1496 #if !UCASE_HARDCODE_DATA
1497 static const UCaseProps *
1498 getCaseProps() {
1499     /*
1500      * This lazy intialization with double-checked locking (without mutex protection for
1501      * the initial check) is transiently unsafe under certain circumstances.
1502      * Check the readme and use u_init() if necessary.
1503      */
1504
1505     /* the initial check is performed by the GET_CASE_PROPS() macro */
1506     const UCaseProps *csp;
1507     UErrorCode errorCode=U_ZERO_ERROR;
1508
1509     csp=ucase_getSingleton(&errorCode);
1510     if(U_FAILURE(errorCode)) {
1511         errorCode=U_ZERO_ERROR;
1512         csp=ucase_getDummy(&errorCode);
1513         if(U_FAILURE(errorCode)) {
1514             return NULL;
1515         }
1516     }
1517
1518     return csp;
1519 }
1520 #endif
1521
1522 /*
1523  * In ICU 3.0, most Unicode properties were loaded from uprops.icu.
1524  * ICU 3.2 adds ucase.icu for case mapping properties.
1525  * ICU 3.4 adds ubidi.icu for bidi/shaping properties and
1526  * removes case/bidi/shaping properties from uprops.icu.
1527  *
1528  * Loading of uprops.icu was never mutex-protected and required u_init()
1529  * for thread safety.
1530  * In order to maintain performance for all such properties,
1531  * ucase.icu and ubidi.icu are loaded lazily, without mutexing.
1532  * u_init() will try to load them for thread safety,
1533  * but u_init() will not fail if they are missing.
1534  *
1535  * uchar.c maintains a tri-state flag for (not loaded/loaded/failed to load)
1536  * and an error code for load failure.
1537  * Instead, here we try to load at most once.
1538  * If it works, we use the resulting singleton object.
1539  * If it fails, then we get a dummy object, which always works unless
1540  * we are seriously out of memory.
1541  * After the first try, we have a never-changing pointer to either the
1542  * real singleton or the dummy.
1543  *
1544  * This method is used in Unicode properties APIs (uchar.h) that
1545  * do not have a service object and also do not have an error code parameter.
1546  * Other API implementations get the singleton themselves
1547  * (with mutexing), store it in the service object, and report errors.
1548  */
1549 #if !UCASE_HARDCODE_DATA
1550 #define GET_CASE_PROPS() (gCsp!=NULL ? gCsp : getCaseProps())
1551 #else
1552 #define GET_CASE_PROPS() &ucase_props_singleton
1553 #endif
1554
1555 /* public API (see uchar.h) */
1556
1557 U_CAPI UBool U_EXPORT2
1558 u_isULowercase(UChar32 c) {
1559     return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
1560 }
1561
1562 U_CAPI UBool U_EXPORT2
1563 u_isUUppercase(UChar32 c) {
1564     return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
1565 }
1566
1567 /* Transforms the Unicode character to its lower case equivalent.*/
1568 U_CAPI UChar32 U_EXPORT2
1569 u_tolower(UChar32 c) {
1570     return ucase_tolower(GET_CASE_PROPS(), c);
1571 }
1572
1573 /* Transforms the Unicode character to its upper case equivalent.*/
1574 U_CAPI UChar32 U_EXPORT2
1575 u_toupper(UChar32 c) {
1576     return ucase_toupper(GET_CASE_PROPS(), c);
1577 }
1578
1579 /* Transforms the Unicode character to its title case equivalent.*/
1580 U_CAPI UChar32 U_EXPORT2
1581 u_totitle(UChar32 c) {
1582     return ucase_totitle(GET_CASE_PROPS(), c);
1583 }
1584
1585 /* return the simple case folding mapping for c */
1586 U_CAPI UChar32 U_EXPORT2
1587 u_foldCase(UChar32 c, uint32_t options) {
1588     return ucase_fold(GET_CASE_PROPS(), c, options);
1589 }
1590
1591 U_CFUNC int32_t U_EXPORT2
1592 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1593     /* case mapping properties */
1594     const UCaseProps *csp=GET_CASE_PROPS();
1595     if(csp==NULL) {
1596         return FALSE;
1597     }
1598     switch(which) {
1599     case UCHAR_LOWERCASE:
1600         return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
1601     case UCHAR_UPPERCASE:
1602         return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
1603     case UCHAR_SOFT_DOTTED:
1604         return ucase_isSoftDotted(csp, c);
1605     case UCHAR_CASE_SENSITIVE:
1606         return ucase_isCaseSensitive(csp, c);
1607     default:
1608         return FALSE;
1609     }
1610 }