icuSources/common/ucase.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2004, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucase.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2004aug30
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Low-level Unicode character/string case mapping code.
  17 *   Much code moved here (and modified) from uchar.c.
  18 */
  19
  20 #include "unicode/utypes.h"
  21 #include "unicode/uset.h"
  22 #include "unicode/udata.h" /* UDataInfo */
  23 #include "ucmndata.h" /* DataHeader */
  24 #include "udatamem.h"
  25 #include "umutex.h"
  26 #include "uassert.h"
  27 #include "cmemory.h"
  28 #include "utrie.h"
  29 #include "ucase.h"
  30 #include "ucln_cmn.h"
  31
  32 struct UCaseProps {
  33     UDataMemory *mem;
  34     const int32_t *indexes;
  35     const uint16_t *exceptions;
  36
  37     UTrie trie;
  38     uint8_t formatVersion[4];
  39 };
  40
  41 /* data loading etc. -------------------------------------------------------- */
  42
  43 static UBool U_CALLCONV
  44 isAcceptable(void *context,
  45              const char *type, const char *name,
  46              const UDataInfo *pInfo) {
  47     if(
  48         pInfo->size>=20 &&
  49         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
  50         pInfo->charsetFamily==U_CHARSET_FAMILY &&
  51         pInfo->dataFormat[0]==UCASE_FMT_0 &&    /* dataFormat="cAsE" */
  52         pInfo->dataFormat[1]==UCASE_FMT_1 &&
  53         pInfo->dataFormat[2]==UCASE_FMT_2 &&
  54         pInfo->dataFormat[3]==UCASE_FMT_3 &&
  55         pInfo->formatVersion[0]==1 &&
  56         pInfo->formatVersion[2]==UTRIE_SHIFT &&
  57         pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
  58     ) {
  59         UCaseProps *csp=(UCaseProps *)context;
  60         uprv_memcpy(csp->formatVersion, pInfo->formatVersion, 4);
  61         return TRUE;
  62     } else {
  63         return FALSE;
  64     }
  65 }
  66
  67 static UCaseProps *
  68 ucase_openData(UCaseProps *cspProto,
  69                const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
  70     UCaseProps *csp;
  71     int32_t size, trieSize;
  72
  73     cspProto->indexes=(const int32_t *)bin;
  74     if( cspProto->indexes[UCASE_IX_INDEX_TOP]<16 ||
  75         (length>=0 && length<cspProto->indexes[UCASE_IX_LENGTH])
  76     ) {
  77         *pErrorCode=U_INVALID_FORMAT_ERROR;
  78         return NULL;
  79     }
  80
  81     /* get the trie address, after indexes[] */
  82     size=cspProto->indexes[UCASE_IX_INDEX_TOP]*4;
  83     bin+=size;
  84     if(length>=0 && (length-=size)<16) {
  85         *pErrorCode=U_INVALID_FORMAT_ERROR;
  86         return NULL;
  87     }
  88
  89     /* unserialize the trie */
  90     trieSize=cspProto->indexes[UCASE_IX_TRIE_SIZE];
  91     trieSize=utrie_unserialize(&cspProto->trie, bin, length>=0 ? length : trieSize, pErrorCode);
  92     if(U_FAILURE(*pErrorCode)) {
  93         return NULL;
  94     }
  95
  96     /* get exceptions[] */
  97     bin+=trieSize;
  98     if(length>=0 && (length-=trieSize)<2*cspProto->indexes[UCASE_IX_EXC_LENGTH]) {
  99         *pErrorCode=U_INVALID_FORMAT_ERROR;
 100         return NULL;
 101     }
 102     cspProto->exceptions=(const uint16_t *)bin;
 103
 104     /* allocate, copy, and return the new UCaseProps */
 105     csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps));
 106     if(csp==NULL) {
 107         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
 108         return NULL;
 109     } else {
 110         uprv_memcpy(csp, cspProto, sizeof(UCaseProps));
 111         return csp;
 112     }
 113 }
 114
 115 U_CAPI UCaseProps * U_EXPORT2
 116 ucase_open(UErrorCode *pErrorCode) {
 117     UCaseProps cspProto={ NULL }, *csp;
 118
 119     cspProto.mem=udata_openChoice(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, isAcceptable, &cspProto, pErrorCode);
 120     if(U_FAILURE(*pErrorCode)) {
 121         return NULL;
 122     }
 123
 124     csp=ucase_openData(
 125             &cspProto,
 126             udata_getMemory(cspProto.mem),
 127             udata_getLength(cspProto.mem),
 128             pErrorCode);
 129     if(U_FAILURE(*pErrorCode)) {
 130         udata_close(cspProto.mem);
 131         return NULL;
 132     } else {
 133         return csp;
 134     }
 135 }
 136
 137 U_CAPI UCaseProps * U_EXPORT2
 138 ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
 139     UCaseProps cspProto={ NULL };
 140     const DataHeader *hdr;
 141
 142     if(U_FAILURE(*pErrorCode)) {
 143         return NULL;
 144     }
 145     if(bin==NULL) {
 146         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 147         return NULL;
 148     }
 149
 150     /* check the header */
 151     if(length>=0 && length<20) {
 152         *pErrorCode=U_INVALID_FORMAT_ERROR;
 153         return NULL;
 154     }
 155     hdr=(const DataHeader *)bin;
 156     if(
 157         !(hdr->dataHeader.magic1==0xda && hdr->dataHeader.magic2==0x27 &&
 158           hdr->info.isBigEndian==U_IS_BIG_ENDIAN &&
 159           isAcceptable(&cspProto, UCASE_DATA_TYPE, UCASE_DATA_NAME, &hdr->info))
 160     ) {
 161         *pErrorCode=U_INVALID_FORMAT_ERROR;
 162         return NULL;
 163     }
 164
 165     bin+=hdr->dataHeader.headerSize;
 166     if(length>=0) {
 167         length-=hdr->dataHeader.headerSize;
 168     }
 169     return ucase_openData(&cspProto, bin, length, pErrorCode);
 170 }
 171
 172 U_CAPI void U_EXPORT2
 173 ucase_close(UCaseProps *csp) {
 174     if(csp!=NULL) {
 175         udata_close(csp->mem);
 176         uprv_free(csp);
 177     }
 178 }
 179
 180 /* UCaseProps singleton ----------------------------------------------------- */
 181
 182 static UCaseProps *gCsp=NULL;
 183 static UErrorCode gErrorCode=U_ZERO_ERROR;
 184 static int8_t gHaveData=0;
 185
 186 static UBool U_CALLCONV ucase_cleanup(void) {
 187     ucase_close(gCsp);
 188     gCsp=NULL;
 189     gErrorCode=U_ZERO_ERROR;
 190     gHaveData=0;
 191     return TRUE;
 192 }
 193
 194 U_CAPI UCaseProps * U_EXPORT2
 195 ucase_getSingleton(UErrorCode *pErrorCode) {
 196     int8_t haveData;
 197
 198     if(U_FAILURE(*pErrorCode)) {
 199         return NULL;
 200     }
 201
 202     UMTX_CHECK(NULL, gHaveData, haveData);
 203
 204     if(haveData>0) {
 205         /* data was loaded */
 206         return gCsp;
 207     } else if(haveData<0) {
 208         /* data loading failed */
 209         *pErrorCode=gErrorCode;
 210         return NULL;
 211     } else /* haveData==0 */ {
 212         /* load the data */
 213         UCaseProps *csp=ucase_open(pErrorCode);
 214         if(U_FAILURE(*pErrorCode)) {
 215             gHaveData=-1;
 216             gErrorCode=*pErrorCode;
 217             return NULL;
 218         }
 219
 220         /* set the static variables */
 221         umtx_lock(NULL);
 222         if(gCsp==NULL) {
 223             gCsp=csp;
 224             csp=NULL;
 225             gHaveData=1;
 226             ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup);
 227         }
 228         umtx_unlock(NULL);
 229
 230         ucase_close(csp);
 231         return gCsp;
 232     }
 233 }
 234
 235 /* Unicode case mapping data swapping --------------------------------------- */
 236
 237 U_CAPI int32_t U_EXPORT2
 238 ucase_swap(const UDataSwapper *ds,
 239            const void *inData, int32_t length, void *outData,
 240            UErrorCode *pErrorCode) {
 241     const UDataInfo *pInfo;
 242     int32_t headerSize;
 243
 244     const uint8_t *inBytes;
 245     uint8_t *outBytes;
 246
 247     const int32_t *inIndexes;
 248     int32_t indexes[16];
 249
 250     int32_t i, offset, count, size;
 251
 252     /* udata_swapDataHeader checks the arguments */
 253     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
 254     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 255         return 0;
 256     }
 257
 258     /* check data format and format version */
 259     pInfo=(const UDataInfo *)((const char *)inData+4);
 260     if(!(
 261         pInfo->dataFormat[0]==UCASE_FMT_0 &&    /* dataFormat="cAsE" */
 262         pInfo->dataFormat[1]==UCASE_FMT_1 &&
 263         pInfo->dataFormat[2]==UCASE_FMT_2 &&
 264         pInfo->dataFormat[3]==UCASE_FMT_3 &&
 265         pInfo->formatVersion[0]==1 &&
 266         pInfo->formatVersion[2]==UTRIE_SHIFT &&
 267         pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
 268     )) {
 269         udata_printError(ds, "ucase_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as case mapping data\n",
 270                          pInfo->dataFormat[0], pInfo->dataFormat[1],
 271                          pInfo->dataFormat[2], pInfo->dataFormat[3],
 272                          pInfo->formatVersion[0]);
 273         *pErrorCode=U_UNSUPPORTED_ERROR;
 274         return 0;
 275     }
 276
 277     inBytes=(const uint8_t *)inData+headerSize;
 278     outBytes=(uint8_t *)outData+headerSize;
 279
 280     inIndexes=(const int32_t *)inBytes;
 281
 282     if(length>=0) {
 283         length-=headerSize;
 284         if(length<16*4) {
 285             udata_printError(ds, "ucase_swap(): too few bytes (%d after header) for case mapping data\n",
 286                              length);
 287             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 288             return 0;
 289         }
 290     }
 291
 292     /* read the first 16 indexes (ICU 3.2/format version 1: UCASE_IX_TOP==16, might grow) */
 293     for(i=0; i<16; ++i) {
 294         indexes[i]=udata_readInt32(ds, inIndexes[i]);
 295     }
 296
 297     /* get the total length of the data */
 298     size=indexes[UCASE_IX_LENGTH];
 299
 300     if(length>=0) {
 301         if(length<size) {
 302             udata_printError(ds, "ucase_swap(): too few bytes (%d after header) for all of case mapping data\n",
 303                              length);
 304             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 305             return 0;
 306         }
 307
 308         /* copy the data for inaccessible bytes */
 309         if(inBytes!=outBytes) {
 310             uprv_memcpy(outBytes, inBytes, size);
 311         }
 312
 313         offset=0;
 314
 315         /* swap the int32_t indexes[] */
 316         count=indexes[UCASE_IX_INDEX_TOP]*4;
 317         ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode);
 318         offset+=count;
 319
 320         /* swap the UTrie */
 321         count=indexes[UCASE_IX_TRIE_SIZE];
 322         utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
 323         offset+=count;
 324
 325         /* swap the uint16_t exceptions[] */
 326         count=indexes[UCASE_IX_EXC_LENGTH]*2;
 327         ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
 328         offset+=count;
 329
 330         U_ASSERT(offset==size);
 331     }
 332
 333     return headerSize+size;
 334 }
 335
 336 /* set of property starts for UnicodeSet ------------------------------------ */
 337
 338 static UBool U_CALLCONV
 339 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
 340     /* add the start code point to the USet */
 341     USetAdder *sa=(USetAdder *)context;
 342     sa->add(sa->set, start);
 343     return TRUE;
 344 }
 345
 346 U_CAPI void U_EXPORT2
 347 ucase_addPropertyStarts(const UCaseProps *csp, USetAdder *sa, UErrorCode *pErrorCode) {
 348     if(U_FAILURE(*pErrorCode)) {
 349         return;
 350     }
 351
 352     /* add the start code point of each same-value range of the trie */
 353     utrie_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
 354
 355     /* add code points with hardcoded properties, plus the ones following them */
 356
 357     /* (none right now, see comment below) */
 358
 359     /*
 360      * Omit code points with hardcoded specialcasing properties
 361      * because we do not build property UnicodeSets for them right now.
 362      */
 363 }
 364
 365 /* data access primitives --------------------------------------------------- */
 366
 367 /* UTRIE_GET16() itself validates c */
 368 #define GET_PROPS(csp, c, result) \
 369     UTRIE_GET16(&(csp)->trie, c, result);
 370
 371 #define GET_CASE_TYPE(props) ((props)&UCASE_TYPE_MASK)
 372 #define GET_SIGNED_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT)
 373 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
 374
 375 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
 376
 377 /* number of bits in an 8-bit integer value */
 378 static const uint8_t flagsOffset[256]={
 379     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
 380     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
 381     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
 382     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 383     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
 384     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 385     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 386     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 387     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
 388     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 389     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 390     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 391     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
 392     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 393     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
 394     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 395 };
 396
 397 #define HAS_SLOT(flags, index) ((flags)&(1<<(index)))
 398 #define SLOT_OFFSET(flags, index) flagsOffset[(flags)&((1<<(index))-1)]
 399
 400 /*
 401  * Get the value of an optional-value slot where HAS_SLOT(excWord, index).
 402  *
 403  * @param excWord (in) initial exceptions word
 404  * @param index (in) desired slot index
 405  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
 406  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
 407  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
 408  */
 409 #define GET_SLOT_VALUE(excWord, index, pExc16, value) \
 410     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
 411         (pExc16)+=SLOT_OFFSET(excWord, index); \
 412         (value)=*pExc16; \
 413     } else { \
 414         (pExc16)+=2*SLOT_OFFSET(excWord, index); \
 415         (value)=*pExc16++; \
 416         (value)=((value)<<16)|*pExc16; \
 417     }
 418
 419 /* simple case mappings ----------------------------------------------------- */
 420
 421 U_CAPI UChar32 U_EXPORT2
 422 ucase_tolower(const UCaseProps *csp, UChar32 c) {
 423     uint16_t props;
 424     GET_PROPS(csp, c, props);
 425     if(!PROPS_HAS_EXCEPTION(props)) {
 426         if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
 427             c+=GET_SIGNED_DELTA(props);
 428         }
 429     } else {
 430         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 431         uint16_t excWord=*pe++;
 432         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
 433             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
 434         }
 435     }
 436     return c;
 437 }
 438
 439 U_CAPI UChar32 U_EXPORT2
 440 ucase_toupper(const UCaseProps *csp, UChar32 c) {
 441     uint16_t props;
 442     GET_PROPS(csp, c, props);
 443     if(!PROPS_HAS_EXCEPTION(props)) {
 444         if(GET_CASE_TYPE(props)==UCASE_LOWER) {
 445             c+=GET_SIGNED_DELTA(props);
 446         }
 447     } else {
 448         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 449         uint16_t excWord=*pe++;
 450         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
 451             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
 452         }
 453     }
 454     return c;
 455 }
 456
 457 U_CAPI UChar32 U_EXPORT2
 458 ucase_totitle(const UCaseProps *csp, UChar32 c) {
 459     uint16_t props;
 460     GET_PROPS(csp, c, props);
 461     if(!PROPS_HAS_EXCEPTION(props)) {
 462         if(GET_CASE_TYPE(props)==UCASE_LOWER) {
 463             c+=GET_SIGNED_DELTA(props);
 464         }
 465     } else {
 466         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 467         uint16_t excWord=*pe++;
 468         int32_t index;
 469         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
 470             index=UCASE_EXC_TITLE;
 471         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
 472             index=UCASE_EXC_UPPER;
 473         } else {
 474             return c;
 475         }
 476         GET_SLOT_VALUE(excWord, index, pe, c);
 477     }
 478     return c;
 479 }
 480
 481 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
 482 U_CAPI int32_t U_EXPORT2
 483 ucase_getType(const UCaseProps *csp, UChar32 c) {
 484     uint16_t props;
 485     GET_PROPS(csp, c, props);
 486     return GET_CASE_TYPE(props);
 487 }
 488
 489 /** @return same as ucase_getType(), or <0 if c is case-ignorable */
 490 U_CAPI int32_t U_EXPORT2
 491 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
 492     int32_t type;
 493     uint16_t props;
 494     GET_PROPS(csp, c, props);
 495     type=GET_CASE_TYPE(props);
 496     if(type!=UCASE_NONE) {
 497         return type;
 498     } else if(
 499         c==0x307 ||
 500         (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE
 501     ) {
 502         return -1; /* case-ignorable */
 503     } else {
 504         return 0; /* c is neither cased nor case-ignorable */
 505     }
 506 }
 507
 508 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
 509 static U_INLINE int32_t
 510 getDotType(const UCaseProps *csp, UChar32 c) {
 511     uint16_t props;
 512     GET_PROPS(csp, c, props);
 513     if(!PROPS_HAS_EXCEPTION(props)) {
 514         return props&UCASE_DOT_MASK;
 515     } else {
 516         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
 517         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
 518     }
 519 }
 520
 521 U_CAPI UBool U_EXPORT2
 522 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
 523     return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
 524 }
 525
 526 U_CAPI UBool U_EXPORT2
 527 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
 528     uint16_t props;
 529     GET_PROPS(csp, c, props);
 530     return (UBool)((props&UCASE_SENSITIVE)!=0);
 531 }
 532
 533 /* public API (see uchar.h) ------------------------------------------------- */
 534
 535 U_CAPI UBool U_EXPORT2
 536 u_isULowercase(UChar32 c) {
 537     UErrorCode errorCode=U_ZERO_ERROR;
 538     UCaseProps *csp=ucase_getSingleton(&errorCode);
 539     return (UBool)(csp!=NULL && UCASE_LOWER==ucase_getType(csp, c));
 540 }
 541
 542 U_CAPI UBool U_EXPORT2
 543 u_isUUppercase(UChar32 c) {
 544     UErrorCode errorCode=U_ZERO_ERROR;
 545     UCaseProps *csp=ucase_getSingleton(&errorCode);
 546     return (UBool)(csp!=NULL && UCASE_UPPER==ucase_getType(csp, c));
 547 }
 548
 549 /* Transforms the Unicode character to its lower case equivalent.*/
 550 U_CAPI UChar32 U_EXPORT2
 551 u_tolower(UChar32 c) {
 552     UErrorCode errorCode=U_ZERO_ERROR;
 553     UCaseProps *csp=ucase_getSingleton(&errorCode);
 554     if(csp!=NULL) {
 555         return ucase_tolower(csp, c);
 556     } else {
 557         return c;
 558     }
 559 }
 560
 561 /* Transforms the Unicode character to its upper case equivalent.*/
 562 U_CAPI UChar32 U_EXPORT2
 563 u_toupper(UChar32 c) {
 564     UErrorCode errorCode=U_ZERO_ERROR;
 565     UCaseProps *csp=ucase_getSingleton(&errorCode);
 566     if(csp!=NULL) {
 567         return ucase_toupper(csp, c);
 568     } else {
 569         return c;
 570     }
 571 }
 572
 573 /* Transforms the Unicode character to its title case equivalent.*/
 574 U_CAPI UChar32 U_EXPORT2
 575 u_totitle(UChar32 c) {
 576     UErrorCode errorCode=U_ZERO_ERROR;
 577     UCaseProps *csp=ucase_getSingleton(&errorCode);
 578     if(csp!=NULL) {
 579         return ucase_totitle(csp, c);
 580     } else {
 581         return c;
 582     }
 583 }
 584
 585 /* return the simple case folding mapping for c */
 586 U_CAPI UChar32 U_EXPORT2
 587 u_foldCase(UChar32 c, uint32_t options) {
 588     UErrorCode errorCode=U_ZERO_ERROR;
 589     UCaseProps *csp=ucase_getSingleton(&errorCode);
 590     if(csp!=NULL) {
 591         return ucase_fold(csp, c, options);
 592     } else {
 593         return c;
 594     }
 595 }
 596
 597 /* string casing ------------------------------------------------------------ */
 598
 599 /*
 600  * These internal functions form the core of string case mappings.
 601  * They map single code points to result code points or strings and take
 602  * all necessary conditions (context, locale ID, options) into account.
 603  *
 604  * They do not iterate over the source or write to the destination
 605  * so that the same functions are useful for non-standard string storage,
 606  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
 607  * For the same reason, the "surrounding text" context is passed in as a
 608  * UCaseContextIterator which does not make any assumptions about
 609  * the underlying storage.
 610  *
 611  * This section contains helper functions that check for conditions
 612  * in the input text surrounding the current code point
 613  * according to SpecialCasing.txt.
 614  *
 615  * Each helper function gets the index
 616  * - after the current code point if it looks at following text
 617  * - before the current code point if it looks at preceding text
 618  *
 619  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
 620  *
 621  * Final_Sigma
 622  *   C is preceded by a sequence consisting of
 623  *     a cased letter and a case-ignorable sequence,
 624  *   and C is not followed by a sequence consisting of
 625  *     an ignorable sequence and then a cased letter.
 626  *
 627  * More_Above
 628  *   C is followed by one or more characters of combining class 230 (ABOVE)
 629  *   in the combining character sequence.
 630  *
 631  * After_Soft_Dotted
 632  *   The last preceding character with combining class of zero before C
 633  *   was Soft_Dotted,
 634  *   and there is no intervening combining character class 230 (ABOVE).
 635  *
 636  * Before_Dot
 637  *   C is followed by combining dot above (U+0307).
 638  *   Any sequence of characters with a combining class that is neither 0 nor 230
 639  *   may intervene between the current character and the combining dot above.
 640  *
 641  * The erratum from 2002-10-31 adds the condition
 642  *
 643  * After_I
 644  *   The last preceding base character was an uppercase I, and there is no
 645  *   intervening combining character class 230 (ABOVE).
 646  *
 647  *   (See Jitterbug 2344 and the comments on After_I below.)
 648  *
 649  * Helper definitions in Unicode 3.2 UAX 21:
 650  *
 651  * D1. A character C is defined to be cased
 652  *     if it meets any of the following criteria:
 653  *
 654  *   - The general category of C is Titlecase Letter (Lt)
 655  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
 656  *   - Given D = NFD(C), then it is not the case that:
 657  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
 658  *     (This third criterium does not add any characters to the list
 659  *      for Unicode 3.2. Ignored.)
 660  *
 661  * D2. A character C is defined to be case-ignorable
 662  *     if it meets either of the following criteria:
 663  *
 664  *   - The general category of C is
 665  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
 666  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
 667  *   - C is one of the following characters
 668  *     U+0027 APOSTROPHE
 669  *     U+00AD SOFT HYPHEN (SHY)
 670  *     U+2019 RIGHT SINGLE QUOTATION MARK
 671  *            (the preferred character for apostrophe)
 672  *
 673  * D3. A case-ignorable sequence is a sequence of
 674  *     zero or more case-ignorable characters.
 675  */
 676
 677 enum {
 678     LOC_UNKNOWN,
 679     LOC_ROOT,
 680     LOC_TURKISH,
 681     LOC_LITHUANIAN
 682 };
 683
 684 #define is_a(c) ((c)=='a' || (c)=='A')
 685 #define is_e(c) ((c)=='e' || (c)=='E')
 686 #define is_i(c) ((c)=='i' || (c)=='I')
 687 #define is_l(c) ((c)=='l' || (c)=='L')
 688 #define is_r(c) ((c)=='r' || (c)=='R')
 689 #define is_t(c) ((c)=='t' || (c)=='T')
 690 #define is_u(c) ((c)=='u' || (c)=='U')
 691 #define is_z(c) ((c)=='z' || (c)=='Z')
 692
 693 /* separator? */
 694 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
 695
 696 /*
 697  * Requires non-NULL locale ID but otherwise does the equivalent of
 698  * checking for language codes as if uloc_getLanguage() were called:
 699  * Accepts both 2- and 3-letter codes and accepts case variants.
 700  */
 701 static int32_t
 702 getCaseLocale(const char *locale, int32_t *locCache) {
 703     int32_t result;
 704     char c;
 705
 706     if(locCache!=NULL && (result=*locCache)!=LOC_UNKNOWN) {
 707         return result;
 708     }
 709
 710     result=LOC_ROOT;
 711
 712     /*
 713      * This function used to use uloc_getLanguage(), but the current code
 714      * removes the dependency of this low-level code on uloc implementation code
 715      * and is faster because not the whole locale ID has to be
 716      * examined and copied/transformed.
 717      *
 718      * Because this code does not want to depend on uloc, the caller must
 719      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
 720      */
 721     c=*locale++;
 722     if(is_t(c)) {
 723         /* tr or tur? */
 724         c=*locale++;
 725         if(is_u(c)) {
 726             c=*locale++;
 727         }
 728         if(is_r(c)) {
 729             c=*locale;
 730             if(is_sep(c)) {
 731                 result=LOC_TURKISH;
 732             }
 733         }
 734     } else if(is_a(c)) {
 735         /* az or aze? */
 736         c=*locale++;
 737         if(is_z(c)) {
 738             c=*locale++;
 739             if(is_e(c)) {
 740                 c=*locale;
 741             }
 742             if(is_sep(c)) {
 743                 result=LOC_TURKISH;
 744             }
 745         }
 746     } else if(is_l(c)) {
 747         /* lt or lit? */
 748         c=*locale++;
 749         if(is_i(c)) {
 750             c=*locale++;
 751         }
 752         if(is_t(c)) {
 753             c=*locale;
 754             if(is_sep(c)) {
 755                 result=LOC_LITHUANIAN;
 756             }
 757         }
 758     }
 759
 760     if(locCache!=NULL) {
 761         *locCache=result;
 762     }
 763     return result;
 764 }
 765
 766 /* Is followed by {case-ignorable}* cased  ? (dir determines looking forward/backward) */
 767 static UBool
 768 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
 769     UChar32 c;
 770     uint16_t props;
 771
 772     if(iter==NULL) {
 773         return FALSE;
 774     }
 775
 776     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
 777         GET_PROPS(csp, c, props);
 778         if(GET_CASE_TYPE(props)!=UCASE_NONE) {
 779             return TRUE; /* followed by cased letter */
 780         } else if(c==0x307 || (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE) {
 781             /* case-ignorable, continue with the loop */
 782         } else {
 783             return FALSE; /* not ignorable */
 784         }
 785     }
 786
 787     return FALSE; /* not followed by cased letter */
 788 }
 789
 790 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
 791 static UBool
 792 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
 793     UChar32 c;
 794     int32_t dotType;
 795     int8_t dir;
 796
 797     if(iter==NULL) {
 798         return FALSE;
 799     }
 800
 801     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
 802         dotType=getDotType(csp, c);
 803         if(dotType==UCASE_SOFT_DOTTED) {
 804             return TRUE; /* preceded by TYPE_i */
 805         } else if(dotType!=UCASE_OTHER_ACCENT) {
 806             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
 807         }
 808     }
 809
 810     return FALSE; /* not preceded by TYPE_i */
 811 }
 812
 813 /*
 814  * See Jitterbug 2344:
 815  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
 816  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
 817  * we made those releases compatible with Unicode 3.2 which had not fixed
 818  * a related bug in SpecialCasing.txt.
 819  *
 820  * From the Jitterbug 2344 text:
 821  * ... this bug is listed as a Unicode erratum
 822  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
 823  * <quote>
 824  * There are two errors in SpecialCasing.txt.
 825  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
 826  * 2. An incorrect context definition. Correct as follows:
 827  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
 828  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
 829  * ---
 830  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
 831  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
 832  * where the context After_I is defined as:
 833  * The last preceding base character was an uppercase I, and there is no
 834  * intervening combining character class 230 (ABOVE).
 835  * </quote>
 836  *
 837  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
 838  *
 839  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
 840  * # This matches the behavior of the canonically equivalent I-dot_above
 841  *
 842  * See also the description in this place in older versions of uchar.c (revision 1.100).
 843  *
 844  * Markus W. Scherer 2003-feb-15
 845  */
 846
 847 /* Is preceded by base character 'I' with no intervening cc=230 ? */
 848 static UBool
 849 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
 850     UChar32 c;
 851     int32_t dotType;
 852     int8_t dir;
 853
 854     if(iter==NULL) {
 855         return FALSE;
 856     }
 857
 858     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
 859         if(c==0x49) {
 860             return TRUE; /* preceded by I */
 861         }
 862         dotType=getDotType(csp, c);
 863         if(dotType!=UCASE_OTHER_ACCENT) {
 864             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
 865         }
 866     }
 867
 868     return FALSE; /* not preceded by I */
 869 }
 870
 871 /* Is followed by one or more cc==230 ? */
 872 static UBool
 873 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
 874     UChar32 c;
 875     int32_t dotType;
 876     int8_t dir;
 877
 878     if(iter==NULL) {
 879         return FALSE;
 880     }
 881
 882     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
 883         dotType=getDotType(csp, c);
 884         if(dotType==UCASE_ABOVE) {
 885             return TRUE; /* at least one cc==230 following */
 886         } else if(dotType!=UCASE_OTHER_ACCENT) {
 887             return FALSE; /* next base character, no more cc==230 following */
 888         }
 889     }
 890
 891     return FALSE; /* no more cc==230 following */
 892 }
 893
 894 /* Is followed by a dot above (without cc==230 in between) ? */
 895 static UBool
 896 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
 897     UChar32 c;
 898     int32_t dotType;
 899     int8_t dir;
 900
 901     if(iter==NULL) {
 902         return FALSE;
 903     }
 904
 905     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
 906         if(c==0x307) {
 907             return TRUE;
 908         }
 909         dotType=getDotType(csp, c);
 910         if(dotType!=UCASE_OTHER_ACCENT) {
 911             return FALSE; /* next base character or cc==230 in between */
 912         }
 913     }
 914
 915     return FALSE; /* no dot above following */
 916 }
 917
 918 U_CAPI int32_t U_EXPORT2
 919 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
 920                   UCaseContextIterator *iter, void *context,
 921                   const UChar **pString,
 922                   const char *locale, int32_t *locCache) {
 923     static const UChar
 924         iDot[2]=        { 0x69, 0x307 },
 925         jDot[2]=        { 0x6a, 0x307 },
 926         iOgonekDot[3]= { 0x12f, 0x307 },
 927         iDotGrave[3]=   { 0x69, 0x307, 0x300 },
 928         iDotAcute[3]=   { 0x69, 0x307, 0x301 },
 929         iDotTilde[3]=   { 0x69, 0x307, 0x303 };
 930
 931     UChar32 result;
 932     uint16_t props;
 933
 934     result=c;
 935     GET_PROPS(csp, c, props);
 936     if(!PROPS_HAS_EXCEPTION(props)) {
 937         if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
 938             result=c+GET_SIGNED_DELTA(props);
 939         }
 940     } else {
 941         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
 942         uint16_t excWord=*pe++;
 943         int32_t full;
 944
 945         pe2=pe;
 946
 947         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
 948             /* use hardcoded conditions and mappings */
 949             int32_t loc=getCaseLocale(locale, locCache);
 950
 951             /*
 952              * Test for conditional mappings first
 953              *   (otherwise the unconditional default mappings are always taken),
 954              * then test for characters that have unconditional mappings in SpecialCasing.txt,
 955              * then get the UnicodeData.txt mappings.
 956              */
 957             if( loc==LOC_LITHUANIAN &&
 958                     /* base characters, find accents above */
 959                     (((c==0x49 || c==0x4a || c==0x12e) &&
 960                         isFollowedByMoreAbove(csp, iter, context)) ||
 961                     /* precomposed with accent above, no need to find one */
 962                     (c==0xcc || c==0xcd || c==0x128))
 963             ) {
 964                 /*
 965                     # Lithuanian
 966
 967                     # Lithuanian retains the dot in a lowercase i when followed by accents.
 968
 969                     # Introduce an explicit dot above when lowercasing capital I's and J's
 970                     # whenever there are more accents above.
 971                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
 972
 973                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
 974                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
 975                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
 976                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
 977                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
 978                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
 979                  */
 980                 switch(c) {
 981                 case 0x49:  /* LATIN CAPITAL LETTER I */
 982                     *pString=iDot;
 983                     return 2;
 984                 case 0x4a:  /* LATIN CAPITAL LETTER J */
 985                     *pString=jDot;
 986                     return 2;
 987                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
 988                     *pString=iOgonekDot;
 989                     return 2;
 990                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
 991                     *pString=iDotGrave;
 992                     return 3;
 993                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
 994                     *pString=iDotAcute;
 995                     return 3;
 996                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
 997                     *pString=iDotTilde;
 998                     return 3;
 999                 default:
1000                     return 0; /* will not occur */
1001                 }
1002             /* # Turkish and Azeri */
1003             } else if(loc==LOC_TURKISH && c==0x130) {
1004                 /*
1005                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1006                     # The following rules handle those cases.
1007
1008                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1009                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1010                  */
1011                 return 0x69;
1012             } else if(loc==LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
1013                 /*
1014                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1015                     # This matches the behavior of the canonically equivalent I-dot_above
1016
1017                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1018                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1019                  */
1020                 return 0; /* remove the dot (continue without output) */
1021             } else if(loc==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
1022                 /*
1023                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1024
1025                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1026                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1027                  */
1028                 return 0x131;
1029             } else if(c==0x130) {
1030                 /*
1031                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
1032
1033                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1034                  */
1035                 *pString=iDot;
1036                 return 2;
1037             } else if(  c==0x3a3 &&
1038                         !isFollowedByCasedLetter(csp, iter, context, 1) &&
1039                         isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
1040             ) {
1041                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1042                 /*
1043                     # Special case for final form of sigma
1044
1045                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1046                  */
1047                 return 0x3c2; /* greek small final sigma */
1048             } else {
1049                 /* no known conditional special case mapping, use a normal mapping */
1050             }
1051         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1052             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1053             full&=UCASE_FULL_LOWER;
1054             if(full!=0) {
1055                 /* set the output pointer to the lowercase mapping */
1056                 *pString=pe+1;
1057
1058                 /* return the string length */
1059                 return full;
1060             }
1061         }
1062
1063         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1064             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1065         }
1066     }
1067
1068     return (result==c) ? ~result : result;
1069 }
1070
1071 /* internal */
1072 static int32_t
1073 toUpperOrTitle(const UCaseProps *csp, UChar32 c,
1074                UCaseContextIterator *iter, void *context,
1075                const UChar **pString,
1076                const char *locale, int32_t *locCache,
1077                UBool upperNotTitle) {
1078     UChar32 result;
1079     uint16_t props;
1080
1081     result=c;
1082     GET_PROPS(csp, c, props);
1083     if(!PROPS_HAS_EXCEPTION(props)) {
1084         if(GET_CASE_TYPE(props)==UCASE_LOWER) {
1085             result=c+GET_SIGNED_DELTA(props);
1086         }
1087     } else {
1088         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1089         uint16_t excWord=*pe++;
1090         int32_t full, index;
1091
1092         pe2=pe;
1093
1094         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1095             /* use hardcoded conditions and mappings */
1096             int32_t loc=getCaseLocale(locale, locCache);
1097
1098             if(loc==LOC_TURKISH && c==0x69) {
1099                 /*
1100                     # Turkish and Azeri
1101
1102                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1103                     # The following rules handle those cases.
1104
1105                     # When uppercasing, i turns into a dotted capital I
1106
1107                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1108                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1109                 */
1110                 return 0x130;
1111             } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
1112                 /*
1113                     # Lithuanian
1114
1115                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1116
1117                     # Remove DOT ABOVE after "i" with upper or titlecase
1118
1119                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1120                  */
1121                 return 0; /* remove the dot (continue without output) */
1122             } else {
1123                 /* no known conditional special case mapping, use a normal mapping */
1124             }
1125         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1126             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1127
1128             /* start of full case mapping strings */
1129             ++pe;
1130
1131             /* skip the lowercase and case-folding result strings */
1132             pe+=full&UCASE_FULL_LOWER;
1133             full>>=4;
1134             pe+=full&0xf;
1135             full>>=4;
1136
1137             if(upperNotTitle) {
1138                 full&=0xf;
1139             } else {
1140                 /* skip the uppercase result string */
1141                 pe+=full&0xf;
1142                 full=(full>>4)&0xf;
1143             }
1144
1145             if(full!=0) {
1146                 /* set the output pointer to the result string */
1147                 *pString=pe;
1148
1149                 /* return the string length */
1150                 return full;
1151             }
1152         }
1153
1154         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1155             index=UCASE_EXC_TITLE;
1156         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1157             /* here, titlecase is same as uppercase */
1158             index=UCASE_EXC_UPPER;
1159         } else {
1160             return ~c;
1161         }
1162         GET_SLOT_VALUE(excWord, index, pe2, result);
1163     }
1164
1165     return (result==c) ? ~result : result;
1166 }
1167
1168 U_CAPI int32_t U_EXPORT2
1169 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
1170                   UCaseContextIterator *iter, void *context,
1171                   const UChar **pString,
1172                   const char *locale, int32_t *locCache) {
1173     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
1174 }
1175
1176 U_CAPI int32_t U_EXPORT2
1177 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
1178                   UCaseContextIterator *iter, void *context,
1179                   const UChar **pString,
1180                   const char *locale, int32_t *locCache) {
1181     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
1182 }
1183
1184 /* case folding ------------------------------------------------------------- */
1185
1186 /*
1187  * Case folding is similar to lowercasing.
1188  * The result may be a simple mapping, i.e., a single code point, or
1189  * a full mapping, i.e., a string.
1190  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1191  * then only the lowercase mapping is stored.
1192  *
1193  * Some special cases are hardcoded because their conditions cannot be
1194  * parsed and processed from CaseFolding.txt.
1195  *
1196  * Unicode 3.2 CaseFolding.txt specifies for its status field:
1197
1198 # C: common case folding, common mappings shared by both simple and full mappings.
1199 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1200 # S: simple case folding, mappings to single characters where different from F.
1201 # T: special case for uppercase I and dotted uppercase I
1202 #    - For non-Turkic languages, this mapping is normally not used.
1203 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1204 #
1205 # Usage:
1206 #  A. To do a simple case folding, use the mappings with status C + S.
1207 #  B. To do a full case folding, use the mappings with status C + F.
1208 #
1209 #    The mappings with status T can be used or omitted depending on the desired case-folding
1210 #    behavior. (The default option is to exclude them.)
1211
1212  * Unicode 3.2 has 'T' mappings as follows:
1213
1214 0049; T; 0131; # LATIN CAPITAL LETTER I
1215 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1216
1217  * while the default mappings for these code points are:
1218
1219 0049; C; 0069; # LATIN CAPITAL LETTER I
1220 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1221
1222  * U+0130 is otherwise lowercased to U+0069 (UnicodeData.txt).
1223  *
1224  * In case this code is used with CaseFolding.txt from an older version of Unicode
1225  * where CaseFolding.txt contains mappings with a status of 'I' that
1226  * have the opposite polarity ('I' mappings are included by default but excluded for Turkic),
1227  * we must also hardcode the Unicode 3.2 mappings for the code points
1228  * with 'I' mappings.
1229  * Unicode 3.1.1 has 'I' mappings for U+0130 and U+0131.
1230  * Unicode 3.2 has a 'T' mapping for U+0130, and lowercases U+0131 to itself (see UnicodeData.txt).
1231  */
1232
1233 /* return the simple case folding mapping for c */
1234 U_CAPI UChar32 U_EXPORT2
1235 ucase_fold(UCaseProps *csp, UChar32 c, uint32_t options) {
1236     uint16_t props;
1237     GET_PROPS(csp, c, props);
1238     if(!PROPS_HAS_EXCEPTION(props)) {
1239         if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
1240             c+=GET_SIGNED_DELTA(props);
1241         }
1242     } else {
1243         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
1244         uint16_t excWord=*pe++;
1245         int32_t index;
1246         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1247             /* special case folding mappings, hardcoded */
1248             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1249                 /* default mappings */
1250                 if(c==0x49) {
1251                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1252                     return 0x69;
1253                 } else if(c==0x130) {
1254                     /* no simple default mapping for U+0130, use UnicodeData.txt */
1255                     return 0x69;
1256                 }
1257             } else {
1258                 /* Turkic mappings */
1259                 if(c==0x49) {
1260                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1261                     return 0x131;
1262                 } else if(c==0x130) {
1263                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1264                     return 0x69;
1265                 }
1266             }
1267         }
1268         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1269             index=UCASE_EXC_FOLD;
1270         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1271             index=UCASE_EXC_LOWER;
1272         } else {
1273             return c;
1274         }
1275         GET_SLOT_VALUE(excWord, index, pe, c);
1276     }
1277     return c;
1278 }
1279
1280 /*
1281  * Issue for canonical caseless match (UAX #21):
1282  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1283  * canonical equivalence, unlike default-option casefolding.
1284  * For example, I-grave and I + grave fold to strings that are not canonically
1285  * equivalent.
1286  * For more details, see the comment in unorm_compare() in unorm.cpp
1287  * and the intermediate prototype changes for Jitterbug 2021.
1288  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1289  *
1290  * This did not get fixed because it appears that it is not possible to fix
1291  * it for uppercase and lowercase characters (I-grave vs. i-grave)
1292  * together in a way that they still fold to common result strings.
1293  */
1294
1295 U_CAPI int32_t U_EXPORT2
1296 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
1297                     const UChar **pString,
1298                     uint32_t options) {
1299     static const UChar
1300         iDot[2]=        { 0x69, 0x307 };
1301
1302     UChar32 result;
1303     uint16_t props;
1304
1305     result=c;
1306     GET_PROPS(csp, c, props);
1307     if(!PROPS_HAS_EXCEPTION(props)) {
1308         if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
1309             result=c+GET_SIGNED_DELTA(props);
1310         }
1311     } else {
1312         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1313         uint16_t excWord=*pe++;
1314         int32_t full, index;
1315
1316         pe2=pe;
1317
1318         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1319             /* use hardcoded conditions and mappings */
1320             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1321                 /* default mappings */
1322                 if(c==0x49) {
1323                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1324                     return 0x69;
1325                 } else if(c==0x130) {
1326                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1327                     *pString=iDot;
1328                     return 2;
1329                 }
1330             } else {
1331                 /* Turkic mappings */
1332                 if(c==0x49) {
1333                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1334                     return 0x131;
1335                 } else if(c==0x130) {
1336                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1337                     return 0x69;
1338                 }
1339             }
1340         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1341             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1342
1343             /* start of full case mapping strings */
1344             ++pe;
1345
1346             /* skip the lowercase result string */
1347             pe+=full&UCASE_FULL_LOWER;
1348             full=(full>>4)&0xf;
1349
1350             if(full!=0) {
1351                 /* set the output pointer to the result string */
1352                 *pString=pe;
1353
1354                 /* return the string length */
1355                 return full;
1356             }
1357         }
1358
1359         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1360             index=UCASE_EXC_FOLD;
1361         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1362             index=UCASE_EXC_LOWER;
1363         } else {
1364             return ~c;
1365         }
1366         GET_SLOT_VALUE(excWord, index, pe2, result);
1367     }
1368
1369     return (result==c) ? ~result : result;
1370 }