icuSources/common/ustrcase.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2008, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ustrcase.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2002feb20
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Implementation file for string casing C API functions.
  17 *   Uses functions from uchar.c for basic functionality that requires access
  18 *   to the Unicode Character Database (uprops.dat).
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "unicode/uloc.h"
  23 #include "unicode/ustring.h"
  24 #include "unicode/ucasemap.h"
  25 #include "unicode/ubrk.h"
  26 #include "cmemory.h"
  27 #include "ucase.h"
  28 #include "unormimp.h"
  29 #include "ustr_imp.h"
  30
  31 /* string casing ------------------------------------------------------------ */
  32
  33 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
  34 static U_INLINE int32_t
  35 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
  36              int32_t result, const UChar *s) {
  37     UChar32 c;
  38     int32_t length;
  39
  40     /* decode the result */
  41     if(result<0) {
  42         /* (not) original code point */
  43         c=~result;
  44         length=-1;
  45     } else if(result<=UCASE_MAX_STRING_LENGTH) {
  46         c=U_SENTINEL;
  47         length=result;
  48     } else {
  49         c=result;
  50         length=-1;
  51     }
  52
  53     if(destIndex<destCapacity) {
  54         /* append the result */
  55         if(length<0) {
  56             /* code point */
  57             UBool isError=FALSE;
  58             U16_APPEND(dest, destIndex, destCapacity, c, isError);
  59             if(isError) {
  60                 /* overflow, nothing written */
  61                 destIndex+=U16_LENGTH(c);
  62             }
  63         } else {
  64             /* string */
  65             if((destIndex+length)<=destCapacity) {
  66                 while(length>0) {
  67                     dest[destIndex++]=*s++;
  68                     --length;
  69                 }
  70             } else {
  71                 /* overflow */
  72                 destIndex+=length;
  73             }
  74         }
  75     } else {
  76         /* preflight */
  77         if(length<0) {
  78             destIndex+=U16_LENGTH(c);
  79         } else {
  80             destIndex+=length;
  81         }
  82     }
  83     return destIndex;
  84 }
  85
  86 static UChar32 U_CALLCONV
  87 utf16_caseContextIterator(void *context, int8_t dir) {
  88     UCaseContext *csc=(UCaseContext *)context;
  89     UChar32 c;
  90
  91     if(dir<0) {
  92         /* reset for backward iteration */
  93         csc->index=csc->cpStart;
  94         csc->dir=dir;
  95     } else if(dir>0) {
  96         /* reset for forward iteration */
  97         csc->index=csc->cpLimit;
  98         csc->dir=dir;
  99     } else {
 100         /* continue current iteration direction */
 101         dir=csc->dir;
 102     }
 103
 104     if(dir<0) {
 105         if(csc->start<csc->index) {
 106             U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
 107             return c;
 108         }
 109     } else {
 110         if(csc->index<csc->limit) {
 111             U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
 112             return c;
 113         }
 114     }
 115     return U_SENTINEL;
 116 }
 117
 118 /*
 119  * Case-maps [srcStart..srcLimit[ but takes
 120  * context [0..srcLength[ into account.
 121  */
 122 static int32_t
 123 _caseMap(const UCaseMap *csm, UCaseMapFull *map,
 124          UChar *dest, int32_t destCapacity,
 125          const UChar *src, UCaseContext *csc,
 126          int32_t srcStart, int32_t srcLimit,
 127          UErrorCode *pErrorCode) {
 128     const UChar *s;
 129     UChar32 c, c2;
 130     int32_t srcIndex, destIndex;
 131     int32_t locCache;
 132
 133     locCache=csm->locCache;
 134
 135     /* case mapping loop */
 136     srcIndex=srcStart;
 137     destIndex=0;
 138     while(srcIndex<srcLimit) {
 139         csc->cpStart=srcIndex;
 140         U16_NEXT(src, srcIndex, srcLimit, c);
 141         csc->cpLimit=srcIndex;
 142         c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache);
 143         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
 144             /* fast path version of appendResult() for BMP results */
 145             dest[destIndex++]=(UChar)c2;
 146         } else {
 147             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
 148         }
 149     }
 150
 151     if(destIndex>destCapacity) {
 152         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 153     }
 154     return destIndex;
 155 }
 156
 157 static void
 158 setTempCaseMapLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
 159     /*
 160      * We could call ucasemap_setLocale(), but here we really only care about
 161      * the initial language subtag, we need not return the real string via
 162      * ucasemap_getLocale(), and we don't care about only getting "x" from
 163      * "x-some-thing" etc.
 164      *
 165      * We ignore locales with a longer-than-3 initial subtag.
 166      *
 167      * We also do not fill in the locCache because it is rarely used,
 168      * and not worth setting unless we reuse it for many case mapping operations.
 169      * (That's why UCaseMap was created.)
 170      */
 171     int i;
 172     char c;
 173
 174     /* the internal functions require locale!=NULL */
 175     if(locale==NULL) {
 176         locale=uloc_getDefault();
 177     }
 178     for(i=0; i<4 && (c=locale[i])!=0 && c!='-' && c!='_'; ++i) {
 179         csm->locale[i]=c;
 180     }
 181     if(i<=3) {
 182         csm->locale[i]=0;  /* Up to 3 non-separator characters. */
 183     } else {
 184         csm->locale[0]=0;  /* Longer-than-3 initial subtag: Ignore. */
 185     }
 186 }
 187
 188 /*
 189  * Set parameters on an empty UCaseMap, for UCaseMap-less API functions.
 190  * Do this fast because it is called with every function call.
 191  */
 192 static U_INLINE void
 193 setTempCaseMap(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
 194     if(csm->csp==NULL) {
 195         csm->csp=ucase_getSingleton(pErrorCode);
 196         if(U_FAILURE(*pErrorCode)) {
 197             return;
 198         }
 199     }
 200     if(locale!=NULL && locale[0]==0) {
 201         csm->locale[0]=0;
 202     } else {
 203         setTempCaseMapLocale(csm, locale, pErrorCode);
 204     }
 205 }
 206
 207 #if !UCONFIG_NO_BREAK_ITERATION
 208
 209 /*
 210  * Internal titlecasing function.
 211  */
 212 static int32_t
 213 _toTitle(UCaseMap *csm,
 214          UChar *dest, int32_t destCapacity,
 215          const UChar *src, UCaseContext *csc,
 216          int32_t srcLength,
 217          UErrorCode *pErrorCode) {
 218     const UChar *s;
 219     UChar32 c;
 220     int32_t prev, titleStart, titleLimit, titleLimitSave, index, indexSave, destIndex, length;
 221     UBool isFirstIndex;
 222
 223     if(csm->iter!=NULL) {
 224         ubrk_setText(csm->iter, src, srcLength, pErrorCode);
 225     } else {
 226         csm->iter=ubrk_open(UBRK_WORD, csm->locale,
 227                             src, srcLength,
 228                             pErrorCode);
 229     }
 230     if(U_FAILURE(*pErrorCode)) {
 231         return 0;
 232     }
 233
 234     /* set up local variables */
 235     destIndex=0;
 236     prev=0;
 237     isFirstIndex=TRUE;
 238
 239     /* titlecasing loop */
 240     while(prev<srcLength) {
 241         /* find next index where to titlecase */
 242         if(isFirstIndex) {
 243             isFirstIndex=FALSE;
 244             index=ubrk_first(csm->iter);
 245         } else {
 246             index=ubrk_next(csm->iter);
 247         }
 248         if(index==UBRK_DONE || index>srcLength) {
 249             index=srcLength;
 250         }
 251
 252         /*
 253          * Unicode 4 & 5 section 3.13 Default Case Operations:
 254          *
 255          * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
 256          * #29, "Text Boundaries." Between each pair of word boundaries, find the first
 257          * cased character F. If F exists, map F to default_title(F); then map each
 258          * subsequent character C to default_lower(C).
 259          *
 260          * In this implementation, segment [prev..index[ into 3 parts:
 261          * a) uncased characters (copy as-is) [prev..titleStart[
 262          * b) first case letter (titlecase)         [titleStart..titleLimit[
 263          * c) subsequent characters (lowercase)                 [titleLimit..index[
 264          */
 265         if(prev<index) {
 266             /* find and copy uncased characters [prev..titleStart[ */
 267             titleStart=titleLimit=prev;
 268             U16_NEXT(src, titleLimit, index, c);
 269             if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
 270                 /* Adjust the titlecasing index (titleStart) to the next cased character. */
 271                 for(;;) {
 272                     titleStart=titleLimit;
 273                     if(titleLimit==index) {
 274                         /*
 275                          * only uncased characters in [prev..index[
 276                          * stop with titleStart==titleLimit==index
 277                          */
 278                         break;
 279                     }
 280                     U16_NEXT(src, titleLimit, index, c);
 281                     if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
 282                         break; /* cased letter at [titleStart..titleLimit[ */
 283                     }
 284                 }
 285                 length=titleStart-prev;
 286                 if(length>0) {
 287                     if((destIndex+length)<=destCapacity) {
 288                         uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR);
 289                     }
 290                     destIndex+=length;
 291                 }
 292             }
 293
 294             if(titleStart<titleLimit) {
 295                 /* titlecase c which is from [titleStart..titleLimit[ */
 296                 csc->cpStart=titleStart;
 297                 csc->cpLimit=titleLimit;
 298                 c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &csm->locCache);
 299                 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
 300
 301                 /* Special case Dutch IJ titlecasing */
 302                 if ( titleStart+1 < index &&
 303                      ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LOC_DUTCH &&
 304                      ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) &&
 305                      ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) {
 306                             c=(UChar32) 0x004A;
 307                             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
 308                             titleLimit++;
 309                 }
 310
 311                 /* lowercase [titleLimit..index[ */
 312                 if(titleLimit<index) {
 313                     if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
 314                         /* Normal operation: Lowercase the rest of the word. */
 315                         destIndex+=
 316                             _caseMap(
 317                                 csm, ucase_toFullLower,
 318                                 dest+destIndex, destCapacity-destIndex,
 319                                 src, csc,
 320                                 titleLimit, index,
 321                                 pErrorCode);
 322                     } else {
 323                         /* Optionally just copy the rest of the word unchanged. */
 324                         length=index-titleLimit;
 325                         if((destIndex+length)<=destCapacity) {
 326                             uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR);
 327                         }
 328                         destIndex+=length;
 329                     }
 330                 }
 331             }
 332         }
 333
 334         prev=index;
 335     }
 336
 337     if(destIndex>destCapacity) {
 338         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 339     }
 340     return destIndex;
 341 }
 342
 343 #endif
 344
 345 /* functions available in the common library (for unistr_case.cpp) */
 346
 347 U_CFUNC int32_t
 348 ustr_toLower(const UCaseProps *csp,
 349              UChar *dest, int32_t destCapacity,
 350              const UChar *src, int32_t srcLength,
 351              const char *locale,
 352              UErrorCode *pErrorCode) {
 353     UCaseMap csm={ NULL };
 354     UCaseContext csc={ NULL };
 355
 356     csm.csp=csp;
 357     setTempCaseMap(&csm, locale, pErrorCode);
 358     csc.p=(void *)src;
 359     csc.limit=srcLength;
 360
 361     return _caseMap(&csm, ucase_toFullLower,
 362                     dest, destCapacity,
 363                     src, &csc, 0, srcLength,
 364                     pErrorCode);
 365 }
 366
 367 U_CFUNC int32_t
 368 ustr_toUpper(const UCaseProps *csp,
 369              UChar *dest, int32_t destCapacity,
 370              const UChar *src, int32_t srcLength,
 371              const char *locale,
 372              UErrorCode *pErrorCode) {
 373     UCaseMap csm={ NULL };
 374     UCaseContext csc={ NULL };
 375
 376     csm.csp=csp;
 377     setTempCaseMap(&csm, locale, pErrorCode);
 378     csc.p=(void *)src;
 379     csc.limit=srcLength;
 380
 381     return _caseMap(&csm, ucase_toFullUpper,
 382                     dest, destCapacity,
 383                     src, &csc, 0, srcLength,
 384                     pErrorCode);
 385 }
 386
 387 #if !UCONFIG_NO_BREAK_ITERATION
 388
 389 U_CFUNC int32_t
 390 ustr_toTitle(const UCaseProps *csp,
 391              UChar *dest, int32_t destCapacity,
 392              const UChar *src, int32_t srcLength,
 393              UBreakIterator *titleIter,
 394              const char *locale, uint32_t options,
 395              UErrorCode *pErrorCode) {
 396     UCaseMap csm={ NULL };
 397     UCaseContext csc={ NULL };
 398     int32_t length;
 399
 400     csm.csp=csp;
 401     csm.iter=titleIter;
 402     csm.options=options;
 403     setTempCaseMap(&csm, locale, pErrorCode);
 404     csc.p=(void *)src;
 405     csc.limit=srcLength;
 406
 407     length=_toTitle(&csm,
 408                     dest, destCapacity,
 409                     src, &csc, srcLength,
 410                     pErrorCode);
 411     if(titleIter==NULL && csm.iter!=NULL) {
 412         ubrk_close(csm.iter);
 413     }
 414     return length;
 415 }
 416
 417 #endif
 418
 419 U_CFUNC int32_t
 420 ustr_foldCase(const UCaseProps *csp,
 421               UChar *dest, int32_t destCapacity,
 422               const UChar *src, int32_t srcLength,
 423               uint32_t options,
 424               UErrorCode *pErrorCode) {
 425     int32_t srcIndex, destIndex;
 426
 427     const UChar *s;
 428     UChar32 c, c2;
 429
 430     /* case mapping loop */
 431     srcIndex=destIndex=0;
 432     while(srcIndex<srcLength) {
 433         U16_NEXT(src, srcIndex, srcLength, c);
 434         c=ucase_toFullFolding(csp, c, &s, options);
 435         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
 436             /* fast path version of appendResult() for BMP results */
 437             dest[destIndex++]=(UChar)c2;
 438         } else {
 439             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
 440         }
 441     }
 442
 443     if(destIndex>destCapacity) {
 444         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 445     }
 446     return destIndex;
 447 }
 448
 449 /*
 450  * Implement argument checking and buffer handling
 451  * for string case mapping as a common function.
 452  */
 453
 454 /* common internal function for public API functions */
 455
 456 static int32_t
 457 caseMap(const UCaseMap *csm,
 458         UChar *dest, int32_t destCapacity,
 459         const UChar *src, int32_t srcLength,
 460         int32_t toWhichCase,
 461         UErrorCode *pErrorCode) {
 462     UChar buffer[300];
 463     UChar *temp;
 464
 465     int32_t destLength;
 466
 467     /* check argument values */
 468     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
 469         return 0;
 470     }
 471     if( destCapacity<0 ||
 472         (dest==NULL && destCapacity>0) ||
 473         src==NULL ||
 474         srcLength<-1
 475     ) {
 476         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 477         return 0;
 478     }
 479
 480     /* get the string length */
 481     if(srcLength==-1) {
 482         srcLength=u_strlen(src);
 483     }
 484
 485     /* check for overlapping source and destination */
 486     if( dest!=NULL &&
 487         ((src>=dest && src<(dest+destCapacity)) ||
 488          (dest>=src && dest<(src+srcLength)))
 489     ) {
 490         /* overlap: provide a temporary destination buffer and later copy the result */
 491         if(destCapacity<=(sizeof(buffer)/U_SIZEOF_UCHAR)) {
 492             /* the stack buffer is large enough */
 493             temp=buffer;
 494         } else {
 495             /* allocate a buffer */
 496             temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
 497             if(temp==NULL) {
 498                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
 499                 return 0;
 500             }
 501         }
 502     } else {
 503         temp=dest;
 504     }
 505
 506     destLength=0;
 507
 508     if(toWhichCase==FOLD_CASE) {
 509         destLength=ustr_foldCase(csm->csp, temp, destCapacity, src, srcLength,
 510                                  csm->options, pErrorCode);
 511     } else {
 512         UCaseContext csc={ NULL };
 513
 514         csc.p=(void *)src;
 515         csc.limit=srcLength;
 516
 517         if(toWhichCase==TO_LOWER) {
 518             destLength=_caseMap(csm, ucase_toFullLower,
 519                                 temp, destCapacity,
 520                                 src, &csc,
 521                                 0, srcLength,
 522                                 pErrorCode);
 523         } else if(toWhichCase==TO_UPPER) {
 524             destLength=_caseMap(csm, ucase_toFullUpper,
 525                                 temp, destCapacity,
 526                                 src, &csc,
 527                                 0, srcLength,
 528                                 pErrorCode);
 529         } else /* if(toWhichCase==TO_TITLE) */ {
 530 #if UCONFIG_NO_BREAK_ITERATION
 531             *pErrorCode=U_UNSUPPORTED_ERROR;
 532 #else
 533             /* UCaseMap is actually non-const in toTitle() APIs. */
 534             destLength=_toTitle((UCaseMap *)csm, temp, destCapacity,
 535                                 src, &csc, srcLength,
 536                                 pErrorCode);
 537 #endif
 538         }
 539     }
 540     if(temp!=dest) {
 541         /* copy the result string to the destination buffer */
 542         if(destLength>0) {
 543             int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity;
 544             if(copyLength>0) {
 545                 uprv_memmove(dest, temp, copyLength*U_SIZEOF_UCHAR);
 546             }
 547         }
 548         if(temp!=buffer) {
 549             uprv_free(temp);
 550         }
 551     }
 552
 553     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
 554 }
 555
 556 /* public API functions */
 557
 558 U_CAPI int32_t U_EXPORT2
 559 u_strToLower(UChar *dest, int32_t destCapacity,
 560              const UChar *src, int32_t srcLength,
 561              const char *locale,
 562              UErrorCode *pErrorCode) {
 563     UCaseMap csm={ NULL };
 564     setTempCaseMap(&csm, locale, pErrorCode);
 565     return caseMap(&csm,
 566                    dest, destCapacity,
 567                    src, srcLength,
 568                    TO_LOWER, pErrorCode);
 569 }
 570
 571 U_CAPI int32_t U_EXPORT2
 572 u_strToUpper(UChar *dest, int32_t destCapacity,
 573              const UChar *src, int32_t srcLength,
 574              const char *locale,
 575              UErrorCode *pErrorCode) {
 576     UCaseMap csm={ NULL };
 577     setTempCaseMap(&csm, locale, pErrorCode);
 578     return caseMap(&csm,
 579                    dest, destCapacity,
 580                    src, srcLength,
 581                    TO_UPPER, pErrorCode);
 582 }
 583
 584 #if !UCONFIG_NO_BREAK_ITERATION
 585
 586 U_CAPI int32_t U_EXPORT2
 587 u_strToTitle(UChar *dest, int32_t destCapacity,
 588              const UChar *src, int32_t srcLength,
 589              UBreakIterator *titleIter,
 590              const char *locale,
 591              UErrorCode *pErrorCode) {
 592     UCaseMap csm={ NULL };
 593     int32_t length;
 594
 595     csm.iter=titleIter;
 596     setTempCaseMap(&csm, locale, pErrorCode);
 597     length=caseMap(&csm,
 598                    dest, destCapacity,
 599                    src, srcLength,
 600                    TO_TITLE, pErrorCode);
 601     if(titleIter==NULL && csm.iter!=NULL) {
 602         ubrk_close(csm.iter);
 603     }
 604     return length;
 605 }
 606
 607 U_CAPI int32_t U_EXPORT2
 608 ucasemap_toTitle(UCaseMap *csm,
 609                  UChar *dest, int32_t destCapacity,
 610                  const UChar *src, int32_t srcLength,
 611                  UErrorCode *pErrorCode) {
 612     return caseMap(csm,
 613                    dest, destCapacity,
 614                    src, srcLength,
 615                    TO_TITLE, pErrorCode);
 616 }
 617
 618 #endif
 619
 620 U_CAPI int32_t U_EXPORT2
 621 u_strFoldCase(UChar *dest, int32_t destCapacity,
 622               const UChar *src, int32_t srcLength,
 623               uint32_t options,
 624               UErrorCode *pErrorCode) {
 625     UCaseMap csm={ NULL };
 626     csm.csp=ucase_getSingleton(pErrorCode);
 627     csm.options=options;
 628     return caseMap(&csm,
 629                    dest, destCapacity,
 630                    src, srcLength,
 631                    FOLD_CASE, pErrorCode);
 632 }
 633
 634 /* case-insensitive string comparisons -------------------------------------- */
 635
 636 /*
 637  * This function is a copy of unorm_cmpEquivFold() minus the parts for
 638  * canonical equivalence.
 639  * Keep the functions in sync, and see there for how this works.
 640  * The duplication is for modularization:
 641  * It makes caseless (but not canonical caseless) matches independent of
 642  * the normalization code.
 643  */
 644
 645 /* stack element for previous-level source/decomposition pointers */
 646 struct CmpEquivLevel {
 647     const UChar *start, *s, *limit;
 648 };
 649 typedef struct CmpEquivLevel CmpEquivLevel;
 650
 651 /* internal function */
 652 U_CFUNC int32_t
 653 u_strcmpFold(const UChar *s1, int32_t length1,
 654              const UChar *s2, int32_t length2,
 655              uint32_t options,
 656              UErrorCode *pErrorCode) {
 657     const UCaseProps *csp;
 658
 659     /* current-level start/limit - s1/s2 as current */
 660     const UChar *start1, *start2, *limit1, *limit2;
 661
 662     /* case folding variables */
 663     const UChar *p;
 664     int32_t length;
 665
 666     /* stacks of previous-level start/current/limit */
 667     CmpEquivLevel stack1[2], stack2[2];
 668
 669     /* case folding buffers, only use current-level start/limit */
 670     UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
 671
 672     /* track which is the current level per string */
 673     int32_t level1, level2;
 674
 675     /* current code units, and code points for lookups */
 676     UChar32 c1, c2, cp1, cp2;
 677
 678     /* no argument error checking because this itself is not an API */
 679
 680     /*
 681      * assume that at least the option U_COMPARE_IGNORE_CASE is set
 682      * otherwise this function would have to behave exactly as uprv_strCompare()
 683      */
 684     csp=ucase_getSingleton(pErrorCode);
 685     if(U_FAILURE(*pErrorCode)) {
 686         return 0;
 687     }
 688
 689     /* initialize */
 690     start1=s1;
 691     if(length1==-1) {
 692         limit1=NULL;
 693     } else {
 694         limit1=s1+length1;
 695     }
 696
 697     start2=s2;
 698     if(length2==-1) {
 699         limit2=NULL;
 700     } else {
 701         limit2=s2+length2;
 702     }
 703
 704     level1=level2=0;
 705     c1=c2=-1;
 706
 707     /* comparison loop */
 708     for(;;) {
 709         /*
 710          * here a code unit value of -1 means "get another code unit"
 711          * below it will mean "this source is finished"
 712          */
 713
 714         if(c1<0) {
 715             /* get next code unit from string 1, post-increment */
 716             for(;;) {
 717                 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
 718                     if(level1==0) {
 719                         c1=-1;
 720                         break;
 721                     }
 722                 } else {
 723                     ++s1;
 724                     break;
 725                 }
 726
 727                 /* reached end of level buffer, pop one level */
 728                 do {
 729                     --level1;
 730                     start1=stack1[level1].start;
 731                 } while(start1==NULL);
 732                 s1=stack1[level1].s;
 733                 limit1=stack1[level1].limit;
 734             }
 735         }
 736
 737         if(c2<0) {
 738             /* get next code unit from string 2, post-increment */
 739             for(;;) {
 740                 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
 741                     if(level2==0) {
 742                         c2=-1;
 743                         break;
 744                     }
 745                 } else {
 746                     ++s2;
 747                     break;
 748                 }
 749
 750                 /* reached end of level buffer, pop one level */
 751                 do {
 752                     --level2;
 753                     start2=stack2[level2].start;
 754                 } while(start2==NULL);
 755                 s2=stack2[level2].s;
 756                 limit2=stack2[level2].limit;
 757             }
 758         }
 759
 760         /*
 761          * compare c1 and c2
 762          * either variable c1, c2 is -1 only if the corresponding string is finished
 763          */
 764         if(c1==c2) {
 765             if(c1<0) {
 766                 return 0;   /* c1==c2==-1 indicating end of strings */
 767             }
 768             c1=c2=-1;       /* make us fetch new code units */
 769             continue;
 770         } else if(c1<0) {
 771             return -1;      /* string 1 ends before string 2 */
 772         } else if(c2<0) {
 773             return 1;       /* string 2 ends before string 1 */
 774         }
 775         /* c1!=c2 && c1>=0 && c2>=0 */
 776
 777         /* get complete code points for c1, c2 for lookups if either is a surrogate */
 778         cp1=c1;
 779         if(U_IS_SURROGATE(c1)) {
 780             UChar c;
 781
 782             if(U_IS_SURROGATE_LEAD(c1)) {
 783                 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
 784                     /* advance ++s1; only below if cp1 decomposes/case-folds */
 785                     cp1=U16_GET_SUPPLEMENTARY(c1, c);
 786                 }
 787             } else /* isTrail(c1) */ {
 788                 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
 789                     cp1=U16_GET_SUPPLEMENTARY(c, c1);
 790                 }
 791             }
 792         }
 793
 794         cp2=c2;
 795         if(U_IS_SURROGATE(c2)) {
 796             UChar c;
 797
 798             if(U_IS_SURROGATE_LEAD(c2)) {
 799                 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
 800                     /* advance ++s2; only below if cp2 decomposes/case-folds */
 801                     cp2=U16_GET_SUPPLEMENTARY(c2, c);
 802                 }
 803             } else /* isTrail(c2) */ {
 804                 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
 805                     cp2=U16_GET_SUPPLEMENTARY(c, c2);
 806                 }
 807             }
 808         }
 809
 810         /*
 811          * go down one level for each string
 812          * continue with the main loop as soon as there is a real change
 813          */
 814
 815         if( level1==0 &&
 816             (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0
 817         ) {
 818             /* cp1 case-folds to the code point "length" or to p[length] */
 819             if(U_IS_SURROGATE(c1)) {
 820                 if(U_IS_SURROGATE_LEAD(c1)) {
 821                     /* advance beyond source surrogate pair if it case-folds */
 822                     ++s1;
 823                 } else /* isTrail(c1) */ {
 824                     /*
 825                      * we got a supplementary code point when hitting its trail surrogate,
 826                      * therefore the lead surrogate must have been the same as in the other string;
 827                      * compare this decomposition with the lead surrogate in the other string
 828                      * remember that this simulates bulk text replacement:
 829                      * the decomposition would replace the entire code point
 830                      */
 831                     --s2;
 832                     c2=*(s2-1);
 833                 }
 834             }
 835
 836             /* push current level pointers */
 837             stack1[0].start=start1;
 838             stack1[0].s=s1;
 839             stack1[0].limit=limit1;
 840             ++level1;
 841
 842             /* copy the folding result to fold1[] */
 843             if(length<=UCASE_MAX_STRING_LENGTH) {
 844                 u_memcpy(fold1, p, length);
 845             } else {
 846                 int32_t i=0;
 847                 U16_APPEND_UNSAFE(fold1, i, length);
 848                 length=i;
 849             }
 850
 851             /* set next level pointers to case folding */
 852             start1=s1=fold1;
 853             limit1=fold1+length;
 854
 855             /* get ready to read from decomposition, continue with loop */
 856             c1=-1;
 857             continue;
 858         }
 859
 860         if( level2==0 &&
 861             (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0
 862         ) {
 863             /* cp2 case-folds to the code point "length" or to p[length] */
 864             if(U_IS_SURROGATE(c2)) {
 865                 if(U_IS_SURROGATE_LEAD(c2)) {
 866                     /* advance beyond source surrogate pair if it case-folds */
 867                     ++s2;
 868                 } else /* isTrail(c2) */ {
 869                     /*
 870                      * we got a supplementary code point when hitting its trail surrogate,
 871                      * therefore the lead surrogate must have been the same as in the other string;
 872                      * compare this decomposition with the lead surrogate in the other string
 873                      * remember that this simulates bulk text replacement:
 874                      * the decomposition would replace the entire code point
 875                      */
 876                     --s1;
 877                     c1=*(s1-1);
 878                 }
 879             }
 880
 881             /* push current level pointers */
 882             stack2[0].start=start2;
 883             stack2[0].s=s2;
 884             stack2[0].limit=limit2;
 885             ++level2;
 886
 887             /* copy the folding result to fold2[] */
 888             if(length<=UCASE_MAX_STRING_LENGTH) {
 889                 u_memcpy(fold2, p, length);
 890             } else {
 891                 int32_t i=0;
 892                 U16_APPEND_UNSAFE(fold2, i, length);
 893                 length=i;
 894             }
 895
 896             /* set next level pointers to case folding */
 897             start2=s2=fold2;
 898             limit2=fold2+length;
 899
 900             /* get ready to read from decomposition, continue with loop */
 901             c2=-1;
 902             continue;
 903         }
 904
 905         /*
 906          * no decomposition/case folding, max level for both sides:
 907          * return difference result
 908          *
 909          * code point order comparison must not just return cp1-cp2
 910          * because when single surrogates are present then the surrogate pairs
 911          * that formed cp1 and cp2 may be from different string indexes
 912          *
 913          * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
 914          * c1=d800 cp1=10001 c2=dc00 cp2=10000
 915          * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
 916          *
 917          * therefore, use same fix-up as in ustring.c/uprv_strCompare()
 918          * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
 919          * so we have slightly different pointer/start/limit comparisons here
 920          */
 921
 922         if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
 923             /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
 924             if(
 925                 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
 926                 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
 927             ) {
 928                 /* part of a surrogate pair, leave >=d800 */
 929             } else {
 930                 /* BMP code point - may be surrogate code point - make <d800 */
 931                 c1-=0x2800;
 932             }
 933
 934             if(
 935                 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
 936                 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
 937             ) {
 938                 /* part of a surrogate pair, leave >=d800 */
 939             } else {
 940                 /* BMP code point - may be surrogate code point - make <d800 */
 941                 c2-=0x2800;
 942             }
 943         }
 944
 945         return c1-c2;
 946     }
 947 }
 948
 949 /* public API functions */
 950
 951 U_CAPI int32_t U_EXPORT2
 952 u_strCaseCompare(const UChar *s1, int32_t length1,
 953                  const UChar *s2, int32_t length2,
 954                  uint32_t options,
 955                  UErrorCode *pErrorCode) {
 956     /* argument checking */
 957     if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
 958         return 0;
 959     }
 960     if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
 961         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 962         return 0;
 963     }
 964     return u_strcmpFold(s1, length1, s2, length2,
 965                         options|U_COMPARE_IGNORE_CASE,
 966                         pErrorCode);
 967 }
 968
 969 U_CAPI int32_t U_EXPORT2
 970 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
 971     UErrorCode errorCode=U_ZERO_ERROR;
 972     return u_strcmpFold(s1, -1, s2, -1,
 973                         options|U_COMPARE_IGNORE_CASE,
 974                         &errorCode);
 975 }
 976
 977 U_CAPI int32_t U_EXPORT2
 978 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
 979     UErrorCode errorCode=U_ZERO_ERROR;
 980     return u_strcmpFold(s1, length, s2, length,
 981                         options|U_COMPARE_IGNORE_CASE,
 982                         &errorCode);
 983 }
 984
 985 U_CAPI int32_t U_EXPORT2
 986 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
 987     UErrorCode errorCode=U_ZERO_ERROR;
 988     return u_strcmpFold(s1, n, s2, n,
 989                         options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
 990                         &errorCode);
 991 }