icuSources/common/ucasemap.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2005-2016, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucasemap.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2005may06
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Case mapping service object and functions using it.
  17 */
  18
  19 #include "unicode/utypes.h"
  20 #include "unicode/brkiter.h"
  21 #include "unicode/ubrk.h"
  22 #include "unicode/uloc.h"
  23 #include "unicode/ustring.h"
  24 #include "unicode/ucasemap.h"
  25 #if !UCONFIG_NO_BREAK_ITERATION
  26 #include "unicode/utext.h"
  27 #endif
  28 #include "unicode/utf.h"
  29 #include "unicode/utf8.h"
  30 #include "unicode/utf16.h"
  31 #include "cmemory.h"
  32 #include "cstring.h"
  33 #include "ucase.h"
  34 #include "ustr_imp.h"
  35
  36 U_NAMESPACE_USE
  37
  38 /* UCaseMap service object -------------------------------------------------- */
  39
  40 U_CAPI UCaseMap * U_EXPORT2
  41 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
  42     UCaseMap *csm;
  43
  44     if(U_FAILURE(*pErrorCode)) {
  45         return NULL;
  46     }
  47
  48     csm=(UCaseMap *)uprv_malloc(sizeof(UCaseMap));
  49     if(csm==NULL) {
  50         return NULL;
  51     }
  52     uprv_memset(csm, 0, sizeof(UCaseMap));
  53
  54     csm->csp=ucase_getSingleton();
  55     ucasemap_setLocale(csm, locale, pErrorCode);
  56     if(U_FAILURE(*pErrorCode)) {
  57         uprv_free(csm);
  58         return NULL;
  59     }
  60
  61     csm->options=options;
  62     return csm;
  63 }
  64
  65 U_CAPI void U_EXPORT2
  66 ucasemap_close(UCaseMap *csm) {
  67     if(csm!=NULL) {
  68 #if !UCONFIG_NO_BREAK_ITERATION
  69         // Do not call ubrk_close() so that we do not depend on all of the BreakIterator code.
  70         delete reinterpret_cast<BreakIterator *>(csm->iter);
  71 #endif
  72         uprv_free(csm);
  73     }
  74 }
  75
  76 U_CAPI const char * U_EXPORT2
  77 ucasemap_getLocale(const UCaseMap *csm) {
  78     return csm->locale;
  79 }
  80
  81 U_CAPI uint32_t U_EXPORT2
  82 ucasemap_getOptions(const UCaseMap *csm) {
  83     return csm->options;
  84 }
  85
  86 U_CAPI void U_EXPORT2
  87 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
  88     int32_t length;
  89
  90     if(U_FAILURE(*pErrorCode)) {
  91         return;
  92     }
  93
  94     length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
  95     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
  96         *pErrorCode=U_ZERO_ERROR;
  97         /* we only really need the language code for case mappings */
  98         length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
  99     }
 100     if(length==sizeof(csm->locale)) {
 101         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 102     }
 103     csm->locCache=0;
 104     if(U_SUCCESS(*pErrorCode)) {
 105         ucase_getCaseLocale(csm->locale, &csm->locCache);
 106     } else {
 107         csm->locale[0]=0;
 108     }
 109 }
 110
 111 U_CAPI void U_EXPORT2
 112 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode * /*pErrorCode*/) {
 113     csm->options=options;
 114 }
 115
 116 /* UTF-8 string case mappings ----------------------------------------------- */
 117
 118 /* TODO(markus): Move to a new, separate utf8case.c file. */
 119
 120 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
 121 static inline int32_t
 122 appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
 123              int32_t result, const UChar *s) {
 124     UChar32 c;
 125     int32_t length, destLength;
 126     UErrorCode errorCode;
 127
 128     /* decode the result */
 129     if(result<0) {
 130         /* (not) original code point */
 131         c=~result;
 132         length=-1;
 133     } else if(result<=UCASE_MAX_STRING_LENGTH) {
 134         c=U_SENTINEL;
 135         length=result;
 136     } else {
 137         c=result;
 138         length=-1;
 139     }
 140
 141     if(destIndex<destCapacity) {
 142         /* append the result */
 143         if(length<0) {
 144             /* code point */
 145             UBool isError=FALSE;
 146             U8_APPEND(dest, destIndex, destCapacity, c, isError);
 147             if(isError) {
 148                 /* overflow, nothing written */
 149                 destIndex+=U8_LENGTH(c);
 150             }
 151         } else {
 152             /* string */
 153             errorCode=U_ZERO_ERROR;
 154             u_strToUTF8(
 155                 (char *)(dest+destIndex), destCapacity-destIndex, &destLength,
 156                 s, length,
 157                 &errorCode);
 158             destIndex+=destLength;
 159             /* we might have an overflow, but we know the actual length */
 160         }
 161     } else {
 162         /* preflight */
 163         if(length<0) {
 164             destIndex+=U8_LENGTH(c);
 165         } else {
 166             errorCode=U_ZERO_ERROR;
 167             u_strToUTF8(
 168                 NULL, 0, &destLength,
 169                 s, length,
 170                 &errorCode);
 171             destIndex+=destLength;
 172         }
 173     }
 174     return destIndex;
 175 }
 176
 177 static UChar32 U_CALLCONV
 178 utf8_caseContextIterator(void *context, int8_t dir) {
 179     UCaseContext *csc=(UCaseContext *)context;
 180     UChar32 c;
 181
 182     if(dir<0) {
 183         /* reset for backward iteration */
 184         csc->index=csc->cpStart;
 185         csc->dir=dir;
 186     } else if(dir>0) {
 187         /* reset for forward iteration */
 188         csc->index=csc->cpLimit;
 189         csc->dir=dir;
 190     } else {
 191         /* continue current iteration direction */
 192         dir=csc->dir;
 193     }
 194
 195     if(dir<0) {
 196         if(csc->start<csc->index) {
 197             U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
 198             return c;
 199         }
 200     } else {
 201         if(csc->index<csc->limit) {
 202             U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
 203             return c;
 204         }
 205     }
 206     return U_SENTINEL;
 207 }
 208
 209 /*
 210  * Case-maps [srcStart..srcLimit[ but takes
 211  * context [0..srcLength[ into account.
 212  */
 213 static int32_t
 214 _caseMap(const UCaseMap *csm, UCaseMapFull *map,
 215          uint8_t *dest, int32_t destCapacity,
 216          const uint8_t *src, UCaseContext *csc,
 217          int32_t srcStart, int32_t srcLimit,
 218          UErrorCode *pErrorCode) {
 219     const UChar *s = NULL;
 220     UChar32 c, c2 = 0;
 221     int32_t srcIndex, destIndex;
 222     int32_t locCache;
 223
 224     locCache=csm->locCache;
 225
 226     /* case mapping loop */
 227     srcIndex=srcStart;
 228     destIndex=0;
 229     while(srcIndex<srcLimit) {
 230         csc->cpStart=srcIndex;
 231         U8_NEXT(src, srcIndex, srcLimit, c);
 232         csc->cpLimit=srcIndex;
 233         if(c<0) {
 234             int32_t i=csc->cpStart;
 235             while(destIndex<destCapacity && i<srcIndex) {
 236                 dest[destIndex++]=src[i++];
 237             }
 238             continue;
 239         }
 240         c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache);
 241         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
 242             /* fast path version of appendResult() for ASCII results */
 243             dest[destIndex++]=(uint8_t)c2;
 244         } else {
 245             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
 246         }
 247     }
 248
 249     if(destIndex>destCapacity) {
 250         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 251     }
 252     return destIndex;
 253 }
 254
 255 #if !UCONFIG_NO_BREAK_ITERATION
 256
 257 U_CFUNC int32_t U_CALLCONV
 258 ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
 259          uint8_t *dest, int32_t destCapacity,
 260          const uint8_t *src, int32_t srcLength,
 261          UErrorCode *pErrorCode) {
 262     const UChar *s;
 263     UChar32 c;
 264     int32_t prev, titleStart, titleLimit, idx, destIndex, length;
 265     UBool isFirstIndex;
 266
 267     if(U_FAILURE(*pErrorCode)) {
 268         return 0;
 269     }
 270
 271     // Use the C++ abstract base class to minimize dependencies.
 272     // TODO: Change UCaseMap.iter to store a BreakIterator directly.
 273     BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter);
 274
 275     /* set up local variables */
 276     int32_t locCache=csm->locCache;
 277     UCaseContext csc=UCASECONTEXT_INITIALIZER;
 278     csc.p=(void *)src;
 279     csc.limit=srcLength;
 280     destIndex=0;
 281     prev=0;
 282     isFirstIndex=TRUE;
 283
 284     /* titlecasing loop */
 285     while(prev<srcLength) {
 286         /* find next index where to titlecase */
 287         if(isFirstIndex) {
 288             isFirstIndex=FALSE;
 289             idx=bi->first();
 290         } else {
 291             idx=bi->next();
 292         }
 293         if(idx==UBRK_DONE || idx>srcLength) {
 294             idx=srcLength;
 295         }
 296
 297         /*
 298          * Unicode 4 & 5 section 3.13 Default Case Operations:
 299          *
 300          * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
 301          * #29, "Text Boundaries." Between each pair of word boundaries, find the first
 302          * cased character F. If F exists, map F to default_title(F); then map each
 303          * subsequent character C to default_lower(C).
 304          *
 305          * In this implementation, segment [prev..index[ into 3 parts:
 306          * a) uncased characters (copy as-is) [prev..titleStart[
 307          * b) first case letter (titlecase)         [titleStart..titleLimit[
 308          * c) subsequent characters (lowercase)                 [titleLimit..index[
 309          */
 310         if(prev<idx) {
 311             /* find and copy uncased characters [prev..titleStart[ */
 312             titleStart=titleLimit=prev;
 313             U8_NEXT(src, titleLimit, idx, c);
 314             if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
 315                 /* Adjust the titlecasing index (titleStart) to the next cased character. */
 316                 for(;;) {
 317                     titleStart=titleLimit;
 318                     if(titleLimit==idx) {
 319                         /*
 320                          * only uncased characters in [prev..index[
 321                          * stop with titleStart==titleLimit==index
 322                          */
 323                         break;
 324                     }
 325                     U8_NEXT(src, titleLimit, idx, c);
 326                     if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
 327                         break; /* cased letter at [titleStart..titleLimit[ */
 328                     }
 329                 }
 330                 length=titleStart-prev;
 331                 if(length>0) {
 332                     if((destIndex+length)<=destCapacity) {
 333                         uprv_memcpy(dest+destIndex, src+prev, length);
 334                     }
 335                     destIndex+=length;
 336                 }
 337             }
 338
 339             if(titleStart<titleLimit) {
 340                 /* titlecase c which is from [titleStart..titleLimit[ */
 341                 csc.cpStart=titleStart;
 342                 csc.cpLimit=titleLimit;
 343                 c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, &csc, &s, csm->locale, &locCache);
 344                 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
 345
 346                 /* Special case Dutch IJ titlecasing */
 347                 if ( titleStart+1 < idx &&
 348                      ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_DUTCH &&
 349                      ( src[titleStart] == 0x0049 || src[titleStart] == 0x0069 ) &&
 350                      ( src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A )) {
 351                             c=0x004A;
 352                             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
 353                             titleLimit++;
 354                 }
 355                 /* lowercase [titleLimit..index[ */
 356                 if(titleLimit<idx) {
 357                     if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
 358                         /* Normal operation: Lowercase the rest of the word. */
 359                         destIndex+=
 360                             _caseMap(
 361                                 csm, ucase_toFullLower,
 362                                 dest+destIndex, destCapacity-destIndex,
 363                                 src, &csc,
 364                                 titleLimit, idx,
 365                                 pErrorCode);
 366                     } else {
 367                         /* Optionally just copy the rest of the word unchanged. */
 368                         length=idx-titleLimit;
 369                         if((destIndex+length)<=destCapacity) {
 370                             uprv_memcpy(dest+destIndex, src+titleLimit, length);
 371                         }
 372                         destIndex+=length;
 373                     }
 374                 }
 375             }
 376         }
 377
 378         prev=idx;
 379     }
 380
 381     if(destIndex>destCapacity) {
 382         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 383     }
 384     return destIndex;
 385 }
 386
 387 #endif
 388
 389 static int32_t U_CALLCONV
 390 ucasemap_internalUTF8ToLower(const UCaseMap *csm,
 391                              uint8_t *dest, int32_t destCapacity,
 392                              const uint8_t *src, int32_t srcLength,
 393                              UErrorCode *pErrorCode) {
 394     UCaseContext csc=UCASECONTEXT_INITIALIZER;
 395     csc.p=(void *)src;
 396     csc.limit=srcLength;
 397     return _caseMap(
 398         csm, ucase_toFullLower,
 399         dest, destCapacity,
 400         src, &csc, 0, srcLength,
 401         pErrorCode);
 402 }
 403
 404 static int32_t U_CALLCONV
 405 ucasemap_internalUTF8ToUpper(const UCaseMap *csm,
 406                              uint8_t *dest, int32_t destCapacity,
 407                              const uint8_t *src, int32_t srcLength,
 408                              UErrorCode *pErrorCode) {
 409     UCaseContext csc=UCASECONTEXT_INITIALIZER;
 410     csc.p=(void *)src;
 411     csc.limit=srcLength;
 412     return _caseMap(
 413         csm, ucase_toFullUpper,
 414         dest, destCapacity,
 415         src, &csc, 0, srcLength,
 416         pErrorCode);
 417 }
 418
 419 static int32_t
 420 utf8_foldCase(const UCaseProps *csp,
 421               uint8_t *dest, int32_t destCapacity,
 422               const uint8_t *src, int32_t srcLength,
 423               uint32_t options,
 424               UErrorCode *pErrorCode) {
 425     int32_t srcIndex, destIndex;
 426
 427     const UChar *s;
 428     UChar32 c, c2;
 429     int32_t start;
 430
 431     /* case mapping loop */
 432     srcIndex=destIndex=0;
 433     while(srcIndex<srcLength) {
 434         start=srcIndex;
 435         U8_NEXT(src, srcIndex, srcLength, c);
 436         if(c<0) {
 437             while(destIndex<destCapacity && start<srcIndex) {
 438                 dest[destIndex++]=src[start++];
 439             }
 440             continue;
 441         }
 442         c=ucase_toFullFolding(csp, c, &s, options);
 443         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
 444             /* fast path version of appendResult() for ASCII results */
 445             dest[destIndex++]=(uint8_t)c2;
 446         } else {
 447             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
 448         }
 449     }
 450
 451     if(destIndex>destCapacity) {
 452         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 453     }
 454     return destIndex;
 455 }
 456
 457 static int32_t U_CALLCONV
 458 ucasemap_internalUTF8Fold(const UCaseMap *csm,
 459                           uint8_t *dest, int32_t destCapacity,
 460                           const uint8_t *src, int32_t srcLength,
 461                           UErrorCode *pErrorCode) {
 462     return utf8_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode);
 463 }
 464
 465 U_CFUNC int32_t
 466 ucasemap_mapUTF8(const UCaseMap *csm,
 467                  uint8_t *dest, int32_t destCapacity,
 468                  const uint8_t *src, int32_t srcLength,
 469                  UTF8CaseMapper *stringCaseMapper,
 470                  UErrorCode *pErrorCode) {
 471     int32_t destLength;
 472
 473     /* check argument values */
 474     if(U_FAILURE(*pErrorCode)) {
 475         return 0;
 476     }
 477     if( destCapacity<0 ||
 478         (dest==NULL && destCapacity>0) ||
 479         src==NULL ||
 480         srcLength<-1
 481     ) {
 482         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 483         return 0;
 484     }
 485
 486     /* get the string length */
 487     if(srcLength==-1) {
 488         srcLength=(int32_t)uprv_strlen((const char *)src);
 489     }
 490
 491     /* check for overlapping source and destination */
 492     if( dest!=NULL &&
 493         ((src>=dest && src<(dest+destCapacity)) ||
 494          (dest>=src && dest<(src+srcLength)))
 495     ) {
 496         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 497         return 0;
 498     }
 499
 500     destLength=stringCaseMapper(csm, dest, destCapacity, src, srcLength, pErrorCode);
 501     return u_terminateChars((char *)dest, destCapacity, destLength, pErrorCode);
 502 }
 503
 504 /* public API functions */
 505
 506 U_CAPI int32_t U_EXPORT2
 507 ucasemap_utf8ToLower(const UCaseMap *csm,
 508                      char *dest, int32_t destCapacity,
 509                      const char *src, int32_t srcLength,
 510                      UErrorCode *pErrorCode) {
 511     return ucasemap_mapUTF8(csm,
 512                    (uint8_t *)dest, destCapacity,
 513                    (const uint8_t *)src, srcLength,
 514                    ucasemap_internalUTF8ToLower, pErrorCode);
 515 }
 516
 517 U_CAPI int32_t U_EXPORT2
 518 ucasemap_utf8ToUpper(const UCaseMap *csm,
 519                      char *dest, int32_t destCapacity,
 520                      const char *src, int32_t srcLength,
 521                      UErrorCode *pErrorCode) {
 522     return ucasemap_mapUTF8(csm,
 523                    (uint8_t *)dest, destCapacity,
 524                    (const uint8_t *)src, srcLength,
 525                    ucasemap_internalUTF8ToUpper, pErrorCode);
 526 }
 527
 528 U_CAPI int32_t U_EXPORT2
 529 ucasemap_utf8FoldCase(const UCaseMap *csm,
 530                       char *dest, int32_t destCapacity,
 531                       const char *src, int32_t srcLength,
 532                       UErrorCode *pErrorCode) {
 533     return ucasemap_mapUTF8(csm,
 534                    (uint8_t *)dest, destCapacity,
 535                    (const uint8_t *)src, srcLength,
 536                    ucasemap_internalUTF8Fold, pErrorCode);
 537 }