icuSources/common/unistr_cnv.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 1999-2014, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  unistr_cnv.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:2
  14 *
  15 *   created on: 2004aug19
  16 *   created by: Markus W. Scherer
  17 *
  18 *   Character conversion functions moved here from unistr.cpp
  19 */
  20
  21 #include "unicode/utypes.h"
  22
  23 #if !UCONFIG_NO_CONVERSION
  24
  25 #include "unicode/putil.h"
  26 #include "cstring.h"
  27 #include "cmemory.h"
  28 #include "unicode/ustring.h"
  29 #include "unicode/unistr.h"
  30 #include "unicode/ucnv.h"
  31 #include "ucnv_imp.h"
  32 #include "putilimp.h"
  33 #include "ustr_cnv.h"
  34 #include "ustr_imp.h"
  35
  36 U_NAMESPACE_BEGIN
  37
  38 //========================================
  39 // Constructors
  40 //========================================
  41
  42 #if !U_CHARSET_IS_UTF8
  43
  44 UnicodeString::UnicodeString(const char *codepageData) {
  45     fUnion.fFields.fLengthAndFlags = kShortString;
  46     if(codepageData != 0) {
  47         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
  48     }
  49 }
  50
  51 UnicodeString::UnicodeString(const char *codepageData,
  52                              int32_t dataLength) {
  53     fUnion.fFields.fLengthAndFlags = kShortString;
  54     if(codepageData != 0) {
  55         doCodepageCreate(codepageData, dataLength, 0);
  56     }
  57 }
  58
  59 // else see unistr.cpp
  60 #endif
  61
  62 UnicodeString::UnicodeString(const char *codepageData,
  63                              const char *codepage) {
  64     fUnion.fFields.fLengthAndFlags = kShortString;
  65     if(codepageData != 0) {
  66         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
  67     }
  68 }
  69
  70 UnicodeString::UnicodeString(const char *codepageData,
  71                              int32_t dataLength,
  72                              const char *codepage) {
  73     fUnion.fFields.fLengthAndFlags = kShortString;
  74     if(codepageData != 0) {
  75         doCodepageCreate(codepageData, dataLength, codepage);
  76     }
  77 }
  78
  79 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
  80                              UConverter *cnv,
  81                              UErrorCode &errorCode) {
  82     fUnion.fFields.fLengthAndFlags = kShortString;
  83     if(U_SUCCESS(errorCode)) {
  84         // check arguments
  85         if(src==NULL) {
  86             // treat as an empty string, do nothing more
  87         } else if(srcLength<-1) {
  88             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  89         } else {
  90             // get input length
  91             if(srcLength==-1) {
  92                 srcLength=(int32_t)uprv_strlen(src);
  93             }
  94             if(srcLength>0) {
  95                 if(cnv!=0) {
  96                     // use the provided converter
  97                     ucnv_resetToUnicode(cnv);
  98                     doCodepageCreate(src, srcLength, cnv, errorCode);
  99                 } else {
 100                     // use the default converter
 101                     cnv=u_getDefaultConverter(&errorCode);
 102                     doCodepageCreate(src, srcLength, cnv, errorCode);
 103                     u_releaseDefaultConverter(cnv);
 104                 }
 105             }
 106         }
 107
 108         if(U_FAILURE(errorCode)) {
 109             setToBogus();
 110         }
 111     }
 112 }
 113
 114 //========================================
 115 // Codeset conversion
 116 //========================================
 117
 118 #if !U_CHARSET_IS_UTF8
 119
 120 int32_t
 121 UnicodeString::extract(int32_t start,
 122                        int32_t length,
 123                        char *target,
 124                        uint32_t dstSize) const {
 125     return extract(start, length, target, dstSize, 0);
 126 }
 127
 128 // else see unistr.cpp
 129 #endif
 130
 131 int32_t
 132 UnicodeString::extract(int32_t start,
 133                        int32_t length,
 134                        char *target,
 135                        uint32_t dstSize,
 136                        const char *codepage) const
 137 {
 138     // if the arguments are illegal, then do nothing
 139     if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
 140         return 0;
 141     }
 142
 143     // pin the indices to legal values
 144     pinIndices(start, length);
 145
 146     // We need to cast dstSize to int32_t for all subsequent code.
 147     // I don't know why the API was defined with uint32_t but we are stuck with it.
 148     // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
 149     // as a limit in some functions, it may wrap around and yield a pointer
 150     // that compares less-than target.
 151     int32_t capacity;
 152     if(dstSize < 0x7fffffff) {
 153         // Assume that the capacity is real and a limit pointer won't wrap around.
 154         capacity = (int32_t)dstSize;
 155     } else {
 156         // Pin the capacity so that a limit pointer does not wrap around.
 157         char *targetLimit = (char *)U_MAX_PTR(target);
 158         // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
 159         // greater than target and does not wrap around the top of the address space.
 160         capacity = (int32_t)(targetLimit - target);
 161     }
 162
 163     // create the converter
 164     UConverter *converter;
 165     UErrorCode status = U_ZERO_ERROR;
 166
 167     // just write the NUL if the string length is 0
 168     if(length == 0) {
 169         return u_terminateChars(target, capacity, 0, &status);
 170     }
 171
 172     // if the codepage is the default, use our cache
 173     // if it is an empty string, then use the "invariant character" conversion
 174     if (codepage == 0) {
 175         const char *defaultName = ucnv_getDefaultName();
 176         if(UCNV_FAST_IS_UTF8(defaultName)) {
 177             return toUTF8(start, length, target, capacity);
 178         }
 179         converter = u_getDefaultConverter(&status);
 180     } else if (*codepage == 0) {
 181         // use the "invariant characters" conversion
 182         int32_t destLength;
 183         if(length <= capacity) {
 184             destLength = length;
 185         } else {
 186             destLength = capacity;
 187         }
 188         u_UCharsToChars(getArrayStart() + start, target, destLength);
 189         return u_terminateChars(target, capacity, length, &status);
 190     } else {
 191         converter = ucnv_open(codepage, &status);
 192     }
 193
 194     length = doExtract(start, length, target, capacity, converter, status);
 195
 196     // close the converter
 197     if (codepage == 0) {
 198         u_releaseDefaultConverter(converter);
 199     } else {
 200         ucnv_close(converter);
 201     }
 202
 203     return length;
 204 }
 205
 206 int32_t
 207 UnicodeString::extract(char *dest, int32_t destCapacity,
 208                        UConverter *cnv,
 209                        UErrorCode &errorCode) const
 210 {
 211     if(U_FAILURE(errorCode)) {
 212         return 0;
 213     }
 214
 215     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 216         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 217         return 0;
 218     }
 219
 220     // nothing to do?
 221     if(isEmpty()) {
 222         return u_terminateChars(dest, destCapacity, 0, &errorCode);
 223     }
 224
 225     // get the converter
 226     UBool isDefaultConverter;
 227     if(cnv==0) {
 228         isDefaultConverter=TRUE;
 229         cnv=u_getDefaultConverter(&errorCode);
 230         if(U_FAILURE(errorCode)) {
 231             return 0;
 232         }
 233     } else {
 234         isDefaultConverter=FALSE;
 235         ucnv_resetFromUnicode(cnv);
 236     }
 237
 238     // convert
 239     int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
 240
 241     // release the converter
 242     if(isDefaultConverter) {
 243         u_releaseDefaultConverter(cnv);
 244     }
 245
 246     return len;
 247 }
 248
 249 int32_t
 250 UnicodeString::doExtract(int32_t start, int32_t length,
 251                          char *dest, int32_t destCapacity,
 252                          UConverter *cnv,
 253                          UErrorCode &errorCode) const
 254 {
 255     if(U_FAILURE(errorCode)) {
 256         if(destCapacity!=0) {
 257             *dest=0;
 258         }
 259         return 0;
 260     }
 261
 262     const UChar *src=getArrayStart()+start, *srcLimit=src+length;
 263     char *originalDest=dest;
 264     const char *destLimit;
 265
 266     if(destCapacity==0) {
 267         destLimit=dest=0;
 268     } else if(destCapacity==-1) {
 269         // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
 270         destLimit=(char*)U_MAX_PTR(dest);
 271         // for NUL-termination, translate into highest int32_t
 272         destCapacity=0x7fffffff;
 273     } else {
 274         destLimit=dest+destCapacity;
 275     }
 276
 277     // perform the conversion
 278     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
 279     length=(int32_t)(dest-originalDest);
 280
 281     // if an overflow occurs, then get the preflighting length
 282     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
 283         char buffer[1024];
 284
 285         destLimit=buffer+sizeof(buffer);
 286         do {
 287             dest=buffer;
 288             errorCode=U_ZERO_ERROR;
 289             ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
 290             length+=(int32_t)(dest-buffer);
 291         } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
 292     }
 293
 294     return u_terminateChars(originalDest, destCapacity, length, &errorCode);
 295 }
 296
 297 void
 298 UnicodeString::doCodepageCreate(const char *codepageData,
 299                                 int32_t dataLength,
 300                                 const char *codepage)
 301 {
 302     // if there's nothing to convert, do nothing
 303     if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
 304         return;
 305     }
 306     if(dataLength == -1) {
 307         dataLength = (int32_t)uprv_strlen(codepageData);
 308     }
 309
 310     UErrorCode status = U_ZERO_ERROR;
 311
 312     // create the converter
 313     // if the codepage is the default, use our cache
 314     // if it is an empty string, then use the "invariant character" conversion
 315     UConverter *converter;
 316     if (codepage == 0) {
 317         const char *defaultName = ucnv_getDefaultName();
 318         if(UCNV_FAST_IS_UTF8(defaultName)) {
 319             setToUTF8(StringPiece(codepageData, dataLength));
 320             return;
 321         }
 322         converter = u_getDefaultConverter(&status);
 323     } else if(*codepage == 0) {
 324         // use the "invariant characters" conversion
 325         if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
 326             u_charsToUChars(codepageData, getArrayStart(), dataLength);
 327             setLength(dataLength);
 328         } else {
 329             setToBogus();
 330         }
 331         return;
 332     } else {
 333         converter = ucnv_open(codepage, &status);
 334     }
 335
 336     // if we failed, set the appropriate flags and return
 337     if(U_FAILURE(status)) {
 338         setToBogus();
 339         return;
 340     }
 341
 342     // perform the conversion
 343     doCodepageCreate(codepageData, dataLength, converter, status);
 344     if(U_FAILURE(status)) {
 345         setToBogus();
 346     }
 347
 348     // close the converter
 349     if(codepage == 0) {
 350         u_releaseDefaultConverter(converter);
 351     } else {
 352         ucnv_close(converter);
 353     }
 354 }
 355
 356 void
 357 UnicodeString::doCodepageCreate(const char *codepageData,
 358                                 int32_t dataLength,
 359                                 UConverter *converter,
 360                                 UErrorCode &status)
 361 {
 362     if(U_FAILURE(status)) {
 363         return;
 364     }
 365
 366     // set up the conversion parameters
 367     const char *mySource     = codepageData;
 368     const char *mySourceEnd  = mySource + dataLength;
 369     UChar *array, *myTarget;
 370
 371     // estimate the size needed:
 372     int32_t arraySize;
 373     if(dataLength <= US_STACKBUF_SIZE) {
 374         // try to use the stack buffer
 375         arraySize = US_STACKBUF_SIZE;
 376     } else {
 377         // 1.25 UChar's per source byte should cover most cases
 378         arraySize = dataLength + (dataLength >> 2);
 379     }
 380
 381     // we do not care about the current contents
 382     UBool doCopyArray = FALSE;
 383     for(;;) {
 384         if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
 385             setToBogus();
 386             break;
 387         }
 388
 389         // perform the conversion
 390         array = getArrayStart();
 391         myTarget = array + length();
 392         ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
 393             &mySource, mySourceEnd, 0, TRUE, &status);
 394
 395         // update the conversion parameters
 396         setLength((int32_t)(myTarget - array));
 397
 398         // allocate more space and copy data, if needed
 399         if(status == U_BUFFER_OVERFLOW_ERROR) {
 400             // reset the error code
 401             status = U_ZERO_ERROR;
 402
 403             // keep the previous conversion results
 404             doCopyArray = TRUE;
 405
 406             // estimate the new size needed, larger than before
 407             // try 2 UChar's per remaining source byte
 408             arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
 409         } else {
 410             break;
 411         }
 412     }
 413 }
 414
 415 U_NAMESPACE_END
 416
 417 #endif