icuSources/common/unistr_cnv.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1999-2014, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  unistr_cnv.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:2
  12 *
  13 *   created on: 2004aug19
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Character conversion functions moved here from unistr.cpp
  17 */
  18
  19 #include "unicode/utypes.h"
  20
  21 #if !UCONFIG_NO_CONVERSION
  22
  23 #include "unicode/putil.h"
  24 #include "cstring.h"
  25 #include "cmemory.h"
  26 #include "unicode/ustring.h"
  27 #include "unicode/unistr.h"
  28 #include "unicode/ucnv.h"
  29 #include "ucnv_imp.h"
  30 #include "putilimp.h"
  31 #include "ustr_cnv.h"
  32 #include "ustr_imp.h"
  33
  34 U_NAMESPACE_BEGIN
  35
  36 //========================================
  37 // Constructors
  38 //========================================
  39
  40 #if !U_CHARSET_IS_UTF8
  41
  42 UnicodeString::UnicodeString(const char *codepageData) {
  43     fUnion.fFields.fLengthAndFlags = kShortString;
  44     if(codepageData != 0) {
  45         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
  46     }
  47 }
  48
  49 UnicodeString::UnicodeString(const char *codepageData,
  50                              int32_t dataLength) {
  51     fUnion.fFields.fLengthAndFlags = kShortString;
  52     if(codepageData != 0) {
  53         doCodepageCreate(codepageData, dataLength, 0);
  54     }
  55 }
  56
  57 // else see unistr.cpp
  58 #endif
  59
  60 UnicodeString::UnicodeString(const char *codepageData,
  61                              const char *codepage) {
  62     fUnion.fFields.fLengthAndFlags = kShortString;
  63     if(codepageData != 0) {
  64         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
  65     }
  66 }
  67
  68 UnicodeString::UnicodeString(const char *codepageData,
  69                              int32_t dataLength,
  70                              const char *codepage) {
  71     fUnion.fFields.fLengthAndFlags = kShortString;
  72     if(codepageData != 0) {
  73         doCodepageCreate(codepageData, dataLength, codepage);
  74     }
  75 }
  76
  77 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
  78                              UConverter *cnv,
  79                              UErrorCode &errorCode) {
  80     fUnion.fFields.fLengthAndFlags = kShortString;
  81     if(U_SUCCESS(errorCode)) {
  82         // check arguments
  83         if(src==NULL) {
  84             // treat as an empty string, do nothing more
  85         } else if(srcLength<-1) {
  86             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  87         } else {
  88             // get input length
  89             if(srcLength==-1) {
  90                 srcLength=(int32_t)uprv_strlen(src);
  91             }
  92             if(srcLength>0) {
  93                 if(cnv!=0) {
  94                     // use the provided converter
  95                     ucnv_resetToUnicode(cnv);
  96                     doCodepageCreate(src, srcLength, cnv, errorCode);
  97                 } else {
  98                     // use the default converter
  99                     cnv=u_getDefaultConverter(&errorCode);
 100                     doCodepageCreate(src, srcLength, cnv, errorCode);
 101                     u_releaseDefaultConverter(cnv);
 102                 }
 103             }
 104         }
 105
 106         if(U_FAILURE(errorCode)) {
 107             setToBogus();
 108         }
 109     }
 110 }
 111
 112 //========================================
 113 // Codeset conversion
 114 //========================================
 115
 116 #if !U_CHARSET_IS_UTF8
 117
 118 int32_t
 119 UnicodeString::extract(int32_t start,
 120                        int32_t length,
 121                        char *target,
 122                        uint32_t dstSize) const {
 123     return extract(start, length, target, dstSize, 0);
 124 }
 125
 126 // else see unistr.cpp
 127 #endif
 128
 129 int32_t
 130 UnicodeString::extract(int32_t start,
 131                        int32_t length,
 132                        char *target,
 133                        uint32_t dstSize,
 134                        const char *codepage) const
 135 {
 136     // if the arguments are illegal, then do nothing
 137     if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
 138         return 0;
 139     }
 140
 141     // pin the indices to legal values
 142     pinIndices(start, length);
 143
 144     // We need to cast dstSize to int32_t for all subsequent code.
 145     // I don't know why the API was defined with uint32_t but we are stuck with it.
 146     // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
 147     // as a limit in some functions, it may wrap around and yield a pointer
 148     // that compares less-than target.
 149     int32_t capacity;
 150     if(dstSize < 0x7fffffff) {
 151         // Assume that the capacity is real and a limit pointer won't wrap around.
 152         capacity = (int32_t)dstSize;
 153     } else {
 154         // Pin the capacity so that a limit pointer does not wrap around.
 155         char *targetLimit = (char *)U_MAX_PTR(target);
 156         // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
 157         // greater than target and does not wrap around the top of the address space.
 158         capacity = (int32_t)(targetLimit - target);
 159     }
 160
 161     // create the converter
 162     UConverter *converter;
 163     UErrorCode status = U_ZERO_ERROR;
 164
 165     // just write the NUL if the string length is 0
 166     if(length == 0) {
 167         return u_terminateChars(target, capacity, 0, &status);
 168     }
 169
 170     // if the codepage is the default, use our cache
 171     // if it is an empty string, then use the "invariant character" conversion
 172     if (codepage == 0) {
 173         const char *defaultName = ucnv_getDefaultName();
 174         if(UCNV_FAST_IS_UTF8(defaultName)) {
 175             return toUTF8(start, length, target, capacity);
 176         }
 177         converter = u_getDefaultConverter(&status);
 178     } else if (*codepage == 0) {
 179         // use the "invariant characters" conversion
 180         int32_t destLength;
 181         if(length <= capacity) {
 182             destLength = length;
 183         } else {
 184             destLength = capacity;
 185         }
 186         u_UCharsToChars(getArrayStart() + start, target, destLength);
 187         return u_terminateChars(target, capacity, length, &status);
 188     } else {
 189         converter = ucnv_open(codepage, &status);
 190     }
 191
 192     length = doExtract(start, length, target, capacity, converter, status);
 193
 194     // close the converter
 195     if (codepage == 0) {
 196         u_releaseDefaultConverter(converter);
 197     } else {
 198         ucnv_close(converter);
 199     }
 200
 201     return length;
 202 }
 203
 204 int32_t
 205 UnicodeString::extract(char *dest, int32_t destCapacity,
 206                        UConverter *cnv,
 207                        UErrorCode &errorCode) const
 208 {
 209     if(U_FAILURE(errorCode)) {
 210         return 0;
 211     }
 212
 213     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 214         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 215         return 0;
 216     }
 217
 218     // nothing to do?
 219     if(isEmpty()) {
 220         return u_terminateChars(dest, destCapacity, 0, &errorCode);
 221     }
 222
 223     // get the converter
 224     UBool isDefaultConverter;
 225     if(cnv==0) {
 226         isDefaultConverter=TRUE;
 227         cnv=u_getDefaultConverter(&errorCode);
 228         if(U_FAILURE(errorCode)) {
 229             return 0;
 230         }
 231     } else {
 232         isDefaultConverter=FALSE;
 233         ucnv_resetFromUnicode(cnv);
 234     }
 235
 236     // convert
 237     int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
 238
 239     // release the converter
 240     if(isDefaultConverter) {
 241         u_releaseDefaultConverter(cnv);
 242     }
 243
 244     return len;
 245 }
 246
 247 int32_t
 248 UnicodeString::doExtract(int32_t start, int32_t length,
 249                          char *dest, int32_t destCapacity,
 250                          UConverter *cnv,
 251                          UErrorCode &errorCode) const
 252 {
 253     if(U_FAILURE(errorCode)) {
 254         if(destCapacity!=0) {
 255             *dest=0;
 256         }
 257         return 0;
 258     }
 259
 260     const UChar *src=getArrayStart()+start, *srcLimit=src+length;
 261     char *originalDest=dest;
 262     const char *destLimit;
 263
 264     if(destCapacity==0) {
 265         destLimit=dest=0;
 266     } else if(destCapacity==-1) {
 267         // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
 268         destLimit=(char*)U_MAX_PTR(dest);
 269         // for NUL-termination, translate into highest int32_t
 270         destCapacity=0x7fffffff;
 271     } else {
 272         destLimit=dest+destCapacity;
 273     }
 274
 275     // perform the conversion
 276     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
 277     length=(int32_t)(dest-originalDest);
 278
 279     // if an overflow occurs, then get the preflighting length
 280     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
 281         char buffer[1024];
 282
 283         destLimit=buffer+sizeof(buffer);
 284         do {
 285             dest=buffer;
 286             errorCode=U_ZERO_ERROR;
 287             ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
 288             length+=(int32_t)(dest-buffer);
 289         } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
 290     }
 291
 292     return u_terminateChars(originalDest, destCapacity, length, &errorCode);
 293 }
 294
 295 void
 296 UnicodeString::doCodepageCreate(const char *codepageData,
 297                                 int32_t dataLength,
 298                                 const char *codepage)
 299 {
 300     // if there's nothing to convert, do nothing
 301     if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
 302         return;
 303     }
 304     if(dataLength == -1) {
 305         dataLength = (int32_t)uprv_strlen(codepageData);
 306     }
 307
 308     UErrorCode status = U_ZERO_ERROR;
 309
 310     // create the converter
 311     // if the codepage is the default, use our cache
 312     // if it is an empty string, then use the "invariant character" conversion
 313     UConverter *converter;
 314     if (codepage == 0) {
 315         const char *defaultName = ucnv_getDefaultName();
 316         if(UCNV_FAST_IS_UTF8(defaultName)) {
 317             setToUTF8(StringPiece(codepageData, dataLength));
 318             return;
 319         }
 320         converter = u_getDefaultConverter(&status);
 321     } else if(*codepage == 0) {
 322         // use the "invariant characters" conversion
 323         if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
 324             u_charsToUChars(codepageData, getArrayStart(), dataLength);
 325             setLength(dataLength);
 326         } else {
 327             setToBogus();
 328         }
 329         return;
 330     } else {
 331         converter = ucnv_open(codepage, &status);
 332     }
 333
 334     // if we failed, set the appropriate flags and return
 335     if(U_FAILURE(status)) {
 336         setToBogus();
 337         return;
 338     }
 339
 340     // perform the conversion
 341     doCodepageCreate(codepageData, dataLength, converter, status);
 342     if(U_FAILURE(status)) {
 343         setToBogus();
 344     }
 345
 346     // close the converter
 347     if(codepage == 0) {
 348         u_releaseDefaultConverter(converter);
 349     } else {
 350         ucnv_close(converter);
 351     }
 352 }
 353
 354 void
 355 UnicodeString::doCodepageCreate(const char *codepageData,
 356                                 int32_t dataLength,
 357                                 UConverter *converter,
 358                                 UErrorCode &status)
 359 {
 360     if(U_FAILURE(status)) {
 361         return;
 362     }
 363
 364     // set up the conversion parameters
 365     const char *mySource     = codepageData;
 366     const char *mySourceEnd  = mySource + dataLength;
 367     UChar *array, *myTarget;
 368
 369     // estimate the size needed:
 370     int32_t arraySize;
 371     if(dataLength <= US_STACKBUF_SIZE) {
 372         // try to use the stack buffer
 373         arraySize = US_STACKBUF_SIZE;
 374     } else {
 375         // 1.25 UChar's per source byte should cover most cases
 376         arraySize = dataLength + (dataLength >> 2);
 377     }
 378
 379     // we do not care about the current contents
 380     UBool doCopyArray = FALSE;
 381     for(;;) {
 382         if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
 383             setToBogus();
 384             break;
 385         }
 386
 387         // perform the conversion
 388         array = getArrayStart();
 389         myTarget = array + length();
 390         ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
 391             &mySource, mySourceEnd, 0, TRUE, &status);
 392
 393         // update the conversion parameters
 394         setLength((int32_t)(myTarget - array));
 395
 396         // allocate more space and copy data, if needed
 397         if(status == U_BUFFER_OVERFLOW_ERROR) {
 398             // reset the error code
 399             status = U_ZERO_ERROR;
 400
 401             // keep the previous conversion results
 402             doCopyArray = TRUE;
 403
 404             // estimate the new size needed, larger than before
 405             // try 2 UChar's per remaining source byte
 406             arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
 407         } else {
 408             break;
 409         }
 410     }
 411 }
 412
 413 U_NAMESPACE_END
 414
 415 #endif