icuSources/common/unistr_cnv.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1999-2010, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  unistr_cnv.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:2
  12 *
  13 *   created on: 2004aug19
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Character conversion functions moved here from unistr.cpp
  17 */
  18
  19 #include "unicode/utypes.h"
  20
  21 #if !UCONFIG_NO_CONVERSION
  22
  23 #include "unicode/putil.h"
  24 #include "cstring.h"
  25 #include "cmemory.h"
  26 #include "unicode/ustring.h"
  27 #include "unicode/unistr.h"
  28 #include "unicode/ucnv.h"
  29 #include "ucnv_imp.h"
  30 #include "putilimp.h"
  31 #include "ustr_cnv.h"
  32 #include "ustr_imp.h"
  33
  34 U_NAMESPACE_BEGIN
  35
  36 //========================================
  37 // Constructors
  38 //========================================
  39
  40 #if !U_CHARSET_IS_UTF8
  41
  42 UnicodeString::UnicodeString(const char *codepageData)
  43   : fShortLength(0),
  44     fFlags(kShortString)
  45 {
  46     if(codepageData != 0) {
  47         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
  48     }
  49 }
  50
  51 UnicodeString::UnicodeString(const char *codepageData,
  52                              int32_t dataLength)
  53   : fShortLength(0),
  54     fFlags(kShortString)
  55 {
  56     if(codepageData != 0) {
  57         doCodepageCreate(codepageData, dataLength, 0);
  58     }
  59 }
  60
  61 // else see unistr.cpp
  62 #endif
  63
  64 UnicodeString::UnicodeString(const char *codepageData,
  65                              const char *codepage)
  66   : fShortLength(0),
  67     fFlags(kShortString)
  68 {
  69     if(codepageData != 0) {
  70         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
  71     }
  72 }
  73
  74 UnicodeString::UnicodeString(const char *codepageData,
  75                              int32_t dataLength,
  76                              const char *codepage)
  77   : fShortLength(0),
  78     fFlags(kShortString)
  79 {
  80     if(codepageData != 0) {
  81         doCodepageCreate(codepageData, dataLength, codepage);
  82     }
  83 }
  84
  85 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
  86                              UConverter *cnv,
  87                              UErrorCode &errorCode)
  88   : fShortLength(0),
  89     fFlags(kShortString)
  90 {
  91     if(U_SUCCESS(errorCode)) {
  92         // check arguments
  93         if(src==NULL) {
  94             // treat as an empty string, do nothing more
  95         } else if(srcLength<-1) {
  96             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  97         } else {
  98             // get input length
  99             if(srcLength==-1) {
 100                 srcLength=(int32_t)uprv_strlen(src);
 101             }
 102             if(srcLength>0) {
 103                 if(cnv!=0) {
 104                     // use the provided converter
 105                     ucnv_resetToUnicode(cnv);
 106                     doCodepageCreate(src, srcLength, cnv, errorCode);
 107                 } else {
 108                     // use the default converter
 109                     cnv=u_getDefaultConverter(&errorCode);
 110                     doCodepageCreate(src, srcLength, cnv, errorCode);
 111                     u_releaseDefaultConverter(cnv);
 112                 }
 113             }
 114         }
 115
 116         if(U_FAILURE(errorCode)) {
 117             setToBogus();
 118         }
 119     }
 120 }
 121
 122 //========================================
 123 // Codeset conversion
 124 //========================================
 125
 126 #if !U_CHARSET_IS_UTF8
 127
 128 int32_t
 129 UnicodeString::extract(int32_t start,
 130                        int32_t length,
 131                        char *target,
 132                        uint32_t dstSize) const {
 133     return extract(start, length, target, dstSize, 0);
 134 }
 135
 136 // else see unistr.cpp
 137 #endif
 138
 139 int32_t
 140 UnicodeString::extract(int32_t start,
 141                        int32_t length,
 142                        char *target,
 143                        uint32_t dstSize,
 144                        const char *codepage) const
 145 {
 146     // if the arguments are illegal, then do nothing
 147     if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
 148         return 0;
 149     }
 150
 151     // pin the indices to legal values
 152     pinIndices(start, length);
 153
 154     // We need to cast dstSize to int32_t for all subsequent code.
 155     // I don't know why the API was defined with uint32_t but we are stuck with it.
 156     // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
 157     // as a limit in some functions, it may wrap around and yield a pointer
 158     // that compares less-than target.
 159     int32_t capacity;
 160     if(dstSize < 0x7fffffff) {
 161         // Assume that the capacity is real and a limit pointer won't wrap around.
 162         capacity = (int32_t)dstSize;
 163     } else {
 164         // Pin the capacity so that a limit pointer does not wrap around.
 165         char *targetLimit = (char *)U_MAX_PTR(target);
 166         // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
 167         // greater than target and does not wrap around the top of the address space.
 168         capacity = (int32_t)(targetLimit - target);
 169     }
 170
 171     // create the converter
 172     UConverter *converter;
 173     UErrorCode status = U_ZERO_ERROR;
 174
 175     // just write the NUL if the string length is 0
 176     if(length == 0) {
 177         return u_terminateChars(target, capacity, 0, &status);
 178     }
 179
 180     // if the codepage is the default, use our cache
 181     // if it is an empty string, then use the "invariant character" conversion
 182     if (codepage == 0) {
 183         const char *defaultName = ucnv_getDefaultName();
 184         if(UCNV_FAST_IS_UTF8(defaultName)) {
 185             return toUTF8(start, length, target, capacity);
 186         }
 187         converter = u_getDefaultConverter(&status);
 188     } else if (*codepage == 0) {
 189         // use the "invariant characters" conversion
 190         int32_t destLength;
 191         if(length <= capacity) {
 192             destLength = length;
 193         } else {
 194             destLength = capacity;
 195         }
 196         u_UCharsToChars(getArrayStart() + start, target, destLength);
 197         return u_terminateChars(target, capacity, length, &status);
 198     } else {
 199         converter = ucnv_open(codepage, &status);
 200     }
 201
 202     length = doExtract(start, length, target, capacity, converter, status);
 203
 204     // close the converter
 205     if (codepage == 0) {
 206         u_releaseDefaultConverter(converter);
 207     } else {
 208         ucnv_close(converter);
 209     }
 210
 211     return length;
 212 }
 213
 214 int32_t
 215 UnicodeString::extract(char *dest, int32_t destCapacity,
 216                        UConverter *cnv,
 217                        UErrorCode &errorCode) const
 218 {
 219     if(U_FAILURE(errorCode)) {
 220         return 0;
 221     }
 222
 223     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 224         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 225         return 0;
 226     }
 227
 228     // nothing to do?
 229     if(isEmpty()) {
 230         return u_terminateChars(dest, destCapacity, 0, &errorCode);
 231     }
 232
 233     // get the converter
 234     UBool isDefaultConverter;
 235     if(cnv==0) {
 236         isDefaultConverter=TRUE;
 237         cnv=u_getDefaultConverter(&errorCode);
 238         if(U_FAILURE(errorCode)) {
 239             return 0;
 240         }
 241     } else {
 242         isDefaultConverter=FALSE;
 243         ucnv_resetFromUnicode(cnv);
 244     }
 245
 246     // convert
 247     int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
 248
 249     // release the converter
 250     if(isDefaultConverter) {
 251         u_releaseDefaultConverter(cnv);
 252     }
 253
 254     return len;
 255 }
 256
 257 int32_t
 258 UnicodeString::doExtract(int32_t start, int32_t length,
 259                          char *dest, int32_t destCapacity,
 260                          UConverter *cnv,
 261                          UErrorCode &errorCode) const
 262 {
 263     if(U_FAILURE(errorCode)) {
 264         if(destCapacity!=0) {
 265             *dest=0;
 266         }
 267         return 0;
 268     }
 269
 270     const UChar *src=getArrayStart()+start, *srcLimit=src+length;
 271     char *originalDest=dest;
 272     const char *destLimit;
 273
 274     if(destCapacity==0) {
 275         destLimit=dest=0;
 276     } else if(destCapacity==-1) {
 277         // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
 278         destLimit=(char*)U_MAX_PTR(dest);
 279         // for NUL-termination, translate into highest int32_t
 280         destCapacity=0x7fffffff;
 281     } else {
 282         destLimit=dest+destCapacity;
 283     }
 284
 285     // perform the conversion
 286     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
 287     length=(int32_t)(dest-originalDest);
 288
 289     // if an overflow occurs, then get the preflighting length
 290     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
 291         char buffer[1024];
 292
 293         destLimit=buffer+sizeof(buffer);
 294         do {
 295             dest=buffer;
 296             errorCode=U_ZERO_ERROR;
 297             ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
 298             length+=(int32_t)(dest-buffer);
 299         } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
 300     }
 301
 302     return u_terminateChars(originalDest, destCapacity, length, &errorCode);
 303 }
 304
 305 void
 306 UnicodeString::doCodepageCreate(const char *codepageData,
 307                                 int32_t dataLength,
 308                                 const char *codepage)
 309 {
 310     // if there's nothing to convert, do nothing
 311     if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
 312         return;
 313     }
 314     if(dataLength == -1) {
 315         dataLength = (int32_t)uprv_strlen(codepageData);
 316     }
 317
 318     UErrorCode status = U_ZERO_ERROR;
 319
 320     // create the converter
 321     // if the codepage is the default, use our cache
 322     // if it is an empty string, then use the "invariant character" conversion
 323     UConverter *converter;
 324     if (codepage == 0) {
 325         const char *defaultName = ucnv_getDefaultName();
 326         if(UCNV_FAST_IS_UTF8(defaultName)) {
 327             setToUTF8(StringPiece(codepageData, dataLength));
 328             return;
 329         }
 330         converter = u_getDefaultConverter(&status);
 331     } else if(*codepage == 0) {
 332         // use the "invariant characters" conversion
 333         if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
 334             u_charsToUChars(codepageData, getArrayStart(), dataLength);
 335             setLength(dataLength);
 336         } else {
 337             setToBogus();
 338         }
 339         return;
 340     } else {
 341         converter = ucnv_open(codepage, &status);
 342     }
 343
 344     // if we failed, set the appropriate flags and return
 345     if(U_FAILURE(status)) {
 346         setToBogus();
 347         return;
 348     }
 349
 350     // perform the conversion
 351     doCodepageCreate(codepageData, dataLength, converter, status);
 352     if(U_FAILURE(status)) {
 353         setToBogus();
 354     }
 355
 356     // close the converter
 357     if(codepage == 0) {
 358         u_releaseDefaultConverter(converter);
 359     } else {
 360         ucnv_close(converter);
 361     }
 362 }
 363
 364 void
 365 UnicodeString::doCodepageCreate(const char *codepageData,
 366                                 int32_t dataLength,
 367                                 UConverter *converter,
 368                                 UErrorCode &status)
 369 {
 370     if(U_FAILURE(status)) {
 371         return;
 372     }
 373
 374     // set up the conversion parameters
 375     const char *mySource     = codepageData;
 376     const char *mySourceEnd  = mySource + dataLength;
 377     UChar *array, *myTarget;
 378
 379     // estimate the size needed:
 380     int32_t arraySize;
 381     if(dataLength <= US_STACKBUF_SIZE) {
 382         // try to use the stack buffer
 383         arraySize = US_STACKBUF_SIZE;
 384     } else {
 385         // 1.25 UChar's per source byte should cover most cases
 386         arraySize = dataLength + (dataLength >> 2);
 387     }
 388
 389     // we do not care about the current contents
 390     UBool doCopyArray = FALSE;
 391     for(;;) {
 392         if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
 393             setToBogus();
 394             break;
 395         }
 396
 397         // perform the conversion
 398         array = getArrayStart();
 399         myTarget = array + length();
 400         ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
 401             &mySource, mySourceEnd, 0, TRUE, &status);
 402
 403         // update the conversion parameters
 404         setLength((int32_t)(myTarget - array));
 405
 406         // allocate more space and copy data, if needed
 407         if(status == U_BUFFER_OVERFLOW_ERROR) {
 408             // reset the error code
 409             status = U_ZERO_ERROR;
 410
 411             // keep the previous conversion results
 412             doCopyArray = TRUE;
 413
 414             // estimate the new size needed, larger than before
 415             // try 2 UChar's per remaining source byte
 416             arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
 417         } else {
 418             break;
 419         }
 420     }
 421 }
 422
 423 U_NAMESPACE_END
 424
 425 #endif