icuSources/common/unistr_cnv.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1999-2007, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  unistr_cnv.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:2
  12 *
  13 *   created on: 2004aug19
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Character conversion functions moved here from unistr.cpp
  17 */
  18
  19 #include "unicode/utypes.h"
  20
  21 #if !UCONFIG_NO_CONVERSION
  22
  23 #include "unicode/putil.h"
  24 #include "cstring.h"
  25 #include "cmemory.h"
  26 #include "unicode/ustring.h"
  27 #include "unicode/unistr.h"
  28 #include "unicode/ucnv.h"
  29 #include "putilimp.h"
  30 #include "ustr_cnv.h"
  31 #include "ustr_imp.h"
  32
  33 U_NAMESPACE_BEGIN
  34
  35 //========================================
  36 // Constructors
  37 //========================================
  38
  39 UnicodeString::UnicodeString(const char *codepageData,
  40                              const char *codepage)
  41   : fShortLength(0),
  42     fFlags(kShortString)
  43 {
  44     if(codepageData != 0) {
  45         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
  46     }
  47 }
  48
  49
  50 UnicodeString::UnicodeString(const char *codepageData,
  51                              int32_t dataLength,
  52                              const char *codepage)
  53   : fShortLength(0),
  54     fFlags(kShortString)
  55 {
  56     if(codepageData != 0) {
  57         doCodepageCreate(codepageData, dataLength, codepage);
  58     }
  59 }
  60
  61 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
  62                              UConverter *cnv,
  63                              UErrorCode &errorCode)
  64   : fShortLength(0),
  65     fFlags(kShortString)
  66 {
  67     if(U_SUCCESS(errorCode)) {
  68         // check arguments
  69         if(src==NULL) {
  70             // treat as an empty string, do nothing more
  71         } else if(srcLength<-1) {
  72             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  73         } else {
  74             // get input length
  75             if(srcLength==-1) {
  76                 srcLength=(int32_t)uprv_strlen(src);
  77             }
  78             if(srcLength>0) {
  79                 if(cnv!=0) {
  80                     // use the provided converter
  81                     ucnv_resetToUnicode(cnv);
  82                     doCodepageCreate(src, srcLength, cnv, errorCode);
  83                 } else {
  84                     // use the default converter
  85                     cnv=u_getDefaultConverter(&errorCode);
  86                     doCodepageCreate(src, srcLength, cnv, errorCode);
  87                     u_releaseDefaultConverter(cnv);
  88                 }
  89             }
  90         }
  91
  92         if(U_FAILURE(errorCode)) {
  93             setToBogus();
  94         }
  95     }
  96 }
  97
  98 //========================================
  99 // Codeset conversion
 100 //========================================
 101 int32_t
 102 UnicodeString::extract(int32_t start,
 103                        int32_t length,
 104                        char *target,
 105                        uint32_t dstSize,
 106                        const char *codepage) const
 107 {
 108     // if the arguments are illegal, then do nothing
 109     if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
 110         return 0;
 111     }
 112
 113     // pin the indices to legal values
 114     pinIndices(start, length);
 115
 116     // create the converter
 117     UConverter *converter;
 118     UErrorCode status = U_ZERO_ERROR;
 119
 120     // just write the NUL if the string length is 0
 121     if(length == 0) {
 122         if(dstSize >= 0x80000000) {
 123             // careful: dstSize is unsigned! (0xffffffff means "unlimited")
 124             // make sure that the NUL-termination works (takes int32_t)
 125             dstSize=0x7fffffff;
 126         }
 127         return u_terminateChars(target, dstSize, 0, &status);
 128     }
 129
 130     // if the codepage is the default, use our cache
 131     // if it is an empty string, then use the "invariant character" conversion
 132     if (codepage == 0) {
 133         converter = u_getDefaultConverter(&status);
 134     } else if (*codepage == 0) {
 135         // use the "invariant characters" conversion
 136         int32_t destLength;
 137         // careful: dstSize is unsigned! (0xffffffff means "unlimited")
 138         if(dstSize >= 0x80000000) {
 139             destLength = length;
 140             // make sure that the NUL-termination works (takes int32_t)
 141             dstSize=0x7fffffff;
 142         } else if(length <= (int32_t)dstSize) {
 143             destLength = length;
 144         } else {
 145             destLength = (int32_t)dstSize;
 146         }
 147         u_UCharsToChars(getArrayStart() + start, target, destLength);
 148         return u_terminateChars(target, (int32_t)dstSize, length, &status);
 149     } else {
 150         converter = ucnv_open(codepage, &status);
 151     }
 152
 153     length = doExtract(start, length, target, (int32_t)dstSize, converter, status);
 154
 155     // close the converter
 156     if (codepage == 0) {
 157         u_releaseDefaultConverter(converter);
 158     } else {
 159         ucnv_close(converter);
 160     }
 161
 162     return length;
 163 }
 164
 165 int32_t
 166 UnicodeString::extract(char *dest, int32_t destCapacity,
 167                        UConverter *cnv,
 168                        UErrorCode &errorCode) const
 169 {
 170     if(U_FAILURE(errorCode)) {
 171         return 0;
 172     }
 173
 174     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 175         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 176         return 0;
 177     }
 178
 179     // nothing to do?
 180     if(isEmpty()) {
 181         return u_terminateChars(dest, destCapacity, 0, &errorCode);
 182     }
 183
 184     // get the converter
 185     UBool isDefaultConverter;
 186     if(cnv==0) {
 187         isDefaultConverter=TRUE;
 188         cnv=u_getDefaultConverter(&errorCode);
 189         if(U_FAILURE(errorCode)) {
 190             return 0;
 191         }
 192     } else {
 193         isDefaultConverter=FALSE;
 194         ucnv_resetFromUnicode(cnv);
 195     }
 196
 197     // convert
 198     int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
 199
 200     // release the converter
 201     if(isDefaultConverter) {
 202         u_releaseDefaultConverter(cnv);
 203     }
 204
 205     return len;
 206 }
 207
 208 int32_t
 209 UnicodeString::doExtract(int32_t start, int32_t length,
 210                          char *dest, int32_t destCapacity,
 211                          UConverter *cnv,
 212                          UErrorCode &errorCode) const
 213 {
 214     if(U_FAILURE(errorCode)) {
 215         if(destCapacity!=0) {
 216             *dest=0;
 217         }
 218         return 0;
 219     }
 220
 221     const UChar *src=getArrayStart()+start, *srcLimit=src+length;
 222     char *originalDest=dest;
 223     const char *destLimit;
 224
 225     if(destCapacity==0) {
 226         destLimit=dest=0;
 227     } else if(destCapacity==-1) {
 228         // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
 229         destLimit=(char*)U_MAX_PTR(dest);
 230         // for NUL-termination, translate into highest int32_t
 231         destCapacity=0x7fffffff;
 232     } else {
 233         destLimit=dest+destCapacity;
 234     }
 235
 236     // perform the conversion
 237     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
 238     length=(int32_t)(dest-originalDest);
 239
 240     // if an overflow occurs, then get the preflighting length
 241     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
 242         char buffer[1024];
 243
 244         destLimit=buffer+sizeof(buffer);
 245         do {
 246             dest=buffer;
 247             errorCode=U_ZERO_ERROR;
 248             ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
 249             length+=(int32_t)(dest-buffer);
 250         } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
 251     }
 252
 253     return u_terminateChars(originalDest, destCapacity, length, &errorCode);
 254 }
 255
 256 void
 257 UnicodeString::doCodepageCreate(const char *codepageData,
 258                                 int32_t dataLength,
 259                                 const char *codepage)
 260 {
 261     // if there's nothing to convert, do nothing
 262     if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
 263         return;
 264     }
 265     if(dataLength == -1) {
 266         dataLength = (int32_t)uprv_strlen(codepageData);
 267     }
 268
 269     UErrorCode status = U_ZERO_ERROR;
 270
 271     // create the converter
 272     // if the codepage is the default, use our cache
 273     // if it is an empty string, then use the "invariant character" conversion
 274     UConverter *converter = (codepage == 0 ?
 275                              u_getDefaultConverter(&status) :
 276                              *codepage == 0 ?
 277                                0 :
 278                                ucnv_open(codepage, &status));
 279
 280     // if we failed, set the appropriate flags and return
 281     if(U_FAILURE(status)) {
 282         setToBogus();
 283         return;
 284     }
 285
 286     // perform the conversion
 287     if(converter == 0) {
 288         // use the "invariant characters" conversion
 289         if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
 290             u_charsToUChars(codepageData, getArrayStart(), dataLength);
 291             setLength(dataLength);
 292         } else {
 293             setToBogus();
 294         }
 295         return;
 296     }
 297
 298     // convert using the real converter
 299     doCodepageCreate(codepageData, dataLength, converter, status);
 300     if(U_FAILURE(status)) {
 301         setToBogus();
 302     }
 303
 304     // close the converter
 305     if(codepage == 0) {
 306         u_releaseDefaultConverter(converter);
 307     } else {
 308         ucnv_close(converter);
 309     }
 310 }
 311
 312 void
 313 UnicodeString::doCodepageCreate(const char *codepageData,
 314                                 int32_t dataLength,
 315                                 UConverter *converter,
 316                                 UErrorCode &status)
 317 {
 318     if(U_FAILURE(status)) {
 319         return;
 320     }
 321
 322     // set up the conversion parameters
 323     const char *mySource     = codepageData;
 324     const char *mySourceEnd  = mySource + dataLength;
 325     UChar *array, *myTarget;
 326
 327     // estimate the size needed:
 328     int32_t arraySize;
 329     if(dataLength <= US_STACKBUF_SIZE) {
 330         // try to use the stack buffer
 331         arraySize = US_STACKBUF_SIZE;
 332     } else {
 333         // 1.25 UChar's per source byte should cover most cases
 334         arraySize = dataLength + (dataLength >> 2);
 335     }
 336
 337     // we do not care about the current contents
 338     UBool doCopyArray = FALSE;
 339     for(;;) {
 340         if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
 341             setToBogus();
 342             break;
 343         }
 344
 345         // perform the conversion
 346         array = getArrayStart();
 347         myTarget = array + length();
 348         ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
 349             &mySource, mySourceEnd, 0, TRUE, &status);
 350
 351         // update the conversion parameters
 352         setLength((int32_t)(myTarget - array));
 353
 354         // allocate more space and copy data, if needed
 355         if(status == U_BUFFER_OVERFLOW_ERROR) {
 356             // reset the error code
 357             status = U_ZERO_ERROR;
 358
 359             // keep the previous conversion results
 360             doCopyArray = TRUE;
 361
 362             // estimate the new size needed, larger than before
 363             // try 2 UChar's per remaining source byte
 364             arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
 365         } else {
 366             break;
 367         }
 368     }
 369 }
 370
 371 U_NAMESPACE_END
 372
 373 #endif