icuSources/common/unistr_cnv.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 1999-2004, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  unistr_cnv.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:2
  12 *
  13 *   created on: 2004aug19
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Character conversion functions moved here from unistr.cpp
  17 */
  18
  19 #include "unicode/utypes.h"
  20
  21 #if !UCONFIG_NO_CONVERSION
  22
  23 #include "unicode/putil.h"
  24 #include "cstring.h"
  25 #include "cmemory.h"
  26 #include "unicode/ustring.h"
  27 #include "unicode/unistr.h"
  28 #include "unicode/ucnv.h"
  29 #include "putilimp.h"
  30 #include "ustr_cnv.h"
  31 #include "ustr_imp.h"
  32
  33 U_NAMESPACE_BEGIN
  34
  35 //========================================
  36 // Constructors
  37 //========================================
  38
  39 UnicodeString::UnicodeString(const char *codepageData,
  40                              const char *codepage)
  41   : fLength(0),
  42     fCapacity(US_STACKBUF_SIZE),
  43     fArray(fStackBuffer),
  44     fFlags(kShortString)
  45 {
  46   if(codepageData != 0) {
  47     doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
  48   }
  49 }
  50
  51
  52 UnicodeString::UnicodeString(const char *codepageData,
  53                              int32_t dataLength,
  54                              const char *codepage)
  55   : fLength(0),
  56     fCapacity(US_STACKBUF_SIZE),
  57     fArray(fStackBuffer),
  58     fFlags(kShortString)
  59 {
  60   if(codepageData != 0) {
  61     doCodepageCreate(codepageData, dataLength, codepage);
  62   }
  63 }
  64
  65 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
  66                              UConverter *cnv,
  67                              UErrorCode &errorCode)
  68   : fLength(0),
  69     fCapacity(US_STACKBUF_SIZE),
  70     fArray(fStackBuffer),
  71     fFlags(kShortString)
  72 {
  73   if(U_SUCCESS(errorCode)) {
  74     // check arguments
  75     if(src==NULL) {
  76       // treat as an empty string, do nothing more
  77     } else if(srcLength<-1) {
  78       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  79     } else {
  80       // get input length
  81       if(srcLength==-1) {
  82         srcLength=(int32_t)uprv_strlen(src);
  83       }
  84       if(srcLength>0) {
  85         if(cnv!=0) {
  86           // use the provided converter
  87           ucnv_resetToUnicode(cnv);
  88           doCodepageCreate(src, srcLength, cnv, errorCode);
  89         } else {
  90           // use the default converter
  91           cnv=u_getDefaultConverter(&errorCode);
  92           doCodepageCreate(src, srcLength, cnv, errorCode);
  93           u_releaseDefaultConverter(cnv);
  94         }
  95       }
  96     }
  97
  98     if(U_FAILURE(errorCode)) {
  99       setToBogus();
 100     }
 101   }
 102 }
 103
 104 //========================================
 105 // Codeset conversion
 106 //========================================
 107 int32_t
 108 UnicodeString::extract(int32_t start,
 109                        int32_t length,
 110                        char *target,
 111                        uint32_t dstSize,
 112                        const char *codepage) const
 113 {
 114   // if the arguments are illegal, then do nothing
 115   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
 116     return 0;
 117   }
 118
 119   // pin the indices to legal values
 120   pinIndices(start, length);
 121
 122   // create the converter
 123   UConverter *converter;
 124   UErrorCode status = U_ZERO_ERROR;
 125
 126   // just write the NUL if the string length is 0
 127   if(length == 0) {
 128       if(dstSize >= 0x80000000) {
 129           // careful: dstSize is unsigned! (0xffffffff means "unlimited")
 130           // make sure that the NUL-termination works (takes int32_t)
 131           dstSize=0x7fffffff;
 132       }
 133       return u_terminateChars(target, dstSize, 0, &status);
 134   }
 135
 136   // if the codepage is the default, use our cache
 137   // if it is an empty string, then use the "invariant character" conversion
 138   if (codepage == 0) {
 139     converter = u_getDefaultConverter(&status);
 140   } else if (*codepage == 0) {
 141     // use the "invariant characters" conversion
 142     int32_t destLength;
 143     // careful: dstSize is unsigned! (0xffffffff means "unlimited")
 144     if(dstSize >= 0x80000000) {
 145       destLength = length;
 146       // make sure that the NUL-termination works (takes int32_t)
 147       dstSize=0x7fffffff;
 148     } else if(length <= (int32_t)dstSize) {
 149       destLength = length;
 150     } else {
 151       destLength = (int32_t)dstSize;
 152     }
 153     u_UCharsToChars(getArrayStart() + start, target, destLength);
 154     return u_terminateChars(target, (int32_t)dstSize, length, &status);
 155   } else {
 156     converter = ucnv_open(codepage, &status);
 157   }
 158
 159   length = doExtract(start, length, target, (int32_t)dstSize, converter, status);
 160
 161   // close the converter
 162   if (codepage == 0) {
 163     u_releaseDefaultConverter(converter);
 164   } else {
 165     ucnv_close(converter);
 166   }
 167
 168   return length;
 169 }
 170
 171 int32_t
 172 UnicodeString::extract(char *dest, int32_t destCapacity,
 173                        UConverter *cnv,
 174                        UErrorCode &errorCode) const {
 175   if(U_FAILURE(errorCode)) {
 176     return 0;
 177   }
 178
 179   if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 180     errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 181     return 0;
 182   }
 183
 184   // nothing to do?
 185   if(fLength<=0) {
 186     return u_terminateChars(dest, destCapacity, 0, &errorCode);
 187   }
 188
 189   // get the converter
 190   UBool isDefaultConverter;
 191   if(cnv==0) {
 192     isDefaultConverter=TRUE;
 193     cnv=u_getDefaultConverter(&errorCode);
 194     if(U_FAILURE(errorCode)) {
 195       return 0;
 196     }
 197   } else {
 198     isDefaultConverter=FALSE;
 199     ucnv_resetFromUnicode(cnv);
 200   }
 201
 202   // convert
 203   int32_t length=doExtract(0, fLength, dest, destCapacity, cnv, errorCode);
 204
 205   // release the converter
 206   if(isDefaultConverter) {
 207     u_releaseDefaultConverter(cnv);
 208   }
 209
 210   return length;
 211 }
 212
 213 int32_t
 214 UnicodeString::doExtract(int32_t start, int32_t length,
 215                          char *dest, int32_t destCapacity,
 216                          UConverter *cnv,
 217                          UErrorCode &errorCode) const {
 218   if(U_FAILURE(errorCode)) {
 219     if(destCapacity!=0) {
 220       *dest=0;
 221     }
 222     return 0;
 223   }
 224
 225   const UChar *src=fArray+start, *srcLimit=src+length;
 226   char *originalDest=dest;
 227   const char *destLimit;
 228
 229   if(destCapacity==0) {
 230     destLimit=dest=0;
 231   } else if(destCapacity==-1) {
 232     // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
 233     destLimit=(char*)U_MAX_PTR(dest);
 234     // for NUL-termination, translate into highest int32_t
 235     destCapacity=0x7fffffff;
 236   } else {
 237     destLimit=dest+destCapacity;
 238   }
 239
 240   // perform the conversion
 241   ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
 242   length=(int32_t)(dest-originalDest);
 243
 244   // if an overflow occurs, then get the preflighting length
 245   if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
 246     char buffer[1024];
 247
 248     destLimit=buffer+sizeof(buffer);
 249     do {
 250       dest=buffer;
 251       errorCode=U_ZERO_ERROR;
 252       ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
 253       length+=(int32_t)(dest-buffer);
 254     } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
 255   }
 256
 257   return u_terminateChars(originalDest, destCapacity, length, &errorCode);
 258 }
 259
 260 void
 261 UnicodeString::doCodepageCreate(const char *codepageData,
 262                 int32_t dataLength,
 263                 const char *codepage)
 264 {
 265   // if there's nothing to convert, do nothing
 266   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
 267     return;
 268   }
 269   if(dataLength == -1) {
 270     dataLength = uprv_strlen(codepageData);
 271   }
 272
 273   UErrorCode status = U_ZERO_ERROR;
 274
 275   // create the converter
 276   // if the codepage is the default, use our cache
 277   // if it is an empty string, then use the "invariant character" conversion
 278   UConverter *converter = (codepage == 0 ?
 279                              u_getDefaultConverter(&status) :
 280                              *codepage == 0 ?
 281                                0 :
 282                                ucnv_open(codepage, &status));
 283
 284   // if we failed, set the appropriate flags and return
 285   if(U_FAILURE(status)) {
 286     setToBogus();
 287     return;
 288   }
 289
 290   // perform the conversion
 291   if(converter == 0) {
 292     // use the "invariant characters" conversion
 293     if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
 294       u_charsToUChars(codepageData, getArrayStart(), dataLength);
 295       fLength = dataLength;
 296     } else {
 297       setToBogus();
 298     }
 299     return;
 300   }
 301
 302   // convert using the real converter
 303   doCodepageCreate(codepageData, dataLength, converter, status);
 304   if(U_FAILURE(status)) {
 305     setToBogus();
 306   }
 307
 308   // close the converter
 309   if(codepage == 0) {
 310     u_releaseDefaultConverter(converter);
 311   } else {
 312     ucnv_close(converter);
 313   }
 314 }
 315
 316 void
 317 UnicodeString::doCodepageCreate(const char *codepageData,
 318                                 int32_t dataLength,
 319                                 UConverter *converter,
 320                                 UErrorCode &status) {
 321   if(U_FAILURE(status)) {
 322     return;
 323   }
 324
 325   // set up the conversion parameters
 326   const char *mySource     = codepageData;
 327   const char *mySourceEnd  = mySource + dataLength;
 328   UChar *myTarget;
 329
 330   // estimate the size needed:
 331   // 1.25 UChar's per source byte should cover most cases
 332   int32_t arraySize = dataLength + (dataLength >> 2);
 333
 334   // we do not care about the current contents
 335   UBool doCopyArray = FALSE;
 336   for(;;) {
 337     if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
 338       setToBogus();
 339       break;
 340     }
 341
 342     // perform the conversion
 343     myTarget = fArray + fLength;
 344     ucnv_toUnicode(converter, &myTarget,  fArray + fCapacity,
 345            &mySource, mySourceEnd, 0, TRUE, &status);
 346
 347     // update the conversion parameters
 348     fLength = (int32_t)(myTarget - fArray);
 349
 350     // allocate more space and copy data, if needed
 351     if(status == U_BUFFER_OVERFLOW_ERROR) {
 352       // reset the error code
 353       status = U_ZERO_ERROR;
 354
 355       // keep the previous conversion results
 356       doCopyArray = TRUE;
 357
 358       // estimate the new size needed, larger than before
 359       // try 2 UChar's per remaining source byte
 360       arraySize = (int32_t)(fLength + 2 * (mySourceEnd - mySource));
 361     } else {
 362       break;
 363     }
 364   }
 365 }
 366
 367 U_NAMESPACE_END
 368
 369 #endif