ICU-57131.0.1.tar.gz

[apple/icu.git] / icuSources / common / unistr_cnv.cpp
diff --git a/icuSources/common/unistr_cnv.cpp b/icuSources/common/unistr_cnv.cpp

index adc0dda6d51af949716709144af748576481dd9b..38998ffd026c4da0fbd19cb8d3ab21540854b08a 100644 (file)
--- a/icuSources/common/unistr_cnv.cpp
+++ b/icuSources/common/unistr_cnv.cpp
@@ -1,7 +1,7 @@
  /*
  *******************************************************************************
  *
-*   Copyright (C) 1999-2006, International Business Machines
+*   Copyright (C) 1999-2014, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  *******************************************************************************
@@ -26,6 +26,7 @@
  #include "unicode/ustring.h"
  #include "unicode/unistr.h"
  #include "unicode/ucnv.h"
+#include "ucnv_imp.h"
  #include "putilimp.h"
  #include "ustr_cnv.h"
  #include "ustr_imp.h"
@@ -36,27 +37,38 @@ U_NAMESPACE_BEGIN
  // Constructors
  //========================================
  
+#if !U_CHARSET_IS_UTF8
+
+UnicodeString::UnicodeString(const char *codepageData) {
+    fUnion.fFields.fLengthAndFlags = kShortString;
+    if(codepageData != 0) {
+        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
+    }
+}
+
  UnicodeString::UnicodeString(const char *codepageData,
-                             const char *codepage)
-  : fLength(0),
-    fCapacity(US_STACKBUF_SIZE),
-    fArray(fStackBuffer),
-    fFlags(kShortString)
-{
+                             int32_t dataLength) {
+    fUnion.fFields.fLengthAndFlags = kShortString;
      if(codepageData != 0) {
-        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
+        doCodepageCreate(codepageData, dataLength, 0);
      }
  }
  
+// else see unistr.cpp
+#endif
+
+UnicodeString::UnicodeString(const char *codepageData,
+                             const char *codepage) {
+    fUnion.fFields.fLengthAndFlags = kShortString;
+    if(codepageData != 0) {
+        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
+    }
+}
  
  UnicodeString::UnicodeString(const char *codepageData,
                               int32_t dataLength,
-                             const char *codepage)
-  : fLength(0),
-    fCapacity(US_STACKBUF_SIZE),
-    fArray(fStackBuffer),
-    fFlags(kShortString)
-{
+                             const char *codepage) {
+    fUnion.fFields.fLengthAndFlags = kShortString;
      if(codepageData != 0) {
          doCodepageCreate(codepageData, dataLength, codepage);
      }
@@ -64,12 +76,8 @@ UnicodeString::UnicodeString(const char *codepageData,
  
  UnicodeString::UnicodeString(const char *src, int32_t srcLength,
                               UConverter *cnv,
-                             UErrorCode &errorCode)
-  : fLength(0),
-    fCapacity(US_STACKBUF_SIZE),
-    fArray(fStackBuffer),
-    fFlags(kShortString)
-{
+                             UErrorCode &errorCode) {
+    fUnion.fFields.fLengthAndFlags = kShortString;
      if(U_SUCCESS(errorCode)) {
          // check arguments
          if(src==NULL) {
@@ -104,6 +112,20 @@ UnicodeString::UnicodeString(const char *src, int32_t srcLength,
  //========================================
  // Codeset conversion
  //========================================
+
+#if !U_CHARSET_IS_UTF8
+
+int32_t
+UnicodeString::extract(int32_t start,
+                       int32_t length,
+                       char *target,
+                       uint32_t dstSize) const {
+    return extract(start, length, target, dstSize, 0);
+}
+
+// else see unistr.cpp
+#endif
+
  int32_t
  UnicodeString::extract(int32_t start,
                         int32_t length,
@@ -119,44 +141,55 @@ UnicodeString::extract(int32_t start,
      // pin the indices to legal values
      pinIndices(start, length);
  
+    // We need to cast dstSize to int32_t for all subsequent code.
+    // I don't know why the API was defined with uint32_t but we are stuck with it.
+    // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
+    // as a limit in some functions, it may wrap around and yield a pointer
+    // that compares less-than target.
+    int32_t capacity;
+    if(dstSize < 0x7fffffff) {
+        // Assume that the capacity is real and a limit pointer won't wrap around.
+        capacity = (int32_t)dstSize;
+    } else {
+        // Pin the capacity so that a limit pointer does not wrap around.
+        char *targetLimit = (char *)U_MAX_PTR(target);
+        // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
+        // greater than target and does not wrap around the top of the address space.
+        capacity = (int32_t)(targetLimit - target);
+    }
+
      // create the converter
      UConverter *converter;
      UErrorCode status = U_ZERO_ERROR;
  
      // just write the NUL if the string length is 0
      if(length == 0) {
-        if(dstSize >= 0x80000000) {  
-            // careful: dstSize is unsigned! (0xffffffff means "unlimited")
-            // make sure that the NUL-termination works (takes int32_t)
-            dstSize=0x7fffffff;
-        }
-        return u_terminateChars(target, dstSize, 0, &status);
+        return u_terminateChars(target, capacity, 0, &status);
      }
  
      // if the codepage is the default, use our cache
      // if it is an empty string, then use the "invariant character" conversion
      if (codepage == 0) {
+        const char *defaultName = ucnv_getDefaultName();
+        if(UCNV_FAST_IS_UTF8(defaultName)) {
+            return toUTF8(start, length, target, capacity);
+        }
          converter = u_getDefaultConverter(&status);
      } else if (*codepage == 0) {
          // use the "invariant characters" conversion
          int32_t destLength;
-        // careful: dstSize is unsigned! (0xffffffff means "unlimited")
-        if(dstSize >= 0x80000000) {
-            destLength = length;
-            // make sure that the NUL-termination works (takes int32_t)
-            dstSize=0x7fffffff;
-        } else if(length <= (int32_t)dstSize) {
+        if(length <= capacity) {
              destLength = length;
          } else {
-            destLength = (int32_t)dstSize;
+            destLength = capacity;
          }
          u_UCharsToChars(getArrayStart() + start, target, destLength);
-        return u_terminateChars(target, (int32_t)dstSize, length, &status);
+        return u_terminateChars(target, capacity, length, &status);
      } else {
          converter = ucnv_open(codepage, &status);
      }
  
-    length = doExtract(start, length, target, (int32_t)dstSize, converter, status);
+    length = doExtract(start, length, target, capacity, converter, status);
  
      // close the converter
      if (codepage == 0) {
@@ -183,7 +216,7 @@ UnicodeString::extract(char *dest, int32_t destCapacity,
      }
  
      // nothing to do?
-    if(fLength<=0) {
+    if(isEmpty()) {
          return u_terminateChars(dest, destCapacity, 0, &errorCode);
      }
  
@@ -201,14 +234,14 @@ UnicodeString::extract(char *dest, int32_t destCapacity,
      }
  
      // convert
-    int32_t length=doExtract(0, fLength, dest, destCapacity, cnv, errorCode);
+    int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
  
      // release the converter
      if(isDefaultConverter) {
          u_releaseDefaultConverter(cnv);
      }
  
-    return length;
+    return len;
  }
  
  int32_t
@@ -224,7 +257,7 @@ UnicodeString::doExtract(int32_t start, int32_t length,
          return 0;
      }
  
-    const UChar *src=fArray+start, *srcLimit=src+length;
+    const UChar *src=getArrayStart()+start, *srcLimit=src+length;
      char *originalDest=dest;
      const char *destLimit;
  
@@ -277,31 +310,34 @@ UnicodeString::doCodepageCreate(const char *codepageData,
      // create the converter
      // if the codepage is the default, use our cache
      // if it is an empty string, then use the "invariant character" conversion
-    UConverter *converter = (codepage == 0 ?
-                             u_getDefaultConverter(&status) :
-                             *codepage == 0 ?
-                               0 :
-                               ucnv_open(codepage, &status));
-
-    // if we failed, set the appropriate flags and return
-    if(U_FAILURE(status)) {
-        setToBogus();
-        return;
-    }
-
-    // perform the conversion
-    if(converter == 0) {
+    UConverter *converter;
+    if (codepage == 0) {
+        const char *defaultName = ucnv_getDefaultName();
+        if(UCNV_FAST_IS_UTF8(defaultName)) {
+            setToUTF8(StringPiece(codepageData, dataLength));
+            return;
+        }
+        converter = u_getDefaultConverter(&status);
+    } else if(*codepage == 0) {
          // use the "invariant characters" conversion
          if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
              u_charsToUChars(codepageData, getArrayStart(), dataLength);
-            fLength = dataLength;
+            setLength(dataLength);
          } else {
              setToBogus();
          }
          return;
+    } else {
+        converter = ucnv_open(codepage, &status);
+    }
+
+    // if we failed, set the appropriate flags and return
+    if(U_FAILURE(status)) {
+        setToBogus();
+        return;
      }
  
-    // convert using the real converter
+    // perform the conversion
      doCodepageCreate(codepageData, dataLength, converter, status);
      if(U_FAILURE(status)) {
          setToBogus();
@@ -328,11 +364,17 @@ UnicodeString::doCodepageCreate(const char *codepageData,
      // set up the conversion parameters
      const char *mySource     = codepageData;
      const char *mySourceEnd  = mySource + dataLength;
-    UChar *myTarget;
+    UChar *array, *myTarget;
  
      // estimate the size needed:
-    // 1.25 UChar's per source byte should cover most cases
-    int32_t arraySize = dataLength + (dataLength >> 2);
+    int32_t arraySize;
+    if(dataLength <= US_STACKBUF_SIZE) {
+        // try to use the stack buffer
+        arraySize = US_STACKBUF_SIZE;
+    } else {
+        // 1.25 UChar's per source byte should cover most cases
+        arraySize = dataLength + (dataLength >> 2);
+    }
  
      // we do not care about the current contents
      UBool doCopyArray = FALSE;
@@ -343,12 +385,13 @@ UnicodeString::doCodepageCreate(const char *codepageData,
          }
  
          // perform the conversion
-        myTarget = fArray + fLength;
-        ucnv_toUnicode(converter, &myTarget,  fArray + fCapacity,
+        array = getArrayStart();
+        myTarget = array + length();
+        ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
              &mySource, mySourceEnd, 0, TRUE, &status);
  
          // update the conversion parameters
-        fLength = (int32_t)(myTarget - fArray);
+        setLength((int32_t)(myTarget - array));
  
          // allocate more space and copy data, if needed
          if(status == U_BUFFER_OVERFLOW_ERROR) {
@@ -360,7 +403,7 @@ UnicodeString::doCodepageCreate(const char *codepageData,
  
              // estimate the new size needed, larger than before
              // try 2 UChar's per remaining source byte
-            arraySize = (int32_t)(fLength + 2 * (mySourceEnd - mySource));
+            arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
          } else {
              break;
          }