X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/73c04bcfe1096173b00431f0cdc742894b15eef0..b331163bffd790ced0e88b73f44f86d49ccc48a5:/icuSources/common/unistr.cpp diff --git a/icuSources/common/unistr.cpp b/icuSources/common/unistr.cpp index 1b3e9da5..9997aa7b 100644 --- a/icuSources/common/unistr.cpp +++ b/icuSources/common/unistr.cpp @@ -1,7 +1,7 @@ /* ****************************************************************************** -* Copyright (C) 1999-2005, International Business Machines Corporation and * -* others. All Rights Reserved. * +* Copyright (C) 1999-2014, International Business Machines Corporation and +* others. All Rights Reserved. ****************************************************************************** * * File unistr.cpp @@ -19,23 +19,23 @@ */ #include "unicode/utypes.h" +#include "unicode/appendable.h" #include "unicode/putil.h" #include "cstring.h" #include "cmemory.h" #include "unicode/ustring.h" #include "unicode/unistr.h" -#include "uhash.h" +#include "unicode/utf.h" +#include "unicode/utf16.h" +#include "uelement.h" #include "ustr_imp.h" #include "umutex.h" +#include "uassert.h" #if 0 -#if U_IOSTREAM_SOURCE >= 199711 #include using namespace std; -#elif U_IOSTREAM_SOURCE >= 198506 -#include -#endif //DEBUGGING void @@ -90,7 +90,7 @@ us_arrayCopy(const UChar *src, int32_t srcStart, U_CDECL_BEGIN static UChar U_CALLCONV UnicodeString_charAt(int32_t offset, void *context) { - return ((UnicodeString*) context)->charAt(offset); + return ((icu::UnicodeString*) context)->charAt(offset); } U_CDECL_END @@ -100,7 +100,7 @@ U_NAMESPACE_BEGIN due to how AIX works with multiple definitions of virtual functions. */ Replaceable::~Replaceable() {} -Replaceable::Replaceable() {} + UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString) UnicodeString U_EXPORT2 @@ -117,28 +117,24 @@ operator+ (const UnicodeString &s1, const UnicodeString &s2) { //======================================== void -UnicodeString::addRef() -{ umtx_atomic_inc((int32_t *)fArray - 1);} +UnicodeString::addRef() { + umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1); +} int32_t -UnicodeString::removeRef() -{ return umtx_atomic_dec((int32_t *)fArray - 1);} +UnicodeString::removeRef() { + return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1); +} int32_t -UnicodeString::refCount() const -{ - umtx_lock(NULL); - // Note: without the lock to force a memory barrier, we might see a very - // stale value on some multi-processor systems. - int32_t count = *((int32_t *)fArray - 1); - umtx_unlock(NULL); - return count; - } +UnicodeString::refCount() const { + return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1)); +} void UnicodeString::releaseArray() { - if((fFlags & kRefCounted) && removeRef() == 0) { - uprv_free((int32_t *)fArray - 1); + if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) { + uprv_free((int32_t *)fUnion.fFields.fArray - 1); } } @@ -147,41 +143,34 @@ UnicodeString::releaseArray() { //======================================== // Constructors //======================================== -UnicodeString::UnicodeString() - : fLength(0), - fCapacity(US_STACKBUF_SIZE), - fArray(fStackBuffer), - fFlags(kShortString) -{} - -UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) - : fLength(0), - fCapacity(US_STACKBUF_SIZE), - fArray(0), - fFlags(0) -{ + +// The default constructor is inline in unistr.h. + +UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) { + fUnion.fFields.fLengthAndFlags = 0; if(count <= 0 || (uint32_t)c > 0x10ffff) { // just allocate and do not do anything else allocate(capacity); } else { // count > 0, allocate and fill the new string with count c's - int32_t unitCount = UTF_CHAR_LENGTH(c), length = count * unitCount; + int32_t unitCount = U16_LENGTH(c), length = count * unitCount; if(capacity < length) { capacity = length; } if(allocate(capacity)) { + UChar *array = getArrayStart(); int32_t i = 0; // fill the new string with c if(unitCount == 1) { // fill with length UChars while(i < length) { - fArray[i++] = (UChar)c; + array[i++] = (UChar)c; } } else { // get the code units for c - UChar units[UTF_MAX_CHAR_LENGTH]; - UTF_APPEND_CHAR_UNSAFE(units, i, c); + UChar units[U16_MAX_LENGTH]; + U16_APPEND_UNSAFE(units, i, c); // now it must be i==unitCount i = 0; @@ -191,113 +180,88 @@ UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) while(i < length) { int32_t unitIdx = 0; while(unitIdx < unitCount) { - fArray[i++]=units[unitIdx++]; + array[i++]=units[unitIdx++]; } } } } - fLength = length; + setLength(length); } } -UnicodeString::UnicodeString(UChar ch) - : fLength(1), - fCapacity(US_STACKBUF_SIZE), - fArray(fStackBuffer), - fFlags(kShortString) -{ - fStackBuffer[0] = ch; +UnicodeString::UnicodeString(UChar ch) { + fUnion.fFields.fLengthAndFlags = kLength1 | kShortString; + fUnion.fStackFields.fBuffer[0] = ch; } -UnicodeString::UnicodeString(UChar32 ch) - : fLength(1), - fCapacity(US_STACKBUF_SIZE), - fArray(fStackBuffer), - fFlags(kShortString) -{ +UnicodeString::UnicodeString(UChar32 ch) { + fUnion.fFields.fLengthAndFlags = kShortString; int32_t i = 0; UBool isError = FALSE; - U16_APPEND(fStackBuffer, i, US_STACKBUF_SIZE, ch, isError); - fLength = i; + U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError); + // We test isError so that the compiler does not complain that we don't. + // If isError then i==0 which is what we want anyway. + if(!isError) { + setShortLength(i); + } } -UnicodeString::UnicodeString(const UChar *text) - : fLength(0), - fCapacity(US_STACKBUF_SIZE), - fArray(fStackBuffer), - fFlags(kShortString) -{ +UnicodeString::UnicodeString(const UChar *text) { + fUnion.fFields.fLengthAndFlags = kShortString; doReplace(0, 0, text, 0, -1); } UnicodeString::UnicodeString(const UChar *text, - int32_t textLength) - : fLength(0), - fCapacity(US_STACKBUF_SIZE), - fArray(fStackBuffer), - fFlags(kShortString) -{ + int32_t textLength) { + fUnion.fFields.fLengthAndFlags = kShortString; doReplace(0, 0, text, 0, textLength); } UnicodeString::UnicodeString(UBool isTerminated, const UChar *text, - int32_t textLength) - : fLength(textLength), - fCapacity(isTerminated ? textLength + 1 : textLength), - fArray((UChar *)text), - fFlags(kReadonlyAlias) -{ + int32_t textLength) { + fUnion.fFields.fLengthAndFlags = kReadonlyAlias; if(text == NULL) { // treat as an empty string, do not alias - fLength = 0; - fCapacity = US_STACKBUF_SIZE; - fArray = fStackBuffer; - fFlags = kShortString; + setToEmpty(); } else if(textLength < -1 || (textLength == -1 && !isTerminated) || (textLength >= 0 && isTerminated && text[textLength] != 0) ) { setToBogus(); - } else if(textLength == -1) { - // text is terminated, or else it would have failed the above test - fLength = u_strlen(text); - fCapacity = fLength + 1; + } else { + if(textLength == -1) { + // text is terminated, or else it would have failed the above test + textLength = u_strlen(text); + } + setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength); } } UnicodeString::UnicodeString(UChar *buff, int32_t buffLength, - int32_t buffCapacity) - : fLength(buffLength), - fCapacity(buffCapacity), - fArray(buff), - fFlags(kWritableAlias) -{ + int32_t buffCapacity) { + fUnion.fFields.fLengthAndFlags = kWritableAlias; if(buff == NULL) { // treat as an empty string, do not alias - fLength = 0; - fCapacity = US_STACKBUF_SIZE; - fArray = fStackBuffer; - fFlags = kShortString; + setToEmpty(); } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { setToBogus(); - } else if(buffLength == -1) { - // fLength = u_strlen(buff); but do not look beyond buffCapacity - const UChar *p = buff, *limit = buff + buffCapacity; - while(p != limit && *p != 0) { - ++p; + } else { + if(buffLength == -1) { + // fLength = u_strlen(buff); but do not look beyond buffCapacity + const UChar *p = buff, *limit = buff + buffCapacity; + while(p != limit && *p != 0) { + ++p; + } + buffLength = (int32_t)(p - buff); } - fLength = (int32_t)(p - buff); + setArray(buff, buffLength, buffCapacity); } } -UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) - : fLength(0), - fCapacity(US_STACKBUF_SIZE), - fArray(fStackBuffer), - fFlags(kShortString) -{ +UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) { + fUnion.fFields.fLengthAndFlags = kShortString; if(src==NULL) { // treat as an empty string } else { @@ -306,43 +270,52 @@ UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) } if(cloneArrayIfNeeded(length, length, FALSE)) { u_charsToUChars(src, getArrayStart(), length); - fLength = length; + setLength(length); } else { setToBogus(); } } } -UnicodeString::UnicodeString(const UnicodeString& that) - : Replaceable(), - fLength(0), - fCapacity(US_STACKBUF_SIZE), - fArray(fStackBuffer), - fFlags(kShortString) -{ +#if U_CHARSET_IS_UTF8 + +UnicodeString::UnicodeString(const char *codepageData) { + fUnion.fFields.fLengthAndFlags = kShortString; + if(codepageData != 0) { + setToUTF8(codepageData); + } +} + +UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) { + fUnion.fFields.fLengthAndFlags = kShortString; + // if there's nothing to convert, do nothing + if(codepageData == 0 || dataLength == 0 || dataLength < -1) { + return; + } + if(dataLength == -1) { + dataLength = (int32_t)uprv_strlen(codepageData); + } + setToUTF8(StringPiece(codepageData, dataLength)); +} + +// else see unistr_cnv.cpp +#endif + +UnicodeString::UnicodeString(const UnicodeString& that) { + fUnion.fFields.fLengthAndFlags = kShortString; copyFrom(that); } UnicodeString::UnicodeString(const UnicodeString& that, - int32_t srcStart) - : Replaceable(), - fLength(0), - fCapacity(US_STACKBUF_SIZE), - fArray(fStackBuffer), - fFlags(kShortString) -{ + int32_t srcStart) { + fUnion.fFields.fLengthAndFlags = kShortString; setTo(that, srcStart); } UnicodeString::UnicodeString(const UnicodeString& that, int32_t srcStart, - int32_t srcLength) - : Replaceable(), - fLength(0), - fCapacity(US_STACKBUF_SIZE), - fArray(fStackBuffer), - fFlags(kShortString) -{ + int32_t srcLength) { + fUnion.fFields.fLengthAndFlags = kShortString; setTo(that, srcStart, srcLength); } @@ -365,27 +338,26 @@ UnicodeString::clone() const { UBool UnicodeString::allocate(int32_t capacity) { if(capacity <= US_STACKBUF_SIZE) { - fArray = fStackBuffer; - fCapacity = US_STACKBUF_SIZE; - fFlags = kShortString; + fUnion.fFields.fLengthAndFlags = kShortString; } else { // count bytes for the refCounter and the string capacity, and // round up to a multiple of 16; then divide by 4 and allocate int32_t's // to be safely aligned for the refCount - int32_t words = (int32_t)(((sizeof(int32_t) + capacity * U_SIZEOF_UCHAR + 15) & ~15) >> 2); + // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer() + int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2); int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words ); if(array != 0) { // set initial refCount and point behind the refCount *array++ = 1; // have fArray point to the first UChar - fArray = (UChar *)array; - fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR)); - fFlags = kLongString; + fUnion.fFields.fArray = (UChar *)array; + fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR)); + fUnion.fFields.fLengthAndFlags = kLongString; } else { - fLength = 0; - fCapacity = 0; - fFlags = kIsBogus; + fUnion.fFields.fLengthAndFlags = kIsBogus; + fUnion.fFields.fArray = 0; + fUnion.fFields.fCapacity = 0; return FALSE; } } @@ -400,6 +372,47 @@ UnicodeString::~UnicodeString() releaseArray(); } +//======================================== +// Factory methods +//======================================== + +UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) { + UnicodeString result; + result.setToUTF8(utf8); + return result; +} + +UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) { + UnicodeString result; + int32_t capacity; + // Most UTF-32 strings will be BMP-only and result in a same-length + // UTF-16 string. We overestimate the capacity just slightly, + // just in case there are a few supplementary characters. + if(length <= US_STACKBUF_SIZE) { + capacity = US_STACKBUF_SIZE; + } else { + capacity = length + (length >> 4) + 4; + } + do { + UChar *utf16 = result.getBuffer(capacity); + int32_t length16; + UErrorCode errorCode = U_ZERO_ERROR; + u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16, + utf32, length, + 0xfffd, // Substitution character. + NULL, // Don't care about number of substitutions. + &errorCode); + result.releaseBuffer(length16); + if(errorCode == U_BUFFER_OVERFLOW_ERROR) { + capacity = length16 + 1; // +1 for the terminating NUL. + continue; + } else if(U_FAILURE(errorCode)) { + result.setToBogus(); + } + break; + } while(TRUE); + return result; +} //======================================== // Assignment @@ -418,12 +431,12 @@ UnicodeString::fastCopyFrom(const UnicodeString &src) { UnicodeString & UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) { // if assigning to ourselves, do nothing - if(this == 0 || this == &src) { + if(this == &src) { return *this; } // is the right side bogus? - if(&src == 0 || src.isBogus()) { + if(src.isBogus()) { setToBogus(); return *this; } @@ -431,59 +444,60 @@ UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) { // delete the current contents releaseArray(); - // we always copy the length - fLength = src.fLength; - if(fLength == 0) { + if(src.isEmpty()) { // empty string - use the stack buffer - fArray = fStackBuffer; - fCapacity = US_STACKBUF_SIZE; - fFlags = kShortString; + setToEmpty(); return *this; } // fLength>0 and not an "open" src.getBuffer(minCapacity) - switch(src.fFlags) { + fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags; + switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) { case kShortString: // short string using the stack buffer, do the same - fArray = fStackBuffer; - fCapacity = US_STACKBUF_SIZE; - fFlags = kShortString; - uprv_memcpy(fStackBuffer, src.fArray, fLength * U_SIZEOF_UCHAR); + uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer, + getShortLength() * U_SIZEOF_UCHAR); break; case kLongString: // src uses a refCounted string buffer, use that buffer with refCount - // src is const, use a cast - we don't really change it + // src is const, use a cast - we don't actually change it ((UnicodeString &)src).addRef(); // copy all fields, share the reference-counted buffer - fArray = src.fArray; - fCapacity = src.fCapacity; - fFlags = src.fFlags; + fUnion.fFields.fArray = src.fUnion.fFields.fArray; + fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; + if(!hasShortLength()) { + fUnion.fFields.fLength = src.fUnion.fFields.fLength; + } break; case kReadonlyAlias: if(fastCopy) { // src is a readonly alias, do the same // -> maintain the readonly alias as such - fArray = src.fArray; - fCapacity = src.fCapacity; - fFlags = src.fFlags; + fUnion.fFields.fArray = src.fUnion.fFields.fArray; + fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; + if(!hasShortLength()) { + fUnion.fFields.fLength = src.fUnion.fFields.fLength; + } break; } // else if(!fastCopy) fall through to case kWritableAlias // -> allocate a new buffer and copy the contents - case kWritableAlias: + case kWritableAlias: { // src is a writable alias; we make a copy of that instead - if(allocate(fLength)) { - uprv_memcpy(fArray, src.fArray, fLength * U_SIZEOF_UCHAR); + int32_t srcLength = src.length(); + if(allocate(srcLength)) { + uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR); + setLength(srcLength); break; } // if there is not enough memory, then fall through to setting to bogus + } default: // if src is bogus, set ourselves to bogus - // do not call setToBogus() here because fArray and fFlags are not consistent here - fArray = 0; - fLength = 0; - fCapacity = 0; - fFlags = kIsBogus; + // do not call setToBogus() here because fArray and flags are not consistent here + fUnion.fFields.fLengthAndFlags = kIsBogus; + fUnion.fFields.fArray = 0; + fUnion.fFields.fCapacity = 0; break; } @@ -495,17 +509,25 @@ UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) { //======================================== UnicodeString UnicodeString::unescape() const { - UnicodeString result; - for (int32_t i=0; i> 15 | 1); @@ -640,33 +669,72 @@ UnicodeString::getChar32At(int32_t offset) const { return char32At(offset); } +UChar32 +UnicodeString::char32At(int32_t offset) const +{ + int32_t len = length(); + if((uint32_t)offset < (uint32_t)len) { + const UChar *array = getArrayStart(); + UChar32 c; + U16_GET(array, 0, offset, len, c); + return c; + } else { + return kInvalidUChar; + } +} + +int32_t +UnicodeString::getChar32Start(int32_t offset) const { + if((uint32_t)offset < (uint32_t)length()) { + const UChar *array = getArrayStart(); + U16_SET_CP_START(array, 0, offset); + return offset; + } else { + return 0; + } +} + +int32_t +UnicodeString::getChar32Limit(int32_t offset) const { + int32_t len = length(); + if((uint32_t)offset < (uint32_t)len) { + const UChar *array = getArrayStart(); + U16_SET_CP_LIMIT(array, 0, offset, len); + return offset; + } else { + return len; + } +} + int32_t UnicodeString::countChar32(int32_t start, int32_t length) const { pinIndices(start, length); // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL - return u_countChar32(fArray+start, length); + return u_countChar32(getArrayStart()+start, length); } UBool UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const { pinIndices(start, length); // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL - return u_strHasMoreChar32Than(fArray+start, length, number); + return u_strHasMoreChar32Than(getArrayStart()+start, length, number); } int32_t UnicodeString::moveIndex32(int32_t index, int32_t delta) const { // pin index + int32_t len = length(); if(index<0) { index=0; - } else if(index>fLength) { - index=fLength; + } else if(index>len) { + index=len; } + const UChar *array = getArrayStart(); if(delta>0) { - UTF_FWD_N(fArray, index, fLength, delta); + U16_FWD_N(array, index, len, delta); } else { - UTF_BACK_N(fArray, 0, index, -delta); + U16_BACK_N(array, 0, index, -delta); } return index; @@ -682,26 +750,29 @@ UnicodeString::doExtract(int32_t start, pinIndices(start, length); // do not copy anything if we alias dst itself - if(fArray + start != dst + dstStart) { - us_arrayCopy(getArrayStart(), start, dst, dstStart, length); + const UChar *array = getArrayStart(); + if(array + start != dst + dstStart) { + us_arrayCopy(array, start, dst, dstStart, length); } } int32_t UnicodeString::extract(UChar *dest, int32_t destCapacity, UErrorCode &errorCode) const { + int32_t len = length(); if(U_SUCCESS(errorCode)) { if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { errorCode=U_ILLEGAL_ARGUMENT_ERROR; } else { - if(fLength>0 && fLength<=destCapacity && fArray!=dest) { - uprv_memcpy(dest, fArray, fLength*U_SIZEOF_UCHAR); + const UChar *array = getArrayStart(); + if(len>0 && len<=destCapacity && array!=dest) { + uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR); } - return u_terminateUChars(dest, destCapacity, fLength, &errorCode); + return u_terminateUChars(dest, destCapacity, len, &errorCode); } } - return fLength; + return len; } int32_t @@ -726,6 +797,46 @@ UnicodeString::extract(int32_t start, return u_terminateChars(target, targetCapacity, length, &status); } +UnicodeString +UnicodeString::tempSubString(int32_t start, int32_t len) const { + pinIndices(start, len); + const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer + if(array==NULL) { + array=fUnion.fStackFields.fBuffer; // anything not NULL because that would make an empty string + len=-2; // bogus result string + } + return UnicodeString(FALSE, array + start, len); +} + +int32_t +UnicodeString::toUTF8(int32_t start, int32_t len, + char *target, int32_t capacity) const { + pinIndices(start, len); + int32_t length8; + UErrorCode errorCode = U_ZERO_ERROR; + u_strToUTF8WithSub(target, capacity, &length8, + getBuffer() + start, len, + 0xFFFD, // Standard substitution character. + NULL, // Don't care about number of substitutions. + &errorCode); + return length8; +} + +#if U_CHARSET_IS_UTF8 + +int32_t +UnicodeString::extract(int32_t start, int32_t len, + char *target, uint32_t dstSize) const { + // if the arguments are illegal, then do nothing + if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { + return 0; + } + return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff); +} + +// else see unistr_cnv.cpp +#endif + void UnicodeString::extractBetween(int32_t start, int32_t limit, @@ -735,6 +846,66 @@ UnicodeString::extractBetween(int32_t start, doExtract(start, limit - start, target); } +// When converting from UTF-16 to UTF-8, the result will have at most 3 times +// as many bytes as the source has UChars. +// The "worst cases" are writing systems like Indic, Thai and CJK with +// 3:1 bytes:UChars. +void +UnicodeString::toUTF8(ByteSink &sink) const { + int32_t length16 = length(); + if(length16 != 0) { + char stackBuffer[1024]; + int32_t capacity = (int32_t)sizeof(stackBuffer); + UBool utf8IsOwned = FALSE; + char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity, + 3*length16, + stackBuffer, capacity, + &capacity); + int32_t length8 = 0; + UErrorCode errorCode = U_ZERO_ERROR; + u_strToUTF8WithSub(utf8, capacity, &length8, + getBuffer(), length16, + 0xFFFD, // Standard substitution character. + NULL, // Don't care about number of substitutions. + &errorCode); + if(errorCode == U_BUFFER_OVERFLOW_ERROR) { + utf8 = (char *)uprv_malloc(length8); + if(utf8 != NULL) { + utf8IsOwned = TRUE; + errorCode = U_ZERO_ERROR; + u_strToUTF8WithSub(utf8, length8, &length8, + getBuffer(), length16, + 0xFFFD, // Standard substitution character. + NULL, // Don't care about number of substitutions. + &errorCode); + } else { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } + } + if(U_SUCCESS(errorCode)) { + sink.Append(utf8, length8); + sink.Flush(); + } + if(utf8IsOwned) { + uprv_free(utf8); + } + } +} + +int32_t +UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const { + int32_t length32=0; + if(U_SUCCESS(errorCode)) { + // getBuffer() and u_strToUTF32WithSub() check for illegal arguments. + u_strToUTF32WithSub(utf32, capacity, &length32, + getBuffer(), length(), + 0xfffd, // Substitution character. + NULL, // Don't care about number of substitutions. + &errorCode); + } + return length32; +} + int32_t UnicodeString::indexOf(const UChar *srcChars, int32_t srcStart, @@ -755,11 +926,12 @@ UnicodeString::indexOf(const UChar *srcChars, pinIndices(start, length); // find the first occurrence of the substring - const UChar *match = u_strFindFirst(fArray + start, length, srcChars + srcStart, srcLength); + const UChar *array = getArrayStart(); + const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength); if(match == NULL) { return -1; } else { - return (int32_t)(match - fArray); + return (int32_t)(match - array); } } @@ -772,11 +944,12 @@ UnicodeString::doIndexOf(UChar c, pinIndices(start, length); // find the first occurrence of c - const UChar *match = u_memchr(fArray + start, c, length); + const UChar *array = getArrayStart(); + const UChar *match = u_memchr(array + start, c, length); if(match == NULL) { return -1; } else { - return (int32_t)(match - fArray); + return (int32_t)(match - array); } } @@ -788,11 +961,12 @@ UnicodeString::doIndexOf(UChar32 c, pinIndices(start, length); // find the first occurrence of c - const UChar *match = u_memchr32(fArray + start, c, length); + const UChar *array = getArrayStart(); + const UChar *match = u_memchr32(array + start, c, length); if(match == NULL) { return -1; } else { - return (int32_t)(match - fArray); + return (int32_t)(match - array); } } @@ -816,11 +990,12 @@ UnicodeString::lastIndexOf(const UChar *srcChars, pinIndices(start, length); // find the last occurrence of the substring - const UChar *match = u_strFindLast(fArray + start, length, srcChars + srcStart, srcLength); + const UChar *array = getArrayStart(); + const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength); if(match == NULL) { return -1; } else { - return (int32_t)(match - fArray); + return (int32_t)(match - array); } } @@ -837,11 +1012,12 @@ UnicodeString::doLastIndexOf(UChar c, pinIndices(start, length); // find the last occurrence of c - const UChar *match = u_memrchr(fArray + start, c, length); + const UChar *array = getArrayStart(); + const UChar *match = u_memrchr(array + start, c, length); if(match == NULL) { return -1; } else { - return (int32_t)(match - fArray); + return (int32_t)(match - array); } } @@ -853,11 +1029,12 @@ UnicodeString::doLastIndexOf(UChar32 c, pinIndices(start, length); // find the last occurrence of c - const UChar *match = u_memrchr32(fArray + start, c, length); + const UChar *array = getArrayStart(); + const UChar *match = u_memrchr32(array + start, c, length); if(match == NULL) { return -1; } else { - return (int32_t)(match - fArray); + return (int32_t)(match - array); } } @@ -909,19 +1086,54 @@ UnicodeString::setToBogus() { releaseArray(); - fArray = 0; - fCapacity = fLength = 0; - fFlags = kIsBogus; + fUnion.fFields.fLengthAndFlags = kIsBogus; + fUnion.fFields.fArray = 0; + fUnion.fFields.fCapacity = 0; } // turn a bogus string into an empty one void UnicodeString::unBogus() { - if(fFlags & kIsBogus) { - fArray = fStackBuffer; - fLength = 0; - fCapacity = US_STACKBUF_SIZE; - fFlags = kShortString; + if(fUnion.fFields.fLengthAndFlags & kIsBogus) { + setToEmpty(); + } +} + +const UChar * +UnicodeString::getTerminatedBuffer() { + if(!isWritable()) { + return 0; + } + UChar *array = getArrayStart(); + int32_t len = length(); + if(len < getCapacity()) { + if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) { + // If len 0) { + int32_t len = length(); + if(cloneArrayIfNeeded() && len > 0) { if(offset < 0) { offset = 0; - } else if(offset >= fLength) { - offset = fLength - 1; + } else if(offset >= len) { + offset = len - 1; } - fArray[offset] = c; + getArrayStart()[offset] = c; } return *this; } +UnicodeString& +UnicodeString::replace(int32_t start, + int32_t _length, + UChar32 srcChar) { + UChar buffer[U16_MAX_LENGTH]; + int32_t count = 0; + UBool isError = FALSE; + U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError); + // We test isError so that the compiler does not complain that we don't. + // If isError (srcChar is not a valid code point) then count==0 which means + // we remove the source segment rather than replacing it with srcChar. + return doReplace(start, _length, buffer, 0, isError ? 0 : count); +} + +UnicodeString& +UnicodeString::append(UChar32 srcChar) { + UChar buffer[U16_MAX_LENGTH]; + int32_t _length = 0; + UBool isError = FALSE; + U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError); + // We test isError so that the compiler does not complain that we don't. + // If isError then _length==0 which turns the doReplace() into a no-op anyway. + return isError ? *this : doReplace(length(), 0, buffer, 0, _length); +} + UnicodeString& UnicodeString::doReplace( int32_t start, int32_t length, @@ -1054,10 +1304,32 @@ UnicodeString::doReplace(int32_t start, int32_t srcStart, int32_t srcLength) { - if(isBogus()) { + if(!isWritable()) { return *this; } + int32_t oldLength = this->length(); + + // optimize (read-only alias).remove(0, start) and .remove(start, end) + if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) { + if(start == 0) { + // remove prefix by adjusting the array pointer + pinIndex(length); + fUnion.fFields.fArray += length; + fUnion.fFields.fCapacity -= length; + setLength(oldLength - length); + return *this; + } else { + pinIndex(start); + if(length >= (oldLength - start)) { + // remove suffix by reducing the length (like truncate()) + setLength(start); + fUnion.fFields.fCapacity = start; // not NUL-terminated any more + return *this; + } + } + } + if(srcChars == 0) { srcStart = srcLength = 0; } else if(srcLength < 0) { @@ -1065,21 +1337,57 @@ UnicodeString::doReplace(int32_t start, srcLength = u_strlen(srcChars + srcStart); } - int32_t *bufferToDelete = 0; + // calculate the size of the string after the replace + int32_t newLength; - // the following may change fArray but will not copy the current contents; - // therefore we need to keep the current fArray - UChar *oldArray = fArray; - int32_t oldLength = fLength; + // optimize append() onto a large-enough, owned string + if(start >= oldLength) { + if(srcLength == 0) { + return *this; + } + newLength = oldLength + srcLength; + if(newLength <= getCapacity() && isBufferWritable()) { + UChar *oldArray = getArrayStart(); + // Do not copy characters when + // UChar *buffer=str.getAppendBuffer(...); + // is followed by + // str.append(buffer, length); + // or + // str.appendString(buffer, length) + // or similar. + if(srcChars + srcStart != oldArray + start || start > oldLength) { + us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength); + } + setLength(newLength); + return *this; + } else { + // pin the indices to legal values + start = oldLength; + length = 0; + } + } else { + // pin the indices to legal values + pinIndices(start, length); - // pin the indices to legal values - pinIndices(start, length); + newLength = oldLength - length + srcLength; + } - // calculate the size of the string after the replace - int32_t newSize = oldLength - length + srcLength; + // the following may change fArray but will not copy the current contents; + // therefore we need to keep the current fArray + UChar oldStackBuffer[US_STACKBUF_SIZE]; + UChar *oldArray; + if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) { + // copy the stack buffer contents because it will be overwritten with + // fUnion.fFields values + u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength); + oldArray = oldStackBuffer; + } else { + oldArray = getArrayStart(); + } // clone our array and allocate a bigger array if needed - if(!cloneArrayIfNeeded(newSize, newSize + (newSize >> 2) + kGrowSize, + int32_t *bufferToDelete = 0; + if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize, FALSE, &bufferToDelete) ) { return *this; @@ -1087,23 +1395,24 @@ UnicodeString::doReplace(int32_t start, // now do the replace - if(fArray != oldArray) { + UChar *newArray = getArrayStart(); + if(newArray != oldArray) { // if fArray changed, then we need to copy everything except what will change - us_arrayCopy(oldArray, 0, fArray, 0, start); + us_arrayCopy(oldArray, 0, newArray, 0, start); us_arrayCopy(oldArray, start + length, - fArray, start + srcLength, + newArray, start + srcLength, oldLength - (start + length)); } else if(length != srcLength) { // fArray did not change; copy only the portion that isn't changing, leaving a hole us_arrayCopy(oldArray, start + length, - fArray, start + srcLength, + newArray, start + srcLength, oldLength - (start + length)); } // now fill in the hole with the new string - us_arrayCopy(srcChars, srcStart, getArrayStart(), start, srcLength); + us_arrayCopy(srcChars, srcStart, newArray, start, srcLength); - fLength = newSize; + setLength(newLength); // delayed delete in case srcChars == fArray when we started, and // to keep oldArray alive for the above operations @@ -1133,9 +1442,12 @@ UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) { return; // Nothing to do; avoid bogus malloc call } UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) ); - extractBetween(start, limit, text, 0); - insert(dest, text, 0, limit - start); - uprv_free(text); + // Check to make sure text is not null. + if (text != NULL) { + extractBetween(start, limit, text, 0); + insert(dest, text, 0, limit - start); + uprv_free(text); + } } /** @@ -1156,35 +1468,40 @@ UBool UnicodeString::hasMetaData() const { } UnicodeString& -UnicodeString::doReverse(int32_t start, - int32_t length) -{ - if(fLength <= 1 || !cloneArrayIfNeeded()) { +UnicodeString::doReverse(int32_t start, int32_t length) { + if(length <= 1 || !cloneArrayIfNeeded()) { return *this; } // pin the indices to legal values pinIndices(start, length); + if(length <= 1) { // pinIndices() might have shrunk the length + return *this; + } UChar *left = getArrayStart() + start; - UChar *right = getArrayStart() + start + length; + UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2) UChar swap; UBool hasSupplementary = FALSE; - while(left < --right) { - hasSupplementary |= (UBool)UTF_IS_LEAD(swap = *left); - hasSupplementary |= (UBool)UTF_IS_LEAD(*left++ = *right); - *right = swap; - } + // Before the loop we know left=2. + do { + hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left); + hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right); + *right-- = swap; + } while(left < right); + // Make sure to test the middle code unit of an odd-length string. + // Redundant if the length is even. + hasSupplementary |= (UBool)U16_IS_LEAD(*left); /* if there are supplementary code points in the reversed range, then re-swap their surrogates */ if(hasSupplementary) { UChar swap2; left = getArrayStart() + start; - right = getArrayStart() + start + length - 1; // -1 so that we can look at *(left+1) if left= targetLength || !cloneArrayIfNeeded(targetLength)) { + int32_t oldLength = length(); + if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { return FALSE; } else { // move contents up by padding width - int32_t start = targetLength - fLength; - us_arrayCopy(fArray, 0, fArray, start, fLength); + UChar *array = getArrayStart(); + int32_t start = targetLength - oldLength; + us_arrayCopy(array, 0, array, start, oldLength); // fill in padding character while(--start >= 0) { - fArray[start] = padChar; + array[start] = padChar; } - fLength = targetLength; + setLength(targetLength); return TRUE; } } @@ -1220,15 +1539,17 @@ UBool UnicodeString::padTrailing(int32_t targetLength, UChar padChar) { - if(fLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { + int32_t oldLength = length(); + if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { return FALSE; } else { // fill in padding character + UChar *array = getArrayStart(); int32_t length = targetLength; - while(--length >= fLength) { - fArray[length] = padChar; + while(--length >= oldLength) { + array[length] = padChar; } - fLength = targetLength; + setLength(targetLength); return TRUE; } } @@ -1241,7 +1562,7 @@ UnicodeString::doHashCode() const { /* Delegate hash computation to uhash. This makes UnicodeString * hashing consistent with UChar* hashing. */ - int32_t hashCode = uhash_hashUCharsN(getArrayStart(), fLength); + int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length()); if (hashCode == kInvalidHashCode) { hashCode = kEmptyHashCode; } @@ -1255,9 +1576,9 @@ UnicodeString::doHashCode() const UChar * UnicodeString::getBuffer(int32_t minCapacity) { if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) { - fFlags|=kOpenGetBuffer; - fLength=0; - return fArray; + fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer; + setZeroLength(); + return getArrayStart(); } else { return 0; } @@ -1265,21 +1586,21 @@ UnicodeString::getBuffer(int32_t minCapacity) { void UnicodeString::releaseBuffer(int32_t newLength) { - if(fFlags&kOpenGetBuffer && newLength>=-1) { + if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) { // set the new fLength + int32_t capacity=getCapacity(); if(newLength==-1) { // the new length is the string length, capped by fCapacity - const UChar *p=fArray, *limit=fArray+fCapacity; + const UChar *array=getArrayStart(), *p=array, *limit=array+capacity; while(pcapacity) { + newLength=capacity; } - fFlags&=~kOpenGetBuffer; + setLength(newLength); + fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer; } } @@ -1295,13 +1616,13 @@ UnicodeString::cloneArrayIfNeeded(int32_t newCapacity, // default parameters need to be static, therefore // the defaults are -1 to have convenience defaults if(newCapacity == -1) { - newCapacity = fCapacity; + newCapacity = getCapacity(); } // while a getBuffer(minCapacity) is "open", // prevent any modifications of the string by returning FALSE here // if the string is bogus, then only an assignment or similar can revive it - if((fFlags&(kOpenGetBuffer|kIsBogus))!=0) { + if(!isWritable()) { return FALSE; } @@ -1313,62 +1634,159 @@ UnicodeString::cloneArrayIfNeeded(int32_t newCapacity, * Return FALSE if memory could not be allocated. */ if(forceClone || - fFlags & kBufferIsReadonly || - fFlags & kRefCounted && refCount() > 1 || - newCapacity > fCapacity + fUnion.fFields.fLengthAndFlags & kBufferIsReadonly || + (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) || + newCapacity > getCapacity() ) { - // save old values - UChar *array = fArray; - uint16_t flags = fFlags; - // check growCapacity for default value and use of the stack buffer - if(growCapacity == -1) { + if(growCapacity < 0) { growCapacity = newCapacity; } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) { growCapacity = US_STACKBUF_SIZE; } + // save old values + UChar oldStackBuffer[US_STACKBUF_SIZE]; + UChar *oldArray; + int32_t oldLength = length(); + int16_t flags = fUnion.fFields.fLengthAndFlags; + + if(flags&kUsingStackBuffer) { + U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */ + if(doCopyArray && growCapacity > US_STACKBUF_SIZE) { + // copy the stack buffer contents because it will be overwritten with + // fUnion.fFields values + us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength); + oldArray = oldStackBuffer; + } else { + oldArray = NULL; // no need to copy from the stack buffer to itself + } + } else { + oldArray = fUnion.fFields.fArray; + U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */ + } + // allocate a new array if(allocate(growCapacity) || - newCapacity < growCapacity && allocate(newCapacity) + (newCapacity < growCapacity && allocate(newCapacity)) ) { if(doCopyArray) { // copy the contents // do not copy more than what fits - it may be smaller than before - if(fCapacity < fLength) { - fLength = fCapacity; + int32_t minLength = oldLength; + newCapacity = getCapacity(); + if(newCapacity < minLength) { + minLength = newCapacity; + } + if(oldArray != NULL) { + us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength); } - us_arrayCopy(array, 0, fArray, 0, fLength); + setLength(minLength); } else { - fLength = 0; + setZeroLength(); } // release the old array if(flags & kRefCounted) { // the array is refCounted; decrement and release if 0 - int32_t *pRefCount = ((int32_t *)array - 1); + u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1); if(umtx_atomic_dec(pRefCount) == 0) { if(pBufferToDelete == 0) { - uprv_free(pRefCount); + // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t + // is defined as volatile. (Volatile has useful non-standard behavior + // with this compiler.) + uprv_free((void *)pRefCount); } else { // the caller requested to delete it himself - *pBufferToDelete = pRefCount; + *pBufferToDelete = (int32_t *)pRefCount; } } } } else { // not enough memory for growCapacity and not even for the smaller newCapacity // reset the old values for setToBogus() to release the array - fArray = array; - fFlags = flags; + if(!(flags&kUsingStackBuffer)) { + fUnion.fFields.fArray = oldArray; + } + fUnion.fFields.fLengthAndFlags = flags; setToBogus(); return FALSE; } } return TRUE; } + +// UnicodeStringAppendable ------------------------------------------------- *** + +UnicodeStringAppendable::~UnicodeStringAppendable() {} + +UBool +UnicodeStringAppendable::appendCodeUnit(UChar c) { + return str.doReplace(str.length(), 0, &c, 0, 1).isWritable(); +} + +UBool +UnicodeStringAppendable::appendCodePoint(UChar32 c) { + UChar buffer[U16_MAX_LENGTH]; + int32_t cLength = 0; + UBool isError = FALSE; + U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError); + return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable(); +} + +UBool +UnicodeStringAppendable::appendString(const UChar *s, int32_t length) { + return str.doReplace(str.length(), 0, s, 0, length).isWritable(); +} + +UBool +UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) { + return str.cloneArrayIfNeeded(str.length() + appendCapacity); +} + +UChar * +UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity, + int32_t desiredCapacityHint, + UChar *scratch, int32_t scratchCapacity, + int32_t *resultCapacity) { + if(minCapacity < 1 || scratchCapacity < minCapacity) { + *resultCapacity = 0; + return NULL; + } + int32_t oldLength = str.length(); + if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) { + *resultCapacity = str.getCapacity() - oldLength; + return str.getArrayStart() + oldLength; + } + *resultCapacity = scratchCapacity; + return scratch; +} + U_NAMESPACE_END +U_NAMESPACE_USE + +U_CAPI int32_t U_EXPORT2 +uhash_hashUnicodeString(const UElement key) { + const UnicodeString *str = (const UnicodeString*) key.pointer; + return (str == NULL) ? 0 : str->hashCode(); +} + +// Moved here from uhash_us.cpp so that using a UVector of UnicodeString* +// does not depend on hashtable code. +U_CAPI UBool U_EXPORT2 +uhash_compareUnicodeString(const UElement key1, const UElement key2) { + const UnicodeString *str1 = (const UnicodeString*) key1.pointer; + const UnicodeString *str2 = (const UnicodeString*) key2.pointer; + if (str1 == str2) { + return TRUE; + } + if (str1 == NULL || str2 == NULL) { + return FALSE; + } + return *str1 == *str2; +} + #ifdef U_STATIC_IMPLEMENTATION /* This should never be called. It is defined here to make sure that the @@ -1378,8 +1796,6 @@ but defining it here makes sure that it is included with this object file. This makes sure that static library dependencies are kept to a minimum. */ static void uprv_UnicodeStringDummy(void) { - U_NAMESPACE_USE delete [] (new UnicodeString[2]); } #endif -