icuSources/common/unistr.cpp

   1 /*
   2 ******************************************************************************
   3 * Copyright (C) 1999-2016, International Business Machines Corporation and
   4 * others. All Rights Reserved.
   5 ******************************************************************************
   6 *
   7 * File unistr.cpp
   8 *
   9 * Modification History:
  10 *
  11 *   Date        Name        Description
  12 *   09/25/98    stephen     Creation.
  13 *   04/20/99    stephen     Overhauled per 4/16 code review.
  14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
  15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
  16 *                           Replaceable.
  17 *   06/25/01    grhoten     Removed the dependency on iostream
  18 ******************************************************************************
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "unicode/appendable.h"
  23 #include "unicode/putil.h"
  24 #include "cstring.h"
  25 #include "cmemory.h"
  26 #include "unicode/ustring.h"
  27 #include "unicode/unistr.h"
  28 #include "unicode/utf.h"
  29 #include "unicode/utf16.h"
  30 #include "uelement.h"
  31 #include "ustr_imp.h"
  32 #include "umutex.h"
  33 #include "uassert.h"
  34
  35 #if 0
  36
  37 #include <iostream>
  38 using namespace std;
  39
  40 //DEBUGGING
  41 void
  42 print(const UnicodeString& s,
  43       const char *name)
  44 {
  45   UChar c;
  46   cout << name << ":|";
  47   for(int i = 0; i < s.length(); ++i) {
  48     c = s[i];
  49     if(c>= 0x007E || c < 0x0020)
  50       cout << "[0x" << hex << s[i] << "]";
  51     else
  52       cout << (char) s[i];
  53   }
  54   cout << '|' << endl;
  55 }
  56
  57 void
  58 print(const UChar *s,
  59       int32_t len,
  60       const char *name)
  61 {
  62   UChar c;
  63   cout << name << ":|";
  64   for(int i = 0; i < len; ++i) {
  65     c = s[i];
  66     if(c>= 0x007E || c < 0x0020)
  67       cout << "[0x" << hex << s[i] << "]";
  68     else
  69       cout << (char) s[i];
  70   }
  71   cout << '|' << endl;
  72 }
  73 // END DEBUGGING
  74 #endif
  75
  76 // Local function definitions for now
  77
  78 // need to copy areas that may overlap
  79 static
  80 inline void
  81 us_arrayCopy(const UChar *src, int32_t srcStart,
  82          UChar *dst, int32_t dstStart, int32_t count)
  83 {
  84   if(count>0) {
  85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
  86   }
  87 }
  88
  89 // u_unescapeAt() callback to get a UChar from a UnicodeString
  90 U_CDECL_BEGIN
  91 static UChar U_CALLCONV
  92 UnicodeString_charAt(int32_t offset, void *context) {
  93     return ((icu::UnicodeString*) context)->charAt(offset);
  94 }
  95 U_CDECL_END
  96
  97 U_NAMESPACE_BEGIN
  98
  99 /* The Replaceable virtual destructor can't be defined in the header
 100    due to how AIX works with multiple definitions of virtual functions.
 101 */
 102 Replaceable::~Replaceable() {}
 103
 104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
 105
 106 UnicodeString U_EXPORT2
 107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
 108     return
 109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
 110             append(s1).
 111                 append(s2);
 112 }
 113
 114 //========================================
 115 // Reference Counting functions, put at top of file so that optimizing compilers
 116 //                               have a chance to automatically inline.
 117 //========================================
 118
 119 void
 120 UnicodeString::addRef() {
 121   umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
 122 }
 123
 124 int32_t
 125 UnicodeString::removeRef() {
 126   return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
 127 }
 128
 129 int32_t
 130 UnicodeString::refCount() const {
 131   return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
 132 }
 133
 134 void
 135 UnicodeString::releaseArray() {
 136   if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
 137     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
 138   }
 139 }
 140
 141
 142
 143 //========================================
 144 // Constructors
 145 //========================================
 146
 147 // The default constructor is inline in unistr.h.
 148
 149 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
 150   fUnion.fFields.fLengthAndFlags = 0;
 151   if(count <= 0 || (uint32_t)c > 0x10ffff) {
 152     // just allocate and do not do anything else
 153     allocate(capacity);
 154   } else {
 155     // count > 0, allocate and fill the new string with count c's
 156     int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
 157     if(capacity < length) {
 158       capacity = length;
 159     }
 160     if(allocate(capacity)) {
 161       UChar *array = getArrayStart();
 162       int32_t i = 0;
 163
 164       // fill the new string with c
 165       if(unitCount == 1) {
 166         // fill with length UChars
 167         while(i < length) {
 168           array[i++] = (UChar)c;
 169         }
 170       } else {
 171         // get the code units for c
 172         UChar units[U16_MAX_LENGTH];
 173         U16_APPEND_UNSAFE(units, i, c);
 174
 175         // now it must be i==unitCount
 176         i = 0;
 177
 178         // for Unicode, unitCount can only be 1, 2, 3, or 4
 179         // 1 is handled above
 180         while(i < length) {
 181           int32_t unitIdx = 0;
 182           while(unitIdx < unitCount) {
 183             array[i++]=units[unitIdx++];
 184           }
 185         }
 186       }
 187     }
 188     setLength(length);
 189   }
 190 }
 191
 192 UnicodeString::UnicodeString(UChar ch) {
 193   fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
 194   fUnion.fStackFields.fBuffer[0] = ch;
 195 }
 196
 197 UnicodeString::UnicodeString(UChar32 ch) {
 198   fUnion.fFields.fLengthAndFlags = kShortString;
 199   int32_t i = 0;
 200   UBool isError = FALSE;
 201   U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
 202   // We test isError so that the compiler does not complain that we don't.
 203   // If isError then i==0 which is what we want anyway.
 204   if(!isError) {
 205     setShortLength(i);
 206   }
 207 }
 208
 209 UnicodeString::UnicodeString(const UChar *text) {
 210   fUnion.fFields.fLengthAndFlags = kShortString;
 211   doAppend(text, 0, -1);
 212 }
 213
 214 UnicodeString::UnicodeString(const UChar *text,
 215                              int32_t textLength) {
 216   fUnion.fFields.fLengthAndFlags = kShortString;
 217   doAppend(text, 0, textLength);
 218 }
 219
 220 UnicodeString::UnicodeString(UBool isTerminated,
 221                              const UChar *text,
 222                              int32_t textLength) {
 223   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
 224   if(text == NULL) {
 225     // treat as an empty string, do not alias
 226     setToEmpty();
 227   } else if(textLength < -1 ||
 228             (textLength == -1 && !isTerminated) ||
 229             (textLength >= 0 && isTerminated && text[textLength] != 0)
 230   ) {
 231     setToBogus();
 232   } else {
 233     if(textLength == -1) {
 234       // text is terminated, or else it would have failed the above test
 235       textLength = u_strlen(text);
 236     }
 237     setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
 238   }
 239 }
 240
 241 UnicodeString::UnicodeString(UChar *buff,
 242                              int32_t buffLength,
 243                              int32_t buffCapacity) {
 244   fUnion.fFields.fLengthAndFlags = kWritableAlias;
 245   if(buff == NULL) {
 246     // treat as an empty string, do not alias
 247     setToEmpty();
 248   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
 249     setToBogus();
 250   } else {
 251     if(buffLength == -1) {
 252       // fLength = u_strlen(buff); but do not look beyond buffCapacity
 253       const UChar *p = buff, *limit = buff + buffCapacity;
 254       while(p != limit && *p != 0) {
 255         ++p;
 256       }
 257       buffLength = (int32_t)(p - buff);
 258     }
 259     setArray(buff, buffLength, buffCapacity);
 260   }
 261 }
 262
 263 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
 264   fUnion.fFields.fLengthAndFlags = kShortString;
 265   if(src==NULL) {
 266     // treat as an empty string
 267   } else {
 268     if(length<0) {
 269       length=(int32_t)uprv_strlen(src);
 270     }
 271     if(cloneArrayIfNeeded(length, length, FALSE)) {
 272       u_charsToUChars(src, getArrayStart(), length);
 273       setLength(length);
 274     } else {
 275       setToBogus();
 276     }
 277   }
 278 }
 279
 280 #if U_CHARSET_IS_UTF8
 281
 282 UnicodeString::UnicodeString(const char *codepageData) {
 283   fUnion.fFields.fLengthAndFlags = kShortString;
 284   if(codepageData != 0) {
 285     setToUTF8(codepageData);
 286   }
 287 }
 288
 289 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
 290   fUnion.fFields.fLengthAndFlags = kShortString;
 291   // if there's nothing to convert, do nothing
 292   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
 293     return;
 294   }
 295   if(dataLength == -1) {
 296     dataLength = (int32_t)uprv_strlen(codepageData);
 297   }
 298   setToUTF8(StringPiece(codepageData, dataLength));
 299 }
 300
 301 // else see unistr_cnv.cpp
 302 #endif
 303
 304 UnicodeString::UnicodeString(const UnicodeString& that) {
 305   fUnion.fFields.fLengthAndFlags = kShortString;
 306   copyFrom(that);
 307 }
 308
 309 #if U_HAVE_RVALUE_REFERENCES
 310 UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
 311   fUnion.fFields.fLengthAndFlags = kShortString;
 312   moveFrom(src);
 313 }
 314 #endif
 315
 316 UnicodeString::UnicodeString(const UnicodeString& that,
 317                              int32_t srcStart) {
 318   fUnion.fFields.fLengthAndFlags = kShortString;
 319   setTo(that, srcStart);
 320 }
 321
 322 UnicodeString::UnicodeString(const UnicodeString& that,
 323                              int32_t srcStart,
 324                              int32_t srcLength) {
 325   fUnion.fFields.fLengthAndFlags = kShortString;
 326   setTo(that, srcStart, srcLength);
 327 }
 328
 329 // Replaceable base class clone() default implementation, does not clone
 330 Replaceable *
 331 Replaceable::clone() const {
 332   return NULL;
 333 }
 334
 335 // UnicodeString overrides clone() with a real implementation
 336 Replaceable *
 337 UnicodeString::clone() const {
 338   return new UnicodeString(*this);
 339 }
 340
 341 //========================================
 342 // array allocation
 343 //========================================
 344
 345 UBool
 346 UnicodeString::allocate(int32_t capacity) {
 347   if(capacity <= US_STACKBUF_SIZE) {
 348     fUnion.fFields.fLengthAndFlags = kShortString;
 349   } else {
 350     // count bytes for the refCounter and the string capacity, and
 351     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
 352     // to be safely aligned for the refCount
 353     // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
 354     int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
 355     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
 356     if(array != 0) {
 357       // set initial refCount and point behind the refCount
 358       *array++ = 1;
 359
 360       // have fArray point to the first UChar
 361       fUnion.fFields.fArray = (UChar *)array;
 362       fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
 363       fUnion.fFields.fLengthAndFlags = kLongString;
 364     } else {
 365       fUnion.fFields.fLengthAndFlags = kIsBogus;
 366       fUnion.fFields.fArray = 0;
 367       fUnion.fFields.fCapacity = 0;
 368       return FALSE;
 369     }
 370   }
 371   return TRUE;
 372 }
 373
 374 //========================================
 375 // Destructor
 376 //========================================
 377
 378 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
 379 static u_atomic_int32_t finalLengthCounts[0x400];  // UnicodeString::kMaxShortLength+1
 380 static u_atomic_int32_t beyondCount(0);
 381
 382 U_CAPI void unistr_printLengths() {
 383   int32_t i;
 384   for(i = 0; i <= 59; ++i) {
 385     printf("%2d,  %9d\n", i, (int32_t)finalLengthCounts[i]);
 386   }
 387   int32_t beyond = beyondCount;
 388   for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
 389     beyond += finalLengthCounts[i];
 390   }
 391   printf(">59, %9d\n", beyond);
 392 }
 393 #endif
 394
 395 UnicodeString::~UnicodeString()
 396 {
 397 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
 398   // Count lengths of strings at the end of their lifetime.
 399   // Useful for discussion of a desirable stack buffer size.
 400   // Count the contents length, not the optional NUL terminator nor further capacity.
 401   // Ignore open-buffer strings and strings which alias external storage.
 402   if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
 403     if(hasShortLength()) {
 404       umtx_atomic_inc(finalLengthCounts + getShortLength());
 405     } else {
 406       umtx_atomic_inc(&beyondCount);
 407     }
 408   }
 409 #endif
 410
 411   releaseArray();
 412 }
 413
 414 //========================================
 415 // Factory methods
 416 //========================================
 417
 418 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
 419   UnicodeString result;
 420   result.setToUTF8(utf8);
 421   return result;
 422 }
 423
 424 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
 425   UnicodeString result;
 426   int32_t capacity;
 427   // Most UTF-32 strings will be BMP-only and result in a same-length
 428   // UTF-16 string. We overestimate the capacity just slightly,
 429   // just in case there are a few supplementary characters.
 430   if(length <= US_STACKBUF_SIZE) {
 431     capacity = US_STACKBUF_SIZE;
 432   } else {
 433     capacity = length + (length >> 4) + 4;
 434   }
 435   do {
 436     UChar *utf16 = result.getBuffer(capacity);
 437     int32_t length16;
 438     UErrorCode errorCode = U_ZERO_ERROR;
 439     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
 440         utf32, length,
 441         0xfffd,  // Substitution character.
 442         NULL,    // Don't care about number of substitutions.
 443         &errorCode);
 444     result.releaseBuffer(length16);
 445     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 446       capacity = length16 + 1;  // +1 for the terminating NUL.
 447       continue;
 448     } else if(U_FAILURE(errorCode)) {
 449       result.setToBogus();
 450     }
 451     break;
 452   } while(TRUE);
 453   return result;
 454 }
 455
 456 //========================================
 457 // Assignment
 458 //========================================
 459
 460 UnicodeString &
 461 UnicodeString::operator=(const UnicodeString &src) {
 462   return copyFrom(src);
 463 }
 464
 465 UnicodeString &
 466 UnicodeString::fastCopyFrom(const UnicodeString &src) {
 467   return copyFrom(src, TRUE);
 468 }
 469
 470 UnicodeString &
 471 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
 472   // if assigning to ourselves, do nothing
 473   if(this == &src) {
 474     return *this;
 475   }
 476
 477   // is the right side bogus?
 478   if(src.isBogus()) {
 479     setToBogus();
 480     return *this;
 481   }
 482
 483   // delete the current contents
 484   releaseArray();
 485
 486   if(src.isEmpty()) {
 487     // empty string - use the stack buffer
 488     setToEmpty();
 489     return *this;
 490   }
 491
 492   // fLength>0 and not an "open" src.getBuffer(minCapacity)
 493   fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
 494   switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
 495   case kShortString:
 496     // short string using the stack buffer, do the same
 497     uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
 498                 getShortLength() * U_SIZEOF_UCHAR);
 499     break;
 500   case kLongString:
 501     // src uses a refCounted string buffer, use that buffer with refCount
 502     // src is const, use a cast - we don't actually change it
 503     ((UnicodeString &)src).addRef();
 504     // copy all fields, share the reference-counted buffer
 505     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 506     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 507     if(!hasShortLength()) {
 508       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
 509     }
 510     break;
 511   case kReadonlyAlias:
 512     if(fastCopy) {
 513       // src is a readonly alias, do the same
 514       // -> maintain the readonly alias as such
 515       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 516       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 517       if(!hasShortLength()) {
 518         fUnion.fFields.fLength = src.fUnion.fFields.fLength;
 519       }
 520       break;
 521     }
 522     // else if(!fastCopy) fall through to case kWritableAlias
 523     // -> allocate a new buffer and copy the contents
 524     U_FALLTHROUGH;
 525   case kWritableAlias: {
 526     // src is a writable alias; we make a copy of that instead
 527     int32_t srcLength = src.length();
 528     if(allocate(srcLength)) {
 529       uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
 530       setLength(srcLength);
 531       break;
 532     }
 533     // if there is not enough memory, then fall through to setting to bogus
 534     U_FALLTHROUGH;
 535   }
 536   default:
 537     // if src is bogus, set ourselves to bogus
 538     // do not call setToBogus() here because fArray and flags are not consistent here
 539     fUnion.fFields.fLengthAndFlags = kIsBogus;
 540     fUnion.fFields.fArray = 0;
 541     fUnion.fFields.fCapacity = 0;
 542     break;
 543   }
 544
 545   return *this;
 546 }
 547
 548 UnicodeString &UnicodeString::moveFrom(UnicodeString &src) U_NOEXCEPT {
 549   // No explicit check for self move assignment, consistent with standard library.
 550   // Self move assignment causes no crash nor leak but might make the object bogus.
 551   releaseArray();
 552   copyFieldsFrom(src, TRUE);
 553   return *this;
 554 }
 555
 556 // Same as moveFrom() except without memory management.
 557 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
 558   int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
 559   if(lengthAndFlags & kUsingStackBuffer) {
 560     // Short string using the stack buffer, copy the contents.
 561     // Check for self assignment to prevent "overlap in memcpy" warnings,
 562     // although it should be harmless to copy a buffer to itself exactly.
 563     if(this != &src) {
 564       uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
 565                   getShortLength() * U_SIZEOF_UCHAR);
 566     }
 567   } else {
 568     // In all other cases, copy all fields.
 569     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 570     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 571     if(!hasShortLength()) {
 572       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
 573     }
 574     if(setSrcToBogus) {
 575       // Set src to bogus without releasing any memory.
 576       src.fUnion.fFields.fLengthAndFlags = kIsBogus;
 577       src.fUnion.fFields.fArray = NULL;
 578       src.fUnion.fFields.fCapacity = 0;
 579     }
 580   }
 581 }
 582
 583 void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
 584   UnicodeString temp;  // Empty short string: Known not to need releaseArray().
 585   // Copy fields without resetting source values in between.
 586   temp.copyFieldsFrom(*this, FALSE);
 587   this->copyFieldsFrom(other, FALSE);
 588   other.copyFieldsFrom(temp, FALSE);
 589   // Set temp to an empty string so that other's memory is not released twice.
 590   temp.fUnion.fFields.fLengthAndFlags = kShortString;
 591 }
 592
 593 //========================================
 594 // Miscellaneous operations
 595 //========================================
 596
 597 UnicodeString UnicodeString::unescape() const {
 598     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
 599     if (result.isBogus()) {
 600         return result;
 601     }
 602     const UChar *array = getBuffer();
 603     int32_t len = length();
 604     int32_t prev = 0;
 605     for (int32_t i=0;;) {
 606         if (i == len) {
 607             result.append(array, prev, len - prev);
 608             break;
 609         }
 610         if (array[i++] == 0x5C /*'\\'*/) {
 611             result.append(array, prev, (i - 1) - prev);
 612             UChar32 c = unescapeAt(i); // advances i
 613             if (c < 0) {
 614                 result.remove(); // return empty string
 615                 break; // invalid escape sequence
 616             }
 617             result.append(c);
 618             prev = i;
 619         }
 620     }
 621     return result;
 622 }
 623
 624 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
 625     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
 626 }
 627
 628 //========================================
 629 // Read-only implementation
 630 //========================================
 631 UBool
 632 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
 633   // Requires: this & text not bogus and have same lengths.
 634   // Byte-wise comparison works for equality regardless of endianness.
 635   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
 636 }
 637
 638 int8_t
 639 UnicodeString::doCompare( int32_t start,
 640               int32_t length,
 641               const UChar *srcChars,
 642               int32_t srcStart,
 643               int32_t srcLength) const
 644 {
 645   // compare illegal string values
 646   if(isBogus()) {
 647     return -1;
 648   }
 649
 650   // pin indices to legal values
 651   pinIndices(start, length);
 652
 653   if(srcChars == NULL) {
 654     // treat const UChar *srcChars==NULL as an empty string
 655     return length == 0 ? 0 : 1;
 656   }
 657
 658   // get the correct pointer
 659   const UChar *chars = getArrayStart();
 660
 661   chars += start;
 662   srcChars += srcStart;
 663
 664   int32_t minLength;
 665   int8_t lengthResult;
 666
 667   // get the srcLength if necessary
 668   if(srcLength < 0) {
 669     srcLength = u_strlen(srcChars + srcStart);
 670   }
 671
 672   // are we comparing different lengths?
 673   if(length != srcLength) {
 674     if(length < srcLength) {
 675       minLength = length;
 676       lengthResult = -1;
 677     } else {
 678       minLength = srcLength;
 679       lengthResult = 1;
 680     }
 681   } else {
 682     minLength = length;
 683     lengthResult = 0;
 684   }
 685
 686   /*
 687    * note that uprv_memcmp() returns an int but we return an int8_t;
 688    * we need to take care not to truncate the result -
 689    * one way to do this is to right-shift the value to
 690    * move the sign bit into the lower 8 bits and making sure that this
 691    * does not become 0 itself
 692    */
 693
 694   if(minLength > 0 && chars != srcChars) {
 695     int32_t result;
 696
 697 #   if U_IS_BIG_ENDIAN
 698       // big-endian: byte comparison works
 699       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
 700       if(result != 0) {
 701         return (int8_t)(result >> 15 | 1);
 702       }
 703 #   else
 704       // little-endian: compare UChar units
 705       do {
 706         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
 707         if(result != 0) {
 708           return (int8_t)(result >> 15 | 1);
 709         }
 710       } while(--minLength > 0);
 711 #   endif
 712   }
 713   return lengthResult;
 714 }
 715
 716 /* String compare in code point order - doCompare() compares in code unit order. */
 717 int8_t
 718 UnicodeString::doCompareCodePointOrder(int32_t start,
 719                                        int32_t length,
 720                                        const UChar *srcChars,
 721                                        int32_t srcStart,
 722                                        int32_t srcLength) const
 723 {
 724   // compare illegal string values
 725   // treat const UChar *srcChars==NULL as an empty string
 726   if(isBogus()) {
 727     return -1;
 728   }
 729
 730   // pin indices to legal values
 731   pinIndices(start, length);
 732
 733   if(srcChars == NULL) {
 734     srcStart = srcLength = 0;
 735   }
 736
 737   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
 738   /* translate the 32-bit result into an 8-bit one */
 739   if(diff!=0) {
 740     return (int8_t)(diff >> 15 | 1);
 741   } else {
 742     return 0;
 743   }
 744 }
 745
 746 int32_t
 747 UnicodeString::getLength() const {
 748     return length();
 749 }
 750
 751 UChar
 752 UnicodeString::getCharAt(int32_t offset) const {
 753   return charAt(offset);
 754 }
 755
 756 UChar32
 757 UnicodeString::getChar32At(int32_t offset) const {
 758   return char32At(offset);
 759 }
 760
 761 UChar32
 762 UnicodeString::char32At(int32_t offset) const
 763 {
 764   int32_t len = length();
 765   if((uint32_t)offset < (uint32_t)len) {
 766     const UChar *array = getArrayStart();
 767     UChar32 c;
 768     U16_GET(array, 0, offset, len, c);
 769     return c;
 770   } else {
 771     return kInvalidUChar;
 772   }
 773 }
 774
 775 int32_t
 776 UnicodeString::getChar32Start(int32_t offset) const {
 777   if((uint32_t)offset < (uint32_t)length()) {
 778     const UChar *array = getArrayStart();
 779     U16_SET_CP_START(array, 0, offset);
 780     return offset;
 781   } else {
 782     return 0;
 783   }
 784 }
 785
 786 int32_t
 787 UnicodeString::getChar32Limit(int32_t offset) const {
 788   int32_t len = length();
 789   if((uint32_t)offset < (uint32_t)len) {
 790     const UChar *array = getArrayStart();
 791     U16_SET_CP_LIMIT(array, 0, offset, len);
 792     return offset;
 793   } else {
 794     return len;
 795   }
 796 }
 797
 798 int32_t
 799 UnicodeString::countChar32(int32_t start, int32_t length) const {
 800   pinIndices(start, length);
 801   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
 802   return u_countChar32(getArrayStart()+start, length);
 803 }
 804
 805 UBool
 806 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
 807   pinIndices(start, length);
 808   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
 809   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
 810 }
 811
 812 int32_t
 813 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
 814   // pin index
 815   int32_t len = length();
 816   if(index<0) {
 817     index=0;
 818   } else if(index>len) {
 819     index=len;
 820   }
 821
 822   const UChar *array = getArrayStart();
 823   if(delta>0) {
 824     U16_FWD_N(array, index, len, delta);
 825   } else {
 826     U16_BACK_N(array, 0, index, -delta);
 827   }
 828
 829   return index;
 830 }
 831
 832 void
 833 UnicodeString::doExtract(int32_t start,
 834              int32_t length,
 835              UChar *dst,
 836              int32_t dstStart) const
 837 {
 838   // pin indices to legal values
 839   pinIndices(start, length);
 840
 841   // do not copy anything if we alias dst itself
 842   const UChar *array = getArrayStart();
 843   if(array + start != dst + dstStart) {
 844     us_arrayCopy(array, start, dst, dstStart, length);
 845   }
 846 }
 847
 848 int32_t
 849 UnicodeString::extract(UChar *dest, int32_t destCapacity,
 850                        UErrorCode &errorCode) const {
 851   int32_t len = length();
 852   if(U_SUCCESS(errorCode)) {
 853     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 854       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 855     } else {
 856       const UChar *array = getArrayStart();
 857       if(len>0 && len<=destCapacity && array!=dest) {
 858         uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
 859       }
 860       return u_terminateUChars(dest, destCapacity, len, &errorCode);
 861     }
 862   }
 863
 864   return len;
 865 }
 866
 867 int32_t
 868 UnicodeString::extract(int32_t start,
 869                        int32_t length,
 870                        char *target,
 871                        int32_t targetCapacity,
 872                        enum EInvariant) const
 873 {
 874   // if the arguments are illegal, then do nothing
 875   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
 876     return 0;
 877   }
 878
 879   // pin the indices to legal values
 880   pinIndices(start, length);
 881
 882   if(length <= targetCapacity) {
 883     u_UCharsToChars(getArrayStart() + start, target, length);
 884   }
 885   UErrorCode status = U_ZERO_ERROR;
 886   return u_terminateChars(target, targetCapacity, length, &status);
 887 }
 888
 889 UnicodeString
 890 UnicodeString::tempSubString(int32_t start, int32_t len) const {
 891   pinIndices(start, len);
 892   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
 893   if(array==NULL) {
 894     array=fUnion.fStackFields.fBuffer;  // anything not NULL because that would make an empty string
 895     len=-2;  // bogus result string
 896   }
 897   return UnicodeString(FALSE, array + start, len);
 898 }
 899
 900 int32_t
 901 UnicodeString::toUTF8(int32_t start, int32_t len,
 902                       char *target, int32_t capacity) const {
 903   pinIndices(start, len);
 904   int32_t length8;
 905   UErrorCode errorCode = U_ZERO_ERROR;
 906   u_strToUTF8WithSub(target, capacity, &length8,
 907                      getBuffer() + start, len,
 908                      0xFFFD,  // Standard substitution character.
 909                      NULL,    // Don't care about number of substitutions.
 910                      &errorCode);
 911   return length8;
 912 }
 913
 914 #if U_CHARSET_IS_UTF8
 915
 916 int32_t
 917 UnicodeString::extract(int32_t start, int32_t len,
 918                        char *target, uint32_t dstSize) const {
 919   // if the arguments are illegal, then do nothing
 920   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
 921     return 0;
 922   }
 923   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
 924 }
 925
 926 // else see unistr_cnv.cpp
 927 #endif
 928
 929 void
 930 UnicodeString::extractBetween(int32_t start,
 931                   int32_t limit,
 932                   UnicodeString& target) const {
 933   pinIndex(start);
 934   pinIndex(limit);
 935   doExtract(start, limit - start, target);
 936 }
 937
 938 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
 939 // as many bytes as the source has UChars.
 940 // The "worst cases" are writing systems like Indic, Thai and CJK with
 941 // 3:1 bytes:UChars.
 942 void
 943 UnicodeString::toUTF8(ByteSink &sink) const {
 944   int32_t length16 = length();
 945   if(length16 != 0) {
 946     char stackBuffer[1024];
 947     int32_t capacity = (int32_t)sizeof(stackBuffer);
 948     UBool utf8IsOwned = FALSE;
 949     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
 950                                       3*length16,
 951                                       stackBuffer, capacity,
 952                                       &capacity);
 953     int32_t length8 = 0;
 954     UErrorCode errorCode = U_ZERO_ERROR;
 955     u_strToUTF8WithSub(utf8, capacity, &length8,
 956                        getBuffer(), length16,
 957                        0xFFFD,  // Standard substitution character.
 958                        NULL,    // Don't care about number of substitutions.
 959                        &errorCode);
 960     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 961       utf8 = (char *)uprv_malloc(length8);
 962       if(utf8 != NULL) {
 963         utf8IsOwned = TRUE;
 964         errorCode = U_ZERO_ERROR;
 965         u_strToUTF8WithSub(utf8, length8, &length8,
 966                            getBuffer(), length16,
 967                            0xFFFD,  // Standard substitution character.
 968                            NULL,    // Don't care about number of substitutions.
 969                            &errorCode);
 970       } else {
 971         errorCode = U_MEMORY_ALLOCATION_ERROR;
 972       }
 973     }
 974     if(U_SUCCESS(errorCode)) {
 975       sink.Append(utf8, length8);
 976       sink.Flush();
 977     }
 978     if(utf8IsOwned) {
 979       uprv_free(utf8);
 980     }
 981   }
 982 }
 983
 984 int32_t
 985 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
 986   int32_t length32=0;
 987   if(U_SUCCESS(errorCode)) {
 988     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
 989     u_strToUTF32WithSub(utf32, capacity, &length32,
 990         getBuffer(), length(),
 991         0xfffd,  // Substitution character.
 992         NULL,    // Don't care about number of substitutions.
 993         &errorCode);
 994   }
 995   return length32;
 996 }
 997
 998 int32_t
 999 UnicodeString::indexOf(const UChar *srcChars,
1000                int32_t srcStart,
1001                int32_t srcLength,
1002                int32_t start,
1003                int32_t length) const
1004 {
1005   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1006     return -1;
1007   }
1008
1009   // UnicodeString does not find empty substrings
1010   if(srcLength < 0 && srcChars[srcStart] == 0) {
1011     return -1;
1012   }
1013
1014   // get the indices within bounds
1015   pinIndices(start, length);
1016
1017   // find the first occurrence of the substring
1018   const UChar *array = getArrayStart();
1019   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1020   if(match == NULL) {
1021     return -1;
1022   } else {
1023     return (int32_t)(match - array);
1024   }
1025 }
1026
1027 int32_t
1028 UnicodeString::doIndexOf(UChar c,
1029              int32_t start,
1030              int32_t length) const
1031 {
1032   // pin indices
1033   pinIndices(start, length);
1034
1035   // find the first occurrence of c
1036   const UChar *array = getArrayStart();
1037   const UChar *match = u_memchr(array + start, c, length);
1038   if(match == NULL) {
1039     return -1;
1040   } else {
1041     return (int32_t)(match - array);
1042   }
1043 }
1044
1045 int32_t
1046 UnicodeString::doIndexOf(UChar32 c,
1047                          int32_t start,
1048                          int32_t length) const {
1049   // pin indices
1050   pinIndices(start, length);
1051
1052   // find the first occurrence of c
1053   const UChar *array = getArrayStart();
1054   const UChar *match = u_memchr32(array + start, c, length);
1055   if(match == NULL) {
1056     return -1;
1057   } else {
1058     return (int32_t)(match - array);
1059   }
1060 }
1061
1062 int32_t
1063 UnicodeString::lastIndexOf(const UChar *srcChars,
1064                int32_t srcStart,
1065                int32_t srcLength,
1066                int32_t start,
1067                int32_t length) const
1068 {
1069   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1070     return -1;
1071   }
1072
1073   // UnicodeString does not find empty substrings
1074   if(srcLength < 0 && srcChars[srcStart] == 0) {
1075     return -1;
1076   }
1077
1078   // get the indices within bounds
1079   pinIndices(start, length);
1080
1081   // find the last occurrence of the substring
1082   const UChar *array = getArrayStart();
1083   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1084   if(match == NULL) {
1085     return -1;
1086   } else {
1087     return (int32_t)(match - array);
1088   }
1089 }
1090
1091 int32_t
1092 UnicodeString::doLastIndexOf(UChar c,
1093                  int32_t start,
1094                  int32_t length) const
1095 {
1096   if(isBogus()) {
1097     return -1;
1098   }
1099
1100   // pin indices
1101   pinIndices(start, length);
1102
1103   // find the last occurrence of c
1104   const UChar *array = getArrayStart();
1105   const UChar *match = u_memrchr(array + start, c, length);
1106   if(match == NULL) {
1107     return -1;
1108   } else {
1109     return (int32_t)(match - array);
1110   }
1111 }
1112
1113 int32_t
1114 UnicodeString::doLastIndexOf(UChar32 c,
1115                              int32_t start,
1116                              int32_t length) const {
1117   // pin indices
1118   pinIndices(start, length);
1119
1120   // find the last occurrence of c
1121   const UChar *array = getArrayStart();
1122   const UChar *match = u_memrchr32(array + start, c, length);
1123   if(match == NULL) {
1124     return -1;
1125   } else {
1126     return (int32_t)(match - array);
1127   }
1128 }
1129
1130 //========================================
1131 // Write implementation
1132 //========================================
1133
1134 UnicodeString&
1135 UnicodeString::findAndReplace(int32_t start,
1136                   int32_t length,
1137                   const UnicodeString& oldText,
1138                   int32_t oldStart,
1139                   int32_t oldLength,
1140                   const UnicodeString& newText,
1141                   int32_t newStart,
1142                   int32_t newLength)
1143 {
1144   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1145     return *this;
1146   }
1147
1148   pinIndices(start, length);
1149   oldText.pinIndices(oldStart, oldLength);
1150   newText.pinIndices(newStart, newLength);
1151
1152   if(oldLength == 0) {
1153     return *this;
1154   }
1155
1156   while(length > 0 && length >= oldLength) {
1157     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1158     if(pos < 0) {
1159       // no more oldText's here: done
1160       break;
1161     } else {
1162       // we found oldText, replace it by newText and go beyond it
1163       replace(pos, oldLength, newText, newStart, newLength);
1164       length -= pos + oldLength - start;
1165       start = pos + newLength;
1166     }
1167   }
1168
1169   return *this;
1170 }
1171
1172
1173 void
1174 UnicodeString::setToBogus()
1175 {
1176   releaseArray();
1177
1178   fUnion.fFields.fLengthAndFlags = kIsBogus;
1179   fUnion.fFields.fArray = 0;
1180   fUnion.fFields.fCapacity = 0;
1181 }
1182
1183 // turn a bogus string into an empty one
1184 void
1185 UnicodeString::unBogus() {
1186   if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1187     setToEmpty();
1188   }
1189 }
1190
1191 const UChar *
1192 UnicodeString::getTerminatedBuffer() {
1193   if(!isWritable()) {
1194     return 0;
1195   }
1196   UChar *array = getArrayStart();
1197   int32_t len = length();
1198   if(len < getCapacity()) {
1199     if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1200       // If len<capacity on a read-only alias, then array[len] is
1201       // either the original NUL (if constructed with (TRUE, s, length))
1202       // or one of the original string contents characters (if later truncated),
1203       // therefore we can assume that array[len] is initialized memory.
1204       if(array[len] == 0) {
1205         return array;
1206       }
1207     } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1208       // kRefCounted: Do not write the NUL if the buffer is shared.
1209       // That is mostly safe, except when the length of one copy was modified
1210       // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1211       // Then the NUL would be written into the middle of another copy's string.
1212
1213       // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1214       // Do not test if there is a NUL already because it might be uninitialized memory.
1215       // (That would be safe, but tools like valgrind & Purify would complain.)
1216       array[len] = 0;
1217       return array;
1218     }
1219   }
1220   if(cloneArrayIfNeeded(len+1)) {
1221     array = getArrayStart();
1222     array[len] = 0;
1223     return array;
1224   } else {
1225     return NULL;
1226   }
1227 }
1228
1229 // setTo() analogous to the readonly-aliasing constructor with the same signature
1230 UnicodeString &
1231 UnicodeString::setTo(UBool isTerminated,
1232                      const UChar *text,
1233                      int32_t textLength)
1234 {
1235   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1236     // do not modify a string that has an "open" getBuffer(minCapacity)
1237     return *this;
1238   }
1239
1240   if(text == NULL) {
1241     // treat as an empty string, do not alias
1242     releaseArray();
1243     setToEmpty();
1244     return *this;
1245   }
1246
1247   if( textLength < -1 ||
1248       (textLength == -1 && !isTerminated) ||
1249       (textLength >= 0 && isTerminated && text[textLength] != 0)
1250   ) {
1251     setToBogus();
1252     return *this;
1253   }
1254
1255   releaseArray();
1256
1257   if(textLength == -1) {
1258     // text is terminated, or else it would have failed the above test
1259     textLength = u_strlen(text);
1260   }
1261   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1262   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1263   return *this;
1264 }
1265
1266 // setTo() analogous to the writable-aliasing constructor with the same signature
1267 UnicodeString &
1268 UnicodeString::setTo(UChar *buffer,
1269                      int32_t buffLength,
1270                      int32_t buffCapacity) {
1271   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1272     // do not modify a string that has an "open" getBuffer(minCapacity)
1273     return *this;
1274   }
1275
1276   if(buffer == NULL) {
1277     // treat as an empty string, do not alias
1278     releaseArray();
1279     setToEmpty();
1280     return *this;
1281   }
1282
1283   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1284     setToBogus();
1285     return *this;
1286   } else if(buffLength == -1) {
1287     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1288     const UChar *p = buffer, *limit = buffer + buffCapacity;
1289     while(p != limit && *p != 0) {
1290       ++p;
1291     }
1292     buffLength = (int32_t)(p - buffer);
1293   }
1294
1295   releaseArray();
1296
1297   fUnion.fFields.fLengthAndFlags = kWritableAlias;
1298   setArray(buffer, buffLength, buffCapacity);
1299   return *this;
1300 }
1301
1302 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
1303   unBogus();
1304   int32_t length = utf8.length();
1305   int32_t capacity;
1306   // The UTF-16 string will be at most as long as the UTF-8 string.
1307   if(length <= US_STACKBUF_SIZE) {
1308     capacity = US_STACKBUF_SIZE;
1309   } else {
1310     capacity = length + 1;  // +1 for the terminating NUL.
1311   }
1312   UChar *utf16 = getBuffer(capacity);
1313   int32_t length16;
1314   UErrorCode errorCode = U_ZERO_ERROR;
1315   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1316       utf8.data(), length,
1317       0xfffd,  // Substitution character.
1318       NULL,    // Don't care about number of substitutions.
1319       &errorCode);
1320   releaseBuffer(length16);
1321   if(U_FAILURE(errorCode)) {
1322     setToBogus();
1323   }
1324   return *this;
1325 }
1326
1327 UnicodeString&
1328 UnicodeString::setCharAt(int32_t offset,
1329              UChar c)
1330 {
1331   int32_t len = length();
1332   if(cloneArrayIfNeeded() && len > 0) {
1333     if(offset < 0) {
1334       offset = 0;
1335     } else if(offset >= len) {
1336       offset = len - 1;
1337     }
1338
1339     getArrayStart()[offset] = c;
1340   }
1341   return *this;
1342 }
1343
1344 UnicodeString&
1345 UnicodeString::replace(int32_t start,
1346                int32_t _length,
1347                UChar32 srcChar) {
1348   UChar buffer[U16_MAX_LENGTH];
1349   int32_t count = 0;
1350   UBool isError = FALSE;
1351   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1352   // We test isError so that the compiler does not complain that we don't.
1353   // If isError (srcChar is not a valid code point) then count==0 which means
1354   // we remove the source segment rather than replacing it with srcChar.
1355   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1356 }
1357
1358 UnicodeString&
1359 UnicodeString::append(UChar32 srcChar) {
1360   UChar buffer[U16_MAX_LENGTH];
1361   int32_t _length = 0;
1362   UBool isError = FALSE;
1363   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1364   // We test isError so that the compiler does not complain that we don't.
1365   // If isError then _length==0 which turns the doAppend() into a no-op anyway.
1366   return isError ? *this : doAppend(buffer, 0, _length);
1367 }
1368
1369 UnicodeString&
1370 UnicodeString::doReplace( int32_t start,
1371               int32_t length,
1372               const UnicodeString& src,
1373               int32_t srcStart,
1374               int32_t srcLength)
1375 {
1376   // pin the indices to legal values
1377   src.pinIndices(srcStart, srcLength);
1378
1379   // get the characters from src
1380   // and replace the range in ourselves with them
1381   return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1382 }
1383
1384 UnicodeString&
1385 UnicodeString::doReplace(int32_t start,
1386              int32_t length,
1387              const UChar *srcChars,
1388              int32_t srcStart,
1389              int32_t srcLength)
1390 {
1391   if(!isWritable()) {
1392     return *this;
1393   }
1394
1395   int32_t oldLength = this->length();
1396
1397   // optimize (read-only alias).remove(0, start) and .remove(start, end)
1398   if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1399     if(start == 0) {
1400       // remove prefix by adjusting the array pointer
1401       pinIndex(length);
1402       fUnion.fFields.fArray += length;
1403       fUnion.fFields.fCapacity -= length;
1404       setLength(oldLength - length);
1405       return *this;
1406     } else {
1407       pinIndex(start);
1408       if(length >= (oldLength - start)) {
1409         // remove suffix by reducing the length (like truncate())
1410         setLength(start);
1411         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
1412         return *this;
1413       }
1414     }
1415   }
1416
1417   if(start == oldLength) {
1418     return doAppend(srcChars, srcStart, srcLength);
1419   }
1420
1421   if(srcChars == 0) {
1422     srcStart = srcLength = 0;
1423   } else if(srcLength < 0) {
1424     // get the srcLength if necessary
1425     srcLength = u_strlen(srcChars + srcStart);
1426   }
1427
1428   // pin the indices to legal values
1429   pinIndices(start, length);
1430
1431   // calculate the size of the string after the replace
1432   int32_t newLength = oldLength - length + srcLength;
1433
1434   // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
1435   // therefore we need to keep the current fArray
1436   UChar oldStackBuffer[US_STACKBUF_SIZE];
1437   UChar *oldArray;
1438   if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1439     // copy the stack buffer contents because it will be overwritten with
1440     // fUnion.fFields values
1441     u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
1442     oldArray = oldStackBuffer;
1443   } else {
1444     oldArray = getArrayStart();
1445   }
1446
1447   // clone our array and allocate a bigger array if needed
1448   int32_t *bufferToDelete = 0;
1449   if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
1450                          FALSE, &bufferToDelete)
1451   ) {
1452     return *this;
1453   }
1454
1455   // now do the replace
1456
1457   UChar *newArray = getArrayStart();
1458   if(newArray != oldArray) {
1459     // if fArray changed, then we need to copy everything except what will change
1460     us_arrayCopy(oldArray, 0, newArray, 0, start);
1461     us_arrayCopy(oldArray, start + length,
1462                  newArray, start + srcLength,
1463                  oldLength - (start + length));
1464   } else if(length != srcLength) {
1465     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1466     us_arrayCopy(oldArray, start + length,
1467                  newArray, start + srcLength,
1468                  oldLength - (start + length));
1469   }
1470
1471   // now fill in the hole with the new string
1472   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1473
1474   setLength(newLength);
1475
1476   // delayed delete in case srcChars == fArray when we started, and
1477   // to keep oldArray alive for the above operations
1478   if (bufferToDelete) {
1479     uprv_free(bufferToDelete);
1480   }
1481
1482   return *this;
1483 }
1484
1485 // Versions of doReplace() only for append() variants.
1486 // doReplace() and doAppend() optimize for different cases.
1487
1488 UnicodeString&
1489 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1490   if(srcLength == 0) {
1491     return *this;
1492   }
1493
1494   // pin the indices to legal values
1495   src.pinIndices(srcStart, srcLength);
1496   return doAppend(src.getArrayStart(), srcStart, srcLength);
1497 }
1498
1499 UnicodeString&
1500 UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
1501   if(!isWritable() || srcLength == 0 || srcChars == NULL) {
1502     return *this;
1503   }
1504
1505   if(srcLength < 0) {
1506     // get the srcLength if necessary
1507     if((srcLength = u_strlen(srcChars + srcStart)) == 0) {
1508       return *this;
1509     }
1510   }
1511
1512   int32_t oldLength = length();
1513   int32_t newLength = oldLength + srcLength;
1514   // optimize append() onto a large-enough, owned string
1515   if((newLength <= getCapacity() && isBufferWritable()) ||
1516       cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize)) {
1517     UChar *newArray = getArrayStart();
1518     // Do not copy characters when
1519     //   UChar *buffer=str.getAppendBuffer(...);
1520     // is followed by
1521     //   str.append(buffer, length);
1522     // or
1523     //   str.appendString(buffer, length)
1524     // or similar.
1525     if(srcChars + srcStart != newArray + oldLength) {
1526       us_arrayCopy(srcChars, srcStart, newArray, oldLength, srcLength);
1527     }
1528     setLength(newLength);
1529   }
1530   return *this;
1531 }
1532
1533 /**
1534  * Replaceable API
1535  */
1536 void
1537 UnicodeString::handleReplaceBetween(int32_t start,
1538                                     int32_t limit,
1539                                     const UnicodeString& text) {
1540     replaceBetween(start, limit, text);
1541 }
1542
1543 /**
1544  * Replaceable API
1545  */
1546 void
1547 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1548     if (limit <= start) {
1549         return; // Nothing to do; avoid bogus malloc call
1550     }
1551     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1552     // Check to make sure text is not null.
1553     if (text != NULL) {
1554             extractBetween(start, limit, text, 0);
1555             insert(dest, text, 0, limit - start);
1556             uprv_free(text);
1557     }
1558 }
1559
1560 /**
1561  * Replaceable API
1562  *
1563  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1564  * so we implement this function here.
1565  */
1566 UBool Replaceable::hasMetaData() const {
1567     return TRUE;
1568 }
1569
1570 /**
1571  * Replaceable API
1572  */
1573 UBool UnicodeString::hasMetaData() const {
1574     return FALSE;
1575 }
1576
1577 UnicodeString&
1578 UnicodeString::doReverse(int32_t start, int32_t length) {
1579   if(length <= 1 || !cloneArrayIfNeeded()) {
1580     return *this;
1581   }
1582
1583   // pin the indices to legal values
1584   pinIndices(start, length);
1585   if(length <= 1) {  // pinIndices() might have shrunk the length
1586     return *this;
1587   }
1588
1589   UChar *left = getArrayStart() + start;
1590   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
1591   UChar swap;
1592   UBool hasSupplementary = FALSE;
1593
1594   // Before the loop we know left<right because length>=2.
1595   do {
1596     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1597     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1598     *right-- = swap;
1599   } while(left < right);
1600   // Make sure to test the middle code unit of an odd-length string.
1601   // Redundant if the length is even.
1602   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1603
1604   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1605   if(hasSupplementary) {
1606     UChar swap2;
1607
1608     left = getArrayStart() + start;
1609     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1610     while(left < right) {
1611       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1612         *left++ = swap2;
1613         *left++ = swap;
1614       } else {
1615         ++left;
1616       }
1617     }
1618   }
1619
1620   return *this;
1621 }
1622
1623 UBool
1624 UnicodeString::padLeading(int32_t targetLength,
1625                           UChar padChar)
1626 {
1627   int32_t oldLength = length();
1628   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1629     return FALSE;
1630   } else {
1631     // move contents up by padding width
1632     UChar *array = getArrayStart();
1633     int32_t start = targetLength - oldLength;
1634     us_arrayCopy(array, 0, array, start, oldLength);
1635
1636     // fill in padding character
1637     while(--start >= 0) {
1638       array[start] = padChar;
1639     }
1640     setLength(targetLength);
1641     return TRUE;
1642   }
1643 }
1644
1645 UBool
1646 UnicodeString::padTrailing(int32_t targetLength,
1647                            UChar padChar)
1648 {
1649   int32_t oldLength = length();
1650   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1651     return FALSE;
1652   } else {
1653     // fill in padding character
1654     UChar *array = getArrayStart();
1655     int32_t length = targetLength;
1656     while(--length >= oldLength) {
1657       array[length] = padChar;
1658     }
1659     setLength(targetLength);
1660     return TRUE;
1661   }
1662 }
1663
1664 //========================================
1665 // Hashing
1666 //========================================
1667 int32_t
1668 UnicodeString::doHashCode() const
1669 {
1670     /* Delegate hash computation to uhash.  This makes UnicodeString
1671      * hashing consistent with UChar* hashing.  */
1672     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1673     if (hashCode == kInvalidHashCode) {
1674         hashCode = kEmptyHashCode;
1675     }
1676     return hashCode;
1677 }
1678
1679 //========================================
1680 // External Buffer
1681 //========================================
1682
1683 UChar *
1684 UnicodeString::getBuffer(int32_t minCapacity) {
1685   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1686     fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1687     setZeroLength();
1688     return getArrayStart();
1689   } else {
1690     return 0;
1691   }
1692 }
1693
1694 void
1695 UnicodeString::releaseBuffer(int32_t newLength) {
1696   if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1697     // set the new fLength
1698     int32_t capacity=getCapacity();
1699     if(newLength==-1) {
1700       // the new length is the string length, capped by fCapacity
1701       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1702       while(p<limit && *p!=0) {
1703         ++p;
1704       }
1705       newLength=(int32_t)(p-array);
1706     } else if(newLength>capacity) {
1707       newLength=capacity;
1708     }
1709     setLength(newLength);
1710     fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1711   }
1712 }
1713
1714 //========================================
1715 // Miscellaneous
1716 //========================================
1717 UBool
1718 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1719                                   int32_t growCapacity,
1720                                   UBool doCopyArray,
1721                                   int32_t **pBufferToDelete,
1722                                   UBool forceClone) {
1723   // default parameters need to be static, therefore
1724   // the defaults are -1 to have convenience defaults
1725   if(newCapacity == -1) {
1726     newCapacity = getCapacity();
1727   }
1728
1729   // while a getBuffer(minCapacity) is "open",
1730   // prevent any modifications of the string by returning FALSE here
1731   // if the string is bogus, then only an assignment or similar can revive it
1732   if(!isWritable()) {
1733     return FALSE;
1734   }
1735
1736   /*
1737    * We need to make a copy of the array if
1738    * the buffer is read-only, or
1739    * the buffer is refCounted (shared), and refCount>1, or
1740    * the buffer is too small.
1741    * Return FALSE if memory could not be allocated.
1742    */
1743   if(forceClone ||
1744      fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1745      (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1746      newCapacity > getCapacity()
1747   ) {
1748     // check growCapacity for default value and use of the stack buffer
1749     if(growCapacity < 0) {
1750       growCapacity = newCapacity;
1751     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1752       growCapacity = US_STACKBUF_SIZE;
1753     }
1754
1755     // save old values
1756     UChar oldStackBuffer[US_STACKBUF_SIZE];
1757     UChar *oldArray;
1758     int32_t oldLength = length();
1759     int16_t flags = fUnion.fFields.fLengthAndFlags;
1760
1761     if(flags&kUsingStackBuffer) {
1762       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1763       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1764         // copy the stack buffer contents because it will be overwritten with
1765         // fUnion.fFields values
1766         us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1767         oldArray = oldStackBuffer;
1768       } else {
1769         oldArray = NULL; // no need to copy from the stack buffer to itself
1770       }
1771     } else {
1772       oldArray = fUnion.fFields.fArray;
1773       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1774     }
1775
1776     // allocate a new array
1777     if(allocate(growCapacity) ||
1778        (newCapacity < growCapacity && allocate(newCapacity))
1779     ) {
1780       if(doCopyArray) {
1781         // copy the contents
1782         // do not copy more than what fits - it may be smaller than before
1783         int32_t minLength = oldLength;
1784         newCapacity = getCapacity();
1785         if(newCapacity < minLength) {
1786           minLength = newCapacity;
1787         }
1788         if(oldArray != NULL) {
1789           us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1790         }
1791         setLength(minLength);
1792       } else {
1793         setZeroLength();
1794       }
1795
1796       // release the old array
1797       if(flags & kRefCounted) {
1798         // the array is refCounted; decrement and release if 0
1799         u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1800         if(umtx_atomic_dec(pRefCount) == 0) {
1801           if(pBufferToDelete == 0) {
1802               // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1803               // is defined as volatile. (Volatile has useful non-standard behavior
1804               //   with this compiler.)
1805             uprv_free((void *)pRefCount);
1806           } else {
1807             // the caller requested to delete it himself
1808             *pBufferToDelete = (int32_t *)pRefCount;
1809           }
1810         }
1811       }
1812     } else {
1813       // not enough memory for growCapacity and not even for the smaller newCapacity
1814       // reset the old values for setToBogus() to release the array
1815       if(!(flags&kUsingStackBuffer)) {
1816         fUnion.fFields.fArray = oldArray;
1817       }
1818       fUnion.fFields.fLengthAndFlags = flags;
1819       setToBogus();
1820       return FALSE;
1821     }
1822   }
1823   return TRUE;
1824 }
1825
1826 // UnicodeStringAppendable ------------------------------------------------- ***
1827
1828 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1829
1830 UBool
1831 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1832   return str.doAppend(&c, 0, 1).isWritable();
1833 }
1834
1835 UBool
1836 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1837   UChar buffer[U16_MAX_LENGTH];
1838   int32_t cLength = 0;
1839   UBool isError = FALSE;
1840   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1841   return !isError && str.doAppend(buffer, 0, cLength).isWritable();
1842 }
1843
1844 UBool
1845 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1846   return str.doAppend(s, 0, length).isWritable();
1847 }
1848
1849 UBool
1850 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1851   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1852 }
1853
1854 UChar *
1855 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1856                                          int32_t desiredCapacityHint,
1857                                          UChar *scratch, int32_t scratchCapacity,
1858                                          int32_t *resultCapacity) {
1859   if(minCapacity < 1 || scratchCapacity < minCapacity) {
1860     *resultCapacity = 0;
1861     return NULL;
1862   }
1863   int32_t oldLength = str.length();
1864   if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1865     *resultCapacity = str.getCapacity() - oldLength;
1866     return str.getArrayStart() + oldLength;
1867   }
1868   *resultCapacity = scratchCapacity;
1869   return scratch;
1870 }
1871
1872 U_NAMESPACE_END
1873
1874 U_NAMESPACE_USE
1875
1876 U_CAPI int32_t U_EXPORT2
1877 uhash_hashUnicodeString(const UElement key) {
1878     const UnicodeString *str = (const UnicodeString*) key.pointer;
1879     return (str == NULL) ? 0 : str->hashCode();
1880 }
1881
1882 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1883 // does not depend on hashtable code.
1884 U_CAPI UBool U_EXPORT2
1885 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1886     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1887     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1888     if (str1 == str2) {
1889         return TRUE;
1890     }
1891     if (str1 == NULL || str2 == NULL) {
1892         return FALSE;
1893     }
1894     return *str1 == *str2;
1895 }
1896
1897 #ifdef U_STATIC_IMPLEMENTATION
1898 /*
1899 This should never be called. It is defined here to make sure that the
1900 virtual vector deleting destructor is defined within unistr.cpp.
1901 The vector deleting destructor is already a part of UObject,
1902 but defining it here makes sure that it is included with this object file.
1903 This makes sure that static library dependencies are kept to a minimum.
1904 */
1905 static void uprv_UnicodeStringDummy(void) {
1906     delete [] (new UnicodeString[2]);
1907 }
1908 #endif