icuSources/common/unistr.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 ******************************************************************************
   5 * Copyright (C) 1999-2016, International Business Machines Corporation and
   6 * others. All Rights Reserved.
   7 ******************************************************************************
   8 *
   9 * File unistr.cpp
  10 *
  11 * Modification History:
  12 *
  13 *   Date        Name        Description
  14 *   09/25/98    stephen     Creation.
  15 *   04/20/99    stephen     Overhauled per 4/16 code review.
  16 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
  17 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
  18 *                           Replaceable.
  19 *   06/25/01    grhoten     Removed the dependency on iostream
  20 ******************************************************************************
  21 */
  22
  23 #include "unicode/utypes.h"
  24 #include "unicode/appendable.h"
  25 #include "unicode/putil.h"
  26 #include "cstring.h"
  27 #include "cmemory.h"
  28 #include "unicode/ustring.h"
  29 #include "unicode/unistr.h"
  30 #include "unicode/utf.h"
  31 #include "unicode/utf16.h"
  32 #include "uelement.h"
  33 #include "ustr_imp.h"
  34 #include "umutex.h"
  35 #include "uassert.h"
  36
  37 #if 0
  38
  39 #include <iostream>
  40 using namespace std;
  41
  42 //DEBUGGING
  43 void
  44 print(const UnicodeString& s,
  45       const char *name)
  46 {
  47   UChar c;
  48   cout << name << ":|";
  49   for(int i = 0; i < s.length(); ++i) {
  50     c = s[i];
  51     if(c>= 0x007E || c < 0x0020)
  52       cout << "[0x" << hex << s[i] << "]";
  53     else
  54       cout << (char) s[i];
  55   }
  56   cout << '|' << endl;
  57 }
  58
  59 void
  60 print(const UChar *s,
  61       int32_t len,
  62       const char *name)
  63 {
  64   UChar c;
  65   cout << name << ":|";
  66   for(int i = 0; i < len; ++i) {
  67     c = s[i];
  68     if(c>= 0x007E || c < 0x0020)
  69       cout << "[0x" << hex << s[i] << "]";
  70     else
  71       cout << (char) s[i];
  72   }
  73   cout << '|' << endl;
  74 }
  75 // END DEBUGGING
  76 #endif
  77
  78 // Local function definitions for now
  79
  80 // need to copy areas that may overlap
  81 static
  82 inline void
  83 us_arrayCopy(const UChar *src, int32_t srcStart,
  84          UChar *dst, int32_t dstStart, int32_t count)
  85 {
  86   if(count>0) {
  87     uprv_memmove(dst+dstStart, src+srcStart, (size_t)count*sizeof(*src));
  88   }
  89 }
  90
  91 // u_unescapeAt() callback to get a UChar from a UnicodeString
  92 U_CDECL_BEGIN
  93 static UChar U_CALLCONV
  94 UnicodeString_charAt(int32_t offset, void *context) {
  95     return ((icu::UnicodeString*) context)->charAt(offset);
  96 }
  97 U_CDECL_END
  98
  99 U_NAMESPACE_BEGIN
 100
 101 /* The Replaceable virtual destructor can't be defined in the header
 102    due to how AIX works with multiple definitions of virtual functions.
 103 */
 104 Replaceable::~Replaceable() {}
 105
 106 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
 107
 108 UnicodeString U_EXPORT2
 109 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
 110     return
 111         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
 112             append(s1).
 113                 append(s2);
 114 }
 115
 116 //========================================
 117 // Reference Counting functions, put at top of file so that optimizing compilers
 118 //                               have a chance to automatically inline.
 119 //========================================
 120
 121 void
 122 UnicodeString::addRef() {
 123   umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
 124 }
 125
 126 int32_t
 127 UnicodeString::removeRef() {
 128   return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
 129 }
 130
 131 int32_t
 132 UnicodeString::refCount() const {
 133   return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
 134 }
 135
 136 void
 137 UnicodeString::releaseArray() {
 138   if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
 139     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
 140   }
 141 }
 142
 143
 144
 145 //========================================
 146 // Constructors
 147 //========================================
 148
 149 // The default constructor is inline in unistr.h.
 150
 151 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
 152   fUnion.fFields.fLengthAndFlags = 0;
 153   if(count <= 0 || (uint32_t)c > 0x10ffff) {
 154     // just allocate and do not do anything else
 155     allocate(capacity);
 156   } else if(c <= 0xffff) {
 157     int32_t length = count;
 158     if(capacity < length) {
 159       capacity = length;
 160     }
 161     if(allocate(capacity)) {
 162       UChar *array = getArrayStart();
 163       UChar unit = (UChar)c;
 164       for(int32_t i = 0; i < length; ++i) {
 165         array[i] = unit;
 166       }
 167       setLength(length);
 168     }
 169   } else {  // supplementary code point, write surrogate pairs
 170     if(count > (INT32_MAX / 2)) {
 171       // We would get more than 2G UChars.
 172       allocate(capacity);
 173       return;
 174     }
 175     int32_t length = count * 2;
 176     if(capacity < length) {
 177       capacity = length;
 178     }
 179     if(allocate(capacity)) {
 180       UChar *array = getArrayStart();
 181       UChar lead = U16_LEAD(c);
 182       UChar trail = U16_TRAIL(c);
 183       for(int32_t i = 0; i < length; i += 2) {
 184         array[i] = lead;
 185         array[i + 1] = trail;
 186       }
 187       setLength(length);
 188     }
 189   }
 190 }
 191
 192 UnicodeString::UnicodeString(UChar ch) {
 193   fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
 194   fUnion.fStackFields.fBuffer[0] = ch;
 195 }
 196
 197 UnicodeString::UnicodeString(UChar32 ch) {
 198   fUnion.fFields.fLengthAndFlags = kShortString;
 199   int32_t i = 0;
 200   UBool isError = FALSE;
 201   U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
 202   // We test isError so that the compiler does not complain that we don't.
 203   // If isError then i==0 which is what we want anyway.
 204   if(!isError) {
 205     setShortLength(i);
 206   }
 207 }
 208
 209 UnicodeString::UnicodeString(const UChar *text) {
 210   fUnion.fFields.fLengthAndFlags = kShortString;
 211   doAppend(text, 0, -1);
 212 }
 213
 214 UnicodeString::UnicodeString(const UChar *text,
 215                              int32_t textLength) {
 216   fUnion.fFields.fLengthAndFlags = kShortString;
 217   doAppend(text, 0, textLength);
 218 }
 219
 220 UnicodeString::UnicodeString(UBool isTerminated,
 221                              ConstChar16Ptr textPtr,
 222                              int32_t textLength) {
 223   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
 224   const UChar *text = textPtr;
 225   if(text == NULL) {
 226     // treat as an empty string, do not alias
 227     setToEmpty();
 228   } else if(textLength < -1 ||
 229             (textLength == -1 && !isTerminated) ||
 230             (textLength >= 0 && isTerminated && text[textLength] != 0)
 231   ) {
 232     setToBogus();
 233   } else {
 234     if(textLength == -1) {
 235       // text is terminated, or else it would have failed the above test
 236       textLength = u_strlen(text);
 237     }
 238     setArray(const_cast<UChar *>(text), textLength,
 239              isTerminated ? textLength + 1 : textLength);
 240   }
 241 }
 242
 243 UnicodeString::UnicodeString(UChar *buff,
 244                              int32_t buffLength,
 245                              int32_t buffCapacity) {
 246   fUnion.fFields.fLengthAndFlags = kWritableAlias;
 247   if(buff == NULL) {
 248     // treat as an empty string, do not alias
 249     setToEmpty();
 250   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
 251     setToBogus();
 252   } else {
 253     if(buffLength == -1) {
 254       // fLength = u_strlen(buff); but do not look beyond buffCapacity
 255       const UChar *p = buff, *limit = buff + buffCapacity;
 256       while(p != limit && *p != 0) {
 257         ++p;
 258       }
 259       buffLength = (int32_t)(p - buff);
 260     }
 261     setArray(buff, buffLength, buffCapacity);
 262   }
 263 }
 264
 265 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
 266   fUnion.fFields.fLengthAndFlags = kShortString;
 267   if(src==NULL) {
 268     // treat as an empty string
 269   } else {
 270     if(length<0) {
 271       length=(int32_t)uprv_strlen(src);
 272     }
 273     if(cloneArrayIfNeeded(length, length, FALSE)) {
 274       u_charsToUChars(src, getArrayStart(), length);
 275       setLength(length);
 276     } else {
 277       setToBogus();
 278     }
 279   }
 280 }
 281
 282 #if U_CHARSET_IS_UTF8
 283
 284 UnicodeString::UnicodeString(const char *codepageData) {
 285   fUnion.fFields.fLengthAndFlags = kShortString;
 286   if(codepageData != 0) {
 287     setToUTF8(codepageData);
 288   }
 289 }
 290
 291 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
 292   fUnion.fFields.fLengthAndFlags = kShortString;
 293   // if there's nothing to convert, do nothing
 294   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
 295     return;
 296   }
 297   if(dataLength == -1) {
 298     dataLength = (int32_t)uprv_strlen(codepageData);
 299   }
 300   setToUTF8(StringPiece(codepageData, dataLength));
 301 }
 302
 303 // else see unistr_cnv.cpp
 304 #endif
 305
 306 UnicodeString::UnicodeString(const UnicodeString& that) {
 307   fUnion.fFields.fLengthAndFlags = kShortString;
 308   copyFrom(that);
 309 }
 310
 311 #if U_HAVE_RVALUE_REFERENCES
 312 UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
 313   fUnion.fFields.fLengthAndFlags = kShortString;
 314   moveFrom(src);
 315 }
 316 #endif
 317
 318 UnicodeString::UnicodeString(const UnicodeString& that,
 319                              int32_t srcStart) {
 320   fUnion.fFields.fLengthAndFlags = kShortString;
 321   setTo(that, srcStart);
 322 }
 323
 324 UnicodeString::UnicodeString(const UnicodeString& that,
 325                              int32_t srcStart,
 326                              int32_t srcLength) {
 327   fUnion.fFields.fLengthAndFlags = kShortString;
 328   setTo(that, srcStart, srcLength);
 329 }
 330
 331 // Replaceable base class clone() default implementation, does not clone
 332 Replaceable *
 333 Replaceable::clone() const {
 334   return NULL;
 335 }
 336
 337 // UnicodeString overrides clone() with a real implementation
 338 Replaceable *
 339 UnicodeString::clone() const {
 340   return new UnicodeString(*this);
 341 }
 342
 343 //========================================
 344 // array allocation
 345 //========================================
 346
 347 namespace {
 348
 349 const int32_t kGrowSize = 128;
 350
 351 // The number of bytes for one int32_t reference counter and capacity UChars
 352 // must fit into a 32-bit size_t (at least when on a 32-bit platform).
 353 // We also add one for the NUL terminator, to avoid reallocation in getTerminatedBuffer(),
 354 // and round up to a multiple of 16 bytes.
 355 // This means that capacity must be at most (0xfffffff0 - 4) / 2 - 1 = 0x7ffffff5.
 356 // (With more complicated checks we could go up to 0x7ffffffd without rounding up,
 357 // but that does not seem worth it.)
 358 const int32_t kMaxCapacity = 0x7ffffff5;
 359
 360 int32_t getGrowCapacity(int32_t newLength) {
 361   int32_t growSize = (newLength >> 2) + kGrowSize;
 362   if(growSize <= (kMaxCapacity - newLength)) {
 363     return newLength + growSize;
 364   } else {
 365     return kMaxCapacity;
 366   }
 367 }
 368
 369 }  // namespace
 370
 371 UBool
 372 UnicodeString::allocate(int32_t capacity) {
 373   if(capacity <= US_STACKBUF_SIZE) {
 374     fUnion.fFields.fLengthAndFlags = kShortString;
 375     return TRUE;
 376   }
 377   if(capacity <= kMaxCapacity) {
 378     ++capacity;  // for the NUL
 379     // Switch to size_t which is unsigned so that we can allocate up to 4GB.
 380     // Reference counter + UChars.
 381     size_t numBytes = sizeof(int32_t) + (size_t)capacity * U_SIZEOF_UCHAR;
 382     // Round up to a multiple of 16.
 383     numBytes = (numBytes + 15) & ~15;
 384     int32_t *array = (int32_t *) uprv_malloc(numBytes);
 385     if(array != NULL) {
 386       // set initial refCount and point behind the refCount
 387       *array++ = 1;
 388       numBytes -= sizeof(int32_t);
 389
 390       // have fArray point to the first UChar
 391       fUnion.fFields.fArray = (UChar *)array;
 392       fUnion.fFields.fCapacity = (int32_t)(numBytes / U_SIZEOF_UCHAR);
 393       fUnion.fFields.fLengthAndFlags = kLongString;
 394       return TRUE;
 395     }
 396   }
 397   fUnion.fFields.fLengthAndFlags = kIsBogus;
 398   fUnion.fFields.fArray = 0;
 399   fUnion.fFields.fCapacity = 0;
 400   return FALSE;
 401 }
 402
 403 //========================================
 404 // Destructor
 405 //========================================
 406
 407 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
 408 static u_atomic_int32_t finalLengthCounts[0x400];  // UnicodeString::kMaxShortLength+1
 409 static u_atomic_int32_t beyondCount(0);
 410
 411 U_CAPI void unistr_printLengths() {
 412   int32_t i;
 413   for(i = 0; i <= 59; ++i) {
 414     printf("%2d,  %9d\n", i, (int32_t)finalLengthCounts[i]);
 415   }
 416   int32_t beyond = beyondCount;
 417   for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
 418     beyond += finalLengthCounts[i];
 419   }
 420   printf(">59, %9d\n", beyond);
 421 }
 422 #endif
 423
 424 UnicodeString::~UnicodeString()
 425 {
 426 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
 427   // Count lengths of strings at the end of their lifetime.
 428   // Useful for discussion of a desirable stack buffer size.
 429   // Count the contents length, not the optional NUL terminator nor further capacity.
 430   // Ignore open-buffer strings and strings which alias external storage.
 431   if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
 432     if(hasShortLength()) {
 433       umtx_atomic_inc(finalLengthCounts + getShortLength());
 434     } else {
 435       umtx_atomic_inc(&beyondCount);
 436     }
 437   }
 438 #endif
 439
 440   releaseArray();
 441 }
 442
 443 //========================================
 444 // Factory methods
 445 //========================================
 446
 447 UnicodeString UnicodeString::fromUTF8(StringPiece utf8) {
 448   UnicodeString result;
 449   result.setToUTF8(utf8);
 450   return result;
 451 }
 452
 453 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
 454   UnicodeString result;
 455   int32_t capacity;
 456   // Most UTF-32 strings will be BMP-only and result in a same-length
 457   // UTF-16 string. We overestimate the capacity just slightly,
 458   // just in case there are a few supplementary characters.
 459   if(length <= US_STACKBUF_SIZE) {
 460     capacity = US_STACKBUF_SIZE;
 461   } else {
 462     capacity = length + (length >> 4) + 4;
 463   }
 464   do {
 465     UChar *utf16 = result.getBuffer(capacity);
 466     int32_t length16;
 467     UErrorCode errorCode = U_ZERO_ERROR;
 468     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
 469         utf32, length,
 470         0xfffd,  // Substitution character.
 471         NULL,    // Don't care about number of substitutions.
 472         &errorCode);
 473     result.releaseBuffer(length16);
 474     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 475       capacity = length16 + 1;  // +1 for the terminating NUL.
 476       continue;
 477     } else if(U_FAILURE(errorCode)) {
 478       result.setToBogus();
 479     }
 480     break;
 481   } while(TRUE);
 482   return result;
 483 }
 484
 485 //========================================
 486 // Assignment
 487 //========================================
 488
 489 UnicodeString &
 490 UnicodeString::operator=(const UnicodeString &src) {
 491   return copyFrom(src);
 492 }
 493
 494 UnicodeString &
 495 UnicodeString::fastCopyFrom(const UnicodeString &src) {
 496   return copyFrom(src, TRUE);
 497 }
 498
 499 UnicodeString &
 500 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
 501   // if assigning to ourselves, do nothing
 502   if(this == &src) {
 503     return *this;
 504   }
 505
 506   // is the right side bogus?
 507   if(src.isBogus()) {
 508     setToBogus();
 509     return *this;
 510   }
 511
 512   // delete the current contents
 513   releaseArray();
 514
 515   if(src.isEmpty()) {
 516     // empty string - use the stack buffer
 517     setToEmpty();
 518     return *this;
 519   }
 520
 521   // fLength>0 and not an "open" src.getBuffer(minCapacity)
 522   fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
 523   switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
 524   case kShortString:
 525     // short string using the stack buffer, do the same
 526     uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
 527                 getShortLength() * U_SIZEOF_UCHAR);
 528     break;
 529   case kLongString:
 530     // src uses a refCounted string buffer, use that buffer with refCount
 531     // src is const, use a cast - we don't actually change it
 532     ((UnicodeString &)src).addRef();
 533     // copy all fields, share the reference-counted buffer
 534     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 535     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 536     if(!hasShortLength()) {
 537       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
 538     }
 539     break;
 540   case kReadonlyAlias:
 541     if(fastCopy) {
 542       // src is a readonly alias, do the same
 543       // -> maintain the readonly alias as such
 544       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 545       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 546       if(!hasShortLength()) {
 547         fUnion.fFields.fLength = src.fUnion.fFields.fLength;
 548       }
 549       break;
 550     }
 551     // else if(!fastCopy) fall through to case kWritableAlias
 552     // -> allocate a new buffer and copy the contents
 553     U_FALLTHROUGH;
 554   case kWritableAlias: {
 555     // src is a writable alias; we make a copy of that instead
 556     int32_t srcLength = src.length();
 557     if(allocate(srcLength)) {
 558       u_memcpy(getArrayStart(), src.getArrayStart(), srcLength);
 559       setLength(srcLength);
 560       break;
 561     }
 562     // if there is not enough memory, then fall through to setting to bogus
 563     U_FALLTHROUGH;
 564   }
 565   default:
 566     // if src is bogus, set ourselves to bogus
 567     // do not call setToBogus() here because fArray and flags are not consistent here
 568     fUnion.fFields.fLengthAndFlags = kIsBogus;
 569     fUnion.fFields.fArray = 0;
 570     fUnion.fFields.fCapacity = 0;
 571     break;
 572   }
 573
 574   return *this;
 575 }
 576
 577 UnicodeString &UnicodeString::moveFrom(UnicodeString &src) U_NOEXCEPT {
 578   // No explicit check for self move assignment, consistent with standard library.
 579   // Self move assignment causes no crash nor leak but might make the object bogus.
 580   releaseArray();
 581   copyFieldsFrom(src, TRUE);
 582   return *this;
 583 }
 584
 585 // Same as moveFrom() except without memory management.
 586 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
 587   int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
 588   if(lengthAndFlags & kUsingStackBuffer) {
 589     // Short string using the stack buffer, copy the contents.
 590     // Check for self assignment to prevent "overlap in memcpy" warnings,
 591     // although it should be harmless to copy a buffer to itself exactly.
 592     if(this != &src) {
 593       uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
 594                   getShortLength() * U_SIZEOF_UCHAR);
 595     }
 596   } else {
 597     // In all other cases, copy all fields.
 598     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 599     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 600     if(!hasShortLength()) {
 601       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
 602     }
 603     if(setSrcToBogus) {
 604       // Set src to bogus without releasing any memory.
 605       src.fUnion.fFields.fLengthAndFlags = kIsBogus;
 606       src.fUnion.fFields.fArray = NULL;
 607       src.fUnion.fFields.fCapacity = 0;
 608     }
 609   }
 610 }
 611
 612 void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
 613   UnicodeString temp;  // Empty short string: Known not to need releaseArray().
 614   // Copy fields without resetting source values in between.
 615   temp.copyFieldsFrom(*this, FALSE);
 616   this->copyFieldsFrom(other, FALSE);
 617   other.copyFieldsFrom(temp, FALSE);
 618   // Set temp to an empty string so that other's memory is not released twice.
 619   temp.fUnion.fFields.fLengthAndFlags = kShortString;
 620 }
 621
 622 //========================================
 623 // Miscellaneous operations
 624 //========================================
 625
 626 UnicodeString UnicodeString::unescape() const {
 627     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
 628     if (result.isBogus()) {
 629         return result;
 630     }
 631     const UChar *array = getBuffer();
 632     int32_t len = length();
 633     int32_t prev = 0;
 634     for (int32_t i=0;;) {
 635         if (i == len) {
 636             result.append(array, prev, len - prev);
 637             break;
 638         }
 639         if (array[i++] == 0x5C /*'\\'*/) {
 640             result.append(array, prev, (i - 1) - prev);
 641             UChar32 c = unescapeAt(i); // advances i
 642             if (c < 0) {
 643                 result.remove(); // return empty string
 644                 break; // invalid escape sequence
 645             }
 646             result.append(c);
 647             prev = i;
 648         }
 649     }
 650     return result;
 651 }
 652
 653 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
 654     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
 655 }
 656
 657 //========================================
 658 // Read-only implementation
 659 //========================================
 660 UBool
 661 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
 662   // Requires: this & text not bogus and have same lengths.
 663   // Byte-wise comparison works for equality regardless of endianness.
 664   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
 665 }
 666
 667 int8_t
 668 UnicodeString::doCompare( int32_t start,
 669               int32_t length,
 670               const UChar *srcChars,
 671               int32_t srcStart,
 672               int32_t srcLength) const
 673 {
 674   // compare illegal string values
 675   if(isBogus()) {
 676     return -1;
 677   }
 678
 679   // pin indices to legal values
 680   pinIndices(start, length);
 681
 682   if(srcChars == NULL) {
 683     // treat const UChar *srcChars==NULL as an empty string
 684     return length == 0 ? 0 : 1;
 685   }
 686
 687   // get the correct pointer
 688   const UChar *chars = getArrayStart();
 689
 690   chars += start;
 691   srcChars += srcStart;
 692
 693   int32_t minLength;
 694   int8_t lengthResult;
 695
 696   // get the srcLength if necessary
 697   if(srcLength < 0) {
 698     srcLength = u_strlen(srcChars + srcStart);
 699   }
 700
 701   // are we comparing different lengths?
 702   if(length != srcLength) {
 703     if(length < srcLength) {
 704       minLength = length;
 705       lengthResult = -1;
 706     } else {
 707       minLength = srcLength;
 708       lengthResult = 1;
 709     }
 710   } else {
 711     minLength = length;
 712     lengthResult = 0;
 713   }
 714
 715   /*
 716    * note that uprv_memcmp() returns an int but we return an int8_t;
 717    * we need to take care not to truncate the result -
 718    * one way to do this is to right-shift the value to
 719    * move the sign bit into the lower 8 bits and making sure that this
 720    * does not become 0 itself
 721    */
 722
 723   if(minLength > 0 && chars != srcChars) {
 724     int32_t result;
 725
 726 #   if U_IS_BIG_ENDIAN
 727       // big-endian: byte comparison works
 728       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
 729       if(result != 0) {
 730         return (int8_t)(result >> 15 | 1);
 731       }
 732 #   else
 733       // little-endian: compare UChar units
 734       do {
 735         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
 736         if(result != 0) {
 737           return (int8_t)(result >> 15 | 1);
 738         }
 739       } while(--minLength > 0);
 740 #   endif
 741   }
 742   return lengthResult;
 743 }
 744
 745 /* String compare in code point order - doCompare() compares in code unit order. */
 746 int8_t
 747 UnicodeString::doCompareCodePointOrder(int32_t start,
 748                                        int32_t length,
 749                                        const UChar *srcChars,
 750                                        int32_t srcStart,
 751                                        int32_t srcLength) const
 752 {
 753   // compare illegal string values
 754   // treat const UChar *srcChars==NULL as an empty string
 755   if(isBogus()) {
 756     return -1;
 757   }
 758
 759   // pin indices to legal values
 760   pinIndices(start, length);
 761
 762   if(srcChars == NULL) {
 763     srcStart = srcLength = 0;
 764   }
 765
 766   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
 767   /* translate the 32-bit result into an 8-bit one */
 768   if(diff!=0) {
 769     return (int8_t)(diff >> 15 | 1);
 770   } else {
 771     return 0;
 772   }
 773 }
 774
 775 int32_t
 776 UnicodeString::getLength() const {
 777     return length();
 778 }
 779
 780 UChar
 781 UnicodeString::getCharAt(int32_t offset) const {
 782   return charAt(offset);
 783 }
 784
 785 UChar32
 786 UnicodeString::getChar32At(int32_t offset) const {
 787   return char32At(offset);
 788 }
 789
 790 UChar32
 791 UnicodeString::char32At(int32_t offset) const
 792 {
 793   int32_t len = length();
 794   if((uint32_t)offset < (uint32_t)len) {
 795     const UChar *array = getArrayStart();
 796     UChar32 c;
 797     U16_GET(array, 0, offset, len, c);
 798     return c;
 799   } else {
 800     return kInvalidUChar;
 801   }
 802 }
 803
 804 int32_t
 805 UnicodeString::getChar32Start(int32_t offset) const {
 806   if((uint32_t)offset < (uint32_t)length()) {
 807     const UChar *array = getArrayStart();
 808     U16_SET_CP_START(array, 0, offset);
 809     return offset;
 810   } else {
 811     return 0;
 812   }
 813 }
 814
 815 int32_t
 816 UnicodeString::getChar32Limit(int32_t offset) const {
 817   int32_t len = length();
 818   if((uint32_t)offset < (uint32_t)len) {
 819     const UChar *array = getArrayStart();
 820     U16_SET_CP_LIMIT(array, 0, offset, len);
 821     return offset;
 822   } else {
 823     return len;
 824   }
 825 }
 826
 827 int32_t
 828 UnicodeString::countChar32(int32_t start, int32_t length) const {
 829   pinIndices(start, length);
 830   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
 831   return u_countChar32(getArrayStart()+start, length);
 832 }
 833
 834 UBool
 835 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
 836   pinIndices(start, length);
 837   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
 838   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
 839 }
 840
 841 int32_t
 842 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
 843   // pin index
 844   int32_t len = length();
 845   if(index<0) {
 846     index=0;
 847   } else if(index>len) {
 848     index=len;
 849   }
 850
 851   const UChar *array = getArrayStart();
 852   if(delta>0) {
 853     U16_FWD_N(array, index, len, delta);
 854   } else {
 855     U16_BACK_N(array, 0, index, -delta);
 856   }
 857
 858   return index;
 859 }
 860
 861 void
 862 UnicodeString::doExtract(int32_t start,
 863              int32_t length,
 864              UChar *dst,
 865              int32_t dstStart) const
 866 {
 867   // pin indices to legal values
 868   pinIndices(start, length);
 869
 870   // do not copy anything if we alias dst itself
 871   const UChar *array = getArrayStart();
 872   if(array + start != dst + dstStart) {
 873     us_arrayCopy(array, start, dst, dstStart, length);
 874   }
 875 }
 876
 877 int32_t
 878 UnicodeString::extract(Char16Ptr dest, int32_t destCapacity,
 879                        UErrorCode &errorCode) const {
 880   int32_t len = length();
 881   if(U_SUCCESS(errorCode)) {
 882     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 883       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 884     } else {
 885       const UChar *array = getArrayStart();
 886       if(len>0 && len<=destCapacity && array!=dest) {
 887         u_memcpy(dest, array, len);
 888       }
 889       return u_terminateUChars(dest, destCapacity, len, &errorCode);
 890     }
 891   }
 892
 893   return len;
 894 }
 895
 896 int32_t
 897 UnicodeString::extract(int32_t start,
 898                        int32_t length,
 899                        char *target,
 900                        int32_t targetCapacity,
 901                        enum EInvariant) const
 902 {
 903   // if the arguments are illegal, then do nothing
 904   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
 905     return 0;
 906   }
 907
 908   // pin the indices to legal values
 909   pinIndices(start, length);
 910
 911   if(length <= targetCapacity) {
 912     u_UCharsToChars(getArrayStart() + start, target, length);
 913   }
 914   UErrorCode status = U_ZERO_ERROR;
 915   return u_terminateChars(target, targetCapacity, length, &status);
 916 }
 917
 918 UnicodeString
 919 UnicodeString::tempSubString(int32_t start, int32_t len) const {
 920   pinIndices(start, len);
 921   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
 922   if(array==NULL) {
 923     array=fUnion.fStackFields.fBuffer;  // anything not NULL because that would make an empty string
 924     len=-2;  // bogus result string
 925   }
 926   return UnicodeString(FALSE, array + start, len);
 927 }
 928
 929 int32_t
 930 UnicodeString::toUTF8(int32_t start, int32_t len,
 931                       char *target, int32_t capacity) const {
 932   pinIndices(start, len);
 933   int32_t length8;
 934   UErrorCode errorCode = U_ZERO_ERROR;
 935   u_strToUTF8WithSub(target, capacity, &length8,
 936                      getBuffer() + start, len,
 937                      0xFFFD,  // Standard substitution character.
 938                      NULL,    // Don't care about number of substitutions.
 939                      &errorCode);
 940   return length8;
 941 }
 942
 943 #if U_CHARSET_IS_UTF8
 944
 945 int32_t
 946 UnicodeString::extract(int32_t start, int32_t len,
 947                        char *target, uint32_t dstSize) const {
 948   // if the arguments are illegal, then do nothing
 949   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
 950     return 0;
 951   }
 952   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
 953 }
 954
 955 // else see unistr_cnv.cpp
 956 #endif
 957
 958 void
 959 UnicodeString::extractBetween(int32_t start,
 960                   int32_t limit,
 961                   UnicodeString& target) const {
 962   pinIndex(start);
 963   pinIndex(limit);
 964   doExtract(start, limit - start, target);
 965 }
 966
 967 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
 968 // as many bytes as the source has UChars.
 969 // The "worst cases" are writing systems like Indic, Thai and CJK with
 970 // 3:1 bytes:UChars.
 971 void
 972 UnicodeString::toUTF8(ByteSink &sink) const {
 973   int32_t length16 = length();
 974   if(length16 != 0) {
 975     char stackBuffer[1024];
 976     int32_t capacity = (int32_t)sizeof(stackBuffer);
 977     UBool utf8IsOwned = FALSE;
 978     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
 979                                       3*length16,
 980                                       stackBuffer, capacity,
 981                                       &capacity);
 982     int32_t length8 = 0;
 983     UErrorCode errorCode = U_ZERO_ERROR;
 984     u_strToUTF8WithSub(utf8, capacity, &length8,
 985                        getBuffer(), length16,
 986                        0xFFFD,  // Standard substitution character.
 987                        NULL,    // Don't care about number of substitutions.
 988                        &errorCode);
 989     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 990       utf8 = (char *)uprv_malloc(length8);
 991       if(utf8 != NULL) {
 992         utf8IsOwned = TRUE;
 993         errorCode = U_ZERO_ERROR;
 994         u_strToUTF8WithSub(utf8, length8, &length8,
 995                            getBuffer(), length16,
 996                            0xFFFD,  // Standard substitution character.
 997                            NULL,    // Don't care about number of substitutions.
 998                            &errorCode);
 999       } else {
1000         errorCode = U_MEMORY_ALLOCATION_ERROR;
1001       }
1002     }
1003     if(U_SUCCESS(errorCode)) {
1004       sink.Append(utf8, length8);
1005       sink.Flush();
1006     }
1007     if(utf8IsOwned) {
1008       uprv_free(utf8);
1009     }
1010   }
1011 }
1012
1013 int32_t
1014 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
1015   int32_t length32=0;
1016   if(U_SUCCESS(errorCode)) {
1017     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
1018     u_strToUTF32WithSub(utf32, capacity, &length32,
1019         getBuffer(), length(),
1020         0xfffd,  // Substitution character.
1021         NULL,    // Don't care about number of substitutions.
1022         &errorCode);
1023   }
1024   return length32;
1025 }
1026
1027 int32_t
1028 UnicodeString::indexOf(const UChar *srcChars,
1029                int32_t srcStart,
1030                int32_t srcLength,
1031                int32_t start,
1032                int32_t length) const
1033 {
1034   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1035     return -1;
1036   }
1037
1038   // UnicodeString does not find empty substrings
1039   if(srcLength < 0 && srcChars[srcStart] == 0) {
1040     return -1;
1041   }
1042
1043   // get the indices within bounds
1044   pinIndices(start, length);
1045
1046   // find the first occurrence of the substring
1047   const UChar *array = getArrayStart();
1048   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1049   if(match == NULL) {
1050     return -1;
1051   } else {
1052     return (int32_t)(match - array);
1053   }
1054 }
1055
1056 int32_t
1057 UnicodeString::doIndexOf(UChar c,
1058              int32_t start,
1059              int32_t length) const
1060 {
1061   // pin indices
1062   pinIndices(start, length);
1063
1064   // find the first occurrence of c
1065   const UChar *array = getArrayStart();
1066   const UChar *match = u_memchr(array + start, c, length);
1067   if(match == NULL) {
1068     return -1;
1069   } else {
1070     return (int32_t)(match - array);
1071   }
1072 }
1073
1074 int32_t
1075 UnicodeString::doIndexOf(UChar32 c,
1076                          int32_t start,
1077                          int32_t length) const {
1078   // pin indices
1079   pinIndices(start, length);
1080
1081   // find the first occurrence of c
1082   const UChar *array = getArrayStart();
1083   const UChar *match = u_memchr32(array + start, c, length);
1084   if(match == NULL) {
1085     return -1;
1086   } else {
1087     return (int32_t)(match - array);
1088   }
1089 }
1090
1091 int32_t
1092 UnicodeString::lastIndexOf(const UChar *srcChars,
1093                int32_t srcStart,
1094                int32_t srcLength,
1095                int32_t start,
1096                int32_t length) const
1097 {
1098   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1099     return -1;
1100   }
1101
1102   // UnicodeString does not find empty substrings
1103   if(srcLength < 0 && srcChars[srcStart] == 0) {
1104     return -1;
1105   }
1106
1107   // get the indices within bounds
1108   pinIndices(start, length);
1109
1110   // find the last occurrence of the substring
1111   const UChar *array = getArrayStart();
1112   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1113   if(match == NULL) {
1114     return -1;
1115   } else {
1116     return (int32_t)(match - array);
1117   }
1118 }
1119
1120 int32_t
1121 UnicodeString::doLastIndexOf(UChar c,
1122                  int32_t start,
1123                  int32_t length) const
1124 {
1125   if(isBogus()) {
1126     return -1;
1127   }
1128
1129   // pin indices
1130   pinIndices(start, length);
1131
1132   // find the last occurrence of c
1133   const UChar *array = getArrayStart();
1134   const UChar *match = u_memrchr(array + start, c, length);
1135   if(match == NULL) {
1136     return -1;
1137   } else {
1138     return (int32_t)(match - array);
1139   }
1140 }
1141
1142 int32_t
1143 UnicodeString::doLastIndexOf(UChar32 c,
1144                              int32_t start,
1145                              int32_t length) const {
1146   // pin indices
1147   pinIndices(start, length);
1148
1149   // find the last occurrence of c
1150   const UChar *array = getArrayStart();
1151   const UChar *match = u_memrchr32(array + start, c, length);
1152   if(match == NULL) {
1153     return -1;
1154   } else {
1155     return (int32_t)(match - array);
1156   }
1157 }
1158
1159 //========================================
1160 // Write implementation
1161 //========================================
1162
1163 UnicodeString&
1164 UnicodeString::findAndReplace(int32_t start,
1165                   int32_t length,
1166                   const UnicodeString& oldText,
1167                   int32_t oldStart,
1168                   int32_t oldLength,
1169                   const UnicodeString& newText,
1170                   int32_t newStart,
1171                   int32_t newLength)
1172 {
1173   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1174     return *this;
1175   }
1176
1177   pinIndices(start, length);
1178   oldText.pinIndices(oldStart, oldLength);
1179   newText.pinIndices(newStart, newLength);
1180
1181   if(oldLength == 0) {
1182     return *this;
1183   }
1184
1185   while(length > 0 && length >= oldLength) {
1186     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1187     if(pos < 0) {
1188       // no more oldText's here: done
1189       break;
1190     } else {
1191       // we found oldText, replace it by newText and go beyond it
1192       replace(pos, oldLength, newText, newStart, newLength);
1193       length -= pos + oldLength - start;
1194       start = pos + newLength;
1195     }
1196   }
1197
1198   return *this;
1199 }
1200
1201
1202 void
1203 UnicodeString::setToBogus()
1204 {
1205   releaseArray();
1206
1207   fUnion.fFields.fLengthAndFlags = kIsBogus;
1208   fUnion.fFields.fArray = 0;
1209   fUnion.fFields.fCapacity = 0;
1210 }
1211
1212 // turn a bogus string into an empty one
1213 void
1214 UnicodeString::unBogus() {
1215   if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1216     setToEmpty();
1217   }
1218 }
1219
1220 const char16_t *
1221 UnicodeString::getTerminatedBuffer() {
1222   if(!isWritable()) {
1223     return nullptr;
1224   }
1225   UChar *array = getArrayStart();
1226   int32_t len = length();
1227   if(len < getCapacity()) {
1228     if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1229       // If len<capacity on a read-only alias, then array[len] is
1230       // either the original NUL (if constructed with (TRUE, s, length))
1231       // or one of the original string contents characters (if later truncated),
1232       // therefore we can assume that array[len] is initialized memory.
1233       if(array[len] == 0) {
1234         return array;
1235       }
1236     } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1237       // kRefCounted: Do not write the NUL if the buffer is shared.
1238       // That is mostly safe, except when the length of one copy was modified
1239       // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1240       // Then the NUL would be written into the middle of another copy's string.
1241
1242       // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1243       // Do not test if there is a NUL already because it might be uninitialized memory.
1244       // (That would be safe, but tools like valgrind & Purify would complain.)
1245       array[len] = 0;
1246       return array;
1247     }
1248   }
1249   if(len<INT32_MAX && cloneArrayIfNeeded(len+1)) {
1250     array = getArrayStart();
1251     array[len] = 0;
1252     return array;
1253   } else {
1254     return nullptr;
1255   }
1256 }
1257
1258 // setTo() analogous to the readonly-aliasing constructor with the same signature
1259 UnicodeString &
1260 UnicodeString::setTo(UBool isTerminated,
1261                      ConstChar16Ptr textPtr,
1262                      int32_t textLength)
1263 {
1264   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1265     // do not modify a string that has an "open" getBuffer(minCapacity)
1266     return *this;
1267   }
1268
1269   const UChar *text = textPtr;
1270   if(text == NULL) {
1271     // treat as an empty string, do not alias
1272     releaseArray();
1273     setToEmpty();
1274     return *this;
1275   }
1276
1277   if( textLength < -1 ||
1278       (textLength == -1 && !isTerminated) ||
1279       (textLength >= 0 && isTerminated && text[textLength] != 0)
1280   ) {
1281     setToBogus();
1282     return *this;
1283   }
1284
1285   releaseArray();
1286
1287   if(textLength == -1) {
1288     // text is terminated, or else it would have failed the above test
1289     textLength = u_strlen(text);
1290   }
1291   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1292   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1293   return *this;
1294 }
1295
1296 // setTo() analogous to the writable-aliasing constructor with the same signature
1297 UnicodeString &
1298 UnicodeString::setTo(UChar *buffer,
1299                      int32_t buffLength,
1300                      int32_t buffCapacity) {
1301   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1302     // do not modify a string that has an "open" getBuffer(minCapacity)
1303     return *this;
1304   }
1305
1306   if(buffer == NULL) {
1307     // treat as an empty string, do not alias
1308     releaseArray();
1309     setToEmpty();
1310     return *this;
1311   }
1312
1313   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1314     setToBogus();
1315     return *this;
1316   } else if(buffLength == -1) {
1317     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1318     const UChar *p = buffer, *limit = buffer + buffCapacity;
1319     while(p != limit && *p != 0) {
1320       ++p;
1321     }
1322     buffLength = (int32_t)(p - buffer);
1323   }
1324
1325   releaseArray();
1326
1327   fUnion.fFields.fLengthAndFlags = kWritableAlias;
1328   setArray(buffer, buffLength, buffCapacity);
1329   return *this;
1330 }
1331
1332 UnicodeString &UnicodeString::setToUTF8(StringPiece utf8) {
1333   unBogus();
1334   int32_t length = utf8.length();
1335   int32_t capacity;
1336   // The UTF-16 string will be at most as long as the UTF-8 string.
1337   if(length <= US_STACKBUF_SIZE) {
1338     capacity = US_STACKBUF_SIZE;
1339   } else {
1340     capacity = length + 1;  // +1 for the terminating NUL.
1341   }
1342   UChar *utf16 = getBuffer(capacity);
1343   int32_t length16;
1344   UErrorCode errorCode = U_ZERO_ERROR;
1345   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1346       utf8.data(), length,
1347       0xfffd,  // Substitution character.
1348       NULL,    // Don't care about number of substitutions.
1349       &errorCode);
1350   releaseBuffer(length16);
1351   if(U_FAILURE(errorCode)) {
1352     setToBogus();
1353   }
1354   return *this;
1355 }
1356
1357 UnicodeString&
1358 UnicodeString::setCharAt(int32_t offset,
1359              UChar c)
1360 {
1361   int32_t len = length();
1362   if(cloneArrayIfNeeded() && len > 0) {
1363     if(offset < 0) {
1364       offset = 0;
1365     } else if(offset >= len) {
1366       offset = len - 1;
1367     }
1368
1369     getArrayStart()[offset] = c;
1370   }
1371   return *this;
1372 }
1373
1374 UnicodeString&
1375 UnicodeString::replace(int32_t start,
1376                int32_t _length,
1377                UChar32 srcChar) {
1378   UChar buffer[U16_MAX_LENGTH];
1379   int32_t count = 0;
1380   UBool isError = FALSE;
1381   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1382   // We test isError so that the compiler does not complain that we don't.
1383   // If isError (srcChar is not a valid code point) then count==0 which means
1384   // we remove the source segment rather than replacing it with srcChar.
1385   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1386 }
1387
1388 UnicodeString&
1389 UnicodeString::append(UChar32 srcChar) {
1390   UChar buffer[U16_MAX_LENGTH];
1391   int32_t _length = 0;
1392   UBool isError = FALSE;
1393   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1394   // We test isError so that the compiler does not complain that we don't.
1395   // If isError then _length==0 which turns the doAppend() into a no-op anyway.
1396   return isError ? *this : doAppend(buffer, 0, _length);
1397 }
1398
1399 UnicodeString&
1400 UnicodeString::doReplace( int32_t start,
1401               int32_t length,
1402               const UnicodeString& src,
1403               int32_t srcStart,
1404               int32_t srcLength)
1405 {
1406   // pin the indices to legal values
1407   src.pinIndices(srcStart, srcLength);
1408
1409   // get the characters from src
1410   // and replace the range in ourselves with them
1411   return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1412 }
1413
1414 UnicodeString&
1415 UnicodeString::doReplace(int32_t start,
1416              int32_t length,
1417              const UChar *srcChars,
1418              int32_t srcStart,
1419              int32_t srcLength)
1420 {
1421   if(!isWritable()) {
1422     return *this;
1423   }
1424
1425   int32_t oldLength = this->length();
1426
1427   // optimize (read-only alias).remove(0, start) and .remove(start, end)
1428   if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1429     if(start == 0) {
1430       // remove prefix by adjusting the array pointer
1431       pinIndex(length);
1432       fUnion.fFields.fArray += length;
1433       fUnion.fFields.fCapacity -= length;
1434       setLength(oldLength - length);
1435       return *this;
1436     } else {
1437       pinIndex(start);
1438       if(length >= (oldLength - start)) {
1439         // remove suffix by reducing the length (like truncate())
1440         setLength(start);
1441         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
1442         return *this;
1443       }
1444     }
1445   }
1446
1447   if(start == oldLength) {
1448     return doAppend(srcChars, srcStart, srcLength);
1449   }
1450
1451   if(srcChars == 0) {
1452     srcStart = srcLength = 0;
1453   } else if(srcLength < 0) {
1454     // get the srcLength if necessary
1455     srcLength = u_strlen(srcChars + srcStart);
1456   }
1457
1458   // pin the indices to legal values
1459   pinIndices(start, length);
1460
1461   // Calculate the size of the string after the replace.
1462   // Avoid int32_t overflow.
1463   int32_t newLength = oldLength - length;
1464   if(srcLength > (INT32_MAX - newLength)) {
1465     setToBogus();
1466     return *this;
1467   }
1468   newLength += srcLength;
1469
1470   // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
1471   // therefore we need to keep the current fArray
1472   UChar oldStackBuffer[US_STACKBUF_SIZE];
1473   UChar *oldArray;
1474   if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1475     // copy the stack buffer contents because it will be overwritten with
1476     // fUnion.fFields values
1477     u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
1478     oldArray = oldStackBuffer;
1479   } else {
1480     oldArray = getArrayStart();
1481   }
1482
1483   // clone our array and allocate a bigger array if needed
1484   int32_t *bufferToDelete = 0;
1485   if(!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength),
1486                          FALSE, &bufferToDelete)
1487   ) {
1488     return *this;
1489   }
1490
1491   // now do the replace
1492
1493   UChar *newArray = getArrayStart();
1494   if(newArray != oldArray) {
1495     // if fArray changed, then we need to copy everything except what will change
1496     us_arrayCopy(oldArray, 0, newArray, 0, start);
1497     us_arrayCopy(oldArray, start + length,
1498                  newArray, start + srcLength,
1499                  oldLength - (start + length));
1500   } else if(length != srcLength) {
1501     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1502     us_arrayCopy(oldArray, start + length,
1503                  newArray, start + srcLength,
1504                  oldLength - (start + length));
1505   }
1506
1507   // now fill in the hole with the new string
1508   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1509
1510   setLength(newLength);
1511
1512   // delayed delete in case srcChars == fArray when we started, and
1513   // to keep oldArray alive for the above operations
1514   if (bufferToDelete) {
1515     uprv_free(bufferToDelete);
1516   }
1517
1518   return *this;
1519 }
1520
1521 // Versions of doReplace() only for append() variants.
1522 // doReplace() and doAppend() optimize for different cases.
1523
1524 UnicodeString&
1525 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1526   if(srcLength == 0) {
1527     return *this;
1528   }
1529
1530   // pin the indices to legal values
1531   src.pinIndices(srcStart, srcLength);
1532   return doAppend(src.getArrayStart(), srcStart, srcLength);
1533 }
1534
1535 UnicodeString&
1536 UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
1537   if(!isWritable() || srcLength == 0 || srcChars == NULL) {
1538     return *this;
1539   }
1540
1541   if(srcLength < 0) {
1542     // get the srcLength if necessary
1543     if((srcLength = u_strlen(srcChars + srcStart)) == 0) {
1544       return *this;
1545     }
1546   }
1547
1548   int32_t oldLength = length();
1549   int32_t newLength = oldLength + srcLength;
1550   // optimize append() onto a large-enough, owned string
1551   if((newLength <= getCapacity() && isBufferWritable()) ||
1552       cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) {
1553     UChar *newArray = getArrayStart();
1554     // Do not copy characters when
1555     //   UChar *buffer=str.getAppendBuffer(...);
1556     // is followed by
1557     //   str.append(buffer, length);
1558     // or
1559     //   str.appendString(buffer, length)
1560     // or similar.
1561     if(srcChars + srcStart != newArray + oldLength) {
1562       us_arrayCopy(srcChars, srcStart, newArray, oldLength, srcLength);
1563     }
1564     setLength(newLength);
1565   }
1566   return *this;
1567 }
1568
1569 /**
1570  * Replaceable API
1571  */
1572 void
1573 UnicodeString::handleReplaceBetween(int32_t start,
1574                                     int32_t limit,
1575                                     const UnicodeString& text) {
1576     replaceBetween(start, limit, text);
1577 }
1578
1579 /**
1580  * Replaceable API
1581  */
1582 void
1583 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1584     if (limit <= start) {
1585         return; // Nothing to do; avoid bogus malloc call
1586     }
1587     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1588     // Check to make sure text is not null.
1589     if (text != NULL) {
1590             extractBetween(start, limit, text, 0);
1591             insert(dest, text, 0, limit - start);
1592             uprv_free(text);
1593     }
1594 }
1595
1596 /**
1597  * Replaceable API
1598  *
1599  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1600  * so we implement this function here.
1601  */
1602 UBool Replaceable::hasMetaData() const {
1603     return TRUE;
1604 }
1605
1606 /**
1607  * Replaceable API
1608  */
1609 UBool UnicodeString::hasMetaData() const {
1610     return FALSE;
1611 }
1612
1613 UnicodeString&
1614 UnicodeString::doReverse(int32_t start, int32_t length) {
1615   if(length <= 1 || !cloneArrayIfNeeded()) {
1616     return *this;
1617   }
1618
1619   // pin the indices to legal values
1620   pinIndices(start, length);
1621   if(length <= 1) {  // pinIndices() might have shrunk the length
1622     return *this;
1623   }
1624
1625   UChar *left = getArrayStart() + start;
1626   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
1627   UChar swap;
1628   UBool hasSupplementary = FALSE;
1629
1630   // Before the loop we know left<right because length>=2.
1631   do {
1632     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1633     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1634     *right-- = swap;
1635   } while(left < right);
1636   // Make sure to test the middle code unit of an odd-length string.
1637   // Redundant if the length is even.
1638   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1639
1640   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1641   if(hasSupplementary) {
1642     UChar swap2;
1643
1644     left = getArrayStart() + start;
1645     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1646     while(left < right) {
1647       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1648         *left++ = swap2;
1649         *left++ = swap;
1650       } else {
1651         ++left;
1652       }
1653     }
1654   }
1655
1656   return *this;
1657 }
1658
1659 UBool
1660 UnicodeString::padLeading(int32_t targetLength,
1661                           UChar padChar)
1662 {
1663   int32_t oldLength = length();
1664   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1665     return FALSE;
1666   } else {
1667     // move contents up by padding width
1668     UChar *array = getArrayStart();
1669     int32_t start = targetLength - oldLength;
1670     us_arrayCopy(array, 0, array, start, oldLength);
1671
1672     // fill in padding character
1673     while(--start >= 0) {
1674       array[start] = padChar;
1675     }
1676     setLength(targetLength);
1677     return TRUE;
1678   }
1679 }
1680
1681 UBool
1682 UnicodeString::padTrailing(int32_t targetLength,
1683                            UChar padChar)
1684 {
1685   int32_t oldLength = length();
1686   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1687     return FALSE;
1688   } else {
1689     // fill in padding character
1690     UChar *array = getArrayStart();
1691     int32_t length = targetLength;
1692     while(--length >= oldLength) {
1693       array[length] = padChar;
1694     }
1695     setLength(targetLength);
1696     return TRUE;
1697   }
1698 }
1699
1700 //========================================
1701 // Hashing
1702 //========================================
1703 int32_t
1704 UnicodeString::doHashCode() const
1705 {
1706     /* Delegate hash computation to uhash.  This makes UnicodeString
1707      * hashing consistent with UChar* hashing.  */
1708     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1709     if (hashCode == kInvalidHashCode) {
1710         hashCode = kEmptyHashCode;
1711     }
1712     return hashCode;
1713 }
1714
1715 //========================================
1716 // External Buffer
1717 //========================================
1718
1719 char16_t *
1720 UnicodeString::getBuffer(int32_t minCapacity) {
1721   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1722     fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1723     setZeroLength();
1724     return getArrayStart();
1725   } else {
1726     return nullptr;
1727   }
1728 }
1729
1730 void
1731 UnicodeString::releaseBuffer(int32_t newLength) {
1732   if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1733     // set the new fLength
1734     int32_t capacity=getCapacity();
1735     if(newLength==-1) {
1736       // the new length is the string length, capped by fCapacity
1737       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1738       while(p<limit && *p!=0) {
1739         ++p;
1740       }
1741       newLength=(int32_t)(p-array);
1742     } else if(newLength>capacity) {
1743       newLength=capacity;
1744     }
1745     setLength(newLength);
1746     fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1747   }
1748 }
1749
1750 //========================================
1751 // Miscellaneous
1752 //========================================
1753 UBool
1754 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1755                                   int32_t growCapacity,
1756                                   UBool doCopyArray,
1757                                   int32_t **pBufferToDelete,
1758                                   UBool forceClone) {
1759   // default parameters need to be static, therefore
1760   // the defaults are -1 to have convenience defaults
1761   if(newCapacity == -1) {
1762     newCapacity = getCapacity();
1763   }
1764
1765   // while a getBuffer(minCapacity) is "open",
1766   // prevent any modifications of the string by returning FALSE here
1767   // if the string is bogus, then only an assignment or similar can revive it
1768   if(!isWritable()) {
1769     return FALSE;
1770   }
1771
1772   /*
1773    * We need to make a copy of the array if
1774    * the buffer is read-only, or
1775    * the buffer is refCounted (shared), and refCount>1, or
1776    * the buffer is too small.
1777    * Return FALSE if memory could not be allocated.
1778    */
1779   if(forceClone ||
1780      fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1781      (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1782      newCapacity > getCapacity()
1783   ) {
1784     // check growCapacity for default value and use of the stack buffer
1785     if(growCapacity < 0) {
1786       growCapacity = newCapacity;
1787     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1788       growCapacity = US_STACKBUF_SIZE;
1789     }
1790
1791     // save old values
1792     UChar oldStackBuffer[US_STACKBUF_SIZE];
1793     UChar *oldArray;
1794     int32_t oldLength = length();
1795     int16_t flags = fUnion.fFields.fLengthAndFlags;
1796
1797     if(flags&kUsingStackBuffer) {
1798       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1799       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1800         // copy the stack buffer contents because it will be overwritten with
1801         // fUnion.fFields values
1802         us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1803         oldArray = oldStackBuffer;
1804       } else {
1805         oldArray = NULL; // no need to copy from the stack buffer to itself
1806       }
1807     } else {
1808       oldArray = fUnion.fFields.fArray;
1809       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1810     }
1811
1812     // allocate a new array
1813     if(allocate(growCapacity) ||
1814        (newCapacity < growCapacity && allocate(newCapacity))
1815     ) {
1816       if(doCopyArray) {
1817         // copy the contents
1818         // do not copy more than what fits - it may be smaller than before
1819         int32_t minLength = oldLength;
1820         newCapacity = getCapacity();
1821         if(newCapacity < minLength) {
1822           minLength = newCapacity;
1823         }
1824         if(oldArray != NULL) {
1825           us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1826         }
1827         setLength(minLength);
1828       } else {
1829         setZeroLength();
1830       }
1831
1832       // release the old array
1833       if(flags & kRefCounted) {
1834         // the array is refCounted; decrement and release if 0
1835         u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1836         if(umtx_atomic_dec(pRefCount) == 0) {
1837           if(pBufferToDelete == 0) {
1838               // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1839               // is defined as volatile. (Volatile has useful non-standard behavior
1840               //   with this compiler.)
1841             uprv_free((void *)pRefCount);
1842           } else {
1843             // the caller requested to delete it himself
1844             *pBufferToDelete = (int32_t *)pRefCount;
1845           }
1846         }
1847       }
1848     } else {
1849       // not enough memory for growCapacity and not even for the smaller newCapacity
1850       // reset the old values for setToBogus() to release the array
1851       if(!(flags&kUsingStackBuffer)) {
1852         fUnion.fFields.fArray = oldArray;
1853       }
1854       fUnion.fFields.fLengthAndFlags = flags;
1855       setToBogus();
1856       return FALSE;
1857     }
1858   }
1859   return TRUE;
1860 }
1861
1862 // UnicodeStringAppendable ------------------------------------------------- ***
1863
1864 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1865
1866 UBool
1867 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1868   return str.doAppend(&c, 0, 1).isWritable();
1869 }
1870
1871 UBool
1872 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1873   UChar buffer[U16_MAX_LENGTH];
1874   int32_t cLength = 0;
1875   UBool isError = FALSE;
1876   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1877   return !isError && str.doAppend(buffer, 0, cLength).isWritable();
1878 }
1879
1880 UBool
1881 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1882   return str.doAppend(s, 0, length).isWritable();
1883 }
1884
1885 UBool
1886 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1887   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1888 }
1889
1890 UChar *
1891 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1892                                          int32_t desiredCapacityHint,
1893                                          UChar *scratch, int32_t scratchCapacity,
1894                                          int32_t *resultCapacity) {
1895   if(minCapacity < 1 || scratchCapacity < minCapacity) {
1896     *resultCapacity = 0;
1897     return NULL;
1898   }
1899   int32_t oldLength = str.length();
1900   if(minCapacity <= (kMaxCapacity - oldLength) &&
1901       desiredCapacityHint <= (kMaxCapacity - oldLength) &&
1902       str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1903     *resultCapacity = str.getCapacity() - oldLength;
1904     return str.getArrayStart() + oldLength;
1905   }
1906   *resultCapacity = scratchCapacity;
1907   return scratch;
1908 }
1909
1910 U_NAMESPACE_END
1911
1912 U_NAMESPACE_USE
1913
1914 U_CAPI int32_t U_EXPORT2
1915 uhash_hashUnicodeString(const UElement key) {
1916     const UnicodeString *str = (const UnicodeString*) key.pointer;
1917     return (str == NULL) ? 0 : str->hashCode();
1918 }
1919
1920 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1921 // does not depend on hashtable code.
1922 U_CAPI UBool U_EXPORT2
1923 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1924     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1925     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1926     if (str1 == str2) {
1927         return TRUE;
1928     }
1929     if (str1 == NULL || str2 == NULL) {
1930         return FALSE;
1931     }
1932     return *str1 == *str2;
1933 }
1934
1935 #ifdef U_STATIC_IMPLEMENTATION
1936 /*
1937 This should never be called. It is defined here to make sure that the
1938 virtual vector deleting destructor is defined within unistr.cpp.
1939 The vector deleting destructor is already a part of UObject,
1940 but defining it here makes sure that it is included with this object file.
1941 This makes sure that static library dependencies are kept to a minimum.
1942 */
1943 static void uprv_UnicodeStringDummy(void) {
1944     delete [] (new UnicodeString[2]);
1945 }
1946 #endif