icuSources/common/unistr.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 ******************************************************************************
   5 * Copyright (C) 1999-2016, International Business Machines Corporation and
   6 * others. All Rights Reserved.
   7 ******************************************************************************
   8 *
   9 * File unistr.cpp
  10 *
  11 * Modification History:
  12 *
  13 *   Date        Name        Description
  14 *   09/25/98    stephen     Creation.
  15 *   04/20/99    stephen     Overhauled per 4/16 code review.
  16 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
  17 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
  18 *                           Replaceable.
  19 *   06/25/01    grhoten     Removed the dependency on iostream
  20 ******************************************************************************
  21 */
  22
  23 #include "unicode/utypes.h"
  24 #include "unicode/appendable.h"
  25 #include "unicode/putil.h"
  26 #include "cstring.h"
  27 #include "cmemory.h"
  28 #include "unicode/ustring.h"
  29 #include "unicode/unistr.h"
  30 #include "unicode/utf.h"
  31 #include "unicode/utf16.h"
  32 #include "uelement.h"
  33 #include "ustr_imp.h"
  34 #include "umutex.h"
  35 #include "uassert.h"
  36
  37 #if 0
  38
  39 #include <iostream>
  40 using namespace std;
  41
  42 //DEBUGGING
  43 void
  44 print(const UnicodeString& s,
  45       const char *name)
  46 {
  47   UChar c;
  48   cout << name << ":|";
  49   for(int i = 0; i < s.length(); ++i) {
  50     c = s[i];
  51     if(c>= 0x007E || c < 0x0020)
  52       cout << "[0x" << hex << s[i] << "]";
  53     else
  54       cout << (char) s[i];
  55   }
  56   cout << '|' << endl;
  57 }
  58
  59 void
  60 print(const UChar *s,
  61       int32_t len,
  62       const char *name)
  63 {
  64   UChar c;
  65   cout << name << ":|";
  66   for(int i = 0; i < len; ++i) {
  67     c = s[i];
  68     if(c>= 0x007E || c < 0x0020)
  69       cout << "[0x" << hex << s[i] << "]";
  70     else
  71       cout << (char) s[i];
  72   }
  73   cout << '|' << endl;
  74 }
  75 // END DEBUGGING
  76 #endif
  77
  78 // Local function definitions for now
  79
  80 // need to copy areas that may overlap
  81 static
  82 inline void
  83 us_arrayCopy(const UChar *src, int32_t srcStart,
  84          UChar *dst, int32_t dstStart, int32_t count)
  85 {
  86   if(count>0) {
  87     uprv_memmove(dst+dstStart, src+srcStart, (size_t)count*sizeof(*src));
  88   }
  89 }
  90
  91 // u_unescapeAt() callback to get a UChar from a UnicodeString
  92 U_CDECL_BEGIN
  93 static UChar U_CALLCONV
  94 UnicodeString_charAt(int32_t offset, void *context) {
  95     return ((icu::UnicodeString*) context)->charAt(offset);
  96 }
  97 U_CDECL_END
  98
  99 U_NAMESPACE_BEGIN
 100
 101 /* The Replaceable virtual destructor can't be defined in the header
 102    due to how AIX works with multiple definitions of virtual functions.
 103 */
 104 Replaceable::~Replaceable() {}
 105
 106 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
 107
 108 UnicodeString U_EXPORT2
 109 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
 110     return
 111         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
 112             append(s1).
 113                 append(s2);
 114 }
 115
 116 //========================================
 117 // Reference Counting functions, put at top of file so that optimizing compilers
 118 //                               have a chance to automatically inline.
 119 //========================================
 120
 121 void
 122 UnicodeString::addRef() {
 123   umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
 124 }
 125
 126 int32_t
 127 UnicodeString::removeRef() {
 128   return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
 129 }
 130
 131 int32_t
 132 UnicodeString::refCount() const {
 133   return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
 134 }
 135
 136 void
 137 UnicodeString::releaseArray() {
 138   if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
 139     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
 140   }
 141 }
 142
 143
 144
 145 //========================================
 146 // Constructors
 147 //========================================
 148
 149 // The default constructor is inline in unistr.h.
 150
 151 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
 152   fUnion.fFields.fLengthAndFlags = 0;
 153   if(count <= 0 || (uint32_t)c > 0x10ffff) {
 154     // just allocate and do not do anything else
 155     allocate(capacity);
 156   } else if(c <= 0xffff) {
 157     int32_t length = count;
 158     if(capacity < length) {
 159       capacity = length;
 160     }
 161     if(allocate(capacity)) {
 162       UChar *array = getArrayStart();
 163       UChar unit = (UChar)c;
 164       for(int32_t i = 0; i < length; ++i) {
 165         array[i] = unit;
 166       }
 167       setLength(length);
 168     }
 169   } else {  // supplementary code point, write surrogate pairs
 170     if(count > (INT32_MAX / 2)) {
 171       // We would get more than 2G UChars.
 172       allocate(capacity);
 173       return;
 174     }
 175     int32_t length = count * 2;
 176     if(capacity < length) {
 177       capacity = length;
 178     }
 179     if(allocate(capacity)) {
 180       UChar *array = getArrayStart();
 181       UChar lead = U16_LEAD(c);
 182       UChar trail = U16_TRAIL(c);
 183       for(int32_t i = 0; i < length; i += 2) {
 184         array[i] = lead;
 185         array[i + 1] = trail;
 186       }
 187       setLength(length);
 188     }
 189   }
 190 }
 191
 192 UnicodeString::UnicodeString(UChar ch) {
 193   fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
 194   fUnion.fStackFields.fBuffer[0] = ch;
 195 }
 196
 197 UnicodeString::UnicodeString(UChar32 ch) {
 198   fUnion.fFields.fLengthAndFlags = kShortString;
 199   int32_t i = 0;
 200   UBool isError = FALSE;
 201   U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
 202   // We test isError so that the compiler does not complain that we don't.
 203   // If isError then i==0 which is what we want anyway.
 204   if(!isError) {
 205     setShortLength(i);
 206   }
 207 }
 208
 209 UnicodeString::UnicodeString(const UChar *text) {
 210   fUnion.fFields.fLengthAndFlags = kShortString;
 211   doAppend(text, 0, -1);
 212 }
 213
 214 UnicodeString::UnicodeString(const UChar *text,
 215                              int32_t textLength) {
 216   fUnion.fFields.fLengthAndFlags = kShortString;
 217   doAppend(text, 0, textLength);
 218 }
 219
 220 UnicodeString::UnicodeString(UBool isTerminated,
 221                              ConstChar16Ptr textPtr,
 222                              int32_t textLength) {
 223   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
 224   const UChar *text = textPtr;
 225   if(text == NULL) {
 226     // treat as an empty string, do not alias
 227     setToEmpty();
 228   } else if(textLength < -1 ||
 229             (textLength == -1 && !isTerminated) ||
 230             (textLength >= 0 && isTerminated && text[textLength] != 0)
 231   ) {
 232     setToBogus();
 233   } else {
 234     if(textLength == -1) {
 235       // text is terminated, or else it would have failed the above test
 236       textLength = u_strlen(text);
 237     }
 238     setArray(const_cast<UChar *>(text), textLength,
 239              isTerminated ? textLength + 1 : textLength);
 240   }
 241 }
 242
 243 UnicodeString::UnicodeString(UChar *buff,
 244                              int32_t buffLength,
 245                              int32_t buffCapacity) {
 246   fUnion.fFields.fLengthAndFlags = kWritableAlias;
 247   if(buff == NULL) {
 248     // treat as an empty string, do not alias
 249     setToEmpty();
 250   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
 251     setToBogus();
 252   } else {
 253     if(buffLength == -1) {
 254       // fLength = u_strlen(buff); but do not look beyond buffCapacity
 255       const UChar *p = buff, *limit = buff + buffCapacity;
 256       while(p != limit && *p != 0) {
 257         ++p;
 258       }
 259       buffLength = (int32_t)(p - buff);
 260     }
 261     setArray(buff, buffLength, buffCapacity);
 262   }
 263 }
 264
 265 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
 266   fUnion.fFields.fLengthAndFlags = kShortString;
 267   if(src==NULL) {
 268     // treat as an empty string
 269   } else {
 270     if(length<0) {
 271       length=(int32_t)uprv_strlen(src);
 272     }
 273     if(cloneArrayIfNeeded(length, length, FALSE)) {
 274       u_charsToUChars(src, getArrayStart(), length);
 275       setLength(length);
 276     } else {
 277       setToBogus();
 278     }
 279   }
 280 }
 281
 282 #if U_CHARSET_IS_UTF8
 283
 284 UnicodeString::UnicodeString(const char *codepageData) {
 285   fUnion.fFields.fLengthAndFlags = kShortString;
 286   if(codepageData != 0) {
 287     setToUTF8(codepageData);
 288   }
 289 }
 290
 291 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
 292   fUnion.fFields.fLengthAndFlags = kShortString;
 293   // if there's nothing to convert, do nothing
 294   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
 295     return;
 296   }
 297   if(dataLength == -1) {
 298     dataLength = (int32_t)uprv_strlen(codepageData);
 299   }
 300   setToUTF8(StringPiece(codepageData, dataLength));
 301 }
 302
 303 // else see unistr_cnv.cpp
 304 #endif
 305
 306 UnicodeString::UnicodeString(const UnicodeString& that) {
 307   fUnion.fFields.fLengthAndFlags = kShortString;
 308   copyFrom(that);
 309 }
 310
 311 UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
 312   copyFieldsFrom(src, TRUE);
 313 }
 314
 315 UnicodeString::UnicodeString(const UnicodeString& that,
 316                              int32_t srcStart) {
 317   fUnion.fFields.fLengthAndFlags = kShortString;
 318   setTo(that, srcStart);
 319 }
 320
 321 UnicodeString::UnicodeString(const UnicodeString& that,
 322                              int32_t srcStart,
 323                              int32_t srcLength) {
 324   fUnion.fFields.fLengthAndFlags = kShortString;
 325   setTo(that, srcStart, srcLength);
 326 }
 327
 328 // Replaceable base class clone() default implementation, does not clone
 329 Replaceable *
 330 Replaceable::clone() const {
 331   return NULL;
 332 }
 333
 334 // UnicodeString overrides clone() with a real implementation
 335 Replaceable *
 336 UnicodeString::clone() const {
 337   return new UnicodeString(*this);
 338 }
 339
 340 //========================================
 341 // array allocation
 342 //========================================
 343
 344 namespace {
 345
 346 const int32_t kGrowSize = 128;
 347
 348 // The number of bytes for one int32_t reference counter and capacity UChars
 349 // must fit into a 32-bit size_t (at least when on a 32-bit platform).
 350 // We also add one for the NUL terminator, to avoid reallocation in getTerminatedBuffer(),
 351 // and round up to a multiple of 16 bytes.
 352 // This means that capacity must be at most (0xfffffff0 - 4) / 2 - 1 = 0x7ffffff5.
 353 // (With more complicated checks we could go up to 0x7ffffffd without rounding up,
 354 // but that does not seem worth it.)
 355 const int32_t kMaxCapacity = 0x7ffffff5;
 356
 357 int32_t getGrowCapacity(int32_t newLength) {
 358   int32_t growSize = (newLength >> 2) + kGrowSize;
 359   if(growSize <= (kMaxCapacity - newLength)) {
 360     return newLength + growSize;
 361   } else {
 362     return kMaxCapacity;
 363   }
 364 }
 365
 366 }  // namespace
 367
 368 UBool
 369 UnicodeString::allocate(int32_t capacity) {
 370   if(capacity <= US_STACKBUF_SIZE) {
 371     fUnion.fFields.fLengthAndFlags = kShortString;
 372     return TRUE;
 373   }
 374   if(capacity <= kMaxCapacity) {
 375     ++capacity;  // for the NUL
 376     // Switch to size_t which is unsigned so that we can allocate up to 4GB.
 377     // Reference counter + UChars.
 378     size_t numBytes = sizeof(int32_t) + (size_t)capacity * U_SIZEOF_UCHAR;
 379     // Round up to a multiple of 16.
 380     numBytes = (numBytes + 15) & ~15;
 381     int32_t *array = (int32_t *) uprv_malloc(numBytes);
 382     if(array != NULL) {
 383       // set initial refCount and point behind the refCount
 384       *array++ = 1;
 385       numBytes -= sizeof(int32_t);
 386
 387       // have fArray point to the first UChar
 388       fUnion.fFields.fArray = (UChar *)array;
 389       fUnion.fFields.fCapacity = (int32_t)(numBytes / U_SIZEOF_UCHAR);
 390       fUnion.fFields.fLengthAndFlags = kLongString;
 391       return TRUE;
 392     }
 393   }
 394   fUnion.fFields.fLengthAndFlags = kIsBogus;
 395   fUnion.fFields.fArray = 0;
 396   fUnion.fFields.fCapacity = 0;
 397   return FALSE;
 398 }
 399
 400 //========================================
 401 // Destructor
 402 //========================================
 403
 404 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
 405 static u_atomic_int32_t finalLengthCounts[0x400];  // UnicodeString::kMaxShortLength+1
 406 static u_atomic_int32_t beyondCount(0);
 407
 408 U_CAPI void unistr_printLengths() {
 409   int32_t i;
 410   for(i = 0; i <= 59; ++i) {
 411     printf("%2d,  %9d\n", i, (int32_t)finalLengthCounts[i]);
 412   }
 413   int32_t beyond = beyondCount;
 414   for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
 415     beyond += finalLengthCounts[i];
 416   }
 417   printf(">59, %9d\n", beyond);
 418 }
 419 #endif
 420
 421 UnicodeString::~UnicodeString()
 422 {
 423 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
 424   // Count lengths of strings at the end of their lifetime.
 425   // Useful for discussion of a desirable stack buffer size.
 426   // Count the contents length, not the optional NUL terminator nor further capacity.
 427   // Ignore open-buffer strings and strings which alias external storage.
 428   if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
 429     if(hasShortLength()) {
 430       umtx_atomic_inc(finalLengthCounts + getShortLength());
 431     } else {
 432       umtx_atomic_inc(&beyondCount);
 433     }
 434   }
 435 #endif
 436
 437   releaseArray();
 438 }
 439
 440 //========================================
 441 // Factory methods
 442 //========================================
 443
 444 UnicodeString UnicodeString::fromUTF8(StringPiece utf8) {
 445   UnicodeString result;
 446   result.setToUTF8(utf8);
 447   return result;
 448 }
 449
 450 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
 451   UnicodeString result;
 452   int32_t capacity;
 453   // Most UTF-32 strings will be BMP-only and result in a same-length
 454   // UTF-16 string. We overestimate the capacity just slightly,
 455   // just in case there are a few supplementary characters.
 456   if(length <= US_STACKBUF_SIZE) {
 457     capacity = US_STACKBUF_SIZE;
 458   } else {
 459     capacity = length + (length >> 4) + 4;
 460   }
 461   do {
 462     UChar *utf16 = result.getBuffer(capacity);
 463     int32_t length16;
 464     UErrorCode errorCode = U_ZERO_ERROR;
 465     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
 466         utf32, length,
 467         0xfffd,  // Substitution character.
 468         NULL,    // Don't care about number of substitutions.
 469         &errorCode);
 470     result.releaseBuffer(length16);
 471     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 472       capacity = length16 + 1;  // +1 for the terminating NUL.
 473       continue;
 474     } else if(U_FAILURE(errorCode)) {
 475       result.setToBogus();
 476     }
 477     break;
 478   } while(TRUE);
 479   return result;
 480 }
 481
 482 //========================================
 483 // Assignment
 484 //========================================
 485
 486 UnicodeString &
 487 UnicodeString::operator=(const UnicodeString &src) {
 488   return copyFrom(src);
 489 }
 490
 491 UnicodeString &
 492 UnicodeString::fastCopyFrom(const UnicodeString &src) {
 493   return copyFrom(src, TRUE);
 494 }
 495
 496 UnicodeString &
 497 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
 498   // if assigning to ourselves, do nothing
 499   if(this == &src) {
 500     return *this;
 501   }
 502
 503   // is the right side bogus?
 504   if(src.isBogus()) {
 505     setToBogus();
 506     return *this;
 507   }
 508
 509   // delete the current contents
 510   releaseArray();
 511
 512   if(src.isEmpty()) {
 513     // empty string - use the stack buffer
 514     setToEmpty();
 515     return *this;
 516   }
 517
 518   // fLength>0 and not an "open" src.getBuffer(minCapacity)
 519   fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
 520   switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
 521   case kShortString:
 522     // short string using the stack buffer, do the same
 523     uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
 524                 getShortLength() * U_SIZEOF_UCHAR);
 525     break;
 526   case kLongString:
 527     // src uses a refCounted string buffer, use that buffer with refCount
 528     // src is const, use a cast - we don't actually change it
 529     ((UnicodeString &)src).addRef();
 530     // copy all fields, share the reference-counted buffer
 531     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 532     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 533     if(!hasShortLength()) {
 534       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
 535     }
 536     break;
 537   case kReadonlyAlias:
 538     if(fastCopy) {
 539       // src is a readonly alias, do the same
 540       // -> maintain the readonly alias as such
 541       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 542       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 543       if(!hasShortLength()) {
 544         fUnion.fFields.fLength = src.fUnion.fFields.fLength;
 545       }
 546       break;
 547     }
 548     // else if(!fastCopy) fall through to case kWritableAlias
 549     // -> allocate a new buffer and copy the contents
 550     U_FALLTHROUGH;
 551   case kWritableAlias: {
 552     // src is a writable alias; we make a copy of that instead
 553     int32_t srcLength = src.length();
 554     if(allocate(srcLength)) {
 555       u_memcpy(getArrayStart(), src.getArrayStart(), srcLength);
 556       setLength(srcLength);
 557       break;
 558     }
 559     // if there is not enough memory, then fall through to setting to bogus
 560     U_FALLTHROUGH;
 561   }
 562   default:
 563     // if src is bogus, set ourselves to bogus
 564     // do not call setToBogus() here because fArray and flags are not consistent here
 565     fUnion.fFields.fLengthAndFlags = kIsBogus;
 566     fUnion.fFields.fArray = 0;
 567     fUnion.fFields.fCapacity = 0;
 568     break;
 569   }
 570
 571   return *this;
 572 }
 573
 574 UnicodeString &UnicodeString::operator=(UnicodeString &&src) U_NOEXCEPT {
 575   // No explicit check for self move assignment, consistent with standard library.
 576   // Self move assignment causes no crash nor leak but might make the object bogus.
 577   releaseArray();
 578   copyFieldsFrom(src, TRUE);
 579   return *this;
 580 }
 581
 582 // Same as move assignment except without memory management.
 583 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
 584   int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
 585   if(lengthAndFlags & kUsingStackBuffer) {
 586     // Short string using the stack buffer, copy the contents.
 587     // Check for self assignment to prevent "overlap in memcpy" warnings,
 588     // although it should be harmless to copy a buffer to itself exactly.
 589     if(this != &src) {
 590       uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
 591                   getShortLength() * U_SIZEOF_UCHAR);
 592     }
 593   } else {
 594     // In all other cases, copy all fields.
 595     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 596     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 597     if(!hasShortLength()) {
 598       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
 599     }
 600     if(setSrcToBogus) {
 601       // Set src to bogus without releasing any memory.
 602       src.fUnion.fFields.fLengthAndFlags = kIsBogus;
 603       src.fUnion.fFields.fArray = NULL;
 604       src.fUnion.fFields.fCapacity = 0;
 605     }
 606   }
 607 }
 608
 609 void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
 610   UnicodeString temp;  // Empty short string: Known not to need releaseArray().
 611   // Copy fields without resetting source values in between.
 612   temp.copyFieldsFrom(*this, FALSE);
 613   this->copyFieldsFrom(other, FALSE);
 614   other.copyFieldsFrom(temp, FALSE);
 615   // Set temp to an empty string so that other's memory is not released twice.
 616   temp.fUnion.fFields.fLengthAndFlags = kShortString;
 617 }
 618
 619 //========================================
 620 // Miscellaneous operations
 621 //========================================
 622
 623 UnicodeString UnicodeString::unescape() const {
 624     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
 625     if (result.isBogus()) {
 626         return result;
 627     }
 628     const UChar *array = getBuffer();
 629     int32_t len = length();
 630     int32_t prev = 0;
 631     for (int32_t i=0;;) {
 632         if (i == len) {
 633             result.append(array, prev, len - prev);
 634             break;
 635         }
 636         if (array[i++] == 0x5C /*'\\'*/) {
 637             result.append(array, prev, (i - 1) - prev);
 638             UChar32 c = unescapeAt(i); // advances i
 639             if (c < 0) {
 640                 result.remove(); // return empty string
 641                 break; // invalid escape sequence
 642             }
 643             result.append(c);
 644             prev = i;
 645         }
 646     }
 647     return result;
 648 }
 649
 650 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
 651     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
 652 }
 653
 654 //========================================
 655 // Read-only implementation
 656 //========================================
 657 UBool
 658 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
 659   // Requires: this & text not bogus and have same lengths.
 660   // Byte-wise comparison works for equality regardless of endianness.
 661   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
 662 }
 663
 664 int8_t
 665 UnicodeString::doCompare( int32_t start,
 666               int32_t length,
 667               const UChar *srcChars,
 668               int32_t srcStart,
 669               int32_t srcLength) const
 670 {
 671   // compare illegal string values
 672   if(isBogus()) {
 673     return -1;
 674   }
 675
 676   // pin indices to legal values
 677   pinIndices(start, length);
 678
 679   if(srcChars == NULL) {
 680     // treat const UChar *srcChars==NULL as an empty string
 681     return length == 0 ? 0 : 1;
 682   }
 683
 684   // get the correct pointer
 685   const UChar *chars = getArrayStart();
 686
 687   chars += start;
 688   srcChars += srcStart;
 689
 690   int32_t minLength;
 691   int8_t lengthResult;
 692
 693   // get the srcLength if necessary
 694   if(srcLength < 0) {
 695     srcLength = u_strlen(srcChars + srcStart);
 696   }
 697
 698   // are we comparing different lengths?
 699   if(length != srcLength) {
 700     if(length < srcLength) {
 701       minLength = length;
 702       lengthResult = -1;
 703     } else {
 704       minLength = srcLength;
 705       lengthResult = 1;
 706     }
 707   } else {
 708     minLength = length;
 709     lengthResult = 0;
 710   }
 711
 712   /*
 713    * note that uprv_memcmp() returns an int but we return an int8_t;
 714    * we need to take care not to truncate the result -
 715    * one way to do this is to right-shift the value to
 716    * move the sign bit into the lower 8 bits and making sure that this
 717    * does not become 0 itself
 718    */
 719
 720   if(minLength > 0 && chars != srcChars) {
 721     int32_t result;
 722
 723 #   if U_IS_BIG_ENDIAN
 724       // big-endian: byte comparison works
 725       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
 726       if(result != 0) {
 727         return (int8_t)(result >> 15 | 1);
 728       }
 729 #   else
 730       // little-endian: compare UChar units
 731       do {
 732         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
 733         if(result != 0) {
 734           return (int8_t)(result >> 15 | 1);
 735         }
 736       } while(--minLength > 0);
 737 #   endif
 738   }
 739   return lengthResult;
 740 }
 741
 742 /* String compare in code point order - doCompare() compares in code unit order. */
 743 int8_t
 744 UnicodeString::doCompareCodePointOrder(int32_t start,
 745                                        int32_t length,
 746                                        const UChar *srcChars,
 747                                        int32_t srcStart,
 748                                        int32_t srcLength) const
 749 {
 750   // compare illegal string values
 751   // treat const UChar *srcChars==NULL as an empty string
 752   if(isBogus()) {
 753     return -1;
 754   }
 755
 756   // pin indices to legal values
 757   pinIndices(start, length);
 758
 759   if(srcChars == NULL) {
 760     srcStart = srcLength = 0;
 761   }
 762
 763   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
 764   /* translate the 32-bit result into an 8-bit one */
 765   if(diff!=0) {
 766     return (int8_t)(diff >> 15 | 1);
 767   } else {
 768     return 0;
 769   }
 770 }
 771
 772 int32_t
 773 UnicodeString::getLength() const {
 774     return length();
 775 }
 776
 777 UChar
 778 UnicodeString::getCharAt(int32_t offset) const {
 779   return charAt(offset);
 780 }
 781
 782 UChar32
 783 UnicodeString::getChar32At(int32_t offset) const {
 784   return char32At(offset);
 785 }
 786
 787 UChar32
 788 UnicodeString::char32At(int32_t offset) const
 789 {
 790   int32_t len = length();
 791   if((uint32_t)offset < (uint32_t)len) {
 792     const UChar *array = getArrayStart();
 793     UChar32 c;
 794     U16_GET(array, 0, offset, len, c);
 795     return c;
 796   } else {
 797     return kInvalidUChar;
 798   }
 799 }
 800
 801 int32_t
 802 UnicodeString::getChar32Start(int32_t offset) const {
 803   if((uint32_t)offset < (uint32_t)length()) {
 804     const UChar *array = getArrayStart();
 805     U16_SET_CP_START(array, 0, offset);
 806     return offset;
 807   } else {
 808     return 0;
 809   }
 810 }
 811
 812 int32_t
 813 UnicodeString::getChar32Limit(int32_t offset) const {
 814   int32_t len = length();
 815   if((uint32_t)offset < (uint32_t)len) {
 816     const UChar *array = getArrayStart();
 817     U16_SET_CP_LIMIT(array, 0, offset, len);
 818     return offset;
 819   } else {
 820     return len;
 821   }
 822 }
 823
 824 int32_t
 825 UnicodeString::countChar32(int32_t start, int32_t length) const {
 826   pinIndices(start, length);
 827   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
 828   return u_countChar32(getArrayStart()+start, length);
 829 }
 830
 831 UBool
 832 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
 833   pinIndices(start, length);
 834   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
 835   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
 836 }
 837
 838 int32_t
 839 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
 840   // pin index
 841   int32_t len = length();
 842   if(index<0) {
 843     index=0;
 844   } else if(index>len) {
 845     index=len;
 846   }
 847
 848   const UChar *array = getArrayStart();
 849   if(delta>0) {
 850     U16_FWD_N(array, index, len, delta);
 851   } else {
 852     U16_BACK_N(array, 0, index, -delta);
 853   }
 854
 855   return index;
 856 }
 857
 858 void
 859 UnicodeString::doExtract(int32_t start,
 860              int32_t length,
 861              UChar *dst,
 862              int32_t dstStart) const
 863 {
 864   // pin indices to legal values
 865   pinIndices(start, length);
 866
 867   // do not copy anything if we alias dst itself
 868   const UChar *array = getArrayStart();
 869   if(array + start != dst + dstStart) {
 870     us_arrayCopy(array, start, dst, dstStart, length);
 871   }
 872 }
 873
 874 int32_t
 875 UnicodeString::extract(Char16Ptr dest, int32_t destCapacity,
 876                        UErrorCode &errorCode) const {
 877   int32_t len = length();
 878   if(U_SUCCESS(errorCode)) {
 879     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 880       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 881     } else {
 882       const UChar *array = getArrayStart();
 883       if(len>0 && len<=destCapacity && array!=dest) {
 884         u_memcpy(dest, array, len);
 885       }
 886       return u_terminateUChars(dest, destCapacity, len, &errorCode);
 887     }
 888   }
 889
 890   return len;
 891 }
 892
 893 int32_t
 894 UnicodeString::extract(int32_t start,
 895                        int32_t length,
 896                        char *target,
 897                        int32_t targetCapacity,
 898                        enum EInvariant) const
 899 {
 900   // if the arguments are illegal, then do nothing
 901   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
 902     return 0;
 903   }
 904
 905   // pin the indices to legal values
 906   pinIndices(start, length);
 907
 908   if(length <= targetCapacity) {
 909     u_UCharsToChars(getArrayStart() + start, target, length);
 910   }
 911   UErrorCode status = U_ZERO_ERROR;
 912   return u_terminateChars(target, targetCapacity, length, &status);
 913 }
 914
 915 UnicodeString
 916 UnicodeString::tempSubString(int32_t start, int32_t len) const {
 917   pinIndices(start, len);
 918   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
 919   if(array==NULL) {
 920     array=fUnion.fStackFields.fBuffer;  // anything not NULL because that would make an empty string
 921     len=-2;  // bogus result string
 922   }
 923   return UnicodeString(FALSE, array + start, len);
 924 }
 925
 926 int32_t
 927 UnicodeString::toUTF8(int32_t start, int32_t len,
 928                       char *target, int32_t capacity) const {
 929   pinIndices(start, len);
 930   int32_t length8;
 931   UErrorCode errorCode = U_ZERO_ERROR;
 932   u_strToUTF8WithSub(target, capacity, &length8,
 933                      getBuffer() + start, len,
 934                      0xFFFD,  // Standard substitution character.
 935                      NULL,    // Don't care about number of substitutions.
 936                      &errorCode);
 937   return length8;
 938 }
 939
 940 #if U_CHARSET_IS_UTF8
 941
 942 int32_t
 943 UnicodeString::extract(int32_t start, int32_t len,
 944                        char *target, uint32_t dstSize) const {
 945   // if the arguments are illegal, then do nothing
 946   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
 947     return 0;
 948   }
 949   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
 950 }
 951
 952 // else see unistr_cnv.cpp
 953 #endif
 954
 955 void
 956 UnicodeString::extractBetween(int32_t start,
 957                   int32_t limit,
 958                   UnicodeString& target) const {
 959   pinIndex(start);
 960   pinIndex(limit);
 961   doExtract(start, limit - start, target);
 962 }
 963
 964 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
 965 // as many bytes as the source has UChars.
 966 // The "worst cases" are writing systems like Indic, Thai and CJK with
 967 // 3:1 bytes:UChars.
 968 void
 969 UnicodeString::toUTF8(ByteSink &sink) const {
 970   int32_t length16 = length();
 971   if(length16 != 0) {
 972     char stackBuffer[1024];
 973     int32_t capacity = (int32_t)sizeof(stackBuffer);
 974     UBool utf8IsOwned = FALSE;
 975     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
 976                                       3*length16,
 977                                       stackBuffer, capacity,
 978                                       &capacity);
 979     int32_t length8 = 0;
 980     UErrorCode errorCode = U_ZERO_ERROR;
 981     u_strToUTF8WithSub(utf8, capacity, &length8,
 982                        getBuffer(), length16,
 983                        0xFFFD,  // Standard substitution character.
 984                        NULL,    // Don't care about number of substitutions.
 985                        &errorCode);
 986     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 987       utf8 = (char *)uprv_malloc(length8);
 988       if(utf8 != NULL) {
 989         utf8IsOwned = TRUE;
 990         errorCode = U_ZERO_ERROR;
 991         u_strToUTF8WithSub(utf8, length8, &length8,
 992                            getBuffer(), length16,
 993                            0xFFFD,  // Standard substitution character.
 994                            NULL,    // Don't care about number of substitutions.
 995                            &errorCode);
 996       } else {
 997         errorCode = U_MEMORY_ALLOCATION_ERROR;
 998       }
 999     }
1000     if(U_SUCCESS(errorCode)) {
1001       sink.Append(utf8, length8);
1002       sink.Flush();
1003     }
1004     if(utf8IsOwned) {
1005       uprv_free(utf8);
1006     }
1007   }
1008 }
1009
1010 int32_t
1011 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
1012   int32_t length32=0;
1013   if(U_SUCCESS(errorCode)) {
1014     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
1015     u_strToUTF32WithSub(utf32, capacity, &length32,
1016         getBuffer(), length(),
1017         0xfffd,  // Substitution character.
1018         NULL,    // Don't care about number of substitutions.
1019         &errorCode);
1020   }
1021   return length32;
1022 }
1023
1024 int32_t
1025 UnicodeString::indexOf(const UChar *srcChars,
1026                int32_t srcStart,
1027                int32_t srcLength,
1028                int32_t start,
1029                int32_t length) const
1030 {
1031   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1032     return -1;
1033   }
1034
1035   // UnicodeString does not find empty substrings
1036   if(srcLength < 0 && srcChars[srcStart] == 0) {
1037     return -1;
1038   }
1039
1040   // get the indices within bounds
1041   pinIndices(start, length);
1042
1043   // find the first occurrence of the substring
1044   const UChar *array = getArrayStart();
1045   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1046   if(match == NULL) {
1047     return -1;
1048   } else {
1049     return (int32_t)(match - array);
1050   }
1051 }
1052
1053 int32_t
1054 UnicodeString::doIndexOf(UChar c,
1055              int32_t start,
1056              int32_t length) const
1057 {
1058   // pin indices
1059   pinIndices(start, length);
1060
1061   // find the first occurrence of c
1062   const UChar *array = getArrayStart();
1063   const UChar *match = u_memchr(array + start, c, length);
1064   if(match == NULL) {
1065     return -1;
1066   } else {
1067     return (int32_t)(match - array);
1068   }
1069 }
1070
1071 int32_t
1072 UnicodeString::doIndexOf(UChar32 c,
1073                          int32_t start,
1074                          int32_t length) const {
1075   // pin indices
1076   pinIndices(start, length);
1077
1078   // find the first occurrence of c
1079   const UChar *array = getArrayStart();
1080   const UChar *match = u_memchr32(array + start, c, length);
1081   if(match == NULL) {
1082     return -1;
1083   } else {
1084     return (int32_t)(match - array);
1085   }
1086 }
1087
1088 int32_t
1089 UnicodeString::lastIndexOf(const UChar *srcChars,
1090                int32_t srcStart,
1091                int32_t srcLength,
1092                int32_t start,
1093                int32_t length) const
1094 {
1095   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1096     return -1;
1097   }
1098
1099   // UnicodeString does not find empty substrings
1100   if(srcLength < 0 && srcChars[srcStart] == 0) {
1101     return -1;
1102   }
1103
1104   // get the indices within bounds
1105   pinIndices(start, length);
1106
1107   // find the last occurrence of the substring
1108   const UChar *array = getArrayStart();
1109   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1110   if(match == NULL) {
1111     return -1;
1112   } else {
1113     return (int32_t)(match - array);
1114   }
1115 }
1116
1117 int32_t
1118 UnicodeString::doLastIndexOf(UChar c,
1119                  int32_t start,
1120                  int32_t length) const
1121 {
1122   if(isBogus()) {
1123     return -1;
1124   }
1125
1126   // pin indices
1127   pinIndices(start, length);
1128
1129   // find the last occurrence of c
1130   const UChar *array = getArrayStart();
1131   const UChar *match = u_memrchr(array + start, c, length);
1132   if(match == NULL) {
1133     return -1;
1134   } else {
1135     return (int32_t)(match - array);
1136   }
1137 }
1138
1139 int32_t
1140 UnicodeString::doLastIndexOf(UChar32 c,
1141                              int32_t start,
1142                              int32_t length) const {
1143   // pin indices
1144   pinIndices(start, length);
1145
1146   // find the last occurrence of c
1147   const UChar *array = getArrayStart();
1148   const UChar *match = u_memrchr32(array + start, c, length);
1149   if(match == NULL) {
1150     return -1;
1151   } else {
1152     return (int32_t)(match - array);
1153   }
1154 }
1155
1156 //========================================
1157 // Write implementation
1158 //========================================
1159
1160 UnicodeString&
1161 UnicodeString::findAndReplace(int32_t start,
1162                   int32_t length,
1163                   const UnicodeString& oldText,
1164                   int32_t oldStart,
1165                   int32_t oldLength,
1166                   const UnicodeString& newText,
1167                   int32_t newStart,
1168                   int32_t newLength)
1169 {
1170   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1171     return *this;
1172   }
1173
1174   pinIndices(start, length);
1175   oldText.pinIndices(oldStart, oldLength);
1176   newText.pinIndices(newStart, newLength);
1177
1178   if(oldLength == 0) {
1179     return *this;
1180   }
1181
1182   while(length > 0 && length >= oldLength) {
1183     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1184     if(pos < 0) {
1185       // no more oldText's here: done
1186       break;
1187     } else {
1188       // we found oldText, replace it by newText and go beyond it
1189       replace(pos, oldLength, newText, newStart, newLength);
1190       length -= pos + oldLength - start;
1191       start = pos + newLength;
1192     }
1193   }
1194
1195   return *this;
1196 }
1197
1198
1199 void
1200 UnicodeString::setToBogus()
1201 {
1202   releaseArray();
1203
1204   fUnion.fFields.fLengthAndFlags = kIsBogus;
1205   fUnion.fFields.fArray = 0;
1206   fUnion.fFields.fCapacity = 0;
1207 }
1208
1209 // turn a bogus string into an empty one
1210 void
1211 UnicodeString::unBogus() {
1212   if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1213     setToEmpty();
1214   }
1215 }
1216
1217 const char16_t *
1218 UnicodeString::getTerminatedBuffer() {
1219   if(!isWritable()) {
1220     return nullptr;
1221   }
1222   UChar *array = getArrayStart();
1223   int32_t len = length();
1224   if(len < getCapacity()) {
1225     if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1226       // If len<capacity on a read-only alias, then array[len] is
1227       // either the original NUL (if constructed with (TRUE, s, length))
1228       // or one of the original string contents characters (if later truncated),
1229       // therefore we can assume that array[len] is initialized memory.
1230       if(array[len] == 0) {
1231         return array;
1232       }
1233     } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1234       // kRefCounted: Do not write the NUL if the buffer is shared.
1235       // That is mostly safe, except when the length of one copy was modified
1236       // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1237       // Then the NUL would be written into the middle of another copy's string.
1238
1239       // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1240       // Do not test if there is a NUL already because it might be uninitialized memory.
1241       // (That would be safe, but tools like valgrind & Purify would complain.)
1242       array[len] = 0;
1243       return array;
1244     }
1245   }
1246   if(len<INT32_MAX && cloneArrayIfNeeded(len+1)) {
1247     array = getArrayStart();
1248     array[len] = 0;
1249     return array;
1250   } else {
1251     return nullptr;
1252   }
1253 }
1254
1255 // setTo() analogous to the readonly-aliasing constructor with the same signature
1256 UnicodeString &
1257 UnicodeString::setTo(UBool isTerminated,
1258                      ConstChar16Ptr textPtr,
1259                      int32_t textLength)
1260 {
1261   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1262     // do not modify a string that has an "open" getBuffer(minCapacity)
1263     return *this;
1264   }
1265
1266   const UChar *text = textPtr;
1267   if(text == NULL) {
1268     // treat as an empty string, do not alias
1269     releaseArray();
1270     setToEmpty();
1271     return *this;
1272   }
1273
1274   if( textLength < -1 ||
1275       (textLength == -1 && !isTerminated) ||
1276       (textLength >= 0 && isTerminated && text[textLength] != 0)
1277   ) {
1278     setToBogus();
1279     return *this;
1280   }
1281
1282   releaseArray();
1283
1284   if(textLength == -1) {
1285     // text is terminated, or else it would have failed the above test
1286     textLength = u_strlen(text);
1287   }
1288   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1289   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1290   return *this;
1291 }
1292
1293 // setTo() analogous to the writable-aliasing constructor with the same signature
1294 UnicodeString &
1295 UnicodeString::setTo(UChar *buffer,
1296                      int32_t buffLength,
1297                      int32_t buffCapacity) {
1298   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1299     // do not modify a string that has an "open" getBuffer(minCapacity)
1300     return *this;
1301   }
1302
1303   if(buffer == NULL) {
1304     // treat as an empty string, do not alias
1305     releaseArray();
1306     setToEmpty();
1307     return *this;
1308   }
1309
1310   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1311     setToBogus();
1312     return *this;
1313   } else if(buffLength == -1) {
1314     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1315     const UChar *p = buffer, *limit = buffer + buffCapacity;
1316     while(p != limit && *p != 0) {
1317       ++p;
1318     }
1319     buffLength = (int32_t)(p - buffer);
1320   }
1321
1322   releaseArray();
1323
1324   fUnion.fFields.fLengthAndFlags = kWritableAlias;
1325   setArray(buffer, buffLength, buffCapacity);
1326   return *this;
1327 }
1328
1329 UnicodeString &UnicodeString::setToUTF8(StringPiece utf8) {
1330   unBogus();
1331   int32_t length = utf8.length();
1332   int32_t capacity;
1333   // The UTF-16 string will be at most as long as the UTF-8 string.
1334   if(length <= US_STACKBUF_SIZE) {
1335     capacity = US_STACKBUF_SIZE;
1336   } else {
1337     capacity = length + 1;  // +1 for the terminating NUL.
1338   }
1339   UChar *utf16 = getBuffer(capacity);
1340   int32_t length16;
1341   UErrorCode errorCode = U_ZERO_ERROR;
1342   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1343       utf8.data(), length,
1344       0xfffd,  // Substitution character.
1345       NULL,    // Don't care about number of substitutions.
1346       &errorCode);
1347   releaseBuffer(length16);
1348   if(U_FAILURE(errorCode)) {
1349     setToBogus();
1350   }
1351   return *this;
1352 }
1353
1354 UnicodeString&
1355 UnicodeString::setCharAt(int32_t offset,
1356              UChar c)
1357 {
1358   int32_t len = length();
1359   if(cloneArrayIfNeeded() && len > 0) {
1360     if(offset < 0) {
1361       offset = 0;
1362     } else if(offset >= len) {
1363       offset = len - 1;
1364     }
1365
1366     getArrayStart()[offset] = c;
1367   }
1368   return *this;
1369 }
1370
1371 UnicodeString&
1372 UnicodeString::replace(int32_t start,
1373                int32_t _length,
1374                UChar32 srcChar) {
1375   UChar buffer[U16_MAX_LENGTH];
1376   int32_t count = 0;
1377   UBool isError = FALSE;
1378   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1379   // We test isError so that the compiler does not complain that we don't.
1380   // If isError (srcChar is not a valid code point) then count==0 which means
1381   // we remove the source segment rather than replacing it with srcChar.
1382   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1383 }
1384
1385 UnicodeString&
1386 UnicodeString::append(UChar32 srcChar) {
1387   UChar buffer[U16_MAX_LENGTH];
1388   int32_t _length = 0;
1389   UBool isError = FALSE;
1390   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1391   // We test isError so that the compiler does not complain that we don't.
1392   // If isError then _length==0 which turns the doAppend() into a no-op anyway.
1393   return isError ? *this : doAppend(buffer, 0, _length);
1394 }
1395
1396 UnicodeString&
1397 UnicodeString::doReplace( int32_t start,
1398               int32_t length,
1399               const UnicodeString& src,
1400               int32_t srcStart,
1401               int32_t srcLength)
1402 {
1403   // pin the indices to legal values
1404   src.pinIndices(srcStart, srcLength);
1405
1406   // get the characters from src
1407   // and replace the range in ourselves with them
1408   return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1409 }
1410
1411 UnicodeString&
1412 UnicodeString::doReplace(int32_t start,
1413              int32_t length,
1414              const UChar *srcChars,
1415              int32_t srcStart,
1416              int32_t srcLength)
1417 {
1418   if(!isWritable()) {
1419     return *this;
1420   }
1421
1422   int32_t oldLength = this->length();
1423
1424   // optimize (read-only alias).remove(0, start) and .remove(start, end)
1425   if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1426     if(start == 0) {
1427       // remove prefix by adjusting the array pointer
1428       pinIndex(length);
1429       fUnion.fFields.fArray += length;
1430       fUnion.fFields.fCapacity -= length;
1431       setLength(oldLength - length);
1432       return *this;
1433     } else {
1434       pinIndex(start);
1435       if(length >= (oldLength - start)) {
1436         // remove suffix by reducing the length (like truncate())
1437         setLength(start);
1438         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
1439         return *this;
1440       }
1441     }
1442   }
1443
1444   if(start == oldLength) {
1445     return doAppend(srcChars, srcStart, srcLength);
1446   }
1447
1448   if(srcChars == 0) {
1449     srcLength = 0;
1450   } else {
1451     // Perform all remaining operations relative to srcChars + srcStart.
1452     // From this point forward, do not use srcStart.
1453     srcChars += srcStart;
1454     if (srcLength < 0) {
1455       // get the srcLength if necessary
1456       srcLength = u_strlen(srcChars);
1457     }
1458   }
1459
1460   // pin the indices to legal values
1461   pinIndices(start, length);
1462
1463   // Calculate the size of the string after the replace.
1464   // Avoid int32_t overflow.
1465   int32_t newLength = oldLength - length;
1466   if(srcLength > (INT32_MAX - newLength)) {
1467     setToBogus();
1468     return *this;
1469   }
1470   newLength += srcLength;
1471
1472   // Check for insertion into ourself
1473   const UChar *oldArray = getArrayStart();
1474   if (isBufferWritable() &&
1475       oldArray < srcChars + srcLength &&
1476       srcChars < oldArray + oldLength) {
1477     // Copy into a new UnicodeString and start over
1478     UnicodeString copy(srcChars, srcLength);
1479     if (copy.isBogus()) {
1480       setToBogus();
1481       return *this;
1482     }
1483     return doReplace(start, length, copy.getArrayStart(), 0, srcLength);
1484   }
1485
1486   // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
1487   // therefore we need to keep the current fArray
1488   UChar oldStackBuffer[US_STACKBUF_SIZE];
1489   if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1490     // copy the stack buffer contents because it will be overwritten with
1491     // fUnion.fFields values
1492     u_memcpy(oldStackBuffer, oldArray, oldLength);
1493     oldArray = oldStackBuffer;
1494   }
1495
1496   // clone our array and allocate a bigger array if needed
1497   int32_t *bufferToDelete = 0;
1498   if(!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength),
1499                          FALSE, &bufferToDelete)
1500   ) {
1501     return *this;
1502   }
1503
1504   // now do the replace
1505
1506   UChar *newArray = getArrayStart();
1507   if(newArray != oldArray) {
1508     // if fArray changed, then we need to copy everything except what will change
1509     us_arrayCopy(oldArray, 0, newArray, 0, start);
1510     us_arrayCopy(oldArray, start + length,
1511                  newArray, start + srcLength,
1512                  oldLength - (start + length));
1513   } else if(length != srcLength) {
1514     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1515     us_arrayCopy(oldArray, start + length,
1516                  newArray, start + srcLength,
1517                  oldLength - (start + length));
1518   }
1519
1520   // now fill in the hole with the new string
1521   us_arrayCopy(srcChars, 0, newArray, start, srcLength);
1522
1523   setLength(newLength);
1524
1525   // delayed delete in case srcChars == fArray when we started, and
1526   // to keep oldArray alive for the above operations
1527   if (bufferToDelete) {
1528     uprv_free(bufferToDelete);
1529   }
1530
1531   return *this;
1532 }
1533
1534 // Versions of doReplace() only for append() variants.
1535 // doReplace() and doAppend() optimize for different cases.
1536
1537 UnicodeString&
1538 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1539   if(srcLength == 0) {
1540     return *this;
1541   }
1542
1543   // pin the indices to legal values
1544   src.pinIndices(srcStart, srcLength);
1545   return doAppend(src.getArrayStart(), srcStart, srcLength);
1546 }
1547
1548 UnicodeString&
1549 UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
1550   if(!isWritable() || srcLength == 0 || srcChars == NULL) {
1551     return *this;
1552   }
1553
1554   // Perform all remaining operations relative to srcChars + srcStart.
1555   // From this point forward, do not use srcStart.
1556   srcChars += srcStart;
1557
1558   if(srcLength < 0) {
1559     // get the srcLength if necessary
1560     if((srcLength = u_strlen(srcChars)) == 0) {
1561       return *this;
1562     }
1563   }
1564
1565   int32_t oldLength = length();
1566   int32_t newLength = oldLength + srcLength;
1567
1568   // Check for append onto ourself
1569   const UChar* oldArray = getArrayStart();
1570   if (isBufferWritable() &&
1571       oldArray < srcChars + srcLength &&
1572       srcChars < oldArray + oldLength) {
1573     // Copy into a new UnicodeString and start over
1574     UnicodeString copy(srcChars, srcLength);
1575     if (copy.isBogus()) {
1576       setToBogus();
1577       return *this;
1578     }
1579     return doAppend(copy.getArrayStart(), 0, srcLength);
1580   }
1581
1582   // optimize append() onto a large-enough, owned string
1583   if((newLength <= getCapacity() && isBufferWritable()) ||
1584       cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) {
1585     UChar *newArray = getArrayStart();
1586     // Do not copy characters when
1587     //   UChar *buffer=str.getAppendBuffer(...);
1588     // is followed by
1589     //   str.append(buffer, length);
1590     // or
1591     //   str.appendString(buffer, length)
1592     // or similar.
1593     if(srcChars != newArray + oldLength) {
1594       us_arrayCopy(srcChars, 0, newArray, oldLength, srcLength);
1595     }
1596     setLength(newLength);
1597   }
1598   return *this;
1599 }
1600
1601 /**
1602  * Replaceable API
1603  */
1604 void
1605 UnicodeString::handleReplaceBetween(int32_t start,
1606                                     int32_t limit,
1607                                     const UnicodeString& text) {
1608     replaceBetween(start, limit, text);
1609 }
1610
1611 /**
1612  * Replaceable API
1613  */
1614 void
1615 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1616     if (limit <= start) {
1617         return; // Nothing to do; avoid bogus malloc call
1618     }
1619     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1620     // Check to make sure text is not null.
1621     if (text != NULL) {
1622             extractBetween(start, limit, text, 0);
1623             insert(dest, text, 0, limit - start);
1624             uprv_free(text);
1625     }
1626 }
1627
1628 /**
1629  * Replaceable API
1630  *
1631  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1632  * so we implement this function here.
1633  */
1634 UBool Replaceable::hasMetaData() const {
1635     return TRUE;
1636 }
1637
1638 /**
1639  * Replaceable API
1640  */
1641 UBool UnicodeString::hasMetaData() const {
1642     return FALSE;
1643 }
1644
1645 UnicodeString&
1646 UnicodeString::doReverse(int32_t start, int32_t length) {
1647   if(length <= 1 || !cloneArrayIfNeeded()) {
1648     return *this;
1649   }
1650
1651   // pin the indices to legal values
1652   pinIndices(start, length);
1653   if(length <= 1) {  // pinIndices() might have shrunk the length
1654     return *this;
1655   }
1656
1657   UChar *left = getArrayStart() + start;
1658   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
1659   UChar swap;
1660   UBool hasSupplementary = FALSE;
1661
1662   // Before the loop we know left<right because length>=2.
1663   do {
1664     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1665     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1666     *right-- = swap;
1667   } while(left < right);
1668   // Make sure to test the middle code unit of an odd-length string.
1669   // Redundant if the length is even.
1670   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1671
1672   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1673   if(hasSupplementary) {
1674     UChar swap2;
1675
1676     left = getArrayStart() + start;
1677     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1678     while(left < right) {
1679       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1680         *left++ = swap2;
1681         *left++ = swap;
1682       } else {
1683         ++left;
1684       }
1685     }
1686   }
1687
1688   return *this;
1689 }
1690
1691 UBool
1692 UnicodeString::padLeading(int32_t targetLength,
1693                           UChar padChar)
1694 {
1695   int32_t oldLength = length();
1696   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1697     return FALSE;
1698   } else {
1699     // move contents up by padding width
1700     UChar *array = getArrayStart();
1701     int32_t start = targetLength - oldLength;
1702     us_arrayCopy(array, 0, array, start, oldLength);
1703
1704     // fill in padding character
1705     while(--start >= 0) {
1706       array[start] = padChar;
1707     }
1708     setLength(targetLength);
1709     return TRUE;
1710   }
1711 }
1712
1713 UBool
1714 UnicodeString::padTrailing(int32_t targetLength,
1715                            UChar padChar)
1716 {
1717   int32_t oldLength = length();
1718   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1719     return FALSE;
1720   } else {
1721     // fill in padding character
1722     UChar *array = getArrayStart();
1723     int32_t length = targetLength;
1724     while(--length >= oldLength) {
1725       array[length] = padChar;
1726     }
1727     setLength(targetLength);
1728     return TRUE;
1729   }
1730 }
1731
1732 //========================================
1733 // Hashing
1734 //========================================
1735 int32_t
1736 UnicodeString::doHashCode() const
1737 {
1738     /* Delegate hash computation to uhash.  This makes UnicodeString
1739      * hashing consistent with UChar* hashing.  */
1740     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1741     if (hashCode == kInvalidHashCode) {
1742         hashCode = kEmptyHashCode;
1743     }
1744     return hashCode;
1745 }
1746
1747 //========================================
1748 // External Buffer
1749 //========================================
1750
1751 char16_t *
1752 UnicodeString::getBuffer(int32_t minCapacity) {
1753   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1754     fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1755     setZeroLength();
1756     return getArrayStart();
1757   } else {
1758     return nullptr;
1759   }
1760 }
1761
1762 void
1763 UnicodeString::releaseBuffer(int32_t newLength) {
1764   if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1765     // set the new fLength
1766     int32_t capacity=getCapacity();
1767     if(newLength==-1) {
1768       // the new length is the string length, capped by fCapacity
1769       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1770       while(p<limit && *p!=0) {
1771         ++p;
1772       }
1773       newLength=(int32_t)(p-array);
1774     } else if(newLength>capacity) {
1775       newLength=capacity;
1776     }
1777     setLength(newLength);
1778     fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1779   }
1780 }
1781
1782 //========================================
1783 // Miscellaneous
1784 //========================================
1785 UBool
1786 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1787                                   int32_t growCapacity,
1788                                   UBool doCopyArray,
1789                                   int32_t **pBufferToDelete,
1790                                   UBool forceClone) {
1791   // default parameters need to be static, therefore
1792   // the defaults are -1 to have convenience defaults
1793   if(newCapacity == -1) {
1794     newCapacity = getCapacity();
1795   }
1796
1797   // while a getBuffer(minCapacity) is "open",
1798   // prevent any modifications of the string by returning FALSE here
1799   // if the string is bogus, then only an assignment or similar can revive it
1800   if(!isWritable()) {
1801     return FALSE;
1802   }
1803
1804   /*
1805    * We need to make a copy of the array if
1806    * the buffer is read-only, or
1807    * the buffer is refCounted (shared), and refCount>1, or
1808    * the buffer is too small.
1809    * Return FALSE if memory could not be allocated.
1810    */
1811   if(forceClone ||
1812      fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1813      (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1814      newCapacity > getCapacity()
1815   ) {
1816     // check growCapacity for default value and use of the stack buffer
1817     if(growCapacity < 0) {
1818       growCapacity = newCapacity;
1819     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1820       growCapacity = US_STACKBUF_SIZE;
1821     }
1822
1823     // save old values
1824     UChar oldStackBuffer[US_STACKBUF_SIZE];
1825     UChar *oldArray;
1826     int32_t oldLength = length();
1827     int16_t flags = fUnion.fFields.fLengthAndFlags;
1828
1829     if(flags&kUsingStackBuffer) {
1830       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1831       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1832         // copy the stack buffer contents because it will be overwritten with
1833         // fUnion.fFields values
1834         us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1835         oldArray = oldStackBuffer;
1836       } else {
1837         oldArray = NULL; // no need to copy from the stack buffer to itself
1838       }
1839     } else {
1840       oldArray = fUnion.fFields.fArray;
1841       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1842     }
1843
1844     // allocate a new array
1845     if(allocate(growCapacity) ||
1846        (newCapacity < growCapacity && allocate(newCapacity))
1847     ) {
1848       if(doCopyArray) {
1849         // copy the contents
1850         // do not copy more than what fits - it may be smaller than before
1851         int32_t minLength = oldLength;
1852         newCapacity = getCapacity();
1853         if(newCapacity < minLength) {
1854           minLength = newCapacity;
1855         }
1856         if(oldArray != NULL) {
1857           us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1858         }
1859         setLength(minLength);
1860       } else {
1861         setZeroLength();
1862       }
1863
1864       // release the old array
1865       if(flags & kRefCounted) {
1866         // the array is refCounted; decrement and release if 0
1867         u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1868         if(umtx_atomic_dec(pRefCount) == 0) {
1869           if(pBufferToDelete == 0) {
1870               // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1871               // is defined as volatile. (Volatile has useful non-standard behavior
1872               //   with this compiler.)
1873             uprv_free((void *)pRefCount);
1874           } else {
1875             // the caller requested to delete it himself
1876             *pBufferToDelete = (int32_t *)pRefCount;
1877           }
1878         }
1879       }
1880     } else {
1881       // not enough memory for growCapacity and not even for the smaller newCapacity
1882       // reset the old values for setToBogus() to release the array
1883       if(!(flags&kUsingStackBuffer)) {
1884         fUnion.fFields.fArray = oldArray;
1885       }
1886       fUnion.fFields.fLengthAndFlags = flags;
1887       setToBogus();
1888       return FALSE;
1889     }
1890   }
1891   return TRUE;
1892 }
1893
1894 // UnicodeStringAppendable ------------------------------------------------- ***
1895
1896 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1897
1898 UBool
1899 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1900   return str.doAppend(&c, 0, 1).isWritable();
1901 }
1902
1903 UBool
1904 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1905   UChar buffer[U16_MAX_LENGTH];
1906   int32_t cLength = 0;
1907   UBool isError = FALSE;
1908   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1909   return !isError && str.doAppend(buffer, 0, cLength).isWritable();
1910 }
1911
1912 UBool
1913 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1914   return str.doAppend(s, 0, length).isWritable();
1915 }
1916
1917 UBool
1918 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1919   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1920 }
1921
1922 UChar *
1923 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1924                                          int32_t desiredCapacityHint,
1925                                          UChar *scratch, int32_t scratchCapacity,
1926                                          int32_t *resultCapacity) {
1927   if(minCapacity < 1 || scratchCapacity < minCapacity) {
1928     *resultCapacity = 0;
1929     return NULL;
1930   }
1931   int32_t oldLength = str.length();
1932   if(minCapacity <= (kMaxCapacity - oldLength) &&
1933       desiredCapacityHint <= (kMaxCapacity - oldLength) &&
1934       str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1935     *resultCapacity = str.getCapacity() - oldLength;
1936     return str.getArrayStart() + oldLength;
1937   }
1938   *resultCapacity = scratchCapacity;
1939   return scratch;
1940 }
1941
1942 U_NAMESPACE_END
1943
1944 U_NAMESPACE_USE
1945
1946 U_CAPI int32_t U_EXPORT2
1947 uhash_hashUnicodeString(const UElement key) {
1948     const UnicodeString *str = (const UnicodeString*) key.pointer;
1949     return (str == NULL) ? 0 : str->hashCode();
1950 }
1951
1952 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1953 // does not depend on hashtable code.
1954 U_CAPI UBool U_EXPORT2
1955 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1956     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1957     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1958     if (str1 == str2) {
1959         return TRUE;
1960     }
1961     if (str1 == NULL || str2 == NULL) {
1962         return FALSE;
1963     }
1964     return *str1 == *str2;
1965 }
1966
1967 #ifdef U_STATIC_IMPLEMENTATION
1968 /*
1969 This should never be called. It is defined here to make sure that the
1970 virtual vector deleting destructor is defined within unistr.cpp.
1971 The vector deleting destructor is already a part of UObject,
1972 but defining it here makes sure that it is included with this object file.
1973 This makes sure that static library dependencies are kept to a minimum.
1974 */
1975 static void uprv_UnicodeStringDummy(void) {
1976     delete [] (new UnicodeString[2]);
1977 }
1978 #endif