icuSources/common/unistr.cpp

   1 /*
   2 ******************************************************************************
   3 * Copyright (C) 1999-2012, International Business Machines Corporation and
   4 * others. All Rights Reserved.
   5 ******************************************************************************
   6 *
   7 * File unistr.cpp
   8 *
   9 * Modification History:
  10 *
  11 *   Date        Name        Description
  12 *   09/25/98    stephen     Creation.
  13 *   04/20/99    stephen     Overhauled per 4/16 code review.
  14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
  15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
  16 *                           Replaceable.
  17 *   06/25/01    grhoten     Removed the dependency on iostream
  18 ******************************************************************************
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "unicode/appendable.h"
  23 #include "unicode/putil.h"
  24 #include "cstring.h"
  25 #include "cmemory.h"
  26 #include "unicode/ustring.h"
  27 #include "unicode/unistr.h"
  28 #include "unicode/utf.h"
  29 #include "unicode/utf16.h"
  30 #include "uelement.h"
  31 #include "ustr_imp.h"
  32 #include "umutex.h"
  33 #include "uassert.h"
  34
  35 #if 0
  36
  37 #include <iostream>
  38 using namespace std;
  39
  40 //DEBUGGING
  41 void
  42 print(const UnicodeString& s,
  43       const char *name)
  44 {
  45   UChar c;
  46   cout << name << ":|";
  47   for(int i = 0; i < s.length(); ++i) {
  48     c = s[i];
  49     if(c>= 0x007E || c < 0x0020)
  50       cout << "[0x" << hex << s[i] << "]";
  51     else
  52       cout << (char) s[i];
  53   }
  54   cout << '|' << endl;
  55 }
  56
  57 void
  58 print(const UChar *s,
  59       int32_t len,
  60       const char *name)
  61 {
  62   UChar c;
  63   cout << name << ":|";
  64   for(int i = 0; i < len; ++i) {
  65     c = s[i];
  66     if(c>= 0x007E || c < 0x0020)
  67       cout << "[0x" << hex << s[i] << "]";
  68     else
  69       cout << (char) s[i];
  70   }
  71   cout << '|' << endl;
  72 }
  73 // END DEBUGGING
  74 #endif
  75
  76 // Local function definitions for now
  77
  78 // need to copy areas that may overlap
  79 static
  80 inline void
  81 us_arrayCopy(const UChar *src, int32_t srcStart,
  82          UChar *dst, int32_t dstStart, int32_t count)
  83 {
  84   if(count>0) {
  85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
  86   }
  87 }
  88
  89 // u_unescapeAt() callback to get a UChar from a UnicodeString
  90 U_CDECL_BEGIN
  91 static UChar U_CALLCONV
  92 UnicodeString_charAt(int32_t offset, void *context) {
  93     return ((icu::UnicodeString*) context)->charAt(offset);
  94 }
  95 U_CDECL_END
  96
  97 U_NAMESPACE_BEGIN
  98
  99 /* The Replaceable virtual destructor can't be defined in the header
 100    due to how AIX works with multiple definitions of virtual functions.
 101 */
 102 Replaceable::~Replaceable() {}
 103
 104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
 105
 106 UnicodeString U_EXPORT2
 107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
 108     return
 109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
 110             append(s1).
 111                 append(s2);
 112 }
 113
 114 //========================================
 115 // Reference Counting functions, put at top of file so that optimizing compilers
 116 //                               have a chance to automatically inline.
 117 //========================================
 118
 119 void
 120 UnicodeString::addRef()
 121 {  umtx_atomic_inc((int32_t *)fUnion.fFields.fArray - 1);}
 122
 123 int32_t
 124 UnicodeString::removeRef()
 125 { return umtx_atomic_dec((int32_t *)fUnion.fFields.fArray - 1);}
 126
 127 int32_t
 128 UnicodeString::refCount() const
 129 {
 130     umtx_lock(NULL);
 131     // Note: without the lock to force a memory barrier, we might see a very
 132     //       stale value on some multi-processor systems.
 133     int32_t  count = *((int32_t *)fUnion.fFields.fArray - 1);
 134     umtx_unlock(NULL);
 135     return count;
 136  }
 137
 138 void
 139 UnicodeString::releaseArray() {
 140   if((fFlags & kRefCounted) && removeRef() == 0) {
 141     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
 142   }
 143 }
 144
 145
 146
 147 //========================================
 148 // Constructors
 149 //========================================
 150
 151 // The default constructor is inline in unistr.h.
 152
 153 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count)
 154   : fShortLength(0),
 155     fFlags(0)
 156 {
 157   if(count <= 0 || (uint32_t)c > 0x10ffff) {
 158     // just allocate and do not do anything else
 159     allocate(capacity);
 160   } else {
 161     // count > 0, allocate and fill the new string with count c's
 162     int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
 163     if(capacity < length) {
 164       capacity = length;
 165     }
 166     if(allocate(capacity)) {
 167       UChar *array = getArrayStart();
 168       int32_t i = 0;
 169
 170       // fill the new string with c
 171       if(unitCount == 1) {
 172         // fill with length UChars
 173         while(i < length) {
 174           array[i++] = (UChar)c;
 175         }
 176       } else {
 177         // get the code units for c
 178         UChar units[U16_MAX_LENGTH];
 179         U16_APPEND_UNSAFE(units, i, c);
 180
 181         // now it must be i==unitCount
 182         i = 0;
 183
 184         // for Unicode, unitCount can only be 1, 2, 3, or 4
 185         // 1 is handled above
 186         while(i < length) {
 187           int32_t unitIdx = 0;
 188           while(unitIdx < unitCount) {
 189             array[i++]=units[unitIdx++];
 190           }
 191         }
 192       }
 193     }
 194     setLength(length);
 195   }
 196 }
 197
 198 UnicodeString::UnicodeString(UChar ch)
 199   : fShortLength(1),
 200     fFlags(kShortString)
 201 {
 202   fUnion.fStackBuffer[0] = ch;
 203 }
 204
 205 UnicodeString::UnicodeString(UChar32 ch)
 206   : fShortLength(0),
 207     fFlags(kShortString)
 208 {
 209   int32_t i = 0;
 210   UBool isError = FALSE;
 211   U16_APPEND(fUnion.fStackBuffer, i, US_STACKBUF_SIZE, ch, isError);
 212   // We test isError so that the compiler does not complain that we don't.
 213   // If isError then i==0 which is what we want anyway.
 214   if(!isError) {
 215     fShortLength = (int8_t)i;
 216   }
 217 }
 218
 219 UnicodeString::UnicodeString(const UChar *text)
 220   : fShortLength(0),
 221     fFlags(kShortString)
 222 {
 223   doReplace(0, 0, text, 0, -1);
 224 }
 225
 226 UnicodeString::UnicodeString(const UChar *text,
 227                              int32_t textLength)
 228   : fShortLength(0),
 229     fFlags(kShortString)
 230 {
 231   doReplace(0, 0, text, 0, textLength);
 232 }
 233
 234 UnicodeString::UnicodeString(UBool isTerminated,
 235                              const UChar *text,
 236                              int32_t textLength)
 237   : fShortLength(0),
 238     fFlags(kReadonlyAlias)
 239 {
 240   if(text == NULL) {
 241     // treat as an empty string, do not alias
 242     setToEmpty();
 243   } else if(textLength < -1 ||
 244             (textLength == -1 && !isTerminated) ||
 245             (textLength >= 0 && isTerminated && text[textLength] != 0)
 246   ) {
 247     setToBogus();
 248   } else {
 249     if(textLength == -1) {
 250       // text is terminated, or else it would have failed the above test
 251       textLength = u_strlen(text);
 252     }
 253     setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
 254   }
 255 }
 256
 257 UnicodeString::UnicodeString(UChar *buff,
 258                              int32_t buffLength,
 259                              int32_t buffCapacity)
 260   : fShortLength(0),
 261     fFlags(kWritableAlias)
 262 {
 263   if(buff == NULL) {
 264     // treat as an empty string, do not alias
 265     setToEmpty();
 266   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
 267     setToBogus();
 268   } else {
 269     if(buffLength == -1) {
 270       // fLength = u_strlen(buff); but do not look beyond buffCapacity
 271       const UChar *p = buff, *limit = buff + buffCapacity;
 272       while(p != limit && *p != 0) {
 273         ++p;
 274       }
 275       buffLength = (int32_t)(p - buff);
 276     }
 277     setArray(buff, buffLength, buffCapacity);
 278   }
 279 }
 280
 281 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
 282   : fShortLength(0),
 283     fFlags(kShortString)
 284 {
 285   if(src==NULL) {
 286     // treat as an empty string
 287   } else {
 288     if(length<0) {
 289       length=(int32_t)uprv_strlen(src);
 290     }
 291     if(cloneArrayIfNeeded(length, length, FALSE)) {
 292       u_charsToUChars(src, getArrayStart(), length);
 293       setLength(length);
 294     } else {
 295       setToBogus();
 296     }
 297   }
 298 }
 299
 300 #if U_CHARSET_IS_UTF8
 301
 302 UnicodeString::UnicodeString(const char *codepageData)
 303   : fShortLength(0),
 304     fFlags(kShortString) {
 305   if(codepageData != 0) {
 306     setToUTF8(codepageData);
 307   }
 308 }
 309
 310 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength)
 311   : fShortLength(0),
 312     fFlags(kShortString) {
 313   // if there's nothing to convert, do nothing
 314   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
 315     return;
 316   }
 317   if(dataLength == -1) {
 318     dataLength = (int32_t)uprv_strlen(codepageData);
 319   }
 320   setToUTF8(StringPiece(codepageData, dataLength));
 321 }
 322
 323 // else see unistr_cnv.cpp
 324 #endif
 325
 326 UnicodeString::UnicodeString(const UnicodeString& that)
 327   : Replaceable(),
 328     fShortLength(0),
 329     fFlags(kShortString)
 330 {
 331   copyFrom(that);
 332 }
 333
 334 UnicodeString::UnicodeString(const UnicodeString& that,
 335                              int32_t srcStart)
 336   : Replaceable(),
 337     fShortLength(0),
 338     fFlags(kShortString)
 339 {
 340   setTo(that, srcStart);
 341 }
 342
 343 UnicodeString::UnicodeString(const UnicodeString& that,
 344                              int32_t srcStart,
 345                              int32_t srcLength)
 346   : Replaceable(),
 347     fShortLength(0),
 348     fFlags(kShortString)
 349 {
 350   setTo(that, srcStart, srcLength);
 351 }
 352
 353 // Replaceable base class clone() default implementation, does not clone
 354 Replaceable *
 355 Replaceable::clone() const {
 356   return NULL;
 357 }
 358
 359 // UnicodeString overrides clone() with a real implementation
 360 Replaceable *
 361 UnicodeString::clone() const {
 362   return new UnicodeString(*this);
 363 }
 364
 365 //========================================
 366 // array allocation
 367 //========================================
 368
 369 UBool
 370 UnicodeString::allocate(int32_t capacity) {
 371   if(capacity <= US_STACKBUF_SIZE) {
 372     fFlags = kShortString;
 373   } else {
 374     // count bytes for the refCounter and the string capacity, and
 375     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
 376     // to be safely aligned for the refCount
 377     // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
 378     int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
 379     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
 380     if(array != 0) {
 381       // set initial refCount and point behind the refCount
 382       *array++ = 1;
 383
 384       // have fArray point to the first UChar
 385       fUnion.fFields.fArray = (UChar *)array;
 386       fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
 387       fFlags = kLongString;
 388     } else {
 389       fShortLength = 0;
 390       fUnion.fFields.fArray = 0;
 391       fUnion.fFields.fCapacity = 0;
 392       fFlags = kIsBogus;
 393       return FALSE;
 394     }
 395   }
 396   return TRUE;
 397 }
 398
 399 //========================================
 400 // Destructor
 401 //========================================
 402 UnicodeString::~UnicodeString()
 403 {
 404   releaseArray();
 405 }
 406
 407 //========================================
 408 // Factory methods
 409 //========================================
 410
 411 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
 412   UnicodeString result;
 413   result.setToUTF8(utf8);
 414   return result;
 415 }
 416
 417 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
 418   UnicodeString result;
 419   int32_t capacity;
 420   // Most UTF-32 strings will be BMP-only and result in a same-length
 421   // UTF-16 string. We overestimate the capacity just slightly,
 422   // just in case there are a few supplementary characters.
 423   if(length <= US_STACKBUF_SIZE) {
 424     capacity = US_STACKBUF_SIZE;
 425   } else {
 426     capacity = length + (length >> 4) + 4;
 427   }
 428   do {
 429     UChar *utf16 = result.getBuffer(capacity);
 430     int32_t length16;
 431     UErrorCode errorCode = U_ZERO_ERROR;
 432     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
 433         utf32, length,
 434         0xfffd,  // Substitution character.
 435         NULL,    // Don't care about number of substitutions.
 436         &errorCode);
 437     result.releaseBuffer(length16);
 438     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 439       capacity = length16 + 1;  // +1 for the terminating NUL.
 440       continue;
 441     } else if(U_FAILURE(errorCode)) {
 442       result.setToBogus();
 443     }
 444     break;
 445   } while(TRUE);
 446   return result;
 447 }
 448
 449 //========================================
 450 // Assignment
 451 //========================================
 452
 453 UnicodeString &
 454 UnicodeString::operator=(const UnicodeString &src) {
 455   return copyFrom(src);
 456 }
 457
 458 UnicodeString &
 459 UnicodeString::fastCopyFrom(const UnicodeString &src) {
 460   return copyFrom(src, TRUE);
 461 }
 462
 463 UnicodeString &
 464 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
 465   // if assigning to ourselves, do nothing
 466   if(this == 0 || this == &src) {
 467     return *this;
 468   }
 469
 470   // is the right side bogus?
 471   if(&src == 0 || src.isBogus()) {
 472     setToBogus();
 473     return *this;
 474   }
 475
 476   // delete the current contents
 477   releaseArray();
 478
 479   if(src.isEmpty()) {
 480     // empty string - use the stack buffer
 481     setToEmpty();
 482     return *this;
 483   }
 484
 485   // we always copy the length
 486   int32_t srcLength = src.length();
 487   setLength(srcLength);
 488
 489   // fLength>0 and not an "open" src.getBuffer(minCapacity)
 490   switch(src.fFlags) {
 491   case kShortString:
 492     // short string using the stack buffer, do the same
 493     fFlags = kShortString;
 494     uprv_memcpy(fUnion.fStackBuffer, src.fUnion.fStackBuffer, srcLength * U_SIZEOF_UCHAR);
 495     break;
 496   case kLongString:
 497     // src uses a refCounted string buffer, use that buffer with refCount
 498     // src is const, use a cast - we don't really change it
 499     ((UnicodeString &)src).addRef();
 500     // copy all fields, share the reference-counted buffer
 501     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 502     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 503     fFlags = src.fFlags;
 504     break;
 505   case kReadonlyAlias:
 506     if(fastCopy) {
 507       // src is a readonly alias, do the same
 508       // -> maintain the readonly alias as such
 509       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 510       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 511       fFlags = src.fFlags;
 512       break;
 513     }
 514     // else if(!fastCopy) fall through to case kWritableAlias
 515     // -> allocate a new buffer and copy the contents
 516   case kWritableAlias:
 517     // src is a writable alias; we make a copy of that instead
 518     if(allocate(srcLength)) {
 519       uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
 520       break;
 521     }
 522     // if there is not enough memory, then fall through to setting to bogus
 523   default:
 524     // if src is bogus, set ourselves to bogus
 525     // do not call setToBogus() here because fArray and fFlags are not consistent here
 526     fShortLength = 0;
 527     fUnion.fFields.fArray = 0;
 528     fUnion.fFields.fCapacity = 0;
 529     fFlags = kIsBogus;
 530     break;
 531   }
 532
 533   return *this;
 534 }
 535
 536 //========================================
 537 // Miscellaneous operations
 538 //========================================
 539
 540 UnicodeString UnicodeString::unescape() const {
 541     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
 542     const UChar *array = getBuffer();
 543     int32_t len = length();
 544     int32_t prev = 0;
 545     for (int32_t i=0;;) {
 546         if (i == len) {
 547             result.append(array, prev, len - prev);
 548             break;
 549         }
 550         if (array[i++] == 0x5C /*'\\'*/) {
 551             result.append(array, prev, (i - 1) - prev);
 552             UChar32 c = unescapeAt(i); // advances i
 553             if (c < 0) {
 554                 result.remove(); // return empty string
 555                 break; // invalid escape sequence
 556             }
 557             result.append(c);
 558             prev = i;
 559         }
 560     }
 561     return result;
 562 }
 563
 564 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
 565     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
 566 }
 567
 568 //========================================
 569 // Read-only implementation
 570 //========================================
 571 UBool
 572 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
 573   // Requires: this & text not bogus and have same lengths.
 574   // Byte-wise comparison works for equality regardless of endianness.
 575   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
 576 }
 577
 578 int8_t
 579 UnicodeString::doCompare( int32_t start,
 580               int32_t length,
 581               const UChar *srcChars,
 582               int32_t srcStart,
 583               int32_t srcLength) const
 584 {
 585   // compare illegal string values
 586   if(isBogus()) {
 587     return -1;
 588   }
 589
 590   // pin indices to legal values
 591   pinIndices(start, length);
 592
 593   if(srcChars == NULL) {
 594     // treat const UChar *srcChars==NULL as an empty string
 595     return length == 0 ? 0 : 1;
 596   }
 597
 598   // get the correct pointer
 599   const UChar *chars = getArrayStart();
 600
 601   chars += start;
 602   srcChars += srcStart;
 603
 604   int32_t minLength;
 605   int8_t lengthResult;
 606
 607   // get the srcLength if necessary
 608   if(srcLength < 0) {
 609     srcLength = u_strlen(srcChars + srcStart);
 610   }
 611
 612   // are we comparing different lengths?
 613   if(length != srcLength) {
 614     if(length < srcLength) {
 615       minLength = length;
 616       lengthResult = -1;
 617     } else {
 618       minLength = srcLength;
 619       lengthResult = 1;
 620     }
 621   } else {
 622     minLength = length;
 623     lengthResult = 0;
 624   }
 625
 626   /*
 627    * note that uprv_memcmp() returns an int but we return an int8_t;
 628    * we need to take care not to truncate the result -
 629    * one way to do this is to right-shift the value to
 630    * move the sign bit into the lower 8 bits and making sure that this
 631    * does not become 0 itself
 632    */
 633
 634   if(minLength > 0 && chars != srcChars) {
 635     int32_t result;
 636
 637 #   if U_IS_BIG_ENDIAN
 638       // big-endian: byte comparison works
 639       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
 640       if(result != 0) {
 641         return (int8_t)(result >> 15 | 1);
 642       }
 643 #   else
 644       // little-endian: compare UChar units
 645       do {
 646         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
 647         if(result != 0) {
 648           return (int8_t)(result >> 15 | 1);
 649         }
 650       } while(--minLength > 0);
 651 #   endif
 652   }
 653   return lengthResult;
 654 }
 655
 656 /* String compare in code point order - doCompare() compares in code unit order. */
 657 int8_t
 658 UnicodeString::doCompareCodePointOrder(int32_t start,
 659                                        int32_t length,
 660                                        const UChar *srcChars,
 661                                        int32_t srcStart,
 662                                        int32_t srcLength) const
 663 {
 664   // compare illegal string values
 665   // treat const UChar *srcChars==NULL as an empty string
 666   if(isBogus()) {
 667     return -1;
 668   }
 669
 670   // pin indices to legal values
 671   pinIndices(start, length);
 672
 673   if(srcChars == NULL) {
 674     srcStart = srcLength = 0;
 675   }
 676
 677   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
 678   /* translate the 32-bit result into an 8-bit one */
 679   if(diff!=0) {
 680     return (int8_t)(diff >> 15 | 1);
 681   } else {
 682     return 0;
 683   }
 684 }
 685
 686 int32_t
 687 UnicodeString::getLength() const {
 688     return length();
 689 }
 690
 691 UChar
 692 UnicodeString::getCharAt(int32_t offset) const {
 693   return charAt(offset);
 694 }
 695
 696 UChar32
 697 UnicodeString::getChar32At(int32_t offset) const {
 698   return char32At(offset);
 699 }
 700
 701 UChar32
 702 UnicodeString::char32At(int32_t offset) const
 703 {
 704   int32_t len = length();
 705   if((uint32_t)offset < (uint32_t)len) {
 706     const UChar *array = getArrayStart();
 707     UChar32 c;
 708     U16_GET(array, 0, offset, len, c);
 709     return c;
 710   } else {
 711     return kInvalidUChar;
 712   }
 713 }
 714
 715 int32_t
 716 UnicodeString::getChar32Start(int32_t offset) const {
 717   if((uint32_t)offset < (uint32_t)length()) {
 718     const UChar *array = getArrayStart();
 719     U16_SET_CP_START(array, 0, offset);
 720     return offset;
 721   } else {
 722     return 0;
 723   }
 724 }
 725
 726 int32_t
 727 UnicodeString::getChar32Limit(int32_t offset) const {
 728   int32_t len = length();
 729   if((uint32_t)offset < (uint32_t)len) {
 730     const UChar *array = getArrayStart();
 731     U16_SET_CP_LIMIT(array, 0, offset, len);
 732     return offset;
 733   } else {
 734     return len;
 735   }
 736 }
 737
 738 int32_t
 739 UnicodeString::countChar32(int32_t start, int32_t length) const {
 740   pinIndices(start, length);
 741   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
 742   return u_countChar32(getArrayStart()+start, length);
 743 }
 744
 745 UBool
 746 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
 747   pinIndices(start, length);
 748   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
 749   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
 750 }
 751
 752 int32_t
 753 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
 754   // pin index
 755   int32_t len = length();
 756   if(index<0) {
 757     index=0;
 758   } else if(index>len) {
 759     index=len;
 760   }
 761
 762   const UChar *array = getArrayStart();
 763   if(delta>0) {
 764     U16_FWD_N(array, index, len, delta);
 765   } else {
 766     U16_BACK_N(array, 0, index, -delta);
 767   }
 768
 769   return index;
 770 }
 771
 772 void
 773 UnicodeString::doExtract(int32_t start,
 774              int32_t length,
 775              UChar *dst,
 776              int32_t dstStart) const
 777 {
 778   // pin indices to legal values
 779   pinIndices(start, length);
 780
 781   // do not copy anything if we alias dst itself
 782   const UChar *array = getArrayStart();
 783   if(array + start != dst + dstStart) {
 784     us_arrayCopy(array, start, dst, dstStart, length);
 785   }
 786 }
 787
 788 int32_t
 789 UnicodeString::extract(UChar *dest, int32_t destCapacity,
 790                        UErrorCode &errorCode) const {
 791   int32_t len = length();
 792   if(U_SUCCESS(errorCode)) {
 793     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 794       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 795     } else {
 796       const UChar *array = getArrayStart();
 797       if(len>0 && len<=destCapacity && array!=dest) {
 798         uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
 799       }
 800       return u_terminateUChars(dest, destCapacity, len, &errorCode);
 801     }
 802   }
 803
 804   return len;
 805 }
 806
 807 int32_t
 808 UnicodeString::extract(int32_t start,
 809                        int32_t length,
 810                        char *target,
 811                        int32_t targetCapacity,
 812                        enum EInvariant) const
 813 {
 814   // if the arguments are illegal, then do nothing
 815   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
 816     return 0;
 817   }
 818
 819   // pin the indices to legal values
 820   pinIndices(start, length);
 821
 822   if(length <= targetCapacity) {
 823     u_UCharsToChars(getArrayStart() + start, target, length);
 824   }
 825   UErrorCode status = U_ZERO_ERROR;
 826   return u_terminateChars(target, targetCapacity, length, &status);
 827 }
 828
 829 UnicodeString
 830 UnicodeString::tempSubString(int32_t start, int32_t len) const {
 831   pinIndices(start, len);
 832   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
 833   if(array==NULL) {
 834     array=fUnion.fStackBuffer;  // anything not NULL because that would make an empty string
 835     len=-2;  // bogus result string
 836   }
 837   return UnicodeString(FALSE, array + start, len);
 838 }
 839
 840 int32_t
 841 UnicodeString::toUTF8(int32_t start, int32_t len,
 842                       char *target, int32_t capacity) const {
 843   pinIndices(start, len);
 844   int32_t length8;
 845   UErrorCode errorCode = U_ZERO_ERROR;
 846   u_strToUTF8WithSub(target, capacity, &length8,
 847                      getBuffer() + start, len,
 848                      0xFFFD,  // Standard substitution character.
 849                      NULL,    // Don't care about number of substitutions.
 850                      &errorCode);
 851   return length8;
 852 }
 853
 854 #if U_CHARSET_IS_UTF8
 855
 856 int32_t
 857 UnicodeString::extract(int32_t start, int32_t len,
 858                        char *target, uint32_t dstSize) const {
 859   // if the arguments are illegal, then do nothing
 860   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
 861     return 0;
 862   }
 863   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
 864 }
 865
 866 // else see unistr_cnv.cpp
 867 #endif
 868
 869 void
 870 UnicodeString::extractBetween(int32_t start,
 871                   int32_t limit,
 872                   UnicodeString& target) const {
 873   pinIndex(start);
 874   pinIndex(limit);
 875   doExtract(start, limit - start, target);
 876 }
 877
 878 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
 879 // as many bytes as the source has UChars.
 880 // The "worst cases" are writing systems like Indic, Thai and CJK with
 881 // 3:1 bytes:UChars.
 882 void
 883 UnicodeString::toUTF8(ByteSink &sink) const {
 884   int32_t length16 = length();
 885   if(length16 != 0) {
 886     char stackBuffer[1024];
 887     int32_t capacity = (int32_t)sizeof(stackBuffer);
 888     UBool utf8IsOwned = FALSE;
 889     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
 890                                       3*length16,
 891                                       stackBuffer, capacity,
 892                                       &capacity);
 893     int32_t length8 = 0;
 894     UErrorCode errorCode = U_ZERO_ERROR;
 895     u_strToUTF8WithSub(utf8, capacity, &length8,
 896                        getBuffer(), length16,
 897                        0xFFFD,  // Standard substitution character.
 898                        NULL,    // Don't care about number of substitutions.
 899                        &errorCode);
 900     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 901       utf8 = (char *)uprv_malloc(length8);
 902       if(utf8 != NULL) {
 903         utf8IsOwned = TRUE;
 904         errorCode = U_ZERO_ERROR;
 905         u_strToUTF8WithSub(utf8, length8, &length8,
 906                            getBuffer(), length16,
 907                            0xFFFD,  // Standard substitution character.
 908                            NULL,    // Don't care about number of substitutions.
 909                            &errorCode);
 910       } else {
 911         errorCode = U_MEMORY_ALLOCATION_ERROR;
 912       }
 913     }
 914     if(U_SUCCESS(errorCode)) {
 915       sink.Append(utf8, length8);
 916       sink.Flush();
 917     }
 918     if(utf8IsOwned) {
 919       uprv_free(utf8);
 920     }
 921   }
 922 }
 923
 924 int32_t
 925 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
 926   int32_t length32=0;
 927   if(U_SUCCESS(errorCode)) {
 928     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
 929     u_strToUTF32WithSub(utf32, capacity, &length32,
 930         getBuffer(), length(),
 931         0xfffd,  // Substitution character.
 932         NULL,    // Don't care about number of substitutions.
 933         &errorCode);
 934   }
 935   return length32;
 936 }
 937
 938 int32_t
 939 UnicodeString::indexOf(const UChar *srcChars,
 940                int32_t srcStart,
 941                int32_t srcLength,
 942                int32_t start,
 943                int32_t length) const
 944 {
 945   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
 946     return -1;
 947   }
 948
 949   // UnicodeString does not find empty substrings
 950   if(srcLength < 0 && srcChars[srcStart] == 0) {
 951     return -1;
 952   }
 953
 954   // get the indices within bounds
 955   pinIndices(start, length);
 956
 957   // find the first occurrence of the substring
 958   const UChar *array = getArrayStart();
 959   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
 960   if(match == NULL) {
 961     return -1;
 962   } else {
 963     return (int32_t)(match - array);
 964   }
 965 }
 966
 967 int32_t
 968 UnicodeString::doIndexOf(UChar c,
 969              int32_t start,
 970              int32_t length) const
 971 {
 972   // pin indices
 973   pinIndices(start, length);
 974
 975   // find the first occurrence of c
 976   const UChar *array = getArrayStart();
 977   const UChar *match = u_memchr(array + start, c, length);
 978   if(match == NULL) {
 979     return -1;
 980   } else {
 981     return (int32_t)(match - array);
 982   }
 983 }
 984
 985 int32_t
 986 UnicodeString::doIndexOf(UChar32 c,
 987                          int32_t start,
 988                          int32_t length) const {
 989   // pin indices
 990   pinIndices(start, length);
 991
 992   // find the first occurrence of c
 993   const UChar *array = getArrayStart();
 994   const UChar *match = u_memchr32(array + start, c, length);
 995   if(match == NULL) {
 996     return -1;
 997   } else {
 998     return (int32_t)(match - array);
 999   }
1000 }
1001
1002 int32_t
1003 UnicodeString::lastIndexOf(const UChar *srcChars,
1004                int32_t srcStart,
1005                int32_t srcLength,
1006                int32_t start,
1007                int32_t length) const
1008 {
1009   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1010     return -1;
1011   }
1012
1013   // UnicodeString does not find empty substrings
1014   if(srcLength < 0 && srcChars[srcStart] == 0) {
1015     return -1;
1016   }
1017
1018   // get the indices within bounds
1019   pinIndices(start, length);
1020
1021   // find the last occurrence of the substring
1022   const UChar *array = getArrayStart();
1023   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1024   if(match == NULL) {
1025     return -1;
1026   } else {
1027     return (int32_t)(match - array);
1028   }
1029 }
1030
1031 int32_t
1032 UnicodeString::doLastIndexOf(UChar c,
1033                  int32_t start,
1034                  int32_t length) const
1035 {
1036   if(isBogus()) {
1037     return -1;
1038   }
1039
1040   // pin indices
1041   pinIndices(start, length);
1042
1043   // find the last occurrence of c
1044   const UChar *array = getArrayStart();
1045   const UChar *match = u_memrchr(array + start, c, length);
1046   if(match == NULL) {
1047     return -1;
1048   } else {
1049     return (int32_t)(match - array);
1050   }
1051 }
1052
1053 int32_t
1054 UnicodeString::doLastIndexOf(UChar32 c,
1055                              int32_t start,
1056                              int32_t length) const {
1057   // pin indices
1058   pinIndices(start, length);
1059
1060   // find the last occurrence of c
1061   const UChar *array = getArrayStart();
1062   const UChar *match = u_memrchr32(array + start, c, length);
1063   if(match == NULL) {
1064     return -1;
1065   } else {
1066     return (int32_t)(match - array);
1067   }
1068 }
1069
1070 //========================================
1071 // Write implementation
1072 //========================================
1073
1074 UnicodeString&
1075 UnicodeString::findAndReplace(int32_t start,
1076                   int32_t length,
1077                   const UnicodeString& oldText,
1078                   int32_t oldStart,
1079                   int32_t oldLength,
1080                   const UnicodeString& newText,
1081                   int32_t newStart,
1082                   int32_t newLength)
1083 {
1084   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1085     return *this;
1086   }
1087
1088   pinIndices(start, length);
1089   oldText.pinIndices(oldStart, oldLength);
1090   newText.pinIndices(newStart, newLength);
1091
1092   if(oldLength == 0) {
1093     return *this;
1094   }
1095
1096   while(length > 0 && length >= oldLength) {
1097     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1098     if(pos < 0) {
1099       // no more oldText's here: done
1100       break;
1101     } else {
1102       // we found oldText, replace it by newText and go beyond it
1103       replace(pos, oldLength, newText, newStart, newLength);
1104       length -= pos + oldLength - start;
1105       start = pos + newLength;
1106     }
1107   }
1108
1109   return *this;
1110 }
1111
1112
1113 void
1114 UnicodeString::setToBogus()
1115 {
1116   releaseArray();
1117
1118   fShortLength = 0;
1119   fUnion.fFields.fArray = 0;
1120   fUnion.fFields.fCapacity = 0;
1121   fFlags = kIsBogus;
1122 }
1123
1124 // turn a bogus string into an empty one
1125 void
1126 UnicodeString::unBogus() {
1127   if(fFlags & kIsBogus) {
1128     setToEmpty();
1129   }
1130 }
1131
1132 // setTo() analogous to the readonly-aliasing constructor with the same signature
1133 UnicodeString &
1134 UnicodeString::setTo(UBool isTerminated,
1135                      const UChar *text,
1136                      int32_t textLength)
1137 {
1138   if(fFlags & kOpenGetBuffer) {
1139     // do not modify a string that has an "open" getBuffer(minCapacity)
1140     return *this;
1141   }
1142
1143   if(text == NULL) {
1144     // treat as an empty string, do not alias
1145     releaseArray();
1146     setToEmpty();
1147     return *this;
1148   }
1149
1150   if( textLength < -1 ||
1151       (textLength == -1 && !isTerminated) ||
1152       (textLength >= 0 && isTerminated && text[textLength] != 0)
1153   ) {
1154     setToBogus();
1155     return *this;
1156   }
1157
1158   releaseArray();
1159
1160   if(textLength == -1) {
1161     // text is terminated, or else it would have failed the above test
1162     textLength = u_strlen(text);
1163   }
1164   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1165
1166   fFlags = kReadonlyAlias;
1167   return *this;
1168 }
1169
1170 // setTo() analogous to the writable-aliasing constructor with the same signature
1171 UnicodeString &
1172 UnicodeString::setTo(UChar *buffer,
1173                      int32_t buffLength,
1174                      int32_t buffCapacity) {
1175   if(fFlags & kOpenGetBuffer) {
1176     // do not modify a string that has an "open" getBuffer(minCapacity)
1177     return *this;
1178   }
1179
1180   if(buffer == NULL) {
1181     // treat as an empty string, do not alias
1182     releaseArray();
1183     setToEmpty();
1184     return *this;
1185   }
1186
1187   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1188     setToBogus();
1189     return *this;
1190   } else if(buffLength == -1) {
1191     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1192     const UChar *p = buffer, *limit = buffer + buffCapacity;
1193     while(p != limit && *p != 0) {
1194       ++p;
1195     }
1196     buffLength = (int32_t)(p - buffer);
1197   }
1198
1199   releaseArray();
1200
1201   setArray(buffer, buffLength, buffCapacity);
1202   fFlags = kWritableAlias;
1203   return *this;
1204 }
1205
1206 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
1207   unBogus();
1208   int32_t length = utf8.length();
1209   int32_t capacity;
1210   // The UTF-16 string will be at most as long as the UTF-8 string.
1211   if(length <= US_STACKBUF_SIZE) {
1212     capacity = US_STACKBUF_SIZE;
1213   } else {
1214     capacity = length + 1;  // +1 for the terminating NUL.
1215   }
1216   UChar *utf16 = getBuffer(capacity);
1217   int32_t length16;
1218   UErrorCode errorCode = U_ZERO_ERROR;
1219   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1220       utf8.data(), length,
1221       0xfffd,  // Substitution character.
1222       NULL,    // Don't care about number of substitutions.
1223       &errorCode);
1224   releaseBuffer(length16);
1225   if(U_FAILURE(errorCode)) {
1226     setToBogus();
1227   }
1228   return *this;
1229 }
1230
1231 UnicodeString&
1232 UnicodeString::setCharAt(int32_t offset,
1233              UChar c)
1234 {
1235   int32_t len = length();
1236   if(cloneArrayIfNeeded() && len > 0) {
1237     if(offset < 0) {
1238       offset = 0;
1239     } else if(offset >= len) {
1240       offset = len - 1;
1241     }
1242
1243     getArrayStart()[offset] = c;
1244   }
1245   return *this;
1246 }
1247
1248 UnicodeString&
1249 UnicodeString::replace(int32_t start,
1250                int32_t _length,
1251                UChar32 srcChar) {
1252   UChar buffer[U16_MAX_LENGTH];
1253   int32_t count = 0;
1254   UBool isError = FALSE;
1255   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1256   // We test isError so that the compiler does not complain that we don't.
1257   // If isError (srcChar is not a valid code point) then count==0 which means
1258   // we remove the source segment rather than replacing it with srcChar.
1259   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1260 }
1261
1262 UnicodeString&
1263 UnicodeString::append(UChar32 srcChar) {
1264   UChar buffer[U16_MAX_LENGTH];
1265   int32_t _length = 0;
1266   UBool isError = FALSE;
1267   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1268   // We test isError so that the compiler does not complain that we don't.
1269   // If isError then _length==0 which turns the doReplace() into a no-op anyway.
1270   return isError ? *this : doReplace(length(), 0, buffer, 0, _length);
1271 }
1272
1273 UnicodeString&
1274 UnicodeString::doReplace( int32_t start,
1275               int32_t length,
1276               const UnicodeString& src,
1277               int32_t srcStart,
1278               int32_t srcLength)
1279 {
1280   if(!src.isBogus()) {
1281     // pin the indices to legal values
1282     src.pinIndices(srcStart, srcLength);
1283
1284     // get the characters from src
1285     // and replace the range in ourselves with them
1286     return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1287   } else {
1288     // remove the range
1289     return doReplace(start, length, 0, 0, 0);
1290   }
1291 }
1292
1293 UnicodeString&
1294 UnicodeString::doReplace(int32_t start,
1295              int32_t length,
1296              const UChar *srcChars,
1297              int32_t srcStart,
1298              int32_t srcLength)
1299 {
1300   if(!isWritable()) {
1301     return *this;
1302   }
1303
1304   int32_t oldLength = this->length();
1305
1306   // optimize (read-only alias).remove(0, start) and .remove(start, end)
1307   if((fFlags&kBufferIsReadonly) && srcLength == 0) {
1308     if(start == 0) {
1309       // remove prefix by adjusting the array pointer
1310       pinIndex(length);
1311       fUnion.fFields.fArray += length;
1312       fUnion.fFields.fCapacity -= length;
1313       setLength(oldLength - length);
1314       return *this;
1315     } else {
1316       pinIndex(start);
1317       if(length >= (oldLength - start)) {
1318         // remove suffix by reducing the length (like truncate())
1319         setLength(start);
1320         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
1321         return *this;
1322       }
1323     }
1324   }
1325
1326   if(srcChars == 0) {
1327     srcStart = srcLength = 0;
1328   } else if(srcLength < 0) {
1329     // get the srcLength if necessary
1330     srcLength = u_strlen(srcChars + srcStart);
1331   }
1332
1333   // calculate the size of the string after the replace
1334   int32_t newLength;
1335
1336   // optimize append() onto a large-enough, owned string
1337   if(start >= oldLength) {
1338     if(srcLength == 0) {
1339       return *this;
1340     }
1341     newLength = oldLength + srcLength;
1342     if(newLength <= getCapacity() && isBufferWritable()) {
1343       UChar *oldArray = getArrayStart();
1344       // Do not copy characters when
1345       //   UChar *buffer=str.getAppendBuffer(...);
1346       // is followed by
1347       //   str.append(buffer, length);
1348       // or
1349       //   str.appendString(buffer, length)
1350       // or similar.
1351       if(srcChars + srcStart != oldArray + start || start > oldLength) {
1352         us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength);
1353       }
1354       setLength(newLength);
1355       return *this;
1356     } else {
1357       // pin the indices to legal values
1358       start = oldLength;
1359       length = 0;
1360     }
1361   } else {
1362     // pin the indices to legal values
1363     pinIndices(start, length);
1364
1365     newLength = oldLength - length + srcLength;
1366   }
1367
1368   // the following may change fArray but will not copy the current contents;
1369   // therefore we need to keep the current fArray
1370   UChar oldStackBuffer[US_STACKBUF_SIZE];
1371   UChar *oldArray;
1372   if((fFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1373     // copy the stack buffer contents because it will be overwritten with
1374     // fUnion.fFields values
1375     u_memcpy(oldStackBuffer, fUnion.fStackBuffer, oldLength);
1376     oldArray = oldStackBuffer;
1377   } else {
1378     oldArray = getArrayStart();
1379   }
1380
1381   // clone our array and allocate a bigger array if needed
1382   int32_t *bufferToDelete = 0;
1383   if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
1384                          FALSE, &bufferToDelete)
1385   ) {
1386     return *this;
1387   }
1388
1389   // now do the replace
1390
1391   UChar *newArray = getArrayStart();
1392   if(newArray != oldArray) {
1393     // if fArray changed, then we need to copy everything except what will change
1394     us_arrayCopy(oldArray, 0, newArray, 0, start);
1395     us_arrayCopy(oldArray, start + length,
1396                  newArray, start + srcLength,
1397                  oldLength - (start + length));
1398   } else if(length != srcLength) {
1399     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1400     us_arrayCopy(oldArray, start + length,
1401                  newArray, start + srcLength,
1402                  oldLength - (start + length));
1403   }
1404
1405   // now fill in the hole with the new string
1406   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1407
1408   setLength(newLength);
1409
1410   // delayed delete in case srcChars == fArray when we started, and
1411   // to keep oldArray alive for the above operations
1412   if (bufferToDelete) {
1413     uprv_free(bufferToDelete);
1414   }
1415
1416   return *this;
1417 }
1418
1419 /**
1420  * Replaceable API
1421  */
1422 void
1423 UnicodeString::handleReplaceBetween(int32_t start,
1424                                     int32_t limit,
1425                                     const UnicodeString& text) {
1426     replaceBetween(start, limit, text);
1427 }
1428
1429 /**
1430  * Replaceable API
1431  */
1432 void
1433 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1434     if (limit <= start) {
1435         return; // Nothing to do; avoid bogus malloc call
1436     }
1437     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1438     // Check to make sure text is not null.
1439     if (text != NULL) {
1440             extractBetween(start, limit, text, 0);
1441             insert(dest, text, 0, limit - start);
1442             uprv_free(text);
1443     }
1444 }
1445
1446 /**
1447  * Replaceable API
1448  *
1449  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1450  * so we implement this function here.
1451  */
1452 UBool Replaceable::hasMetaData() const {
1453     return TRUE;
1454 }
1455
1456 /**
1457  * Replaceable API
1458  */
1459 UBool UnicodeString::hasMetaData() const {
1460     return FALSE;
1461 }
1462
1463 UnicodeString&
1464 UnicodeString::doReverse(int32_t start, int32_t length) {
1465   if(length <= 1 || !cloneArrayIfNeeded()) {
1466     return *this;
1467   }
1468
1469   // pin the indices to legal values
1470   pinIndices(start, length);
1471   if(length <= 1) {  // pinIndices() might have shrunk the length
1472     return *this;
1473   }
1474
1475   UChar *left = getArrayStart() + start;
1476   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
1477   UChar swap;
1478   UBool hasSupplementary = FALSE;
1479
1480   // Before the loop we know left<right because length>=2.
1481   do {
1482     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1483     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1484     *right-- = swap;
1485   } while(left < right);
1486   // Make sure to test the middle code unit of an odd-length string.
1487   // Redundant if the length is even.
1488   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1489
1490   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1491   if(hasSupplementary) {
1492     UChar swap2;
1493
1494     left = getArrayStart() + start;
1495     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1496     while(left < right) {
1497       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1498         *left++ = swap2;
1499         *left++ = swap;
1500       } else {
1501         ++left;
1502       }
1503     }
1504   }
1505
1506   return *this;
1507 }
1508
1509 UBool
1510 UnicodeString::padLeading(int32_t targetLength,
1511                           UChar padChar)
1512 {
1513   int32_t oldLength = length();
1514   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1515     return FALSE;
1516   } else {
1517     // move contents up by padding width
1518     UChar *array = getArrayStart();
1519     int32_t start = targetLength - oldLength;
1520     us_arrayCopy(array, 0, array, start, oldLength);
1521
1522     // fill in padding character
1523     while(--start >= 0) {
1524       array[start] = padChar;
1525     }
1526     setLength(targetLength);
1527     return TRUE;
1528   }
1529 }
1530
1531 UBool
1532 UnicodeString::padTrailing(int32_t targetLength,
1533                            UChar padChar)
1534 {
1535   int32_t oldLength = length();
1536   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1537     return FALSE;
1538   } else {
1539     // fill in padding character
1540     UChar *array = getArrayStart();
1541     int32_t length = targetLength;
1542     while(--length >= oldLength) {
1543       array[length] = padChar;
1544     }
1545     setLength(targetLength);
1546     return TRUE;
1547   }
1548 }
1549
1550 //========================================
1551 // Hashing
1552 //========================================
1553 int32_t
1554 UnicodeString::doHashCode() const
1555 {
1556     /* Delegate hash computation to uhash.  This makes UnicodeString
1557      * hashing consistent with UChar* hashing.  */
1558     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1559     if (hashCode == kInvalidHashCode) {
1560         hashCode = kEmptyHashCode;
1561     }
1562     return hashCode;
1563 }
1564
1565 //========================================
1566 // External Buffer
1567 //========================================
1568
1569 UChar *
1570 UnicodeString::getBuffer(int32_t minCapacity) {
1571   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1572     fFlags|=kOpenGetBuffer;
1573     fShortLength=0;
1574     return getArrayStart();
1575   } else {
1576     return 0;
1577   }
1578 }
1579
1580 void
1581 UnicodeString::releaseBuffer(int32_t newLength) {
1582   if(fFlags&kOpenGetBuffer && newLength>=-1) {
1583     // set the new fLength
1584     int32_t capacity=getCapacity();
1585     if(newLength==-1) {
1586       // the new length is the string length, capped by fCapacity
1587       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1588       while(p<limit && *p!=0) {
1589         ++p;
1590       }
1591       newLength=(int32_t)(p-array);
1592     } else if(newLength>capacity) {
1593       newLength=capacity;
1594     }
1595     setLength(newLength);
1596     fFlags&=~kOpenGetBuffer;
1597   }
1598 }
1599
1600 //========================================
1601 // Miscellaneous
1602 //========================================
1603 UBool
1604 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1605                                   int32_t growCapacity,
1606                                   UBool doCopyArray,
1607                                   int32_t **pBufferToDelete,
1608                                   UBool forceClone) {
1609   // default parameters need to be static, therefore
1610   // the defaults are -1 to have convenience defaults
1611   if(newCapacity == -1) {
1612     newCapacity = getCapacity();
1613   }
1614
1615   // while a getBuffer(minCapacity) is "open",
1616   // prevent any modifications of the string by returning FALSE here
1617   // if the string is bogus, then only an assignment or similar can revive it
1618   if(!isWritable()) {
1619     return FALSE;
1620   }
1621
1622   /*
1623    * We need to make a copy of the array if
1624    * the buffer is read-only, or
1625    * the buffer is refCounted (shared), and refCount>1, or
1626    * the buffer is too small.
1627    * Return FALSE if memory could not be allocated.
1628    */
1629   if(forceClone ||
1630      fFlags & kBufferIsReadonly ||
1631      (fFlags & kRefCounted && refCount() > 1) ||
1632      newCapacity > getCapacity()
1633   ) {
1634     // check growCapacity for default value and use of the stack buffer
1635     if(growCapacity < 0) {
1636       growCapacity = newCapacity;
1637     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1638       growCapacity = US_STACKBUF_SIZE;
1639     }
1640
1641     // save old values
1642     UChar oldStackBuffer[US_STACKBUF_SIZE];
1643     UChar *oldArray;
1644     uint8_t flags = fFlags;
1645
1646     if(flags&kUsingStackBuffer) {
1647       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1648       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1649         // copy the stack buffer contents because it will be overwritten with
1650         // fUnion.fFields values
1651         us_arrayCopy(fUnion.fStackBuffer, 0, oldStackBuffer, 0, fShortLength);
1652         oldArray = oldStackBuffer;
1653       } else {
1654         oldArray = 0; // no need to copy from stack buffer to itself
1655       }
1656     } else {
1657       oldArray = fUnion.fFields.fArray;
1658       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1659     }
1660
1661     // allocate a new array
1662     if(allocate(growCapacity) ||
1663        (newCapacity < growCapacity && allocate(newCapacity))
1664     ) {
1665       if(doCopyArray && oldArray != 0) {
1666         // copy the contents
1667         // do not copy more than what fits - it may be smaller than before
1668         int32_t minLength = length();
1669         newCapacity = getCapacity();
1670         if(newCapacity < minLength) {
1671           minLength = newCapacity;
1672           setLength(minLength);
1673         }
1674         us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1675       } else {
1676         fShortLength = 0;
1677       }
1678
1679       // release the old array
1680       if(flags & kRefCounted) {
1681         // the array is refCounted; decrement and release if 0
1682         int32_t *pRefCount = ((int32_t *)oldArray - 1);
1683         if(umtx_atomic_dec(pRefCount) == 0) {
1684           if(pBufferToDelete == 0) {
1685             uprv_free(pRefCount);
1686           } else {
1687             // the caller requested to delete it himself
1688             *pBufferToDelete = pRefCount;
1689           }
1690         }
1691       }
1692     } else {
1693       // not enough memory for growCapacity and not even for the smaller newCapacity
1694       // reset the old values for setToBogus() to release the array
1695       if(!(flags&kUsingStackBuffer)) {
1696         fUnion.fFields.fArray = oldArray;
1697       }
1698       fFlags = flags;
1699       setToBogus();
1700       return FALSE;
1701     }
1702   }
1703   return TRUE;
1704 }
1705
1706 // UnicodeStringAppendable ------------------------------------------------- ***
1707
1708 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1709
1710 UBool
1711 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1712   return str.doReplace(str.length(), 0, &c, 0, 1).isWritable();
1713 }
1714
1715 UBool
1716 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1717   UChar buffer[U16_MAX_LENGTH];
1718   int32_t cLength = 0;
1719   UBool isError = FALSE;
1720   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1721   return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable();
1722 }
1723
1724 UBool
1725 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1726   return str.doReplace(str.length(), 0, s, 0, length).isWritable();
1727 }
1728
1729 UBool
1730 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1731   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1732 }
1733
1734 UChar *
1735 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1736                                          int32_t desiredCapacityHint,
1737                                          UChar *scratch, int32_t scratchCapacity,
1738                                          int32_t *resultCapacity) {
1739   if(minCapacity < 1 || scratchCapacity < minCapacity) {
1740     *resultCapacity = 0;
1741     return NULL;
1742   }
1743   int32_t oldLength = str.length();
1744   if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1745     *resultCapacity = str.getCapacity() - oldLength;
1746     return str.getArrayStart() + oldLength;
1747   }
1748   *resultCapacity = scratchCapacity;
1749   return scratch;
1750 }
1751
1752 U_NAMESPACE_END
1753
1754 U_NAMESPACE_USE
1755
1756 U_CAPI int32_t U_EXPORT2
1757 uhash_hashUnicodeString(const UElement key) {
1758     const UnicodeString *str = (const UnicodeString*) key.pointer;
1759     return (str == NULL) ? 0 : str->hashCode();
1760 }
1761
1762 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1763 // does not depend on hashtable code.
1764 U_CAPI UBool U_EXPORT2
1765 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1766     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1767     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1768     if (str1 == str2) {
1769         return TRUE;
1770     }
1771     if (str1 == NULL || str2 == NULL) {
1772         return FALSE;
1773     }
1774     return *str1 == *str2;
1775 }
1776
1777 #ifdef U_STATIC_IMPLEMENTATION
1778 /*
1779 This should never be called. It is defined here to make sure that the
1780 virtual vector deleting destructor is defined within unistr.cpp.
1781 The vector deleting destructor is already a part of UObject,
1782 but defining it here makes sure that it is included with this object file.
1783 This makes sure that static library dependencies are kept to a minimum.
1784 */
1785 static void uprv_UnicodeStringDummy(void) {
1786     delete [] (new UnicodeString[2]);
1787 }
1788 #endif