icuSources/common/unistr.cpp

   1 /*
   2 ******************************************************************************
   3 * Copyright (C) 1999-2012, International Business Machines Corporation and
   4 * others. All Rights Reserved.
   5 ******************************************************************************
   6 *
   7 * File unistr.cpp
   8 *
   9 * Modification History:
  10 *
  11 *   Date        Name        Description
  12 *   09/25/98    stephen     Creation.
  13 *   04/20/99    stephen     Overhauled per 4/16 code review.
  14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
  15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
  16 *                           Replaceable.
  17 *   06/25/01    grhoten     Removed the dependency on iostream
  18 ******************************************************************************
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "unicode/appendable.h"
  23 #include "unicode/putil.h"
  24 #include "cstring.h"
  25 #include "cmemory.h"
  26 #include "unicode/ustring.h"
  27 #include "unicode/unistr.h"
  28 #include "unicode/utf.h"
  29 #include "unicode/utf16.h"
  30 #include "uelement.h"
  31 #include "ustr_imp.h"
  32 #include "umutex.h"
  33 #include "uassert.h"
  34
  35 #if 0
  36
  37 #include <iostream>
  38 using namespace std;
  39
  40 //DEBUGGING
  41 void
  42 print(const UnicodeString& s,
  43       const char *name)
  44 {
  45   UChar c;
  46   cout << name << ":|";
  47   for(int i = 0; i < s.length(); ++i) {
  48     c = s[i];
  49     if(c>= 0x007E || c < 0x0020)
  50       cout << "[0x" << hex << s[i] << "]";
  51     else
  52       cout << (char) s[i];
  53   }
  54   cout << '|' << endl;
  55 }
  56
  57 void
  58 print(const UChar *s,
  59       int32_t len,
  60       const char *name)
  61 {
  62   UChar c;
  63   cout << name << ":|";
  64   for(int i = 0; i < len; ++i) {
  65     c = s[i];
  66     if(c>= 0x007E || c < 0x0020)
  67       cout << "[0x" << hex << s[i] << "]";
  68     else
  69       cout << (char) s[i];
  70   }
  71   cout << '|' << endl;
  72 }
  73 // END DEBUGGING
  74 #endif
  75
  76 // Local function definitions for now
  77
  78 // need to copy areas that may overlap
  79 static
  80 inline void
  81 us_arrayCopy(const UChar *src, int32_t srcStart,
  82          UChar *dst, int32_t dstStart, int32_t count)
  83 {
  84   if(count>0) {
  85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
  86   }
  87 }
  88
  89 // u_unescapeAt() callback to get a UChar from a UnicodeString
  90 U_CDECL_BEGIN
  91 static UChar U_CALLCONV
  92 UnicodeString_charAt(int32_t offset, void *context) {
  93     return ((icu::UnicodeString*) context)->charAt(offset);
  94 }
  95 U_CDECL_END
  96
  97 U_NAMESPACE_BEGIN
  98
  99 /* The Replaceable virtual destructor can't be defined in the header
 100    due to how AIX works with multiple definitions of virtual functions.
 101 */
 102 Replaceable::~Replaceable() {}
 103 Replaceable::Replaceable() {}
 104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
 105
 106 UnicodeString U_EXPORT2
 107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
 108     return
 109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
 110             append(s1).
 111                 append(s2);
 112 }
 113
 114 //========================================
 115 // Reference Counting functions, put at top of file so that optimizing compilers
 116 //                               have a chance to automatically inline.
 117 //========================================
 118
 119 void
 120 UnicodeString::addRef()
 121 {  umtx_atomic_inc((int32_t *)fUnion.fFields.fArray - 1);}
 122
 123 int32_t
 124 UnicodeString::removeRef()
 125 { return umtx_atomic_dec((int32_t *)fUnion.fFields.fArray - 1);}
 126
 127 int32_t
 128 UnicodeString::refCount() const
 129 {
 130     umtx_lock(NULL);
 131     // Note: without the lock to force a memory barrier, we might see a very
 132     //       stale value on some multi-processor systems.
 133     int32_t  count = *((int32_t *)fUnion.fFields.fArray - 1);
 134     umtx_unlock(NULL);
 135     return count;
 136  }
 137
 138 void
 139 UnicodeString::releaseArray() {
 140   if((fFlags & kRefCounted) && removeRef() == 0) {
 141     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
 142   }
 143 }
 144
 145
 146
 147 //========================================
 148 // Constructors
 149 //========================================
 150 UnicodeString::UnicodeString()
 151   : fShortLength(0),
 152     fFlags(kShortString)
 153 {}
 154
 155 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count)
 156   : fShortLength(0),
 157     fFlags(0)
 158 {
 159   if(count <= 0 || (uint32_t)c > 0x10ffff) {
 160     // just allocate and do not do anything else
 161     allocate(capacity);
 162   } else {
 163     // count > 0, allocate and fill the new string with count c's
 164     int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
 165     if(capacity < length) {
 166       capacity = length;
 167     }
 168     if(allocate(capacity)) {
 169       UChar *array = getArrayStart();
 170       int32_t i = 0;
 171
 172       // fill the new string with c
 173       if(unitCount == 1) {
 174         // fill with length UChars
 175         while(i < length) {
 176           array[i++] = (UChar)c;
 177         }
 178       } else {
 179         // get the code units for c
 180         UChar units[U16_MAX_LENGTH];
 181         U16_APPEND_UNSAFE(units, i, c);
 182
 183         // now it must be i==unitCount
 184         i = 0;
 185
 186         // for Unicode, unitCount can only be 1, 2, 3, or 4
 187         // 1 is handled above
 188         while(i < length) {
 189           int32_t unitIdx = 0;
 190           while(unitIdx < unitCount) {
 191             array[i++]=units[unitIdx++];
 192           }
 193         }
 194       }
 195     }
 196     setLength(length);
 197   }
 198 }
 199
 200 UnicodeString::UnicodeString(UChar ch)
 201   : fShortLength(1),
 202     fFlags(kShortString)
 203 {
 204   fUnion.fStackBuffer[0] = ch;
 205 }
 206
 207 UnicodeString::UnicodeString(UChar32 ch)
 208   : fShortLength(0),
 209     fFlags(kShortString)
 210 {
 211   int32_t i = 0;
 212   UBool isError = FALSE;
 213   U16_APPEND(fUnion.fStackBuffer, i, US_STACKBUF_SIZE, ch, isError);
 214   // We test isError so that the compiler does not complain that we don't.
 215   // If isError then i==0 which is what we want anyway.
 216   if(!isError) {
 217     fShortLength = (int8_t)i;
 218   }
 219 }
 220
 221 UnicodeString::UnicodeString(const UChar *text)
 222   : fShortLength(0),
 223     fFlags(kShortString)
 224 {
 225   doReplace(0, 0, text, 0, -1);
 226 }
 227
 228 UnicodeString::UnicodeString(const UChar *text,
 229                              int32_t textLength)
 230   : fShortLength(0),
 231     fFlags(kShortString)
 232 {
 233   doReplace(0, 0, text, 0, textLength);
 234 }
 235
 236 UnicodeString::UnicodeString(UBool isTerminated,
 237                              const UChar *text,
 238                              int32_t textLength)
 239   : fShortLength(0),
 240     fFlags(kReadonlyAlias)
 241 {
 242   if(text == NULL) {
 243     // treat as an empty string, do not alias
 244     setToEmpty();
 245   } else if(textLength < -1 ||
 246             (textLength == -1 && !isTerminated) ||
 247             (textLength >= 0 && isTerminated && text[textLength] != 0)
 248   ) {
 249     setToBogus();
 250   } else {
 251     if(textLength == -1) {
 252       // text is terminated, or else it would have failed the above test
 253       textLength = u_strlen(text);
 254     }
 255     setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
 256   }
 257 }
 258
 259 UnicodeString::UnicodeString(UChar *buff,
 260                              int32_t buffLength,
 261                              int32_t buffCapacity)
 262   : fShortLength(0),
 263     fFlags(kWritableAlias)
 264 {
 265   if(buff == NULL) {
 266     // treat as an empty string, do not alias
 267     setToEmpty();
 268   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
 269     setToBogus();
 270   } else {
 271     if(buffLength == -1) {
 272       // fLength = u_strlen(buff); but do not look beyond buffCapacity
 273       const UChar *p = buff, *limit = buff + buffCapacity;
 274       while(p != limit && *p != 0) {
 275         ++p;
 276       }
 277       buffLength = (int32_t)(p - buff);
 278     }
 279     setArray(buff, buffLength, buffCapacity);
 280   }
 281 }
 282
 283 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
 284   : fShortLength(0),
 285     fFlags(kShortString)
 286 {
 287   if(src==NULL) {
 288     // treat as an empty string
 289   } else {
 290     if(length<0) {
 291       length=(int32_t)uprv_strlen(src);
 292     }
 293     if(cloneArrayIfNeeded(length, length, FALSE)) {
 294       u_charsToUChars(src, getArrayStart(), length);
 295       setLength(length);
 296     } else {
 297       setToBogus();
 298     }
 299   }
 300 }
 301
 302 #if U_CHARSET_IS_UTF8
 303
 304 UnicodeString::UnicodeString(const char *codepageData)
 305   : fShortLength(0),
 306     fFlags(kShortString) {
 307   if(codepageData != 0) {
 308     setToUTF8(codepageData);
 309   }
 310 }
 311
 312 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength)
 313   : fShortLength(0),
 314     fFlags(kShortString) {
 315   // if there's nothing to convert, do nothing
 316   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
 317     return;
 318   }
 319   if(dataLength == -1) {
 320     dataLength = (int32_t)uprv_strlen(codepageData);
 321   }
 322   setToUTF8(StringPiece(codepageData, dataLength));
 323 }
 324
 325 // else see unistr_cnv.cpp
 326 #endif
 327
 328 UnicodeString::UnicodeString(const UnicodeString& that)
 329   : Replaceable(),
 330     fShortLength(0),
 331     fFlags(kShortString)
 332 {
 333   copyFrom(that);
 334 }
 335
 336 UnicodeString::UnicodeString(const UnicodeString& that,
 337                              int32_t srcStart)
 338   : Replaceable(),
 339     fShortLength(0),
 340     fFlags(kShortString)
 341 {
 342   setTo(that, srcStart);
 343 }
 344
 345 UnicodeString::UnicodeString(const UnicodeString& that,
 346                              int32_t srcStart,
 347                              int32_t srcLength)
 348   : Replaceable(),
 349     fShortLength(0),
 350     fFlags(kShortString)
 351 {
 352   setTo(that, srcStart, srcLength);
 353 }
 354
 355 // Replaceable base class clone() default implementation, does not clone
 356 Replaceable *
 357 Replaceable::clone() const {
 358   return NULL;
 359 }
 360
 361 // UnicodeString overrides clone() with a real implementation
 362 Replaceable *
 363 UnicodeString::clone() const {
 364   return new UnicodeString(*this);
 365 }
 366
 367 //========================================
 368 // array allocation
 369 //========================================
 370
 371 UBool
 372 UnicodeString::allocate(int32_t capacity) {
 373   if(capacity <= US_STACKBUF_SIZE) {
 374     fFlags = kShortString;
 375   } else {
 376     // count bytes for the refCounter and the string capacity, and
 377     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
 378     // to be safely aligned for the refCount
 379     // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
 380     int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
 381     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
 382     if(array != 0) {
 383       // set initial refCount and point behind the refCount
 384       *array++ = 1;
 385
 386       // have fArray point to the first UChar
 387       fUnion.fFields.fArray = (UChar *)array;
 388       fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
 389       fFlags = kLongString;
 390     } else {
 391       fShortLength = 0;
 392       fUnion.fFields.fArray = 0;
 393       fUnion.fFields.fCapacity = 0;
 394       fFlags = kIsBogus;
 395       return FALSE;
 396     }
 397   }
 398   return TRUE;
 399 }
 400
 401 //========================================
 402 // Destructor
 403 //========================================
 404 UnicodeString::~UnicodeString()
 405 {
 406   releaseArray();
 407 }
 408
 409 //========================================
 410 // Factory methods
 411 //========================================
 412
 413 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
 414   UnicodeString result;
 415   result.setToUTF8(utf8);
 416   return result;
 417 }
 418
 419 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
 420   UnicodeString result;
 421   int32_t capacity;
 422   // Most UTF-32 strings will be BMP-only and result in a same-length
 423   // UTF-16 string. We overestimate the capacity just slightly,
 424   // just in case there are a few supplementary characters.
 425   if(length <= US_STACKBUF_SIZE) {
 426     capacity = US_STACKBUF_SIZE;
 427   } else {
 428     capacity = length + (length >> 4) + 4;
 429   }
 430   do {
 431     UChar *utf16 = result.getBuffer(capacity);
 432     int32_t length16;
 433     UErrorCode errorCode = U_ZERO_ERROR;
 434     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
 435         utf32, length,
 436         0xfffd,  // Substitution character.
 437         NULL,    // Don't care about number of substitutions.
 438         &errorCode);
 439     result.releaseBuffer(length16);
 440     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 441       capacity = length16 + 1;  // +1 for the terminating NUL.
 442       continue;
 443     } else if(U_FAILURE(errorCode)) {
 444       result.setToBogus();
 445     }
 446     break;
 447   } while(TRUE);
 448   return result;
 449 }
 450
 451 //========================================
 452 // Assignment
 453 //========================================
 454
 455 UnicodeString &
 456 UnicodeString::operator=(const UnicodeString &src) {
 457   return copyFrom(src);
 458 }
 459
 460 UnicodeString &
 461 UnicodeString::fastCopyFrom(const UnicodeString &src) {
 462   return copyFrom(src, TRUE);
 463 }
 464
 465 UnicodeString &
 466 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
 467   // if assigning to ourselves, do nothing
 468   if(this == 0 || this == &src) {
 469     return *this;
 470   }
 471
 472   // is the right side bogus?
 473   if(&src == 0 || src.isBogus()) {
 474     setToBogus();
 475     return *this;
 476   }
 477
 478   // delete the current contents
 479   releaseArray();
 480
 481   if(src.isEmpty()) {
 482     // empty string - use the stack buffer
 483     setToEmpty();
 484     return *this;
 485   }
 486
 487   // we always copy the length
 488   int32_t srcLength = src.length();
 489   setLength(srcLength);
 490
 491   // fLength>0 and not an "open" src.getBuffer(minCapacity)
 492   switch(src.fFlags) {
 493   case kShortString:
 494     // short string using the stack buffer, do the same
 495     fFlags = kShortString;
 496     uprv_memcpy(fUnion.fStackBuffer, src.fUnion.fStackBuffer, srcLength * U_SIZEOF_UCHAR);
 497     break;
 498   case kLongString:
 499     // src uses a refCounted string buffer, use that buffer with refCount
 500     // src is const, use a cast - we don't really change it
 501     ((UnicodeString &)src).addRef();
 502     // copy all fields, share the reference-counted buffer
 503     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 504     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 505     fFlags = src.fFlags;
 506     break;
 507   case kReadonlyAlias:
 508     if(fastCopy) {
 509       // src is a readonly alias, do the same
 510       // -> maintain the readonly alias as such
 511       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 512       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 513       fFlags = src.fFlags;
 514       break;
 515     }
 516     // else if(!fastCopy) fall through to case kWritableAlias
 517     // -> allocate a new buffer and copy the contents
 518   case kWritableAlias:
 519     // src is a writable alias; we make a copy of that instead
 520     if(allocate(srcLength)) {
 521       uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
 522       break;
 523     }
 524     // if there is not enough memory, then fall through to setting to bogus
 525   default:
 526     // if src is bogus, set ourselves to bogus
 527     // do not call setToBogus() here because fArray and fFlags are not consistent here
 528     fShortLength = 0;
 529     fUnion.fFields.fArray = 0;
 530     fUnion.fFields.fCapacity = 0;
 531     fFlags = kIsBogus;
 532     break;
 533   }
 534
 535   return *this;
 536 }
 537
 538 //========================================
 539 // Miscellaneous operations
 540 //========================================
 541
 542 UnicodeString UnicodeString::unescape() const {
 543     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
 544     const UChar *array = getBuffer();
 545     int32_t len = length();
 546     int32_t prev = 0;
 547     for (int32_t i=0;;) {
 548         if (i == len) {
 549             result.append(array, prev, len - prev);
 550             break;
 551         }
 552         if (array[i++] == 0x5C /*'\\'*/) {
 553             result.append(array, prev, (i - 1) - prev);
 554             UChar32 c = unescapeAt(i); // advances i
 555             if (c < 0) {
 556                 result.remove(); // return empty string
 557                 break; // invalid escape sequence
 558             }
 559             result.append(c);
 560             prev = i;
 561         }
 562     }
 563     return result;
 564 }
 565
 566 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
 567     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
 568 }
 569
 570 //========================================
 571 // Read-only implementation
 572 //========================================
 573 int8_t
 574 UnicodeString::doCompare( int32_t start,
 575               int32_t length,
 576               const UChar *srcChars,
 577               int32_t srcStart,
 578               int32_t srcLength) const
 579 {
 580   // compare illegal string values
 581   if(isBogus()) {
 582     return -1;
 583   }
 584
 585   // pin indices to legal values
 586   pinIndices(start, length);
 587
 588   if(srcChars == NULL) {
 589     // treat const UChar *srcChars==NULL as an empty string
 590     return length == 0 ? 0 : 1;
 591   }
 592
 593   // get the correct pointer
 594   const UChar *chars = getArrayStart();
 595
 596   chars += start;
 597   srcChars += srcStart;
 598
 599   int32_t minLength;
 600   int8_t lengthResult;
 601
 602   // get the srcLength if necessary
 603   if(srcLength < 0) {
 604     srcLength = u_strlen(srcChars + srcStart);
 605   }
 606
 607   // are we comparing different lengths?
 608   if(length != srcLength) {
 609     if(length < srcLength) {
 610       minLength = length;
 611       lengthResult = -1;
 612     } else {
 613       minLength = srcLength;
 614       lengthResult = 1;
 615     }
 616   } else {
 617     minLength = length;
 618     lengthResult = 0;
 619   }
 620
 621   /*
 622    * note that uprv_memcmp() returns an int but we return an int8_t;
 623    * we need to take care not to truncate the result -
 624    * one way to do this is to right-shift the value to
 625    * move the sign bit into the lower 8 bits and making sure that this
 626    * does not become 0 itself
 627    */
 628
 629   if(minLength > 0 && chars != srcChars) {
 630     int32_t result;
 631
 632 #   if U_IS_BIG_ENDIAN
 633       // big-endian: byte comparison works
 634       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
 635       if(result != 0) {
 636         return (int8_t)(result >> 15 | 1);
 637       }
 638 #   else
 639       // little-endian: compare UChar units
 640       do {
 641         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
 642         if(result != 0) {
 643           return (int8_t)(result >> 15 | 1);
 644         }
 645       } while(--minLength > 0);
 646 #   endif
 647   }
 648   return lengthResult;
 649 }
 650
 651 /* String compare in code point order - doCompare() compares in code unit order. */
 652 int8_t
 653 UnicodeString::doCompareCodePointOrder(int32_t start,
 654                                        int32_t length,
 655                                        const UChar *srcChars,
 656                                        int32_t srcStart,
 657                                        int32_t srcLength) const
 658 {
 659   // compare illegal string values
 660   // treat const UChar *srcChars==NULL as an empty string
 661   if(isBogus()) {
 662     return -1;
 663   }
 664
 665   // pin indices to legal values
 666   pinIndices(start, length);
 667
 668   if(srcChars == NULL) {
 669     srcStart = srcLength = 0;
 670   }
 671
 672   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
 673   /* translate the 32-bit result into an 8-bit one */
 674   if(diff!=0) {
 675     return (int8_t)(diff >> 15 | 1);
 676   } else {
 677     return 0;
 678   }
 679 }
 680
 681 int32_t
 682 UnicodeString::getLength() const {
 683     return length();
 684 }
 685
 686 UChar
 687 UnicodeString::getCharAt(int32_t offset) const {
 688   return charAt(offset);
 689 }
 690
 691 UChar32
 692 UnicodeString::getChar32At(int32_t offset) const {
 693   return char32At(offset);
 694 }
 695
 696 UChar32
 697 UnicodeString::char32At(int32_t offset) const
 698 {
 699   int32_t len = length();
 700   if((uint32_t)offset < (uint32_t)len) {
 701     const UChar *array = getArrayStart();
 702     UChar32 c;
 703     U16_GET(array, 0, offset, len, c);
 704     return c;
 705   } else {
 706     return kInvalidUChar;
 707   }
 708 }
 709
 710 int32_t
 711 UnicodeString::getChar32Start(int32_t offset) const {
 712   if((uint32_t)offset < (uint32_t)length()) {
 713     const UChar *array = getArrayStart();
 714     U16_SET_CP_START(array, 0, offset);
 715     return offset;
 716   } else {
 717     return 0;
 718   }
 719 }
 720
 721 int32_t
 722 UnicodeString::getChar32Limit(int32_t offset) const {
 723   int32_t len = length();
 724   if((uint32_t)offset < (uint32_t)len) {
 725     const UChar *array = getArrayStart();
 726     U16_SET_CP_LIMIT(array, 0, offset, len);
 727     return offset;
 728   } else {
 729     return len;
 730   }
 731 }
 732
 733 int32_t
 734 UnicodeString::countChar32(int32_t start, int32_t length) const {
 735   pinIndices(start, length);
 736   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
 737   return u_countChar32(getArrayStart()+start, length);
 738 }
 739
 740 UBool
 741 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
 742   pinIndices(start, length);
 743   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
 744   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
 745 }
 746
 747 int32_t
 748 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
 749   // pin index
 750   int32_t len = length();
 751   if(index<0) {
 752     index=0;
 753   } else if(index>len) {
 754     index=len;
 755   }
 756
 757   const UChar *array = getArrayStart();
 758   if(delta>0) {
 759     U16_FWD_N(array, index, len, delta);
 760   } else {
 761     U16_BACK_N(array, 0, index, -delta);
 762   }
 763
 764   return index;
 765 }
 766
 767 void
 768 UnicodeString::doExtract(int32_t start,
 769              int32_t length,
 770              UChar *dst,
 771              int32_t dstStart) const
 772 {
 773   // pin indices to legal values
 774   pinIndices(start, length);
 775
 776   // do not copy anything if we alias dst itself
 777   const UChar *array = getArrayStart();
 778   if(array + start != dst + dstStart) {
 779     us_arrayCopy(array, start, dst, dstStart, length);
 780   }
 781 }
 782
 783 int32_t
 784 UnicodeString::extract(UChar *dest, int32_t destCapacity,
 785                        UErrorCode &errorCode) const {
 786   int32_t len = length();
 787   if(U_SUCCESS(errorCode)) {
 788     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 789       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 790     } else {
 791       const UChar *array = getArrayStart();
 792       if(len>0 && len<=destCapacity && array!=dest) {
 793         uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
 794       }
 795       return u_terminateUChars(dest, destCapacity, len, &errorCode);
 796     }
 797   }
 798
 799   return len;
 800 }
 801
 802 int32_t
 803 UnicodeString::extract(int32_t start,
 804                        int32_t length,
 805                        char *target,
 806                        int32_t targetCapacity,
 807                        enum EInvariant) const
 808 {
 809   // if the arguments are illegal, then do nothing
 810   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
 811     return 0;
 812   }
 813
 814   // pin the indices to legal values
 815   pinIndices(start, length);
 816
 817   if(length <= targetCapacity) {
 818     u_UCharsToChars(getArrayStart() + start, target, length);
 819   }
 820   UErrorCode status = U_ZERO_ERROR;
 821   return u_terminateChars(target, targetCapacity, length, &status);
 822 }
 823
 824 UnicodeString
 825 UnicodeString::tempSubString(int32_t start, int32_t len) const {
 826   pinIndices(start, len);
 827   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
 828   if(array==NULL) {
 829     array=fUnion.fStackBuffer;  // anything not NULL because that would make an empty string
 830     len=-2;  // bogus result string
 831   }
 832   return UnicodeString(FALSE, array + start, len);
 833 }
 834
 835 int32_t
 836 UnicodeString::toUTF8(int32_t start, int32_t len,
 837                       char *target, int32_t capacity) const {
 838   pinIndices(start, len);
 839   int32_t length8;
 840   UErrorCode errorCode = U_ZERO_ERROR;
 841   u_strToUTF8WithSub(target, capacity, &length8,
 842                      getBuffer() + start, len,
 843                      0xFFFD,  // Standard substitution character.
 844                      NULL,    // Don't care about number of substitutions.
 845                      &errorCode);
 846   return length8;
 847 }
 848
 849 #if U_CHARSET_IS_UTF8
 850
 851 int32_t
 852 UnicodeString::extract(int32_t start, int32_t len,
 853                        char *target, uint32_t dstSize) const {
 854   // if the arguments are illegal, then do nothing
 855   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
 856     return 0;
 857   }
 858   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
 859 }
 860
 861 // else see unistr_cnv.cpp
 862 #endif
 863
 864 void
 865 UnicodeString::extractBetween(int32_t start,
 866                   int32_t limit,
 867                   UnicodeString& target) const {
 868   pinIndex(start);
 869   pinIndex(limit);
 870   doExtract(start, limit - start, target);
 871 }
 872
 873 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
 874 // as many bytes as the source has UChars.
 875 // The "worst cases" are writing systems like Indic, Thai and CJK with
 876 // 3:1 bytes:UChars.
 877 void
 878 UnicodeString::toUTF8(ByteSink &sink) const {
 879   int32_t length16 = length();
 880   if(length16 != 0) {
 881     char stackBuffer[1024];
 882     int32_t capacity = (int32_t)sizeof(stackBuffer);
 883     UBool utf8IsOwned = FALSE;
 884     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
 885                                       3*length16,
 886                                       stackBuffer, capacity,
 887                                       &capacity);
 888     int32_t length8 = 0;
 889     UErrorCode errorCode = U_ZERO_ERROR;
 890     u_strToUTF8WithSub(utf8, capacity, &length8,
 891                        getBuffer(), length16,
 892                        0xFFFD,  // Standard substitution character.
 893                        NULL,    // Don't care about number of substitutions.
 894                        &errorCode);
 895     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 896       utf8 = (char *)uprv_malloc(length8);
 897       if(utf8 != NULL) {
 898         utf8IsOwned = TRUE;
 899         errorCode = U_ZERO_ERROR;
 900         u_strToUTF8WithSub(utf8, length8, &length8,
 901                            getBuffer(), length16,
 902                            0xFFFD,  // Standard substitution character.
 903                            NULL,    // Don't care about number of substitutions.
 904                            &errorCode);
 905       } else {
 906         errorCode = U_MEMORY_ALLOCATION_ERROR;
 907       }
 908     }
 909     if(U_SUCCESS(errorCode)) {
 910       sink.Append(utf8, length8);
 911       sink.Flush();
 912     }
 913     if(utf8IsOwned) {
 914       uprv_free(utf8);
 915     }
 916   }
 917 }
 918
 919 int32_t
 920 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
 921   int32_t length32=0;
 922   if(U_SUCCESS(errorCode)) {
 923     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
 924     u_strToUTF32WithSub(utf32, capacity, &length32,
 925         getBuffer(), length(),
 926         0xfffd,  // Substitution character.
 927         NULL,    // Don't care about number of substitutions.
 928         &errorCode);
 929   }
 930   return length32;
 931 }
 932
 933 int32_t
 934 UnicodeString::indexOf(const UChar *srcChars,
 935                int32_t srcStart,
 936                int32_t srcLength,
 937                int32_t start,
 938                int32_t length) const
 939 {
 940   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
 941     return -1;
 942   }
 943
 944   // UnicodeString does not find empty substrings
 945   if(srcLength < 0 && srcChars[srcStart] == 0) {
 946     return -1;
 947   }
 948
 949   // get the indices within bounds
 950   pinIndices(start, length);
 951
 952   // find the first occurrence of the substring
 953   const UChar *array = getArrayStart();
 954   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
 955   if(match == NULL) {
 956     return -1;
 957   } else {
 958     return (int32_t)(match - array);
 959   }
 960 }
 961
 962 int32_t
 963 UnicodeString::doIndexOf(UChar c,
 964              int32_t start,
 965              int32_t length) const
 966 {
 967   // pin indices
 968   pinIndices(start, length);
 969
 970   // find the first occurrence of c
 971   const UChar *array = getArrayStart();
 972   const UChar *match = u_memchr(array + start, c, length);
 973   if(match == NULL) {
 974     return -1;
 975   } else {
 976     return (int32_t)(match - array);
 977   }
 978 }
 979
 980 int32_t
 981 UnicodeString::doIndexOf(UChar32 c,
 982                          int32_t start,
 983                          int32_t length) const {
 984   // pin indices
 985   pinIndices(start, length);
 986
 987   // find the first occurrence of c
 988   const UChar *array = getArrayStart();
 989   const UChar *match = u_memchr32(array + start, c, length);
 990   if(match == NULL) {
 991     return -1;
 992   } else {
 993     return (int32_t)(match - array);
 994   }
 995 }
 996
 997 int32_t
 998 UnicodeString::lastIndexOf(const UChar *srcChars,
 999                int32_t srcStart,
1000                int32_t srcLength,
1001                int32_t start,
1002                int32_t length) const
1003 {
1004   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1005     return -1;
1006   }
1007
1008   // UnicodeString does not find empty substrings
1009   if(srcLength < 0 && srcChars[srcStart] == 0) {
1010     return -1;
1011   }
1012
1013   // get the indices within bounds
1014   pinIndices(start, length);
1015
1016   // find the last occurrence of the substring
1017   const UChar *array = getArrayStart();
1018   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1019   if(match == NULL) {
1020     return -1;
1021   } else {
1022     return (int32_t)(match - array);
1023   }
1024 }
1025
1026 int32_t
1027 UnicodeString::doLastIndexOf(UChar c,
1028                  int32_t start,
1029                  int32_t length) const
1030 {
1031   if(isBogus()) {
1032     return -1;
1033   }
1034
1035   // pin indices
1036   pinIndices(start, length);
1037
1038   // find the last occurrence of c
1039   const UChar *array = getArrayStart();
1040   const UChar *match = u_memrchr(array + start, c, length);
1041   if(match == NULL) {
1042     return -1;
1043   } else {
1044     return (int32_t)(match - array);
1045   }
1046 }
1047
1048 int32_t
1049 UnicodeString::doLastIndexOf(UChar32 c,
1050                              int32_t start,
1051                              int32_t length) const {
1052   // pin indices
1053   pinIndices(start, length);
1054
1055   // find the last occurrence of c
1056   const UChar *array = getArrayStart();
1057   const UChar *match = u_memrchr32(array + start, c, length);
1058   if(match == NULL) {
1059     return -1;
1060   } else {
1061     return (int32_t)(match - array);
1062   }
1063 }
1064
1065 //========================================
1066 // Write implementation
1067 //========================================
1068
1069 UnicodeString&
1070 UnicodeString::findAndReplace(int32_t start,
1071                   int32_t length,
1072                   const UnicodeString& oldText,
1073                   int32_t oldStart,
1074                   int32_t oldLength,
1075                   const UnicodeString& newText,
1076                   int32_t newStart,
1077                   int32_t newLength)
1078 {
1079   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1080     return *this;
1081   }
1082
1083   pinIndices(start, length);
1084   oldText.pinIndices(oldStart, oldLength);
1085   newText.pinIndices(newStart, newLength);
1086
1087   if(oldLength == 0) {
1088     return *this;
1089   }
1090
1091   while(length > 0 && length >= oldLength) {
1092     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1093     if(pos < 0) {
1094       // no more oldText's here: done
1095       break;
1096     } else {
1097       // we found oldText, replace it by newText and go beyond it
1098       replace(pos, oldLength, newText, newStart, newLength);
1099       length -= pos + oldLength - start;
1100       start = pos + newLength;
1101     }
1102   }
1103
1104   return *this;
1105 }
1106
1107
1108 void
1109 UnicodeString::setToBogus()
1110 {
1111   releaseArray();
1112
1113   fShortLength = 0;
1114   fUnion.fFields.fArray = 0;
1115   fUnion.fFields.fCapacity = 0;
1116   fFlags = kIsBogus;
1117 }
1118
1119 // turn a bogus string into an empty one
1120 void
1121 UnicodeString::unBogus() {
1122   if(fFlags & kIsBogus) {
1123     setToEmpty();
1124   }
1125 }
1126
1127 // setTo() analogous to the readonly-aliasing constructor with the same signature
1128 UnicodeString &
1129 UnicodeString::setTo(UBool isTerminated,
1130                      const UChar *text,
1131                      int32_t textLength)
1132 {
1133   if(fFlags & kOpenGetBuffer) {
1134     // do not modify a string that has an "open" getBuffer(minCapacity)
1135     return *this;
1136   }
1137
1138   if(text == NULL) {
1139     // treat as an empty string, do not alias
1140     releaseArray();
1141     setToEmpty();
1142     return *this;
1143   }
1144
1145   if( textLength < -1 ||
1146       (textLength == -1 && !isTerminated) ||
1147       (textLength >= 0 && isTerminated && text[textLength] != 0)
1148   ) {
1149     setToBogus();
1150     return *this;
1151   }
1152
1153   releaseArray();
1154
1155   if(textLength == -1) {
1156     // text is terminated, or else it would have failed the above test
1157     textLength = u_strlen(text);
1158   }
1159   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1160
1161   fFlags = kReadonlyAlias;
1162   return *this;
1163 }
1164
1165 // setTo() analogous to the writable-aliasing constructor with the same signature
1166 UnicodeString &
1167 UnicodeString::setTo(UChar *buffer,
1168                      int32_t buffLength,
1169                      int32_t buffCapacity) {
1170   if(fFlags & kOpenGetBuffer) {
1171     // do not modify a string that has an "open" getBuffer(minCapacity)
1172     return *this;
1173   }
1174
1175   if(buffer == NULL) {
1176     // treat as an empty string, do not alias
1177     releaseArray();
1178     setToEmpty();
1179     return *this;
1180   }
1181
1182   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1183     setToBogus();
1184     return *this;
1185   } else if(buffLength == -1) {
1186     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1187     const UChar *p = buffer, *limit = buffer + buffCapacity;
1188     while(p != limit && *p != 0) {
1189       ++p;
1190     }
1191     buffLength = (int32_t)(p - buffer);
1192   }
1193
1194   releaseArray();
1195
1196   setArray(buffer, buffLength, buffCapacity);
1197   fFlags = kWritableAlias;
1198   return *this;
1199 }
1200
1201 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
1202   unBogus();
1203   int32_t length = utf8.length();
1204   int32_t capacity;
1205   // The UTF-16 string will be at most as long as the UTF-8 string.
1206   if(length <= US_STACKBUF_SIZE) {
1207     capacity = US_STACKBUF_SIZE;
1208   } else {
1209     capacity = length + 1;  // +1 for the terminating NUL.
1210   }
1211   UChar *utf16 = getBuffer(capacity);
1212   int32_t length16;
1213   UErrorCode errorCode = U_ZERO_ERROR;
1214   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1215       utf8.data(), length,
1216       0xfffd,  // Substitution character.
1217       NULL,    // Don't care about number of substitutions.
1218       &errorCode);
1219   releaseBuffer(length16);
1220   if(U_FAILURE(errorCode)) {
1221     setToBogus();
1222   }
1223   return *this;
1224 }
1225
1226 UnicodeString&
1227 UnicodeString::setCharAt(int32_t offset,
1228              UChar c)
1229 {
1230   int32_t len = length();
1231   if(cloneArrayIfNeeded() && len > 0) {
1232     if(offset < 0) {
1233       offset = 0;
1234     } else if(offset >= len) {
1235       offset = len - 1;
1236     }
1237
1238     getArrayStart()[offset] = c;
1239   }
1240   return *this;
1241 }
1242
1243 UnicodeString&
1244 UnicodeString::replace(int32_t start,
1245                int32_t _length,
1246                UChar32 srcChar) {
1247   UChar buffer[U16_MAX_LENGTH];
1248   int32_t count = 0;
1249   UBool isError = FALSE;
1250   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1251   // We test isError so that the compiler does not complain that we don't.
1252   // If isError then count==0 which turns the doReplace() into a no-op anyway.
1253   return isError ? *this : doReplace(start, _length, buffer, 0, count);
1254 }
1255
1256 UnicodeString&
1257 UnicodeString::append(UChar32 srcChar) {
1258   UChar buffer[U16_MAX_LENGTH];
1259   int32_t _length = 0;
1260   UBool isError = FALSE;
1261   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1262   // We test isError so that the compiler does not complain that we don't.
1263   // If isError then _length==0 which turns the doReplace() into a no-op anyway.
1264   return isError ? *this : doReplace(length(), 0, buffer, 0, _length);
1265 }
1266
1267 UnicodeString&
1268 UnicodeString::doReplace( int32_t start,
1269               int32_t length,
1270               const UnicodeString& src,
1271               int32_t srcStart,
1272               int32_t srcLength)
1273 {
1274   if(!src.isBogus()) {
1275     // pin the indices to legal values
1276     src.pinIndices(srcStart, srcLength);
1277
1278     // get the characters from src
1279     // and replace the range in ourselves with them
1280     return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1281   } else {
1282     // remove the range
1283     return doReplace(start, length, 0, 0, 0);
1284   }
1285 }
1286
1287 UnicodeString&
1288 UnicodeString::doReplace(int32_t start,
1289              int32_t length,
1290              const UChar *srcChars,
1291              int32_t srcStart,
1292              int32_t srcLength)
1293 {
1294   if(!isWritable()) {
1295     return *this;
1296   }
1297
1298   int32_t oldLength = this->length();
1299
1300   // optimize (read-only alias).remove(0, start) and .remove(start, end)
1301   if((fFlags&kBufferIsReadonly) && srcLength == 0) {
1302     if(start == 0) {
1303       // remove prefix by adjusting the array pointer
1304       pinIndex(length);
1305       fUnion.fFields.fArray += length;
1306       fUnion.fFields.fCapacity -= length;
1307       setLength(oldLength - length);
1308       return *this;
1309     } else {
1310       pinIndex(start);
1311       if(length >= (oldLength - start)) {
1312         // remove suffix by reducing the length (like truncate())
1313         setLength(start);
1314         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
1315         return *this;
1316       }
1317     }
1318   }
1319
1320   if(srcChars == 0) {
1321     srcStart = srcLength = 0;
1322   } else if(srcLength < 0) {
1323     // get the srcLength if necessary
1324     srcLength = u_strlen(srcChars + srcStart);
1325   }
1326
1327   // calculate the size of the string after the replace
1328   int32_t newLength;
1329
1330   // optimize append() onto a large-enough, owned string
1331   if(start >= oldLength) {
1332     if(srcLength == 0) {
1333       return *this;
1334     }
1335     newLength = oldLength + srcLength;
1336     if(newLength <= getCapacity() && isBufferWritable()) {
1337       UChar *oldArray = getArrayStart();
1338       // Do not copy characters when
1339       //   UChar *buffer=str.getAppendBuffer(...);
1340       // is followed by
1341       //   str.append(buffer, length);
1342       // or
1343       //   str.appendString(buffer, length)
1344       // or similar.
1345       if(srcChars + srcStart != oldArray + start || start > oldLength) {
1346         us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength);
1347       }
1348       setLength(newLength);
1349       return *this;
1350     } else {
1351       // pin the indices to legal values
1352       start = oldLength;
1353       length = 0;
1354     }
1355   } else {
1356     // pin the indices to legal values
1357     pinIndices(start, length);
1358
1359     newLength = oldLength - length + srcLength;
1360   }
1361
1362   // the following may change fArray but will not copy the current contents;
1363   // therefore we need to keep the current fArray
1364   UChar oldStackBuffer[US_STACKBUF_SIZE];
1365   UChar *oldArray;
1366   if((fFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1367     // copy the stack buffer contents because it will be overwritten with
1368     // fUnion.fFields values
1369     u_memcpy(oldStackBuffer, fUnion.fStackBuffer, oldLength);
1370     oldArray = oldStackBuffer;
1371   } else {
1372     oldArray = getArrayStart();
1373   }
1374
1375   // clone our array and allocate a bigger array if needed
1376   int32_t *bufferToDelete = 0;
1377   if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
1378                          FALSE, &bufferToDelete)
1379   ) {
1380     return *this;
1381   }
1382
1383   // now do the replace
1384
1385   UChar *newArray = getArrayStart();
1386   if(newArray != oldArray) {
1387     // if fArray changed, then we need to copy everything except what will change
1388     us_arrayCopy(oldArray, 0, newArray, 0, start);
1389     us_arrayCopy(oldArray, start + length,
1390                  newArray, start + srcLength,
1391                  oldLength - (start + length));
1392   } else if(length != srcLength) {
1393     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1394     us_arrayCopy(oldArray, start + length,
1395                  newArray, start + srcLength,
1396                  oldLength - (start + length));
1397   }
1398
1399   // now fill in the hole with the new string
1400   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1401
1402   setLength(newLength);
1403
1404   // delayed delete in case srcChars == fArray when we started, and
1405   // to keep oldArray alive for the above operations
1406   if (bufferToDelete) {
1407     uprv_free(bufferToDelete);
1408   }
1409
1410   return *this;
1411 }
1412
1413 /**
1414  * Replaceable API
1415  */
1416 void
1417 UnicodeString::handleReplaceBetween(int32_t start,
1418                                     int32_t limit,
1419                                     const UnicodeString& text) {
1420     replaceBetween(start, limit, text);
1421 }
1422
1423 /**
1424  * Replaceable API
1425  */
1426 void
1427 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1428     if (limit <= start) {
1429         return; // Nothing to do; avoid bogus malloc call
1430     }
1431     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1432     // Check to make sure text is not null.
1433     if (text != NULL) {
1434             extractBetween(start, limit, text, 0);
1435             insert(dest, text, 0, limit - start);
1436             uprv_free(text);
1437     }
1438 }
1439
1440 /**
1441  * Replaceable API
1442  *
1443  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1444  * so we implement this function here.
1445  */
1446 UBool Replaceable::hasMetaData() const {
1447     return TRUE;
1448 }
1449
1450 /**
1451  * Replaceable API
1452  */
1453 UBool UnicodeString::hasMetaData() const {
1454     return FALSE;
1455 }
1456
1457 UnicodeString&
1458 UnicodeString::doReverse(int32_t start, int32_t length) {
1459   if(length <= 1 || !cloneArrayIfNeeded()) {
1460     return *this;
1461   }
1462
1463   // pin the indices to legal values
1464   pinIndices(start, length);
1465   if(length <= 1) {  // pinIndices() might have shrunk the length
1466     return *this;
1467   }
1468
1469   UChar *left = getArrayStart() + start;
1470   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
1471   UChar swap;
1472   UBool hasSupplementary = FALSE;
1473
1474   // Before the loop we know left<right because length>=2.
1475   do {
1476     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1477     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1478     *right-- = swap;
1479   } while(left < right);
1480   // Make sure to test the middle code unit of an odd-length string.
1481   // Redundant if the length is even.
1482   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1483
1484   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1485   if(hasSupplementary) {
1486     UChar swap2;
1487
1488     left = getArrayStart() + start;
1489     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1490     while(left < right) {
1491       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1492         *left++ = swap2;
1493         *left++ = swap;
1494       } else {
1495         ++left;
1496       }
1497     }
1498   }
1499
1500   return *this;
1501 }
1502
1503 UBool
1504 UnicodeString::padLeading(int32_t targetLength,
1505                           UChar padChar)
1506 {
1507   int32_t oldLength = length();
1508   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1509     return FALSE;
1510   } else {
1511     // move contents up by padding width
1512     UChar *array = getArrayStart();
1513     int32_t start = targetLength - oldLength;
1514     us_arrayCopy(array, 0, array, start, oldLength);
1515
1516     // fill in padding character
1517     while(--start >= 0) {
1518       array[start] = padChar;
1519     }
1520     setLength(targetLength);
1521     return TRUE;
1522   }
1523 }
1524
1525 UBool
1526 UnicodeString::padTrailing(int32_t targetLength,
1527                            UChar padChar)
1528 {
1529   int32_t oldLength = length();
1530   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1531     return FALSE;
1532   } else {
1533     // fill in padding character
1534     UChar *array = getArrayStart();
1535     int32_t length = targetLength;
1536     while(--length >= oldLength) {
1537       array[length] = padChar;
1538     }
1539     setLength(targetLength);
1540     return TRUE;
1541   }
1542 }
1543
1544 //========================================
1545 // Hashing
1546 //========================================
1547 int32_t
1548 UnicodeString::doHashCode() const
1549 {
1550     /* Delegate hash computation to uhash.  This makes UnicodeString
1551      * hashing consistent with UChar* hashing.  */
1552     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1553     if (hashCode == kInvalidHashCode) {
1554         hashCode = kEmptyHashCode;
1555     }
1556     return hashCode;
1557 }
1558
1559 //========================================
1560 // External Buffer
1561 //========================================
1562
1563 UChar *
1564 UnicodeString::getBuffer(int32_t minCapacity) {
1565   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1566     fFlags|=kOpenGetBuffer;
1567     fShortLength=0;
1568     return getArrayStart();
1569   } else {
1570     return 0;
1571   }
1572 }
1573
1574 void
1575 UnicodeString::releaseBuffer(int32_t newLength) {
1576   if(fFlags&kOpenGetBuffer && newLength>=-1) {
1577     // set the new fLength
1578     int32_t capacity=getCapacity();
1579     if(newLength==-1) {
1580       // the new length is the string length, capped by fCapacity
1581       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1582       while(p<limit && *p!=0) {
1583         ++p;
1584       }
1585       newLength=(int32_t)(p-array);
1586     } else if(newLength>capacity) {
1587       newLength=capacity;
1588     }
1589     setLength(newLength);
1590     fFlags&=~kOpenGetBuffer;
1591   }
1592 }
1593
1594 //========================================
1595 // Miscellaneous
1596 //========================================
1597 UBool
1598 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1599                                   int32_t growCapacity,
1600                                   UBool doCopyArray,
1601                                   int32_t **pBufferToDelete,
1602                                   UBool forceClone) {
1603   // default parameters need to be static, therefore
1604   // the defaults are -1 to have convenience defaults
1605   if(newCapacity == -1) {
1606     newCapacity = getCapacity();
1607   }
1608
1609   // while a getBuffer(minCapacity) is "open",
1610   // prevent any modifications of the string by returning FALSE here
1611   // if the string is bogus, then only an assignment or similar can revive it
1612   if(!isWritable()) {
1613     return FALSE;
1614   }
1615
1616   /*
1617    * We need to make a copy of the array if
1618    * the buffer is read-only, or
1619    * the buffer is refCounted (shared), and refCount>1, or
1620    * the buffer is too small.
1621    * Return FALSE if memory could not be allocated.
1622    */
1623   if(forceClone ||
1624      fFlags & kBufferIsReadonly ||
1625      (fFlags & kRefCounted && refCount() > 1) ||
1626      newCapacity > getCapacity()
1627   ) {
1628     // check growCapacity for default value and use of the stack buffer
1629     if(growCapacity < 0) {
1630       growCapacity = newCapacity;
1631     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1632       growCapacity = US_STACKBUF_SIZE;
1633     }
1634
1635     // save old values
1636     UChar oldStackBuffer[US_STACKBUF_SIZE];
1637     UChar *oldArray;
1638     uint8_t flags = fFlags;
1639
1640     if(flags&kUsingStackBuffer) {
1641       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1642       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1643         // copy the stack buffer contents because it will be overwritten with
1644         // fUnion.fFields values
1645         us_arrayCopy(fUnion.fStackBuffer, 0, oldStackBuffer, 0, fShortLength);
1646         oldArray = oldStackBuffer;
1647       } else {
1648         oldArray = 0; // no need to copy from stack buffer to itself
1649       }
1650     } else {
1651       oldArray = fUnion.fFields.fArray;
1652       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1653     }
1654
1655     // allocate a new array
1656     if(allocate(growCapacity) ||
1657        (newCapacity < growCapacity && allocate(newCapacity))
1658     ) {
1659       if(doCopyArray && oldArray != 0) {
1660         // copy the contents
1661         // do not copy more than what fits - it may be smaller than before
1662         int32_t minLength = length();
1663         newCapacity = getCapacity();
1664         if(newCapacity < minLength) {
1665           minLength = newCapacity;
1666           setLength(minLength);
1667         }
1668         us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1669       } else {
1670         fShortLength = 0;
1671       }
1672
1673       // release the old array
1674       if(flags & kRefCounted) {
1675         // the array is refCounted; decrement and release if 0
1676         int32_t *pRefCount = ((int32_t *)oldArray - 1);
1677         if(umtx_atomic_dec(pRefCount) == 0) {
1678           if(pBufferToDelete == 0) {
1679             uprv_free(pRefCount);
1680           } else {
1681             // the caller requested to delete it himself
1682             *pBufferToDelete = pRefCount;
1683           }
1684         }
1685       }
1686     } else {
1687       // not enough memory for growCapacity and not even for the smaller newCapacity
1688       // reset the old values for setToBogus() to release the array
1689       if(!(flags&kUsingStackBuffer)) {
1690         fUnion.fFields.fArray = oldArray;
1691       }
1692       fFlags = flags;
1693       setToBogus();
1694       return FALSE;
1695     }
1696   }
1697   return TRUE;
1698 }
1699
1700 // UnicodeStringAppendable ------------------------------------------------- ***
1701
1702 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1703
1704 UBool
1705 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1706   return str.doReplace(str.length(), 0, &c, 0, 1).isWritable();
1707 }
1708
1709 UBool
1710 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1711   UChar buffer[U16_MAX_LENGTH];
1712   int32_t cLength = 0;
1713   UBool isError = FALSE;
1714   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1715   return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable();
1716 }
1717
1718 UBool
1719 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1720   return str.doReplace(str.length(), 0, s, 0, length).isWritable();
1721 }
1722
1723 UBool
1724 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1725   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1726 }
1727
1728 UChar *
1729 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1730                                          int32_t desiredCapacityHint,
1731                                          UChar *scratch, int32_t scratchCapacity,
1732                                          int32_t *resultCapacity) {
1733   if(minCapacity < 1 || scratchCapacity < minCapacity) {
1734     *resultCapacity = 0;
1735     return NULL;
1736   }
1737   int32_t oldLength = str.length();
1738   if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1739     *resultCapacity = str.getCapacity() - oldLength;
1740     return str.getArrayStart() + oldLength;
1741   }
1742   *resultCapacity = scratchCapacity;
1743   return scratch;
1744 }
1745
1746 U_NAMESPACE_END
1747
1748 U_NAMESPACE_USE
1749
1750 U_CAPI int32_t U_EXPORT2
1751 uhash_hashUnicodeString(const UElement key) {
1752     const UnicodeString *str = (const UnicodeString*) key.pointer;
1753     return (str == NULL) ? 0 : str->hashCode();
1754 }
1755
1756 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1757 // does not depend on hashtable code.
1758 U_CAPI UBool U_EXPORT2
1759 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1760     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1761     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1762     if (str1 == str2) {
1763         return TRUE;
1764     }
1765     if (str1 == NULL || str2 == NULL) {
1766         return FALSE;
1767     }
1768     return *str1 == *str2;
1769 }
1770
1771 #ifdef U_STATIC_IMPLEMENTATION
1772 /*
1773 This should never be called. It is defined here to make sure that the
1774 virtual vector deleting destructor is defined within unistr.cpp.
1775 The vector deleting destructor is already a part of UObject,
1776 but defining it here makes sure that it is included with this object file.
1777 This makes sure that static library dependencies are kept to a minimum.
1778 */
1779 static void uprv_UnicodeStringDummy(void) {
1780     delete [] (new UnicodeString[2]);
1781 }
1782 #endif