icuSources/common/unistr.cpp

   1 /*
   2 ******************************************************************************
   3 * Copyright (C) 1999-2013, International Business Machines Corporation and
   4 * others. All Rights Reserved.
   5 ******************************************************************************
   6 *
   7 * File unistr.cpp
   8 *
   9 * Modification History:
  10 *
  11 *   Date        Name        Description
  12 *   09/25/98    stephen     Creation.
  13 *   04/20/99    stephen     Overhauled per 4/16 code review.
  14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
  15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
  16 *                           Replaceable.
  17 *   06/25/01    grhoten     Removed the dependency on iostream
  18 ******************************************************************************
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "unicode/appendable.h"
  23 #include "unicode/putil.h"
  24 #include "cstring.h"
  25 #include "cmemory.h"
  26 #include "unicode/ustring.h"
  27 #include "unicode/unistr.h"
  28 #include "unicode/utf.h"
  29 #include "unicode/utf16.h"
  30 #include "uelement.h"
  31 #include "ustr_imp.h"
  32 #include "umutex.h"
  33 #include "uassert.h"
  34
  35 #if 0
  36
  37 #include <iostream>
  38 using namespace std;
  39
  40 //DEBUGGING
  41 void
  42 print(const UnicodeString& s,
  43       const char *name)
  44 {
  45   UChar c;
  46   cout << name << ":|";
  47   for(int i = 0; i < s.length(); ++i) {
  48     c = s[i];
  49     if(c>= 0x007E || c < 0x0020)
  50       cout << "[0x" << hex << s[i] << "]";
  51     else
  52       cout << (char) s[i];
  53   }
  54   cout << '|' << endl;
  55 }
  56
  57 void
  58 print(const UChar *s,
  59       int32_t len,
  60       const char *name)
  61 {
  62   UChar c;
  63   cout << name << ":|";
  64   for(int i = 0; i < len; ++i) {
  65     c = s[i];
  66     if(c>= 0x007E || c < 0x0020)
  67       cout << "[0x" << hex << s[i] << "]";
  68     else
  69       cout << (char) s[i];
  70   }
  71   cout << '|' << endl;
  72 }
  73 // END DEBUGGING
  74 #endif
  75
  76 // Local function definitions for now
  77
  78 // need to copy areas that may overlap
  79 static
  80 inline void
  81 us_arrayCopy(const UChar *src, int32_t srcStart,
  82          UChar *dst, int32_t dstStart, int32_t count)
  83 {
  84   if(count>0) {
  85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
  86   }
  87 }
  88
  89 // u_unescapeAt() callback to get a UChar from a UnicodeString
  90 U_CDECL_BEGIN
  91 static UChar U_CALLCONV
  92 UnicodeString_charAt(int32_t offset, void *context) {
  93     return ((icu::UnicodeString*) context)->charAt(offset);
  94 }
  95 U_CDECL_END
  96
  97 U_NAMESPACE_BEGIN
  98
  99 /* The Replaceable virtual destructor can't be defined in the header
 100    due to how AIX works with multiple definitions of virtual functions.
 101 */
 102 Replaceable::~Replaceable() {}
 103
 104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
 105
 106 UnicodeString U_EXPORT2
 107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
 108     return
 109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
 110             append(s1).
 111                 append(s2);
 112 }
 113
 114 //========================================
 115 // Reference Counting functions, put at top of file so that optimizing compilers
 116 //                               have a chance to automatically inline.
 117 //========================================
 118
 119 void
 120 UnicodeString::addRef() {
 121   umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
 122 }
 123
 124 int32_t
 125 UnicodeString::removeRef() {
 126   return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
 127 }
 128
 129 int32_t
 130 UnicodeString::refCount() const {
 131   return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
 132 }
 133
 134 void
 135 UnicodeString::releaseArray() {
 136   if((fFlags & kRefCounted) && removeRef() == 0) {
 137     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
 138   }
 139 }
 140
 141
 142
 143 //========================================
 144 // Constructors
 145 //========================================
 146
 147 // The default constructor is inline in unistr.h.
 148
 149 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count)
 150   : fShortLength(0),
 151     fFlags(0)
 152 {
 153   if(count <= 0 || (uint32_t)c > 0x10ffff) {
 154     // just allocate and do not do anything else
 155     allocate(capacity);
 156   } else {
 157     // count > 0, allocate and fill the new string with count c's
 158     int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
 159     if(capacity < length) {
 160       capacity = length;
 161     }
 162     if(allocate(capacity)) {
 163       UChar *array = getArrayStart();
 164       int32_t i = 0;
 165
 166       // fill the new string with c
 167       if(unitCount == 1) {
 168         // fill with length UChars
 169         while(i < length) {
 170           array[i++] = (UChar)c;
 171         }
 172       } else {
 173         // get the code units for c
 174         UChar units[U16_MAX_LENGTH];
 175         U16_APPEND_UNSAFE(units, i, c);
 176
 177         // now it must be i==unitCount
 178         i = 0;
 179
 180         // for Unicode, unitCount can only be 1, 2, 3, or 4
 181         // 1 is handled above
 182         while(i < length) {
 183           int32_t unitIdx = 0;
 184           while(unitIdx < unitCount) {
 185             array[i++]=units[unitIdx++];
 186           }
 187         }
 188       }
 189     }
 190     setLength(length);
 191   }
 192 }
 193
 194 UnicodeString::UnicodeString(UChar ch)
 195   : fShortLength(1),
 196     fFlags(kShortString)
 197 {
 198   fUnion.fStackBuffer[0] = ch;
 199 }
 200
 201 UnicodeString::UnicodeString(UChar32 ch)
 202   : fShortLength(0),
 203     fFlags(kShortString)
 204 {
 205   int32_t i = 0;
 206   UBool isError = FALSE;
 207   U16_APPEND(fUnion.fStackBuffer, i, US_STACKBUF_SIZE, ch, isError);
 208   // We test isError so that the compiler does not complain that we don't.
 209   // If isError then i==0 which is what we want anyway.
 210   if(!isError) {
 211     fShortLength = (int8_t)i;
 212   }
 213 }
 214
 215 UnicodeString::UnicodeString(const UChar *text)
 216   : fShortLength(0),
 217     fFlags(kShortString)
 218 {
 219   doReplace(0, 0, text, 0, -1);
 220 }
 221
 222 UnicodeString::UnicodeString(const UChar *text,
 223                              int32_t textLength)
 224   : fShortLength(0),
 225     fFlags(kShortString)
 226 {
 227   doReplace(0, 0, text, 0, textLength);
 228 }
 229
 230 UnicodeString::UnicodeString(UBool isTerminated,
 231                              const UChar *text,
 232                              int32_t textLength)
 233   : fShortLength(0),
 234     fFlags(kReadonlyAlias)
 235 {
 236   if(text == NULL) {
 237     // treat as an empty string, do not alias
 238     setToEmpty();
 239   } else if(textLength < -1 ||
 240             (textLength == -1 && !isTerminated) ||
 241             (textLength >= 0 && isTerminated && text[textLength] != 0)
 242   ) {
 243     setToBogus();
 244   } else {
 245     if(textLength == -1) {
 246       // text is terminated, or else it would have failed the above test
 247       textLength = u_strlen(text);
 248     }
 249     setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
 250   }
 251 }
 252
 253 UnicodeString::UnicodeString(UChar *buff,
 254                              int32_t buffLength,
 255                              int32_t buffCapacity)
 256   : fShortLength(0),
 257     fFlags(kWritableAlias)
 258 {
 259   if(buff == NULL) {
 260     // treat as an empty string, do not alias
 261     setToEmpty();
 262   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
 263     setToBogus();
 264   } else {
 265     if(buffLength == -1) {
 266       // fLength = u_strlen(buff); but do not look beyond buffCapacity
 267       const UChar *p = buff, *limit = buff + buffCapacity;
 268       while(p != limit && *p != 0) {
 269         ++p;
 270       }
 271       buffLength = (int32_t)(p - buff);
 272     }
 273     setArray(buff, buffLength, buffCapacity);
 274   }
 275 }
 276
 277 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
 278   : fShortLength(0),
 279     fFlags(kShortString)
 280 {
 281   if(src==NULL) {
 282     // treat as an empty string
 283   } else {
 284     if(length<0) {
 285       length=(int32_t)uprv_strlen(src);
 286     }
 287     if(cloneArrayIfNeeded(length, length, FALSE)) {
 288       u_charsToUChars(src, getArrayStart(), length);
 289       setLength(length);
 290     } else {
 291       setToBogus();
 292     }
 293   }
 294 }
 295
 296 #if U_CHARSET_IS_UTF8
 297
 298 UnicodeString::UnicodeString(const char *codepageData)
 299   : fShortLength(0),
 300     fFlags(kShortString) {
 301   if(codepageData != 0) {
 302     setToUTF8(codepageData);
 303   }
 304 }
 305
 306 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength)
 307   : fShortLength(0),
 308     fFlags(kShortString) {
 309   // if there's nothing to convert, do nothing
 310   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
 311     return;
 312   }
 313   if(dataLength == -1) {
 314     dataLength = (int32_t)uprv_strlen(codepageData);
 315   }
 316   setToUTF8(StringPiece(codepageData, dataLength));
 317 }
 318
 319 // else see unistr_cnv.cpp
 320 #endif
 321
 322 UnicodeString::UnicodeString(const UnicodeString& that)
 323   : Replaceable(),
 324     fShortLength(0),
 325     fFlags(kShortString)
 326 {
 327   copyFrom(that);
 328 }
 329
 330 UnicodeString::UnicodeString(const UnicodeString& that,
 331                              int32_t srcStart)
 332   : Replaceable(),
 333     fShortLength(0),
 334     fFlags(kShortString)
 335 {
 336   setTo(that, srcStart);
 337 }
 338
 339 UnicodeString::UnicodeString(const UnicodeString& that,
 340                              int32_t srcStart,
 341                              int32_t srcLength)
 342   : Replaceable(),
 343     fShortLength(0),
 344     fFlags(kShortString)
 345 {
 346   setTo(that, srcStart, srcLength);
 347 }
 348
 349 // Replaceable base class clone() default implementation, does not clone
 350 Replaceable *
 351 Replaceable::clone() const {
 352   return NULL;
 353 }
 354
 355 // UnicodeString overrides clone() with a real implementation
 356 Replaceable *
 357 UnicodeString::clone() const {
 358   return new UnicodeString(*this);
 359 }
 360
 361 //========================================
 362 // array allocation
 363 //========================================
 364
 365 UBool
 366 UnicodeString::allocate(int32_t capacity) {
 367   if(capacity <= US_STACKBUF_SIZE) {
 368     fFlags = kShortString;
 369   } else {
 370     // count bytes for the refCounter and the string capacity, and
 371     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
 372     // to be safely aligned for the refCount
 373     // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
 374     int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
 375     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
 376     if(array != 0) {
 377       // set initial refCount and point behind the refCount
 378       *array++ = 1;
 379
 380       // have fArray point to the first UChar
 381       fUnion.fFields.fArray = (UChar *)array;
 382       fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
 383       fFlags = kLongString;
 384     } else {
 385       fShortLength = 0;
 386       fUnion.fFields.fArray = 0;
 387       fUnion.fFields.fCapacity = 0;
 388       fFlags = kIsBogus;
 389       return FALSE;
 390     }
 391   }
 392   return TRUE;
 393 }
 394
 395 //========================================
 396 // Destructor
 397 //========================================
 398 UnicodeString::~UnicodeString()
 399 {
 400   releaseArray();
 401 }
 402
 403 //========================================
 404 // Factory methods
 405 //========================================
 406
 407 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
 408   UnicodeString result;
 409   result.setToUTF8(utf8);
 410   return result;
 411 }
 412
 413 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
 414   UnicodeString result;
 415   int32_t capacity;
 416   // Most UTF-32 strings will be BMP-only and result in a same-length
 417   // UTF-16 string. We overestimate the capacity just slightly,
 418   // just in case there are a few supplementary characters.
 419   if(length <= US_STACKBUF_SIZE) {
 420     capacity = US_STACKBUF_SIZE;
 421   } else {
 422     capacity = length + (length >> 4) + 4;
 423   }
 424   do {
 425     UChar *utf16 = result.getBuffer(capacity);
 426     int32_t length16;
 427     UErrorCode errorCode = U_ZERO_ERROR;
 428     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
 429         utf32, length,
 430         0xfffd,  // Substitution character.
 431         NULL,    // Don't care about number of substitutions.
 432         &errorCode);
 433     result.releaseBuffer(length16);
 434     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 435       capacity = length16 + 1;  // +1 for the terminating NUL.
 436       continue;
 437     } else if(U_FAILURE(errorCode)) {
 438       result.setToBogus();
 439     }
 440     break;
 441   } while(TRUE);
 442   return result;
 443 }
 444
 445 //========================================
 446 // Assignment
 447 //========================================
 448
 449 UnicodeString &
 450 UnicodeString::operator=(const UnicodeString &src) {
 451   return copyFrom(src);
 452 }
 453
 454 UnicodeString &
 455 UnicodeString::fastCopyFrom(const UnicodeString &src) {
 456   return copyFrom(src, TRUE);
 457 }
 458
 459 UnicodeString &
 460 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
 461   // if assigning to ourselves, do nothing
 462   if(this == 0 || this == &src) {
 463     return *this;
 464   }
 465
 466   // is the right side bogus?
 467   if(&src == 0 || src.isBogus()) {
 468     setToBogus();
 469     return *this;
 470   }
 471
 472   // delete the current contents
 473   releaseArray();
 474
 475   if(src.isEmpty()) {
 476     // empty string - use the stack buffer
 477     setToEmpty();
 478     return *this;
 479   }
 480
 481   // we always copy the length
 482   int32_t srcLength = src.length();
 483   setLength(srcLength);
 484
 485   // fLength>0 and not an "open" src.getBuffer(minCapacity)
 486   switch(src.fFlags) {
 487   case kShortString:
 488     // short string using the stack buffer, do the same
 489     fFlags = kShortString;
 490     uprv_memcpy(fUnion.fStackBuffer, src.fUnion.fStackBuffer, srcLength * U_SIZEOF_UCHAR);
 491     break;
 492   case kLongString:
 493     // src uses a refCounted string buffer, use that buffer with refCount
 494     // src is const, use a cast - we don't really change it
 495     ((UnicodeString &)src).addRef();
 496     // copy all fields, share the reference-counted buffer
 497     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 498     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 499     fFlags = src.fFlags;
 500     break;
 501   case kReadonlyAlias:
 502     if(fastCopy) {
 503       // src is a readonly alias, do the same
 504       // -> maintain the readonly alias as such
 505       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 506       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 507       fFlags = src.fFlags;
 508       break;
 509     }
 510     // else if(!fastCopy) fall through to case kWritableAlias
 511     // -> allocate a new buffer and copy the contents
 512   case kWritableAlias:
 513     // src is a writable alias; we make a copy of that instead
 514     if(allocate(srcLength)) {
 515       uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
 516       break;
 517     }
 518     // if there is not enough memory, then fall through to setting to bogus
 519   default:
 520     // if src is bogus, set ourselves to bogus
 521     // do not call setToBogus() here because fArray and fFlags are not consistent here
 522     fShortLength = 0;
 523     fUnion.fFields.fArray = 0;
 524     fUnion.fFields.fCapacity = 0;
 525     fFlags = kIsBogus;
 526     break;
 527   }
 528
 529   return *this;
 530 }
 531
 532 //========================================
 533 // Miscellaneous operations
 534 //========================================
 535
 536 UnicodeString UnicodeString::unescape() const {
 537     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
 538     const UChar *array = getBuffer();
 539     int32_t len = length();
 540     int32_t prev = 0;
 541     for (int32_t i=0;;) {
 542         if (i == len) {
 543             result.append(array, prev, len - prev);
 544             break;
 545         }
 546         if (array[i++] == 0x5C /*'\\'*/) {
 547             result.append(array, prev, (i - 1) - prev);
 548             UChar32 c = unescapeAt(i); // advances i
 549             if (c < 0) {
 550                 result.remove(); // return empty string
 551                 break; // invalid escape sequence
 552             }
 553             result.append(c);
 554             prev = i;
 555         }
 556     }
 557     return result;
 558 }
 559
 560 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
 561     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
 562 }
 563
 564 //========================================
 565 // Read-only implementation
 566 //========================================
 567 UBool
 568 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
 569   // Requires: this & text not bogus and have same lengths.
 570   // Byte-wise comparison works for equality regardless of endianness.
 571   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
 572 }
 573
 574 int8_t
 575 UnicodeString::doCompare( int32_t start,
 576               int32_t length,
 577               const UChar *srcChars,
 578               int32_t srcStart,
 579               int32_t srcLength) const
 580 {
 581   // compare illegal string values
 582   if(isBogus()) {
 583     return -1;
 584   }
 585
 586   // pin indices to legal values
 587   pinIndices(start, length);
 588
 589   if(srcChars == NULL) {
 590     // treat const UChar *srcChars==NULL as an empty string
 591     return length == 0 ? 0 : 1;
 592   }
 593
 594   // get the correct pointer
 595   const UChar *chars = getArrayStart();
 596
 597   chars += start;
 598   srcChars += srcStart;
 599
 600   int32_t minLength;
 601   int8_t lengthResult;
 602
 603   // get the srcLength if necessary
 604   if(srcLength < 0) {
 605     srcLength = u_strlen(srcChars + srcStart);
 606   }
 607
 608   // are we comparing different lengths?
 609   if(length != srcLength) {
 610     if(length < srcLength) {
 611       minLength = length;
 612       lengthResult = -1;
 613     } else {
 614       minLength = srcLength;
 615       lengthResult = 1;
 616     }
 617   } else {
 618     minLength = length;
 619     lengthResult = 0;
 620   }
 621
 622   /*
 623    * note that uprv_memcmp() returns an int but we return an int8_t;
 624    * we need to take care not to truncate the result -
 625    * one way to do this is to right-shift the value to
 626    * move the sign bit into the lower 8 bits and making sure that this
 627    * does not become 0 itself
 628    */
 629
 630   if(minLength > 0 && chars != srcChars) {
 631     int32_t result;
 632
 633 #   if U_IS_BIG_ENDIAN
 634       // big-endian: byte comparison works
 635       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
 636       if(result != 0) {
 637         return (int8_t)(result >> 15 | 1);
 638       }
 639 #   else
 640       // little-endian: compare UChar units
 641       do {
 642         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
 643         if(result != 0) {
 644           return (int8_t)(result >> 15 | 1);
 645         }
 646       } while(--minLength > 0);
 647 #   endif
 648   }
 649   return lengthResult;
 650 }
 651
 652 /* String compare in code point order - doCompare() compares in code unit order. */
 653 int8_t
 654 UnicodeString::doCompareCodePointOrder(int32_t start,
 655                                        int32_t length,
 656                                        const UChar *srcChars,
 657                                        int32_t srcStart,
 658                                        int32_t srcLength) const
 659 {
 660   // compare illegal string values
 661   // treat const UChar *srcChars==NULL as an empty string
 662   if(isBogus()) {
 663     return -1;
 664   }
 665
 666   // pin indices to legal values
 667   pinIndices(start, length);
 668
 669   if(srcChars == NULL) {
 670     srcStart = srcLength = 0;
 671   }
 672
 673   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
 674   /* translate the 32-bit result into an 8-bit one */
 675   if(diff!=0) {
 676     return (int8_t)(diff >> 15 | 1);
 677   } else {
 678     return 0;
 679   }
 680 }
 681
 682 int32_t
 683 UnicodeString::getLength() const {
 684     return length();
 685 }
 686
 687 UChar
 688 UnicodeString::getCharAt(int32_t offset) const {
 689   return charAt(offset);
 690 }
 691
 692 UChar32
 693 UnicodeString::getChar32At(int32_t offset) const {
 694   return char32At(offset);
 695 }
 696
 697 UChar32
 698 UnicodeString::char32At(int32_t offset) const
 699 {
 700   int32_t len = length();
 701   if((uint32_t)offset < (uint32_t)len) {
 702     const UChar *array = getArrayStart();
 703     UChar32 c;
 704     U16_GET(array, 0, offset, len, c);
 705     return c;
 706   } else {
 707     return kInvalidUChar;
 708   }
 709 }
 710
 711 int32_t
 712 UnicodeString::getChar32Start(int32_t offset) const {
 713   if((uint32_t)offset < (uint32_t)length()) {
 714     const UChar *array = getArrayStart();
 715     U16_SET_CP_START(array, 0, offset);
 716     return offset;
 717   } else {
 718     return 0;
 719   }
 720 }
 721
 722 int32_t
 723 UnicodeString::getChar32Limit(int32_t offset) const {
 724   int32_t len = length();
 725   if((uint32_t)offset < (uint32_t)len) {
 726     const UChar *array = getArrayStart();
 727     U16_SET_CP_LIMIT(array, 0, offset, len);
 728     return offset;
 729   } else {
 730     return len;
 731   }
 732 }
 733
 734 int32_t
 735 UnicodeString::countChar32(int32_t start, int32_t length) const {
 736   pinIndices(start, length);
 737   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
 738   return u_countChar32(getArrayStart()+start, length);
 739 }
 740
 741 UBool
 742 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
 743   pinIndices(start, length);
 744   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
 745   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
 746 }
 747
 748 int32_t
 749 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
 750   // pin index
 751   int32_t len = length();
 752   if(index<0) {
 753     index=0;
 754   } else if(index>len) {
 755     index=len;
 756   }
 757
 758   const UChar *array = getArrayStart();
 759   if(delta>0) {
 760     U16_FWD_N(array, index, len, delta);
 761   } else {
 762     U16_BACK_N(array, 0, index, -delta);
 763   }
 764
 765   return index;
 766 }
 767
 768 void
 769 UnicodeString::doExtract(int32_t start,
 770              int32_t length,
 771              UChar *dst,
 772              int32_t dstStart) const
 773 {
 774   // pin indices to legal values
 775   pinIndices(start, length);
 776
 777   // do not copy anything if we alias dst itself
 778   const UChar *array = getArrayStart();
 779   if(array + start != dst + dstStart) {
 780     us_arrayCopy(array, start, dst, dstStart, length);
 781   }
 782 }
 783
 784 int32_t
 785 UnicodeString::extract(UChar *dest, int32_t destCapacity,
 786                        UErrorCode &errorCode) const {
 787   int32_t len = length();
 788   if(U_SUCCESS(errorCode)) {
 789     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 790       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 791     } else {
 792       const UChar *array = getArrayStart();
 793       if(len>0 && len<=destCapacity && array!=dest) {
 794         uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
 795       }
 796       return u_terminateUChars(dest, destCapacity, len, &errorCode);
 797     }
 798   }
 799
 800   return len;
 801 }
 802
 803 int32_t
 804 UnicodeString::extract(int32_t start,
 805                        int32_t length,
 806                        char *target,
 807                        int32_t targetCapacity,
 808                        enum EInvariant) const
 809 {
 810   // if the arguments are illegal, then do nothing
 811   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
 812     return 0;
 813   }
 814
 815   // pin the indices to legal values
 816   pinIndices(start, length);
 817
 818   if(length <= targetCapacity) {
 819     u_UCharsToChars(getArrayStart() + start, target, length);
 820   }
 821   UErrorCode status = U_ZERO_ERROR;
 822   return u_terminateChars(target, targetCapacity, length, &status);
 823 }
 824
 825 UnicodeString
 826 UnicodeString::tempSubString(int32_t start, int32_t len) const {
 827   pinIndices(start, len);
 828   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
 829   if(array==NULL) {
 830     array=fUnion.fStackBuffer;  // anything not NULL because that would make an empty string
 831     len=-2;  // bogus result string
 832   }
 833   return UnicodeString(FALSE, array + start, len);
 834 }
 835
 836 int32_t
 837 UnicodeString::toUTF8(int32_t start, int32_t len,
 838                       char *target, int32_t capacity) const {
 839   pinIndices(start, len);
 840   int32_t length8;
 841   UErrorCode errorCode = U_ZERO_ERROR;
 842   u_strToUTF8WithSub(target, capacity, &length8,
 843                      getBuffer() + start, len,
 844                      0xFFFD,  // Standard substitution character.
 845                      NULL,    // Don't care about number of substitutions.
 846                      &errorCode);
 847   return length8;
 848 }
 849
 850 #if U_CHARSET_IS_UTF8
 851
 852 int32_t
 853 UnicodeString::extract(int32_t start, int32_t len,
 854                        char *target, uint32_t dstSize) const {
 855   // if the arguments are illegal, then do nothing
 856   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
 857     return 0;
 858   }
 859   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
 860 }
 861
 862 // else see unistr_cnv.cpp
 863 #endif
 864
 865 void
 866 UnicodeString::extractBetween(int32_t start,
 867                   int32_t limit,
 868                   UnicodeString& target) const {
 869   pinIndex(start);
 870   pinIndex(limit);
 871   doExtract(start, limit - start, target);
 872 }
 873
 874 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
 875 // as many bytes as the source has UChars.
 876 // The "worst cases" are writing systems like Indic, Thai and CJK with
 877 // 3:1 bytes:UChars.
 878 void
 879 UnicodeString::toUTF8(ByteSink &sink) const {
 880   int32_t length16 = length();
 881   if(length16 != 0) {
 882     char stackBuffer[1024];
 883     int32_t capacity = (int32_t)sizeof(stackBuffer);
 884     UBool utf8IsOwned = FALSE;
 885     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
 886                                       3*length16,
 887                                       stackBuffer, capacity,
 888                                       &capacity);
 889     int32_t length8 = 0;
 890     UErrorCode errorCode = U_ZERO_ERROR;
 891     u_strToUTF8WithSub(utf8, capacity, &length8,
 892                        getBuffer(), length16,
 893                        0xFFFD,  // Standard substitution character.
 894                        NULL,    // Don't care about number of substitutions.
 895                        &errorCode);
 896     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 897       utf8 = (char *)uprv_malloc(length8);
 898       if(utf8 != NULL) {
 899         utf8IsOwned = TRUE;
 900         errorCode = U_ZERO_ERROR;
 901         u_strToUTF8WithSub(utf8, length8, &length8,
 902                            getBuffer(), length16,
 903                            0xFFFD,  // Standard substitution character.
 904                            NULL,    // Don't care about number of substitutions.
 905                            &errorCode);
 906       } else {
 907         errorCode = U_MEMORY_ALLOCATION_ERROR;
 908       }
 909     }
 910     if(U_SUCCESS(errorCode)) {
 911       sink.Append(utf8, length8);
 912       sink.Flush();
 913     }
 914     if(utf8IsOwned) {
 915       uprv_free(utf8);
 916     }
 917   }
 918 }
 919
 920 int32_t
 921 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
 922   int32_t length32=0;
 923   if(U_SUCCESS(errorCode)) {
 924     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
 925     u_strToUTF32WithSub(utf32, capacity, &length32,
 926         getBuffer(), length(),
 927         0xfffd,  // Substitution character.
 928         NULL,    // Don't care about number of substitutions.
 929         &errorCode);
 930   }
 931   return length32;
 932 }
 933
 934 int32_t
 935 UnicodeString::indexOf(const UChar *srcChars,
 936                int32_t srcStart,
 937                int32_t srcLength,
 938                int32_t start,
 939                int32_t length) const
 940 {
 941   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
 942     return -1;
 943   }
 944
 945   // UnicodeString does not find empty substrings
 946   if(srcLength < 0 && srcChars[srcStart] == 0) {
 947     return -1;
 948   }
 949
 950   // get the indices within bounds
 951   pinIndices(start, length);
 952
 953   // find the first occurrence of the substring
 954   const UChar *array = getArrayStart();
 955   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
 956   if(match == NULL) {
 957     return -1;
 958   } else {
 959     return (int32_t)(match - array);
 960   }
 961 }
 962
 963 int32_t
 964 UnicodeString::doIndexOf(UChar c,
 965              int32_t start,
 966              int32_t length) const
 967 {
 968   // pin indices
 969   pinIndices(start, length);
 970
 971   // find the first occurrence of c
 972   const UChar *array = getArrayStart();
 973   const UChar *match = u_memchr(array + start, c, length);
 974   if(match == NULL) {
 975     return -1;
 976   } else {
 977     return (int32_t)(match - array);
 978   }
 979 }
 980
 981 int32_t
 982 UnicodeString::doIndexOf(UChar32 c,
 983                          int32_t start,
 984                          int32_t length) const {
 985   // pin indices
 986   pinIndices(start, length);
 987
 988   // find the first occurrence of c
 989   const UChar *array = getArrayStart();
 990   const UChar *match = u_memchr32(array + start, c, length);
 991   if(match == NULL) {
 992     return -1;
 993   } else {
 994     return (int32_t)(match - array);
 995   }
 996 }
 997
 998 int32_t
 999 UnicodeString::lastIndexOf(const UChar *srcChars,
1000                int32_t srcStart,
1001                int32_t srcLength,
1002                int32_t start,
1003                int32_t length) const
1004 {
1005   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1006     return -1;
1007   }
1008
1009   // UnicodeString does not find empty substrings
1010   if(srcLength < 0 && srcChars[srcStart] == 0) {
1011     return -1;
1012   }
1013
1014   // get the indices within bounds
1015   pinIndices(start, length);
1016
1017   // find the last occurrence of the substring
1018   const UChar *array = getArrayStart();
1019   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1020   if(match == NULL) {
1021     return -1;
1022   } else {
1023     return (int32_t)(match - array);
1024   }
1025 }
1026
1027 int32_t
1028 UnicodeString::doLastIndexOf(UChar c,
1029                  int32_t start,
1030                  int32_t length) const
1031 {
1032   if(isBogus()) {
1033     return -1;
1034   }
1035
1036   // pin indices
1037   pinIndices(start, length);
1038
1039   // find the last occurrence of c
1040   const UChar *array = getArrayStart();
1041   const UChar *match = u_memrchr(array + start, c, length);
1042   if(match == NULL) {
1043     return -1;
1044   } else {
1045     return (int32_t)(match - array);
1046   }
1047 }
1048
1049 int32_t
1050 UnicodeString::doLastIndexOf(UChar32 c,
1051                              int32_t start,
1052                              int32_t length) const {
1053   // pin indices
1054   pinIndices(start, length);
1055
1056   // find the last occurrence of c
1057   const UChar *array = getArrayStart();
1058   const UChar *match = u_memrchr32(array + start, c, length);
1059   if(match == NULL) {
1060     return -1;
1061   } else {
1062     return (int32_t)(match - array);
1063   }
1064 }
1065
1066 //========================================
1067 // Write implementation
1068 //========================================
1069
1070 UnicodeString&
1071 UnicodeString::findAndReplace(int32_t start,
1072                   int32_t length,
1073                   const UnicodeString& oldText,
1074                   int32_t oldStart,
1075                   int32_t oldLength,
1076                   const UnicodeString& newText,
1077                   int32_t newStart,
1078                   int32_t newLength)
1079 {
1080   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1081     return *this;
1082   }
1083
1084   pinIndices(start, length);
1085   oldText.pinIndices(oldStart, oldLength);
1086   newText.pinIndices(newStart, newLength);
1087
1088   if(oldLength == 0) {
1089     return *this;
1090   }
1091
1092   while(length > 0 && length >= oldLength) {
1093     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1094     if(pos < 0) {
1095       // no more oldText's here: done
1096       break;
1097     } else {
1098       // we found oldText, replace it by newText and go beyond it
1099       replace(pos, oldLength, newText, newStart, newLength);
1100       length -= pos + oldLength - start;
1101       start = pos + newLength;
1102     }
1103   }
1104
1105   return *this;
1106 }
1107
1108
1109 void
1110 UnicodeString::setToBogus()
1111 {
1112   releaseArray();
1113
1114   fShortLength = 0;
1115   fUnion.fFields.fArray = 0;
1116   fUnion.fFields.fCapacity = 0;
1117   fFlags = kIsBogus;
1118 }
1119
1120 // turn a bogus string into an empty one
1121 void
1122 UnicodeString::unBogus() {
1123   if(fFlags & kIsBogus) {
1124     setToEmpty();
1125   }
1126 }
1127
1128 const UChar *
1129 UnicodeString::getTerminatedBuffer() {
1130   if(!isWritable()) {
1131     return 0;
1132   }
1133   UChar *array = getArrayStart();
1134   int32_t len = length();
1135   if(len < getCapacity()) {
1136     if(fFlags & kBufferIsReadonly) {
1137       // If len<capacity on a read-only alias, then array[len] is
1138       // either the original NUL (if constructed with (TRUE, s, length))
1139       // or one of the original string contents characters (if later truncated),
1140       // therefore we can assume that array[len] is initialized memory.
1141       if(array[len] == 0) {
1142         return array;
1143       }
1144     } else if(((fFlags & kRefCounted) == 0 || refCount() == 1)) {
1145       // kRefCounted: Do not write the NUL if the buffer is shared.
1146       // That is mostly safe, except when the length of one copy was modified
1147       // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1148       // Then the NUL would be written into the middle of another copy's string.
1149
1150       // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1151       // Do not test if there is a NUL already because it might be uninitialized memory.
1152       // (That would be safe, but tools like valgrind & Purify would complain.)
1153       array[len] = 0;
1154       return array;
1155     }
1156   }
1157   if(cloneArrayIfNeeded(len+1)) {
1158     array = getArrayStart();
1159     array[len] = 0;
1160     return array;
1161   } else {
1162     return NULL;
1163   }
1164 }
1165
1166 // setTo() analogous to the readonly-aliasing constructor with the same signature
1167 UnicodeString &
1168 UnicodeString::setTo(UBool isTerminated,
1169                      const UChar *text,
1170                      int32_t textLength)
1171 {
1172   if(fFlags & kOpenGetBuffer) {
1173     // do not modify a string that has an "open" getBuffer(minCapacity)
1174     return *this;
1175   }
1176
1177   if(text == NULL) {
1178     // treat as an empty string, do not alias
1179     releaseArray();
1180     setToEmpty();
1181     return *this;
1182   }
1183
1184   if( textLength < -1 ||
1185       (textLength == -1 && !isTerminated) ||
1186       (textLength >= 0 && isTerminated && text[textLength] != 0)
1187   ) {
1188     setToBogus();
1189     return *this;
1190   }
1191
1192   releaseArray();
1193
1194   if(textLength == -1) {
1195     // text is terminated, or else it would have failed the above test
1196     textLength = u_strlen(text);
1197   }
1198   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1199
1200   fFlags = kReadonlyAlias;
1201   return *this;
1202 }
1203
1204 // setTo() analogous to the writable-aliasing constructor with the same signature
1205 UnicodeString &
1206 UnicodeString::setTo(UChar *buffer,
1207                      int32_t buffLength,
1208                      int32_t buffCapacity) {
1209   if(fFlags & kOpenGetBuffer) {
1210     // do not modify a string that has an "open" getBuffer(minCapacity)
1211     return *this;
1212   }
1213
1214   if(buffer == NULL) {
1215     // treat as an empty string, do not alias
1216     releaseArray();
1217     setToEmpty();
1218     return *this;
1219   }
1220
1221   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1222     setToBogus();
1223     return *this;
1224   } else if(buffLength == -1) {
1225     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1226     const UChar *p = buffer, *limit = buffer + buffCapacity;
1227     while(p != limit && *p != 0) {
1228       ++p;
1229     }
1230     buffLength = (int32_t)(p - buffer);
1231   }
1232
1233   releaseArray();
1234
1235   setArray(buffer, buffLength, buffCapacity);
1236   fFlags = kWritableAlias;
1237   return *this;
1238 }
1239
1240 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
1241   unBogus();
1242   int32_t length = utf8.length();
1243   int32_t capacity;
1244   // The UTF-16 string will be at most as long as the UTF-8 string.
1245   if(length <= US_STACKBUF_SIZE) {
1246     capacity = US_STACKBUF_SIZE;
1247   } else {
1248     capacity = length + 1;  // +1 for the terminating NUL.
1249   }
1250   UChar *utf16 = getBuffer(capacity);
1251   int32_t length16;
1252   UErrorCode errorCode = U_ZERO_ERROR;
1253   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1254       utf8.data(), length,
1255       0xfffd,  // Substitution character.
1256       NULL,    // Don't care about number of substitutions.
1257       &errorCode);
1258   releaseBuffer(length16);
1259   if(U_FAILURE(errorCode)) {
1260     setToBogus();
1261   }
1262   return *this;
1263 }
1264
1265 UnicodeString&
1266 UnicodeString::setCharAt(int32_t offset,
1267              UChar c)
1268 {
1269   int32_t len = length();
1270   if(cloneArrayIfNeeded() && len > 0) {
1271     if(offset < 0) {
1272       offset = 0;
1273     } else if(offset >= len) {
1274       offset = len - 1;
1275     }
1276
1277     getArrayStart()[offset] = c;
1278   }
1279   return *this;
1280 }
1281
1282 UnicodeString&
1283 UnicodeString::replace(int32_t start,
1284                int32_t _length,
1285                UChar32 srcChar) {
1286   UChar buffer[U16_MAX_LENGTH];
1287   int32_t count = 0;
1288   UBool isError = FALSE;
1289   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1290   // We test isError so that the compiler does not complain that we don't.
1291   // If isError (srcChar is not a valid code point) then count==0 which means
1292   // we remove the source segment rather than replacing it with srcChar.
1293   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1294 }
1295
1296 UnicodeString&
1297 UnicodeString::append(UChar32 srcChar) {
1298   UChar buffer[U16_MAX_LENGTH];
1299   int32_t _length = 0;
1300   UBool isError = FALSE;
1301   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1302   // We test isError so that the compiler does not complain that we don't.
1303   // If isError then _length==0 which turns the doReplace() into a no-op anyway.
1304   return isError ? *this : doReplace(length(), 0, buffer, 0, _length);
1305 }
1306
1307 UnicodeString&
1308 UnicodeString::doReplace( int32_t start,
1309               int32_t length,
1310               const UnicodeString& src,
1311               int32_t srcStart,
1312               int32_t srcLength)
1313 {
1314   if(!src.isBogus()) {
1315     // pin the indices to legal values
1316     src.pinIndices(srcStart, srcLength);
1317
1318     // get the characters from src
1319     // and replace the range in ourselves with them
1320     return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1321   } else {
1322     // remove the range
1323     return doReplace(start, length, 0, 0, 0);
1324   }
1325 }
1326
1327 UnicodeString&
1328 UnicodeString::doReplace(int32_t start,
1329              int32_t length,
1330              const UChar *srcChars,
1331              int32_t srcStart,
1332              int32_t srcLength)
1333 {
1334   if(!isWritable()) {
1335     return *this;
1336   }
1337
1338   int32_t oldLength = this->length();
1339
1340   // optimize (read-only alias).remove(0, start) and .remove(start, end)
1341   if((fFlags&kBufferIsReadonly) && srcLength == 0) {
1342     if(start == 0) {
1343       // remove prefix by adjusting the array pointer
1344       pinIndex(length);
1345       fUnion.fFields.fArray += length;
1346       fUnion.fFields.fCapacity -= length;
1347       setLength(oldLength - length);
1348       return *this;
1349     } else {
1350       pinIndex(start);
1351       if(length >= (oldLength - start)) {
1352         // remove suffix by reducing the length (like truncate())
1353         setLength(start);
1354         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
1355         return *this;
1356       }
1357     }
1358   }
1359
1360   if(srcChars == 0) {
1361     srcStart = srcLength = 0;
1362   } else if(srcLength < 0) {
1363     // get the srcLength if necessary
1364     srcLength = u_strlen(srcChars + srcStart);
1365   }
1366
1367   // calculate the size of the string after the replace
1368   int32_t newLength;
1369
1370   // optimize append() onto a large-enough, owned string
1371   if(start >= oldLength) {
1372     if(srcLength == 0) {
1373       return *this;
1374     }
1375     newLength = oldLength + srcLength;
1376     if(newLength <= getCapacity() && isBufferWritable()) {
1377       UChar *oldArray = getArrayStart();
1378       // Do not copy characters when
1379       //   UChar *buffer=str.getAppendBuffer(...);
1380       // is followed by
1381       //   str.append(buffer, length);
1382       // or
1383       //   str.appendString(buffer, length)
1384       // or similar.
1385       if(srcChars + srcStart != oldArray + start || start > oldLength) {
1386         us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength);
1387       }
1388       setLength(newLength);
1389       return *this;
1390     } else {
1391       // pin the indices to legal values
1392       start = oldLength;
1393       length = 0;
1394     }
1395   } else {
1396     // pin the indices to legal values
1397     pinIndices(start, length);
1398
1399     newLength = oldLength - length + srcLength;
1400   }
1401
1402   // the following may change fArray but will not copy the current contents;
1403   // therefore we need to keep the current fArray
1404   UChar oldStackBuffer[US_STACKBUF_SIZE];
1405   UChar *oldArray;
1406   if((fFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1407     // copy the stack buffer contents because it will be overwritten with
1408     // fUnion.fFields values
1409     u_memcpy(oldStackBuffer, fUnion.fStackBuffer, oldLength);
1410     oldArray = oldStackBuffer;
1411   } else {
1412     oldArray = getArrayStart();
1413   }
1414
1415   // clone our array and allocate a bigger array if needed
1416   int32_t *bufferToDelete = 0;
1417   if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
1418                          FALSE, &bufferToDelete)
1419   ) {
1420     return *this;
1421   }
1422
1423   // now do the replace
1424
1425   UChar *newArray = getArrayStart();
1426   if(newArray != oldArray) {
1427     // if fArray changed, then we need to copy everything except what will change
1428     us_arrayCopy(oldArray, 0, newArray, 0, start);
1429     us_arrayCopy(oldArray, start + length,
1430                  newArray, start + srcLength,
1431                  oldLength - (start + length));
1432   } else if(length != srcLength) {
1433     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1434     us_arrayCopy(oldArray, start + length,
1435                  newArray, start + srcLength,
1436                  oldLength - (start + length));
1437   }
1438
1439   // now fill in the hole with the new string
1440   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1441
1442   setLength(newLength);
1443
1444   // delayed delete in case srcChars == fArray when we started, and
1445   // to keep oldArray alive for the above operations
1446   if (bufferToDelete) {
1447     uprv_free(bufferToDelete);
1448   }
1449
1450   return *this;
1451 }
1452
1453 /**
1454  * Replaceable API
1455  */
1456 void
1457 UnicodeString::handleReplaceBetween(int32_t start,
1458                                     int32_t limit,
1459                                     const UnicodeString& text) {
1460     replaceBetween(start, limit, text);
1461 }
1462
1463 /**
1464  * Replaceable API
1465  */
1466 void
1467 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1468     if (limit <= start) {
1469         return; // Nothing to do; avoid bogus malloc call
1470     }
1471     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1472     // Check to make sure text is not null.
1473     if (text != NULL) {
1474             extractBetween(start, limit, text, 0);
1475             insert(dest, text, 0, limit - start);
1476             uprv_free(text);
1477     }
1478 }
1479
1480 /**
1481  * Replaceable API
1482  *
1483  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1484  * so we implement this function here.
1485  */
1486 UBool Replaceable::hasMetaData() const {
1487     return TRUE;
1488 }
1489
1490 /**
1491  * Replaceable API
1492  */
1493 UBool UnicodeString::hasMetaData() const {
1494     return FALSE;
1495 }
1496
1497 UnicodeString&
1498 UnicodeString::doReverse(int32_t start, int32_t length) {
1499   if(length <= 1 || !cloneArrayIfNeeded()) {
1500     return *this;
1501   }
1502
1503   // pin the indices to legal values
1504   pinIndices(start, length);
1505   if(length <= 1) {  // pinIndices() might have shrunk the length
1506     return *this;
1507   }
1508
1509   UChar *left = getArrayStart() + start;
1510   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
1511   UChar swap;
1512   UBool hasSupplementary = FALSE;
1513
1514   // Before the loop we know left<right because length>=2.
1515   do {
1516     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1517     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1518     *right-- = swap;
1519   } while(left < right);
1520   // Make sure to test the middle code unit of an odd-length string.
1521   // Redundant if the length is even.
1522   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1523
1524   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1525   if(hasSupplementary) {
1526     UChar swap2;
1527
1528     left = getArrayStart() + start;
1529     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1530     while(left < right) {
1531       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1532         *left++ = swap2;
1533         *left++ = swap;
1534       } else {
1535         ++left;
1536       }
1537     }
1538   }
1539
1540   return *this;
1541 }
1542
1543 UBool
1544 UnicodeString::padLeading(int32_t targetLength,
1545                           UChar padChar)
1546 {
1547   int32_t oldLength = length();
1548   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1549     return FALSE;
1550   } else {
1551     // move contents up by padding width
1552     UChar *array = getArrayStart();
1553     int32_t start = targetLength - oldLength;
1554     us_arrayCopy(array, 0, array, start, oldLength);
1555
1556     // fill in padding character
1557     while(--start >= 0) {
1558       array[start] = padChar;
1559     }
1560     setLength(targetLength);
1561     return TRUE;
1562   }
1563 }
1564
1565 UBool
1566 UnicodeString::padTrailing(int32_t targetLength,
1567                            UChar padChar)
1568 {
1569   int32_t oldLength = length();
1570   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1571     return FALSE;
1572   } else {
1573     // fill in padding character
1574     UChar *array = getArrayStart();
1575     int32_t length = targetLength;
1576     while(--length >= oldLength) {
1577       array[length] = padChar;
1578     }
1579     setLength(targetLength);
1580     return TRUE;
1581   }
1582 }
1583
1584 //========================================
1585 // Hashing
1586 //========================================
1587 int32_t
1588 UnicodeString::doHashCode() const
1589 {
1590     /* Delegate hash computation to uhash.  This makes UnicodeString
1591      * hashing consistent with UChar* hashing.  */
1592     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1593     if (hashCode == kInvalidHashCode) {
1594         hashCode = kEmptyHashCode;
1595     }
1596     return hashCode;
1597 }
1598
1599 //========================================
1600 // External Buffer
1601 //========================================
1602
1603 UChar *
1604 UnicodeString::getBuffer(int32_t minCapacity) {
1605   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1606     fFlags|=kOpenGetBuffer;
1607     fShortLength=0;
1608     return getArrayStart();
1609   } else {
1610     return 0;
1611   }
1612 }
1613
1614 void
1615 UnicodeString::releaseBuffer(int32_t newLength) {
1616   if(fFlags&kOpenGetBuffer && newLength>=-1) {
1617     // set the new fLength
1618     int32_t capacity=getCapacity();
1619     if(newLength==-1) {
1620       // the new length is the string length, capped by fCapacity
1621       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1622       while(p<limit && *p!=0) {
1623         ++p;
1624       }
1625       newLength=(int32_t)(p-array);
1626     } else if(newLength>capacity) {
1627       newLength=capacity;
1628     }
1629     setLength(newLength);
1630     fFlags&=~kOpenGetBuffer;
1631   }
1632 }
1633
1634 //========================================
1635 // Miscellaneous
1636 //========================================
1637 UBool
1638 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1639                                   int32_t growCapacity,
1640                                   UBool doCopyArray,
1641                                   int32_t **pBufferToDelete,
1642                                   UBool forceClone) {
1643   // default parameters need to be static, therefore
1644   // the defaults are -1 to have convenience defaults
1645   if(newCapacity == -1) {
1646     newCapacity = getCapacity();
1647   }
1648
1649   // while a getBuffer(minCapacity) is "open",
1650   // prevent any modifications of the string by returning FALSE here
1651   // if the string is bogus, then only an assignment or similar can revive it
1652   if(!isWritable()) {
1653     return FALSE;
1654   }
1655
1656   /*
1657    * We need to make a copy of the array if
1658    * the buffer is read-only, or
1659    * the buffer is refCounted (shared), and refCount>1, or
1660    * the buffer is too small.
1661    * Return FALSE if memory could not be allocated.
1662    */
1663   if(forceClone ||
1664      fFlags & kBufferIsReadonly ||
1665      (fFlags & kRefCounted && refCount() > 1) ||
1666      newCapacity > getCapacity()
1667   ) {
1668     // check growCapacity for default value and use of the stack buffer
1669     if(growCapacity < 0) {
1670       growCapacity = newCapacity;
1671     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1672       growCapacity = US_STACKBUF_SIZE;
1673     }
1674
1675     // save old values
1676     UChar oldStackBuffer[US_STACKBUF_SIZE];
1677     UChar *oldArray;
1678     uint8_t flags = fFlags;
1679
1680     if(flags&kUsingStackBuffer) {
1681       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1682       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1683         // copy the stack buffer contents because it will be overwritten with
1684         // fUnion.fFields values
1685         us_arrayCopy(fUnion.fStackBuffer, 0, oldStackBuffer, 0, fShortLength);
1686         oldArray = oldStackBuffer;
1687       } else {
1688         oldArray = 0; // no need to copy from stack buffer to itself
1689       }
1690     } else {
1691       oldArray = fUnion.fFields.fArray;
1692       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1693     }
1694
1695     // allocate a new array
1696     if(allocate(growCapacity) ||
1697        (newCapacity < growCapacity && allocate(newCapacity))
1698     ) {
1699       if(doCopyArray && oldArray != 0) {
1700         // copy the contents
1701         // do not copy more than what fits - it may be smaller than before
1702         int32_t minLength = length();
1703         newCapacity = getCapacity();
1704         if(newCapacity < minLength) {
1705           minLength = newCapacity;
1706           setLength(minLength);
1707         }
1708         us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1709       } else {
1710         fShortLength = 0;
1711       }
1712
1713       // release the old array
1714       if(flags & kRefCounted) {
1715         // the array is refCounted; decrement and release if 0
1716         u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1717         if(umtx_atomic_dec(pRefCount) == 0) {
1718           if(pBufferToDelete == 0) {
1719               // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1720               // is defined as volatile. (Volatile has useful non-standard behavior
1721               //   with this compiler.)
1722             uprv_free((void *)pRefCount);
1723           } else {
1724             // the caller requested to delete it himself
1725             *pBufferToDelete = (int32_t *)pRefCount;
1726           }
1727         }
1728       }
1729     } else {
1730       // not enough memory for growCapacity and not even for the smaller newCapacity
1731       // reset the old values for setToBogus() to release the array
1732       if(!(flags&kUsingStackBuffer)) {
1733         fUnion.fFields.fArray = oldArray;
1734       }
1735       fFlags = flags;
1736       setToBogus();
1737       return FALSE;
1738     }
1739   }
1740   return TRUE;
1741 }
1742
1743 // UnicodeStringAppendable ------------------------------------------------- ***
1744
1745 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1746
1747 UBool
1748 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1749   return str.doReplace(str.length(), 0, &c, 0, 1).isWritable();
1750 }
1751
1752 UBool
1753 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1754   UChar buffer[U16_MAX_LENGTH];
1755   int32_t cLength = 0;
1756   UBool isError = FALSE;
1757   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1758   return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable();
1759 }
1760
1761 UBool
1762 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1763   return str.doReplace(str.length(), 0, s, 0, length).isWritable();
1764 }
1765
1766 UBool
1767 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1768   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1769 }
1770
1771 UChar *
1772 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1773                                          int32_t desiredCapacityHint,
1774                                          UChar *scratch, int32_t scratchCapacity,
1775                                          int32_t *resultCapacity) {
1776   if(minCapacity < 1 || scratchCapacity < minCapacity) {
1777     *resultCapacity = 0;
1778     return NULL;
1779   }
1780   int32_t oldLength = str.length();
1781   if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1782     *resultCapacity = str.getCapacity() - oldLength;
1783     return str.getArrayStart() + oldLength;
1784   }
1785   *resultCapacity = scratchCapacity;
1786   return scratch;
1787 }
1788
1789 U_NAMESPACE_END
1790
1791 U_NAMESPACE_USE
1792
1793 U_CAPI int32_t U_EXPORT2
1794 uhash_hashUnicodeString(const UElement key) {
1795     const UnicodeString *str = (const UnicodeString*) key.pointer;
1796     return (str == NULL) ? 0 : str->hashCode();
1797 }
1798
1799 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1800 // does not depend on hashtable code.
1801 U_CAPI UBool U_EXPORT2
1802 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1803     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1804     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1805     if (str1 == str2) {
1806         return TRUE;
1807     }
1808     if (str1 == NULL || str2 == NULL) {
1809         return FALSE;
1810     }
1811     return *str1 == *str2;
1812 }
1813
1814 #ifdef U_STATIC_IMPLEMENTATION
1815 /*
1816 This should never be called. It is defined here to make sure that the
1817 virtual vector deleting destructor is defined within unistr.cpp.
1818 The vector deleting destructor is already a part of UObject,
1819 but defining it here makes sure that it is included with this object file.
1820 This makes sure that static library dependencies are kept to a minimum.
1821 */
1822 static void uprv_UnicodeStringDummy(void) {
1823     delete [] (new UnicodeString[2]);
1824 }
1825 #endif