icuSources/common/unistr.cpp

   1 /*
   2 ******************************************************************************
   3 * Copyright (C) 1999-2014, International Business Machines Corporation and
   4 * others. All Rights Reserved.
   5 ******************************************************************************
   6 *
   7 * File unistr.cpp
   8 *
   9 * Modification History:
  10 *
  11 *   Date        Name        Description
  12 *   09/25/98    stephen     Creation.
  13 *   04/20/99    stephen     Overhauled per 4/16 code review.
  14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
  15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
  16 *                           Replaceable.
  17 *   06/25/01    grhoten     Removed the dependency on iostream
  18 ******************************************************************************
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "unicode/appendable.h"
  23 #include "unicode/putil.h"
  24 #include "cstring.h"
  25 #include "cmemory.h"
  26 #include "unicode/ustring.h"
  27 #include "unicode/unistr.h"
  28 #include "unicode/utf.h"
  29 #include "unicode/utf16.h"
  30 #include "uelement.h"
  31 #include "ustr_imp.h"
  32 #include "umutex.h"
  33 #include "uassert.h"
  34
  35 #if 0
  36
  37 #include <iostream>
  38 using namespace std;
  39
  40 //DEBUGGING
  41 void
  42 print(const UnicodeString& s,
  43       const char *name)
  44 {
  45   UChar c;
  46   cout << name << ":|";
  47   for(int i = 0; i < s.length(); ++i) {
  48     c = s[i];
  49     if(c>= 0x007E || c < 0x0020)
  50       cout << "[0x" << hex << s[i] << "]";
  51     else
  52       cout << (char) s[i];
  53   }
  54   cout << '|' << endl;
  55 }
  56
  57 void
  58 print(const UChar *s,
  59       int32_t len,
  60       const char *name)
  61 {
  62   UChar c;
  63   cout << name << ":|";
  64   for(int i = 0; i < len; ++i) {
  65     c = s[i];
  66     if(c>= 0x007E || c < 0x0020)
  67       cout << "[0x" << hex << s[i] << "]";
  68     else
  69       cout << (char) s[i];
  70   }
  71   cout << '|' << endl;
  72 }
  73 // END DEBUGGING
  74 #endif
  75
  76 // Local function definitions for now
  77
  78 // need to copy areas that may overlap
  79 static
  80 inline void
  81 us_arrayCopy(const UChar *src, int32_t srcStart,
  82          UChar *dst, int32_t dstStart, int32_t count)
  83 {
  84   if(count>0) {
  85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
  86   }
  87 }
  88
  89 // u_unescapeAt() callback to get a UChar from a UnicodeString
  90 U_CDECL_BEGIN
  91 static UChar U_CALLCONV
  92 UnicodeString_charAt(int32_t offset, void *context) {
  93     return ((icu::UnicodeString*) context)->charAt(offset);
  94 }
  95 U_CDECL_END
  96
  97 U_NAMESPACE_BEGIN
  98
  99 /* The Replaceable virtual destructor can't be defined in the header
 100    due to how AIX works with multiple definitions of virtual functions.
 101 */
 102 Replaceable::~Replaceable() {}
 103
 104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
 105
 106 UnicodeString U_EXPORT2
 107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
 108     return
 109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
 110             append(s1).
 111                 append(s2);
 112 }
 113
 114 //========================================
 115 // Reference Counting functions, put at top of file so that optimizing compilers
 116 //                               have a chance to automatically inline.
 117 //========================================
 118
 119 void
 120 UnicodeString::addRef() {
 121   umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
 122 }
 123
 124 int32_t
 125 UnicodeString::removeRef() {
 126   return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
 127 }
 128
 129 int32_t
 130 UnicodeString::refCount() const {
 131   return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
 132 }
 133
 134 void
 135 UnicodeString::releaseArray() {
 136   if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
 137     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
 138   }
 139 }
 140
 141
 142
 143 //========================================
 144 // Constructors
 145 //========================================
 146
 147 // The default constructor is inline in unistr.h.
 148
 149 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
 150   fUnion.fFields.fLengthAndFlags = 0;
 151   if(count <= 0 || (uint32_t)c > 0x10ffff) {
 152     // just allocate and do not do anything else
 153     allocate(capacity);
 154   } else {
 155     // count > 0, allocate and fill the new string with count c's
 156     int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
 157     if(capacity < length) {
 158       capacity = length;
 159     }
 160     if(allocate(capacity)) {
 161       UChar *array = getArrayStart();
 162       int32_t i = 0;
 163
 164       // fill the new string with c
 165       if(unitCount == 1) {
 166         // fill with length UChars
 167         while(i < length) {
 168           array[i++] = (UChar)c;
 169         }
 170       } else {
 171         // get the code units for c
 172         UChar units[U16_MAX_LENGTH];
 173         U16_APPEND_UNSAFE(units, i, c);
 174
 175         // now it must be i==unitCount
 176         i = 0;
 177
 178         // for Unicode, unitCount can only be 1, 2, 3, or 4
 179         // 1 is handled above
 180         while(i < length) {
 181           int32_t unitIdx = 0;
 182           while(unitIdx < unitCount) {
 183             array[i++]=units[unitIdx++];
 184           }
 185         }
 186       }
 187     }
 188     setLength(length);
 189   }
 190 }
 191
 192 UnicodeString::UnicodeString(UChar ch) {
 193   fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
 194   fUnion.fStackFields.fBuffer[0] = ch;
 195 }
 196
 197 UnicodeString::UnicodeString(UChar32 ch) {
 198   fUnion.fFields.fLengthAndFlags = kShortString;
 199   int32_t i = 0;
 200   UBool isError = FALSE;
 201   U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
 202   // We test isError so that the compiler does not complain that we don't.
 203   // If isError then i==0 which is what we want anyway.
 204   if(!isError) {
 205     setShortLength(i);
 206   }
 207 }
 208
 209 UnicodeString::UnicodeString(const UChar *text) {
 210   fUnion.fFields.fLengthAndFlags = kShortString;
 211   doReplace(0, 0, text, 0, -1);
 212 }
 213
 214 UnicodeString::UnicodeString(const UChar *text,
 215                              int32_t textLength) {
 216   fUnion.fFields.fLengthAndFlags = kShortString;
 217   doReplace(0, 0, text, 0, textLength);
 218 }
 219
 220 UnicodeString::UnicodeString(UBool isTerminated,
 221                              const UChar *text,
 222                              int32_t textLength) {
 223   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
 224   if(text == NULL) {
 225     // treat as an empty string, do not alias
 226     setToEmpty();
 227   } else if(textLength < -1 ||
 228             (textLength == -1 && !isTerminated) ||
 229             (textLength >= 0 && isTerminated && text[textLength] != 0)
 230   ) {
 231     setToBogus();
 232   } else {
 233     if(textLength == -1) {
 234       // text is terminated, or else it would have failed the above test
 235       textLength = u_strlen(text);
 236     }
 237     setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
 238   }
 239 }
 240
 241 UnicodeString::UnicodeString(UChar *buff,
 242                              int32_t buffLength,
 243                              int32_t buffCapacity) {
 244   fUnion.fFields.fLengthAndFlags = kWritableAlias;
 245   if(buff == NULL) {
 246     // treat as an empty string, do not alias
 247     setToEmpty();
 248   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
 249     setToBogus();
 250   } else {
 251     if(buffLength == -1) {
 252       // fLength = u_strlen(buff); but do not look beyond buffCapacity
 253       const UChar *p = buff, *limit = buff + buffCapacity;
 254       while(p != limit && *p != 0) {
 255         ++p;
 256       }
 257       buffLength = (int32_t)(p - buff);
 258     }
 259     setArray(buff, buffLength, buffCapacity);
 260   }
 261 }
 262
 263 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
 264   fUnion.fFields.fLengthAndFlags = kShortString;
 265   if(src==NULL) {
 266     // treat as an empty string
 267   } else {
 268     if(length<0) {
 269       length=(int32_t)uprv_strlen(src);
 270     }
 271     if(cloneArrayIfNeeded(length, length, FALSE)) {
 272       u_charsToUChars(src, getArrayStart(), length);
 273       setLength(length);
 274     } else {
 275       setToBogus();
 276     }
 277   }
 278 }
 279
 280 #if U_CHARSET_IS_UTF8
 281
 282 UnicodeString::UnicodeString(const char *codepageData) {
 283   fUnion.fFields.fLengthAndFlags = kShortString;
 284   if(codepageData != 0) {
 285     setToUTF8(codepageData);
 286   }
 287 }
 288
 289 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
 290   fUnion.fFields.fLengthAndFlags = kShortString;
 291   // if there's nothing to convert, do nothing
 292   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
 293     return;
 294   }
 295   if(dataLength == -1) {
 296     dataLength = (int32_t)uprv_strlen(codepageData);
 297   }
 298   setToUTF8(StringPiece(codepageData, dataLength));
 299 }
 300
 301 // else see unistr_cnv.cpp
 302 #endif
 303
 304 UnicodeString::UnicodeString(const UnicodeString& that) {
 305   fUnion.fFields.fLengthAndFlags = kShortString;
 306   copyFrom(that);
 307 }
 308
 309 UnicodeString::UnicodeString(const UnicodeString& that,
 310                              int32_t srcStart) {
 311   fUnion.fFields.fLengthAndFlags = kShortString;
 312   setTo(that, srcStart);
 313 }
 314
 315 UnicodeString::UnicodeString(const UnicodeString& that,
 316                              int32_t srcStart,
 317                              int32_t srcLength) {
 318   fUnion.fFields.fLengthAndFlags = kShortString;
 319   setTo(that, srcStart, srcLength);
 320 }
 321
 322 // Replaceable base class clone() default implementation, does not clone
 323 Replaceable *
 324 Replaceable::clone() const {
 325   return NULL;
 326 }
 327
 328 // UnicodeString overrides clone() with a real implementation
 329 Replaceable *
 330 UnicodeString::clone() const {
 331   return new UnicodeString(*this);
 332 }
 333
 334 //========================================
 335 // array allocation
 336 //========================================
 337
 338 UBool
 339 UnicodeString::allocate(int32_t capacity) {
 340   if(capacity <= US_STACKBUF_SIZE) {
 341     fUnion.fFields.fLengthAndFlags = kShortString;
 342   } else {
 343     // count bytes for the refCounter and the string capacity, and
 344     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
 345     // to be safely aligned for the refCount
 346     // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
 347     int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
 348     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
 349     if(array != 0) {
 350       // set initial refCount and point behind the refCount
 351       *array++ = 1;
 352
 353       // have fArray point to the first UChar
 354       fUnion.fFields.fArray = (UChar *)array;
 355       fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
 356       fUnion.fFields.fLengthAndFlags = kLongString;
 357     } else {
 358       fUnion.fFields.fLengthAndFlags = kIsBogus;
 359       fUnion.fFields.fArray = 0;
 360       fUnion.fFields.fCapacity = 0;
 361       return FALSE;
 362     }
 363   }
 364   return TRUE;
 365 }
 366
 367 //========================================
 368 // Destructor
 369 //========================================
 370 UnicodeString::~UnicodeString()
 371 {
 372   releaseArray();
 373 }
 374
 375 //========================================
 376 // Factory methods
 377 //========================================
 378
 379 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
 380   UnicodeString result;
 381   result.setToUTF8(utf8);
 382   return result;
 383 }
 384
 385 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
 386   UnicodeString result;
 387   int32_t capacity;
 388   // Most UTF-32 strings will be BMP-only and result in a same-length
 389   // UTF-16 string. We overestimate the capacity just slightly,
 390   // just in case there are a few supplementary characters.
 391   if(length <= US_STACKBUF_SIZE) {
 392     capacity = US_STACKBUF_SIZE;
 393   } else {
 394     capacity = length + (length >> 4) + 4;
 395   }
 396   do {
 397     UChar *utf16 = result.getBuffer(capacity);
 398     int32_t length16;
 399     UErrorCode errorCode = U_ZERO_ERROR;
 400     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
 401         utf32, length,
 402         0xfffd,  // Substitution character.
 403         NULL,    // Don't care about number of substitutions.
 404         &errorCode);
 405     result.releaseBuffer(length16);
 406     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 407       capacity = length16 + 1;  // +1 for the terminating NUL.
 408       continue;
 409     } else if(U_FAILURE(errorCode)) {
 410       result.setToBogus();
 411     }
 412     break;
 413   } while(TRUE);
 414   return result;
 415 }
 416
 417 //========================================
 418 // Assignment
 419 //========================================
 420
 421 UnicodeString &
 422 UnicodeString::operator=(const UnicodeString &src) {
 423   return copyFrom(src);
 424 }
 425
 426 UnicodeString &
 427 UnicodeString::fastCopyFrom(const UnicodeString &src) {
 428   return copyFrom(src, TRUE);
 429 }
 430
 431 UnicodeString &
 432 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
 433   // if assigning to ourselves, do nothing
 434   if(this == &src) {
 435     return *this;
 436   }
 437
 438   // is the right side bogus?
 439   if(src.isBogus()) {
 440     setToBogus();
 441     return *this;
 442   }
 443
 444   // delete the current contents
 445   releaseArray();
 446
 447   if(src.isEmpty()) {
 448     // empty string - use the stack buffer
 449     setToEmpty();
 450     return *this;
 451   }
 452
 453   // fLength>0 and not an "open" src.getBuffer(minCapacity)
 454   fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
 455   switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
 456   case kShortString:
 457     // short string using the stack buffer, do the same
 458     uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
 459                 getShortLength() * U_SIZEOF_UCHAR);
 460     break;
 461   case kLongString:
 462     // src uses a refCounted string buffer, use that buffer with refCount
 463     // src is const, use a cast - we don't actually change it
 464     ((UnicodeString &)src).addRef();
 465     // copy all fields, share the reference-counted buffer
 466     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 467     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 468     if(!hasShortLength()) {
 469       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
 470     }
 471     break;
 472   case kReadonlyAlias:
 473     if(fastCopy) {
 474       // src is a readonly alias, do the same
 475       // -> maintain the readonly alias as such
 476       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 477       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 478       if(!hasShortLength()) {
 479         fUnion.fFields.fLength = src.fUnion.fFields.fLength;
 480       }
 481       break;
 482     }
 483     // else if(!fastCopy) fall through to case kWritableAlias
 484     // -> allocate a new buffer and copy the contents
 485   case kWritableAlias: {
 486     // src is a writable alias; we make a copy of that instead
 487     int32_t srcLength = src.length();
 488     if(allocate(srcLength)) {
 489       uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
 490       setLength(srcLength);
 491       break;
 492     }
 493     // if there is not enough memory, then fall through to setting to bogus
 494   }
 495   default:
 496     // if src is bogus, set ourselves to bogus
 497     // do not call setToBogus() here because fArray and flags are not consistent here
 498     fUnion.fFields.fLengthAndFlags = kIsBogus;
 499     fUnion.fFields.fArray = 0;
 500     fUnion.fFields.fCapacity = 0;
 501     break;
 502   }
 503
 504   return *this;
 505 }
 506
 507 //========================================
 508 // Miscellaneous operations
 509 //========================================
 510
 511 UnicodeString UnicodeString::unescape() const {
 512     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
 513     const UChar *array = getBuffer();
 514     int32_t len = length();
 515     int32_t prev = 0;
 516     for (int32_t i=0;;) {
 517         if (i == len) {
 518             result.append(array, prev, len - prev);
 519             break;
 520         }
 521         if (array[i++] == 0x5C /*'\\'*/) {
 522             result.append(array, prev, (i - 1) - prev);
 523             UChar32 c = unescapeAt(i); // advances i
 524             if (c < 0) {
 525                 result.remove(); // return empty string
 526                 break; // invalid escape sequence
 527             }
 528             result.append(c);
 529             prev = i;
 530         }
 531     }
 532     return result;
 533 }
 534
 535 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
 536     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
 537 }
 538
 539 //========================================
 540 // Read-only implementation
 541 //========================================
 542 UBool
 543 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
 544   // Requires: this & text not bogus and have same lengths.
 545   // Byte-wise comparison works for equality regardless of endianness.
 546   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
 547 }
 548
 549 int8_t
 550 UnicodeString::doCompare( int32_t start,
 551               int32_t length,
 552               const UChar *srcChars,
 553               int32_t srcStart,
 554               int32_t srcLength) const
 555 {
 556   // compare illegal string values
 557   if(isBogus()) {
 558     return -1;
 559   }
 560
 561   // pin indices to legal values
 562   pinIndices(start, length);
 563
 564   if(srcChars == NULL) {
 565     // treat const UChar *srcChars==NULL as an empty string
 566     return length == 0 ? 0 : 1;
 567   }
 568
 569   // get the correct pointer
 570   const UChar *chars = getArrayStart();
 571
 572   chars += start;
 573   srcChars += srcStart;
 574
 575   int32_t minLength;
 576   int8_t lengthResult;
 577
 578   // get the srcLength if necessary
 579   if(srcLength < 0) {
 580     srcLength = u_strlen(srcChars + srcStart);
 581   }
 582
 583   // are we comparing different lengths?
 584   if(length != srcLength) {
 585     if(length < srcLength) {
 586       minLength = length;
 587       lengthResult = -1;
 588     } else {
 589       minLength = srcLength;
 590       lengthResult = 1;
 591     }
 592   } else {
 593     minLength = length;
 594     lengthResult = 0;
 595   }
 596
 597   /*
 598    * note that uprv_memcmp() returns an int but we return an int8_t;
 599    * we need to take care not to truncate the result -
 600    * one way to do this is to right-shift the value to
 601    * move the sign bit into the lower 8 bits and making sure that this
 602    * does not become 0 itself
 603    */
 604
 605   if(minLength > 0 && chars != srcChars) {
 606     int32_t result;
 607
 608 #   if U_IS_BIG_ENDIAN
 609       // big-endian: byte comparison works
 610       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
 611       if(result != 0) {
 612         return (int8_t)(result >> 15 | 1);
 613       }
 614 #   else
 615       // little-endian: compare UChar units
 616       do {
 617         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
 618         if(result != 0) {
 619           return (int8_t)(result >> 15 | 1);
 620         }
 621       } while(--minLength > 0);
 622 #   endif
 623   }
 624   return lengthResult;
 625 }
 626
 627 /* String compare in code point order - doCompare() compares in code unit order. */
 628 int8_t
 629 UnicodeString::doCompareCodePointOrder(int32_t start,
 630                                        int32_t length,
 631                                        const UChar *srcChars,
 632                                        int32_t srcStart,
 633                                        int32_t srcLength) const
 634 {
 635   // compare illegal string values
 636   // treat const UChar *srcChars==NULL as an empty string
 637   if(isBogus()) {
 638     return -1;
 639   }
 640
 641   // pin indices to legal values
 642   pinIndices(start, length);
 643
 644   if(srcChars == NULL) {
 645     srcStart = srcLength = 0;
 646   }
 647
 648   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
 649   /* translate the 32-bit result into an 8-bit one */
 650   if(diff!=0) {
 651     return (int8_t)(diff >> 15 | 1);
 652   } else {
 653     return 0;
 654   }
 655 }
 656
 657 int32_t
 658 UnicodeString::getLength() const {
 659     return length();
 660 }
 661
 662 UChar
 663 UnicodeString::getCharAt(int32_t offset) const {
 664   return charAt(offset);
 665 }
 666
 667 UChar32
 668 UnicodeString::getChar32At(int32_t offset) const {
 669   return char32At(offset);
 670 }
 671
 672 UChar32
 673 UnicodeString::char32At(int32_t offset) const
 674 {
 675   int32_t len = length();
 676   if((uint32_t)offset < (uint32_t)len) {
 677     const UChar *array = getArrayStart();
 678     UChar32 c;
 679     U16_GET(array, 0, offset, len, c);
 680     return c;
 681   } else {
 682     return kInvalidUChar;
 683   }
 684 }
 685
 686 int32_t
 687 UnicodeString::getChar32Start(int32_t offset) const {
 688   if((uint32_t)offset < (uint32_t)length()) {
 689     const UChar *array = getArrayStart();
 690     U16_SET_CP_START(array, 0, offset);
 691     return offset;
 692   } else {
 693     return 0;
 694   }
 695 }
 696
 697 int32_t
 698 UnicodeString::getChar32Limit(int32_t offset) const {
 699   int32_t len = length();
 700   if((uint32_t)offset < (uint32_t)len) {
 701     const UChar *array = getArrayStart();
 702     U16_SET_CP_LIMIT(array, 0, offset, len);
 703     return offset;
 704   } else {
 705     return len;
 706   }
 707 }
 708
 709 int32_t
 710 UnicodeString::countChar32(int32_t start, int32_t length) const {
 711   pinIndices(start, length);
 712   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
 713   return u_countChar32(getArrayStart()+start, length);
 714 }
 715
 716 UBool
 717 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
 718   pinIndices(start, length);
 719   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
 720   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
 721 }
 722
 723 int32_t
 724 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
 725   // pin index
 726   int32_t len = length();
 727   if(index<0) {
 728     index=0;
 729   } else if(index>len) {
 730     index=len;
 731   }
 732
 733   const UChar *array = getArrayStart();
 734   if(delta>0) {
 735     U16_FWD_N(array, index, len, delta);
 736   } else {
 737     U16_BACK_N(array, 0, index, -delta);
 738   }
 739
 740   return index;
 741 }
 742
 743 void
 744 UnicodeString::doExtract(int32_t start,
 745              int32_t length,
 746              UChar *dst,
 747              int32_t dstStart) const
 748 {
 749   // pin indices to legal values
 750   pinIndices(start, length);
 751
 752   // do not copy anything if we alias dst itself
 753   const UChar *array = getArrayStart();
 754   if(array + start != dst + dstStart) {
 755     us_arrayCopy(array, start, dst, dstStart, length);
 756   }
 757 }
 758
 759 int32_t
 760 UnicodeString::extract(UChar *dest, int32_t destCapacity,
 761                        UErrorCode &errorCode) const {
 762   int32_t len = length();
 763   if(U_SUCCESS(errorCode)) {
 764     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 765       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 766     } else {
 767       const UChar *array = getArrayStart();
 768       if(len>0 && len<=destCapacity && array!=dest) {
 769         uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
 770       }
 771       return u_terminateUChars(dest, destCapacity, len, &errorCode);
 772     }
 773   }
 774
 775   return len;
 776 }
 777
 778 int32_t
 779 UnicodeString::extract(int32_t start,
 780                        int32_t length,
 781                        char *target,
 782                        int32_t targetCapacity,
 783                        enum EInvariant) const
 784 {
 785   // if the arguments are illegal, then do nothing
 786   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
 787     return 0;
 788   }
 789
 790   // pin the indices to legal values
 791   pinIndices(start, length);
 792
 793   if(length <= targetCapacity) {
 794     u_UCharsToChars(getArrayStart() + start, target, length);
 795   }
 796   UErrorCode status = U_ZERO_ERROR;
 797   return u_terminateChars(target, targetCapacity, length, &status);
 798 }
 799
 800 UnicodeString
 801 UnicodeString::tempSubString(int32_t start, int32_t len) const {
 802   pinIndices(start, len);
 803   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
 804   if(array==NULL) {
 805     array=fUnion.fStackFields.fBuffer;  // anything not NULL because that would make an empty string
 806     len=-2;  // bogus result string
 807   }
 808   return UnicodeString(FALSE, array + start, len);
 809 }
 810
 811 int32_t
 812 UnicodeString::toUTF8(int32_t start, int32_t len,
 813                       char *target, int32_t capacity) const {
 814   pinIndices(start, len);
 815   int32_t length8;
 816   UErrorCode errorCode = U_ZERO_ERROR;
 817   u_strToUTF8WithSub(target, capacity, &length8,
 818                      getBuffer() + start, len,
 819                      0xFFFD,  // Standard substitution character.
 820                      NULL,    // Don't care about number of substitutions.
 821                      &errorCode);
 822   return length8;
 823 }
 824
 825 #if U_CHARSET_IS_UTF8
 826
 827 int32_t
 828 UnicodeString::extract(int32_t start, int32_t len,
 829                        char *target, uint32_t dstSize) const {
 830   // if the arguments are illegal, then do nothing
 831   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
 832     return 0;
 833   }
 834   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
 835 }
 836
 837 // else see unistr_cnv.cpp
 838 #endif
 839
 840 void
 841 UnicodeString::extractBetween(int32_t start,
 842                   int32_t limit,
 843                   UnicodeString& target) const {
 844   pinIndex(start);
 845   pinIndex(limit);
 846   doExtract(start, limit - start, target);
 847 }
 848
 849 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
 850 // as many bytes as the source has UChars.
 851 // The "worst cases" are writing systems like Indic, Thai and CJK with
 852 // 3:1 bytes:UChars.
 853 void
 854 UnicodeString::toUTF8(ByteSink &sink) const {
 855   int32_t length16 = length();
 856   if(length16 != 0) {
 857     char stackBuffer[1024];
 858     int32_t capacity = (int32_t)sizeof(stackBuffer);
 859     UBool utf8IsOwned = FALSE;
 860     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
 861                                       3*length16,
 862                                       stackBuffer, capacity,
 863                                       &capacity);
 864     int32_t length8 = 0;
 865     UErrorCode errorCode = U_ZERO_ERROR;
 866     u_strToUTF8WithSub(utf8, capacity, &length8,
 867                        getBuffer(), length16,
 868                        0xFFFD,  // Standard substitution character.
 869                        NULL,    // Don't care about number of substitutions.
 870                        &errorCode);
 871     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 872       utf8 = (char *)uprv_malloc(length8);
 873       if(utf8 != NULL) {
 874         utf8IsOwned = TRUE;
 875         errorCode = U_ZERO_ERROR;
 876         u_strToUTF8WithSub(utf8, length8, &length8,
 877                            getBuffer(), length16,
 878                            0xFFFD,  // Standard substitution character.
 879                            NULL,    // Don't care about number of substitutions.
 880                            &errorCode);
 881       } else {
 882         errorCode = U_MEMORY_ALLOCATION_ERROR;
 883       }
 884     }
 885     if(U_SUCCESS(errorCode)) {
 886       sink.Append(utf8, length8);
 887       sink.Flush();
 888     }
 889     if(utf8IsOwned) {
 890       uprv_free(utf8);
 891     }
 892   }
 893 }
 894
 895 int32_t
 896 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
 897   int32_t length32=0;
 898   if(U_SUCCESS(errorCode)) {
 899     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
 900     u_strToUTF32WithSub(utf32, capacity, &length32,
 901         getBuffer(), length(),
 902         0xfffd,  // Substitution character.
 903         NULL,    // Don't care about number of substitutions.
 904         &errorCode);
 905   }
 906   return length32;
 907 }
 908
 909 int32_t
 910 UnicodeString::indexOf(const UChar *srcChars,
 911                int32_t srcStart,
 912                int32_t srcLength,
 913                int32_t start,
 914                int32_t length) const
 915 {
 916   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
 917     return -1;
 918   }
 919
 920   // UnicodeString does not find empty substrings
 921   if(srcLength < 0 && srcChars[srcStart] == 0) {
 922     return -1;
 923   }
 924
 925   // get the indices within bounds
 926   pinIndices(start, length);
 927
 928   // find the first occurrence of the substring
 929   const UChar *array = getArrayStart();
 930   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
 931   if(match == NULL) {
 932     return -1;
 933   } else {
 934     return (int32_t)(match - array);
 935   }
 936 }
 937
 938 int32_t
 939 UnicodeString::doIndexOf(UChar c,
 940              int32_t start,
 941              int32_t length) const
 942 {
 943   // pin indices
 944   pinIndices(start, length);
 945
 946   // find the first occurrence of c
 947   const UChar *array = getArrayStart();
 948   const UChar *match = u_memchr(array + start, c, length);
 949   if(match == NULL) {
 950     return -1;
 951   } else {
 952     return (int32_t)(match - array);
 953   }
 954 }
 955
 956 int32_t
 957 UnicodeString::doIndexOf(UChar32 c,
 958                          int32_t start,
 959                          int32_t length) const {
 960   // pin indices
 961   pinIndices(start, length);
 962
 963   // find the first occurrence of c
 964   const UChar *array = getArrayStart();
 965   const UChar *match = u_memchr32(array + start, c, length);
 966   if(match == NULL) {
 967     return -1;
 968   } else {
 969     return (int32_t)(match - array);
 970   }
 971 }
 972
 973 int32_t
 974 UnicodeString::lastIndexOf(const UChar *srcChars,
 975                int32_t srcStart,
 976                int32_t srcLength,
 977                int32_t start,
 978                int32_t length) const
 979 {
 980   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
 981     return -1;
 982   }
 983
 984   // UnicodeString does not find empty substrings
 985   if(srcLength < 0 && srcChars[srcStart] == 0) {
 986     return -1;
 987   }
 988
 989   // get the indices within bounds
 990   pinIndices(start, length);
 991
 992   // find the last occurrence of the substring
 993   const UChar *array = getArrayStart();
 994   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
 995   if(match == NULL) {
 996     return -1;
 997   } else {
 998     return (int32_t)(match - array);
 999   }
1000 }
1001
1002 int32_t
1003 UnicodeString::doLastIndexOf(UChar c,
1004                  int32_t start,
1005                  int32_t length) const
1006 {
1007   if(isBogus()) {
1008     return -1;
1009   }
1010
1011   // pin indices
1012   pinIndices(start, length);
1013
1014   // find the last occurrence of c
1015   const UChar *array = getArrayStart();
1016   const UChar *match = u_memrchr(array + start, c, length);
1017   if(match == NULL) {
1018     return -1;
1019   } else {
1020     return (int32_t)(match - array);
1021   }
1022 }
1023
1024 int32_t
1025 UnicodeString::doLastIndexOf(UChar32 c,
1026                              int32_t start,
1027                              int32_t length) const {
1028   // pin indices
1029   pinIndices(start, length);
1030
1031   // find the last occurrence of c
1032   const UChar *array = getArrayStart();
1033   const UChar *match = u_memrchr32(array + start, c, length);
1034   if(match == NULL) {
1035     return -1;
1036   } else {
1037     return (int32_t)(match - array);
1038   }
1039 }
1040
1041 //========================================
1042 // Write implementation
1043 //========================================
1044
1045 UnicodeString&
1046 UnicodeString::findAndReplace(int32_t start,
1047                   int32_t length,
1048                   const UnicodeString& oldText,
1049                   int32_t oldStart,
1050                   int32_t oldLength,
1051                   const UnicodeString& newText,
1052                   int32_t newStart,
1053                   int32_t newLength)
1054 {
1055   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1056     return *this;
1057   }
1058
1059   pinIndices(start, length);
1060   oldText.pinIndices(oldStart, oldLength);
1061   newText.pinIndices(newStart, newLength);
1062
1063   if(oldLength == 0) {
1064     return *this;
1065   }
1066
1067   while(length > 0 && length >= oldLength) {
1068     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1069     if(pos < 0) {
1070       // no more oldText's here: done
1071       break;
1072     } else {
1073       // we found oldText, replace it by newText and go beyond it
1074       replace(pos, oldLength, newText, newStart, newLength);
1075       length -= pos + oldLength - start;
1076       start = pos + newLength;
1077     }
1078   }
1079
1080   return *this;
1081 }
1082
1083
1084 void
1085 UnicodeString::setToBogus()
1086 {
1087   releaseArray();
1088
1089   fUnion.fFields.fLengthAndFlags = kIsBogus;
1090   fUnion.fFields.fArray = 0;
1091   fUnion.fFields.fCapacity = 0;
1092 }
1093
1094 // turn a bogus string into an empty one
1095 void
1096 UnicodeString::unBogus() {
1097   if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1098     setToEmpty();
1099   }
1100 }
1101
1102 const UChar *
1103 UnicodeString::getTerminatedBuffer() {
1104   if(!isWritable()) {
1105     return 0;
1106   }
1107   UChar *array = getArrayStart();
1108   int32_t len = length();
1109   if(len < getCapacity()) {
1110     if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1111       // If len<capacity on a read-only alias, then array[len] is
1112       // either the original NUL (if constructed with (TRUE, s, length))
1113       // or one of the original string contents characters (if later truncated),
1114       // therefore we can assume that array[len] is initialized memory.
1115       if(array[len] == 0) {
1116         return array;
1117       }
1118     } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1119       // kRefCounted: Do not write the NUL if the buffer is shared.
1120       // That is mostly safe, except when the length of one copy was modified
1121       // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1122       // Then the NUL would be written into the middle of another copy's string.
1123
1124       // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1125       // Do not test if there is a NUL already because it might be uninitialized memory.
1126       // (That would be safe, but tools like valgrind & Purify would complain.)
1127       array[len] = 0;
1128       return array;
1129     }
1130   }
1131   if(cloneArrayIfNeeded(len+1)) {
1132     array = getArrayStart();
1133     array[len] = 0;
1134     return array;
1135   } else {
1136     return NULL;
1137   }
1138 }
1139
1140 // setTo() analogous to the readonly-aliasing constructor with the same signature
1141 UnicodeString &
1142 UnicodeString::setTo(UBool isTerminated,
1143                      const UChar *text,
1144                      int32_t textLength)
1145 {
1146   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1147     // do not modify a string that has an "open" getBuffer(minCapacity)
1148     return *this;
1149   }
1150
1151   if(text == NULL) {
1152     // treat as an empty string, do not alias
1153     releaseArray();
1154     setToEmpty();
1155     return *this;
1156   }
1157
1158   if( textLength < -1 ||
1159       (textLength == -1 && !isTerminated) ||
1160       (textLength >= 0 && isTerminated && text[textLength] != 0)
1161   ) {
1162     setToBogus();
1163     return *this;
1164   }
1165
1166   releaseArray();
1167
1168   if(textLength == -1) {
1169     // text is terminated, or else it would have failed the above test
1170     textLength = u_strlen(text);
1171   }
1172   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1173   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1174   return *this;
1175 }
1176
1177 // setTo() analogous to the writable-aliasing constructor with the same signature
1178 UnicodeString &
1179 UnicodeString::setTo(UChar *buffer,
1180                      int32_t buffLength,
1181                      int32_t buffCapacity) {
1182   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1183     // do not modify a string that has an "open" getBuffer(minCapacity)
1184     return *this;
1185   }
1186
1187   if(buffer == NULL) {
1188     // treat as an empty string, do not alias
1189     releaseArray();
1190     setToEmpty();
1191     return *this;
1192   }
1193
1194   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1195     setToBogus();
1196     return *this;
1197   } else if(buffLength == -1) {
1198     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1199     const UChar *p = buffer, *limit = buffer + buffCapacity;
1200     while(p != limit && *p != 0) {
1201       ++p;
1202     }
1203     buffLength = (int32_t)(p - buffer);
1204   }
1205
1206   releaseArray();
1207
1208   fUnion.fFields.fLengthAndFlags = kWritableAlias;
1209   setArray(buffer, buffLength, buffCapacity);
1210   return *this;
1211 }
1212
1213 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
1214   unBogus();
1215   int32_t length = utf8.length();
1216   int32_t capacity;
1217   // The UTF-16 string will be at most as long as the UTF-8 string.
1218   if(length <= US_STACKBUF_SIZE) {
1219     capacity = US_STACKBUF_SIZE;
1220   } else {
1221     capacity = length + 1;  // +1 for the terminating NUL.
1222   }
1223   UChar *utf16 = getBuffer(capacity);
1224   int32_t length16;
1225   UErrorCode errorCode = U_ZERO_ERROR;
1226   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1227       utf8.data(), length,
1228       0xfffd,  // Substitution character.
1229       NULL,    // Don't care about number of substitutions.
1230       &errorCode);
1231   releaseBuffer(length16);
1232   if(U_FAILURE(errorCode)) {
1233     setToBogus();
1234   }
1235   return *this;
1236 }
1237
1238 UnicodeString&
1239 UnicodeString::setCharAt(int32_t offset,
1240              UChar c)
1241 {
1242   int32_t len = length();
1243   if(cloneArrayIfNeeded() && len > 0) {
1244     if(offset < 0) {
1245       offset = 0;
1246     } else if(offset >= len) {
1247       offset = len - 1;
1248     }
1249
1250     getArrayStart()[offset] = c;
1251   }
1252   return *this;
1253 }
1254
1255 UnicodeString&
1256 UnicodeString::replace(int32_t start,
1257                int32_t _length,
1258                UChar32 srcChar) {
1259   UChar buffer[U16_MAX_LENGTH];
1260   int32_t count = 0;
1261   UBool isError = FALSE;
1262   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1263   // We test isError so that the compiler does not complain that we don't.
1264   // If isError (srcChar is not a valid code point) then count==0 which means
1265   // we remove the source segment rather than replacing it with srcChar.
1266   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1267 }
1268
1269 UnicodeString&
1270 UnicodeString::append(UChar32 srcChar) {
1271   UChar buffer[U16_MAX_LENGTH];
1272   int32_t _length = 0;
1273   UBool isError = FALSE;
1274   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1275   // We test isError so that the compiler does not complain that we don't.
1276   // If isError then _length==0 which turns the doReplace() into a no-op anyway.
1277   return isError ? *this : doReplace(length(), 0, buffer, 0, _length);
1278 }
1279
1280 UnicodeString&
1281 UnicodeString::doReplace( int32_t start,
1282               int32_t length,
1283               const UnicodeString& src,
1284               int32_t srcStart,
1285               int32_t srcLength)
1286 {
1287   if(!src.isBogus()) {
1288     // pin the indices to legal values
1289     src.pinIndices(srcStart, srcLength);
1290
1291     // get the characters from src
1292     // and replace the range in ourselves with them
1293     return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1294   } else {
1295     // remove the range
1296     return doReplace(start, length, 0, 0, 0);
1297   }
1298 }
1299
1300 UnicodeString&
1301 UnicodeString::doReplace(int32_t start,
1302              int32_t length,
1303              const UChar *srcChars,
1304              int32_t srcStart,
1305              int32_t srcLength)
1306 {
1307   if(!isWritable()) {
1308     return *this;
1309   }
1310
1311   int32_t oldLength = this->length();
1312
1313   // optimize (read-only alias).remove(0, start) and .remove(start, end)
1314   if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1315     if(start == 0) {
1316       // remove prefix by adjusting the array pointer
1317       pinIndex(length);
1318       fUnion.fFields.fArray += length;
1319       fUnion.fFields.fCapacity -= length;
1320       setLength(oldLength - length);
1321       return *this;
1322     } else {
1323       pinIndex(start);
1324       if(length >= (oldLength - start)) {
1325         // remove suffix by reducing the length (like truncate())
1326         setLength(start);
1327         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
1328         return *this;
1329       }
1330     }
1331   }
1332
1333   if(srcChars == 0) {
1334     srcStart = srcLength = 0;
1335   } else if(srcLength < 0) {
1336     // get the srcLength if necessary
1337     srcLength = u_strlen(srcChars + srcStart);
1338   }
1339
1340   // calculate the size of the string after the replace
1341   int32_t newLength;
1342
1343   // optimize append() onto a large-enough, owned string
1344   if(start >= oldLength) {
1345     if(srcLength == 0) {
1346       return *this;
1347     }
1348     newLength = oldLength + srcLength;
1349     if(newLength <= getCapacity() && isBufferWritable()) {
1350       UChar *oldArray = getArrayStart();
1351       // Do not copy characters when
1352       //   UChar *buffer=str.getAppendBuffer(...);
1353       // is followed by
1354       //   str.append(buffer, length);
1355       // or
1356       //   str.appendString(buffer, length)
1357       // or similar.
1358       if(srcChars + srcStart != oldArray + start || start > oldLength) {
1359         us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength);
1360       }
1361       setLength(newLength);
1362       return *this;
1363     } else {
1364       // pin the indices to legal values
1365       start = oldLength;
1366       length = 0;
1367     }
1368   } else {
1369     // pin the indices to legal values
1370     pinIndices(start, length);
1371
1372     newLength = oldLength - length + srcLength;
1373   }
1374
1375   // the following may change fArray but will not copy the current contents;
1376   // therefore we need to keep the current fArray
1377   UChar oldStackBuffer[US_STACKBUF_SIZE];
1378   UChar *oldArray;
1379   if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1380     // copy the stack buffer contents because it will be overwritten with
1381     // fUnion.fFields values
1382     u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
1383     oldArray = oldStackBuffer;
1384   } else {
1385     oldArray = getArrayStart();
1386   }
1387
1388   // clone our array and allocate a bigger array if needed
1389   int32_t *bufferToDelete = 0;
1390   if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
1391                          FALSE, &bufferToDelete)
1392   ) {
1393     return *this;
1394   }
1395
1396   // now do the replace
1397
1398   UChar *newArray = getArrayStart();
1399   if(newArray != oldArray) {
1400     // if fArray changed, then we need to copy everything except what will change
1401     us_arrayCopy(oldArray, 0, newArray, 0, start);
1402     us_arrayCopy(oldArray, start + length,
1403                  newArray, start + srcLength,
1404                  oldLength - (start + length));
1405   } else if(length != srcLength) {
1406     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1407     us_arrayCopy(oldArray, start + length,
1408                  newArray, start + srcLength,
1409                  oldLength - (start + length));
1410   }
1411
1412   // now fill in the hole with the new string
1413   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1414
1415   setLength(newLength);
1416
1417   // delayed delete in case srcChars == fArray when we started, and
1418   // to keep oldArray alive for the above operations
1419   if (bufferToDelete) {
1420     uprv_free(bufferToDelete);
1421   }
1422
1423   return *this;
1424 }
1425
1426 /**
1427  * Replaceable API
1428  */
1429 void
1430 UnicodeString::handleReplaceBetween(int32_t start,
1431                                     int32_t limit,
1432                                     const UnicodeString& text) {
1433     replaceBetween(start, limit, text);
1434 }
1435
1436 /**
1437  * Replaceable API
1438  */
1439 void
1440 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1441     if (limit <= start) {
1442         return; // Nothing to do; avoid bogus malloc call
1443     }
1444     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1445     // Check to make sure text is not null.
1446     if (text != NULL) {
1447             extractBetween(start, limit, text, 0);
1448             insert(dest, text, 0, limit - start);
1449             uprv_free(text);
1450     }
1451 }
1452
1453 /**
1454  * Replaceable API
1455  *
1456  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1457  * so we implement this function here.
1458  */
1459 UBool Replaceable::hasMetaData() const {
1460     return TRUE;
1461 }
1462
1463 /**
1464  * Replaceable API
1465  */
1466 UBool UnicodeString::hasMetaData() const {
1467     return FALSE;
1468 }
1469
1470 UnicodeString&
1471 UnicodeString::doReverse(int32_t start, int32_t length) {
1472   if(length <= 1 || !cloneArrayIfNeeded()) {
1473     return *this;
1474   }
1475
1476   // pin the indices to legal values
1477   pinIndices(start, length);
1478   if(length <= 1) {  // pinIndices() might have shrunk the length
1479     return *this;
1480   }
1481
1482   UChar *left = getArrayStart() + start;
1483   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
1484   UChar swap;
1485   UBool hasSupplementary = FALSE;
1486
1487   // Before the loop we know left<right because length>=2.
1488   do {
1489     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1490     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1491     *right-- = swap;
1492   } while(left < right);
1493   // Make sure to test the middle code unit of an odd-length string.
1494   // Redundant if the length is even.
1495   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1496
1497   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1498   if(hasSupplementary) {
1499     UChar swap2;
1500
1501     left = getArrayStart() + start;
1502     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1503     while(left < right) {
1504       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1505         *left++ = swap2;
1506         *left++ = swap;
1507       } else {
1508         ++left;
1509       }
1510     }
1511   }
1512
1513   return *this;
1514 }
1515
1516 UBool
1517 UnicodeString::padLeading(int32_t targetLength,
1518                           UChar padChar)
1519 {
1520   int32_t oldLength = length();
1521   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1522     return FALSE;
1523   } else {
1524     // move contents up by padding width
1525     UChar *array = getArrayStart();
1526     int32_t start = targetLength - oldLength;
1527     us_arrayCopy(array, 0, array, start, oldLength);
1528
1529     // fill in padding character
1530     while(--start >= 0) {
1531       array[start] = padChar;
1532     }
1533     setLength(targetLength);
1534     return TRUE;
1535   }
1536 }
1537
1538 UBool
1539 UnicodeString::padTrailing(int32_t targetLength,
1540                            UChar padChar)
1541 {
1542   int32_t oldLength = length();
1543   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1544     return FALSE;
1545   } else {
1546     // fill in padding character
1547     UChar *array = getArrayStart();
1548     int32_t length = targetLength;
1549     while(--length >= oldLength) {
1550       array[length] = padChar;
1551     }
1552     setLength(targetLength);
1553     return TRUE;
1554   }
1555 }
1556
1557 //========================================
1558 // Hashing
1559 //========================================
1560 int32_t
1561 UnicodeString::doHashCode() const
1562 {
1563     /* Delegate hash computation to uhash.  This makes UnicodeString
1564      * hashing consistent with UChar* hashing.  */
1565     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1566     if (hashCode == kInvalidHashCode) {
1567         hashCode = kEmptyHashCode;
1568     }
1569     return hashCode;
1570 }
1571
1572 //========================================
1573 // External Buffer
1574 //========================================
1575
1576 UChar *
1577 UnicodeString::getBuffer(int32_t minCapacity) {
1578   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1579     fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1580     setZeroLength();
1581     return getArrayStart();
1582   } else {
1583     return 0;
1584   }
1585 }
1586
1587 void
1588 UnicodeString::releaseBuffer(int32_t newLength) {
1589   if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1590     // set the new fLength
1591     int32_t capacity=getCapacity();
1592     if(newLength==-1) {
1593       // the new length is the string length, capped by fCapacity
1594       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1595       while(p<limit && *p!=0) {
1596         ++p;
1597       }
1598       newLength=(int32_t)(p-array);
1599     } else if(newLength>capacity) {
1600       newLength=capacity;
1601     }
1602     setLength(newLength);
1603     fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1604   }
1605 }
1606
1607 //========================================
1608 // Miscellaneous
1609 //========================================
1610 UBool
1611 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1612                                   int32_t growCapacity,
1613                                   UBool doCopyArray,
1614                                   int32_t **pBufferToDelete,
1615                                   UBool forceClone) {
1616   // default parameters need to be static, therefore
1617   // the defaults are -1 to have convenience defaults
1618   if(newCapacity == -1) {
1619     newCapacity = getCapacity();
1620   }
1621
1622   // while a getBuffer(minCapacity) is "open",
1623   // prevent any modifications of the string by returning FALSE here
1624   // if the string is bogus, then only an assignment or similar can revive it
1625   if(!isWritable()) {
1626     return FALSE;
1627   }
1628
1629   /*
1630    * We need to make a copy of the array if
1631    * the buffer is read-only, or
1632    * the buffer is refCounted (shared), and refCount>1, or
1633    * the buffer is too small.
1634    * Return FALSE if memory could not be allocated.
1635    */
1636   if(forceClone ||
1637      fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1638      (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1639      newCapacity > getCapacity()
1640   ) {
1641     // check growCapacity for default value and use of the stack buffer
1642     if(growCapacity < 0) {
1643       growCapacity = newCapacity;
1644     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1645       growCapacity = US_STACKBUF_SIZE;
1646     }
1647
1648     // save old values
1649     UChar oldStackBuffer[US_STACKBUF_SIZE];
1650     UChar *oldArray;
1651     int32_t oldLength = length();
1652     int16_t flags = fUnion.fFields.fLengthAndFlags;
1653
1654     if(flags&kUsingStackBuffer) {
1655       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1656       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1657         // copy the stack buffer contents because it will be overwritten with
1658         // fUnion.fFields values
1659         us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1660         oldArray = oldStackBuffer;
1661       } else {
1662         oldArray = NULL; // no need to copy from the stack buffer to itself
1663       }
1664     } else {
1665       oldArray = fUnion.fFields.fArray;
1666       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1667     }
1668
1669     // allocate a new array
1670     if(allocate(growCapacity) ||
1671        (newCapacity < growCapacity && allocate(newCapacity))
1672     ) {
1673       if(doCopyArray) {
1674         // copy the contents
1675         // do not copy more than what fits - it may be smaller than before
1676         int32_t minLength = oldLength;
1677         newCapacity = getCapacity();
1678         if(newCapacity < minLength) {
1679           minLength = newCapacity;
1680         }
1681         if(oldArray != NULL) {
1682           us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1683         }
1684         setLength(minLength);
1685       } else {
1686         setZeroLength();
1687       }
1688
1689       // release the old array
1690       if(flags & kRefCounted) {
1691         // the array is refCounted; decrement and release if 0
1692         u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1693         if(umtx_atomic_dec(pRefCount) == 0) {
1694           if(pBufferToDelete == 0) {
1695               // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1696               // is defined as volatile. (Volatile has useful non-standard behavior
1697               //   with this compiler.)
1698             uprv_free((void *)pRefCount);
1699           } else {
1700             // the caller requested to delete it himself
1701             *pBufferToDelete = (int32_t *)pRefCount;
1702           }
1703         }
1704       }
1705     } else {
1706       // not enough memory for growCapacity and not even for the smaller newCapacity
1707       // reset the old values for setToBogus() to release the array
1708       if(!(flags&kUsingStackBuffer)) {
1709         fUnion.fFields.fArray = oldArray;
1710       }
1711       fUnion.fFields.fLengthAndFlags = flags;
1712       setToBogus();
1713       return FALSE;
1714     }
1715   }
1716   return TRUE;
1717 }
1718
1719 // UnicodeStringAppendable ------------------------------------------------- ***
1720
1721 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1722
1723 UBool
1724 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1725   return str.doReplace(str.length(), 0, &c, 0, 1).isWritable();
1726 }
1727
1728 UBool
1729 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1730   UChar buffer[U16_MAX_LENGTH];
1731   int32_t cLength = 0;
1732   UBool isError = FALSE;
1733   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1734   return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable();
1735 }
1736
1737 UBool
1738 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1739   return str.doReplace(str.length(), 0, s, 0, length).isWritable();
1740 }
1741
1742 UBool
1743 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1744   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1745 }
1746
1747 UChar *
1748 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1749                                          int32_t desiredCapacityHint,
1750                                          UChar *scratch, int32_t scratchCapacity,
1751                                          int32_t *resultCapacity) {
1752   if(minCapacity < 1 || scratchCapacity < minCapacity) {
1753     *resultCapacity = 0;
1754     return NULL;
1755   }
1756   int32_t oldLength = str.length();
1757   if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1758     *resultCapacity = str.getCapacity() - oldLength;
1759     return str.getArrayStart() + oldLength;
1760   }
1761   *resultCapacity = scratchCapacity;
1762   return scratch;
1763 }
1764
1765 U_NAMESPACE_END
1766
1767 U_NAMESPACE_USE
1768
1769 U_CAPI int32_t U_EXPORT2
1770 uhash_hashUnicodeString(const UElement key) {
1771     const UnicodeString *str = (const UnicodeString*) key.pointer;
1772     return (str == NULL) ? 0 : str->hashCode();
1773 }
1774
1775 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1776 // does not depend on hashtable code.
1777 U_CAPI UBool U_EXPORT2
1778 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1779     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1780     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1781     if (str1 == str2) {
1782         return TRUE;
1783     }
1784     if (str1 == NULL || str2 == NULL) {
1785         return FALSE;
1786     }
1787     return *str1 == *str2;
1788 }
1789
1790 #ifdef U_STATIC_IMPLEMENTATION
1791 /*
1792 This should never be called. It is defined here to make sure that the
1793 virtual vector deleting destructor is defined within unistr.cpp.
1794 The vector deleting destructor is already a part of UObject,
1795 but defining it here makes sure that it is included with this object file.
1796 This makes sure that static library dependencies are kept to a minimum.
1797 */
1798 static void uprv_UnicodeStringDummy(void) {
1799     delete [] (new UnicodeString[2]);
1800 }
1801 #endif