icuSources/common/unistr.cpp

   1 /*
   2 ******************************************************************************
   3 * Copyright (C) 1999-2005, International Business Machines Corporation and   *
   4 * others. All Rights Reserved.                                               *
   5 ******************************************************************************
   6 *
   7 * File unistr.cpp
   8 *
   9 * Modification History:
  10 *
  11 *   Date        Name        Description
  12 *   09/25/98    stephen     Creation.
  13 *   04/20/99    stephen     Overhauled per 4/16 code review.
  14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
  15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
  16 *                           Replaceable.
  17 *   06/25/01    grhoten     Removed the dependency on iostream
  18 ******************************************************************************
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "unicode/putil.h"
  23 #include "cstring.h"
  24 #include "cmemory.h"
  25 #include "unicode/ustring.h"
  26 #include "unicode/unistr.h"
  27 #include "uhash.h"
  28 #include "ustr_imp.h"
  29 #include "umutex.h"
  30
  31 #if 0
  32
  33 #if U_IOSTREAM_SOURCE >= 199711
  34 #include <iostream>
  35 using namespace std;
  36 #elif U_IOSTREAM_SOURCE >= 198506
  37 #include <iostream.h>
  38 #endif
  39
  40 //DEBUGGING
  41 void
  42 print(const UnicodeString& s,
  43       const char *name)
  44 {
  45   UChar c;
  46   cout << name << ":|";
  47   for(int i = 0; i < s.length(); ++i) {
  48     c = s[i];
  49     if(c>= 0x007E || c < 0x0020)
  50       cout << "[0x" << hex << s[i] << "]";
  51     else
  52       cout << (char) s[i];
  53   }
  54   cout << '|' << endl;
  55 }
  56
  57 void
  58 print(const UChar *s,
  59       int32_t len,
  60       const char *name)
  61 {
  62   UChar c;
  63   cout << name << ":|";
  64   for(int i = 0; i < len; ++i) {
  65     c = s[i];
  66     if(c>= 0x007E || c < 0x0020)
  67       cout << "[0x" << hex << s[i] << "]";
  68     else
  69       cout << (char) s[i];
  70   }
  71   cout << '|' << endl;
  72 }
  73 // END DEBUGGING
  74 #endif
  75
  76 // Local function definitions for now
  77
  78 // need to copy areas that may overlap
  79 static
  80 inline void
  81 us_arrayCopy(const UChar *src, int32_t srcStart,
  82          UChar *dst, int32_t dstStart, int32_t count)
  83 {
  84   if(count>0) {
  85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
  86   }
  87 }
  88
  89 // u_unescapeAt() callback to get a UChar from a UnicodeString
  90 U_CDECL_BEGIN
  91 static UChar U_CALLCONV
  92 UnicodeString_charAt(int32_t offset, void *context) {
  93     return ((UnicodeString*) context)->charAt(offset);
  94 }
  95 U_CDECL_END
  96
  97 U_NAMESPACE_BEGIN
  98
  99 /* The Replaceable virtual destructor can't be defined in the header
 100    due to how AIX works with multiple definitions of virtual functions.
 101 */
 102 Replaceable::~Replaceable() {}
 103 Replaceable::Replaceable() {}
 104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
 105
 106 UnicodeString U_EXPORT2
 107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
 108     return
 109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
 110             append(s1).
 111                 append(s2);
 112 }
 113
 114 //========================================
 115 // Reference Counting functions, put at top of file so that optimizing compilers
 116 //                               have a chance to automatically inline.
 117 //========================================
 118
 119 void
 120 UnicodeString::addRef()
 121 {  umtx_atomic_inc((int32_t *)fArray - 1);}
 122
 123 int32_t
 124 UnicodeString::removeRef()
 125 { return umtx_atomic_dec((int32_t *)fArray - 1);}
 126
 127 int32_t
 128 UnicodeString::refCount() const
 129 {
 130     umtx_lock(NULL);
 131     // Note: without the lock to force a memory barrier, we might see a very
 132     //       stale value on some multi-processor systems.
 133     int32_t  count = *((int32_t *)fArray - 1);
 134     umtx_unlock(NULL);
 135     return count;
 136  }
 137
 138 void
 139 UnicodeString::releaseArray() {
 140   if((fFlags & kRefCounted) && removeRef() == 0) {
 141     uprv_free((int32_t *)fArray - 1);
 142   }
 143 }
 144
 145
 146
 147 //========================================
 148 // Constructors
 149 //========================================
 150 UnicodeString::UnicodeString()
 151   : fLength(0),
 152     fCapacity(US_STACKBUF_SIZE),
 153     fArray(fStackBuffer),
 154     fFlags(kShortString)
 155 {}
 156
 157 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count)
 158   : fLength(0),
 159     fCapacity(US_STACKBUF_SIZE),
 160     fArray(0),
 161     fFlags(0)
 162 {
 163   if(count <= 0 || (uint32_t)c > 0x10ffff) {
 164     // just allocate and do not do anything else
 165     allocate(capacity);
 166   } else {
 167     // count > 0, allocate and fill the new string with count c's
 168     int32_t unitCount = UTF_CHAR_LENGTH(c), length = count * unitCount;
 169     if(capacity < length) {
 170       capacity = length;
 171     }
 172     if(allocate(capacity)) {
 173       int32_t i = 0;
 174
 175       // fill the new string with c
 176       if(unitCount == 1) {
 177         // fill with length UChars
 178         while(i < length) {
 179           fArray[i++] = (UChar)c;
 180         }
 181       } else {
 182         // get the code units for c
 183         UChar units[UTF_MAX_CHAR_LENGTH];
 184         UTF_APPEND_CHAR_UNSAFE(units, i, c);
 185
 186         // now it must be i==unitCount
 187         i = 0;
 188
 189         // for Unicode, unitCount can only be 1, 2, 3, or 4
 190         // 1 is handled above
 191         while(i < length) {
 192           int32_t unitIdx = 0;
 193           while(unitIdx < unitCount) {
 194             fArray[i++]=units[unitIdx++];
 195           }
 196         }
 197       }
 198     }
 199     fLength = length;
 200   }
 201 }
 202
 203 UnicodeString::UnicodeString(UChar ch)
 204   : fLength(1),
 205     fCapacity(US_STACKBUF_SIZE),
 206     fArray(fStackBuffer),
 207     fFlags(kShortString)
 208 {
 209   fStackBuffer[0] = ch;
 210 }
 211
 212 UnicodeString::UnicodeString(UChar32 ch)
 213   : fLength(1),
 214     fCapacity(US_STACKBUF_SIZE),
 215     fArray(fStackBuffer),
 216     fFlags(kShortString)
 217 {
 218   int32_t i = 0;
 219   UBool isError = FALSE;
 220   U16_APPEND(fStackBuffer, i, US_STACKBUF_SIZE, ch, isError);
 221   fLength = i;
 222 }
 223
 224 UnicodeString::UnicodeString(const UChar *text)
 225   : fLength(0),
 226     fCapacity(US_STACKBUF_SIZE),
 227     fArray(fStackBuffer),
 228     fFlags(kShortString)
 229 {
 230   doReplace(0, 0, text, 0, -1);
 231 }
 232
 233 UnicodeString::UnicodeString(const UChar *text,
 234                              int32_t textLength)
 235   : fLength(0),
 236     fCapacity(US_STACKBUF_SIZE),
 237     fArray(fStackBuffer),
 238     fFlags(kShortString)
 239 {
 240   doReplace(0, 0, text, 0, textLength);
 241 }
 242
 243 UnicodeString::UnicodeString(UBool isTerminated,
 244                              const UChar *text,
 245                              int32_t textLength)
 246   : fLength(textLength),
 247     fCapacity(isTerminated ? textLength + 1 : textLength),
 248     fArray((UChar *)text),
 249     fFlags(kReadonlyAlias)
 250 {
 251   if(text == NULL) {
 252     // treat as an empty string, do not alias
 253     fLength = 0;
 254     fCapacity = US_STACKBUF_SIZE;
 255     fArray = fStackBuffer;
 256     fFlags = kShortString;
 257   } else if(textLength < -1 ||
 258             (textLength == -1 && !isTerminated) ||
 259             (textLength >= 0 && isTerminated && text[textLength] != 0)
 260   ) {
 261     setToBogus();
 262   } else if(textLength == -1) {
 263     // text is terminated, or else it would have failed the above test
 264     fLength = u_strlen(text);
 265     fCapacity = fLength + 1;
 266   }
 267 }
 268
 269 UnicodeString::UnicodeString(UChar *buff,
 270                              int32_t buffLength,
 271                              int32_t buffCapacity)
 272   : fLength(buffLength),
 273     fCapacity(buffCapacity),
 274     fArray(buff),
 275     fFlags(kWritableAlias)
 276 {
 277   if(buff == NULL) {
 278     // treat as an empty string, do not alias
 279     fLength = 0;
 280     fCapacity = US_STACKBUF_SIZE;
 281     fArray = fStackBuffer;
 282     fFlags = kShortString;
 283   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
 284     setToBogus();
 285   } else if(buffLength == -1) {
 286     // fLength = u_strlen(buff); but do not look beyond buffCapacity
 287     const UChar *p = buff, *limit = buff + buffCapacity;
 288     while(p != limit && *p != 0) {
 289       ++p;
 290     }
 291     fLength = (int32_t)(p - buff);
 292   }
 293 }
 294
 295 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
 296   : fLength(0),
 297     fCapacity(US_STACKBUF_SIZE),
 298     fArray(fStackBuffer),
 299     fFlags(kShortString)
 300 {
 301   if(src==NULL) {
 302     // treat as an empty string
 303   } else {
 304     if(length<0) {
 305       length=(int32_t)uprv_strlen(src);
 306     }
 307     if(cloneArrayIfNeeded(length, length, FALSE)) {
 308       u_charsToUChars(src, getArrayStart(), length);
 309       fLength = length;
 310     } else {
 311       setToBogus();
 312     }
 313   }
 314 }
 315
 316 UnicodeString::UnicodeString(const UnicodeString& that)
 317   : Replaceable(),
 318     fLength(0),
 319     fCapacity(US_STACKBUF_SIZE),
 320     fArray(fStackBuffer),
 321     fFlags(kShortString)
 322 {
 323   copyFrom(that);
 324 }
 325
 326 UnicodeString::UnicodeString(const UnicodeString& that,
 327                              int32_t srcStart)
 328   : Replaceable(),
 329     fLength(0),
 330     fCapacity(US_STACKBUF_SIZE),
 331     fArray(fStackBuffer),
 332     fFlags(kShortString)
 333 {
 334   setTo(that, srcStart);
 335 }
 336
 337 UnicodeString::UnicodeString(const UnicodeString& that,
 338                              int32_t srcStart,
 339                              int32_t srcLength)
 340   : Replaceable(),
 341     fLength(0),
 342     fCapacity(US_STACKBUF_SIZE),
 343     fArray(fStackBuffer),
 344     fFlags(kShortString)
 345 {
 346   setTo(that, srcStart, srcLength);
 347 }
 348
 349 // Replaceable base class clone() default implementation, does not clone
 350 Replaceable *
 351 Replaceable::clone() const {
 352   return NULL;
 353 }
 354
 355 // UnicodeString overrides clone() with a real implementation
 356 Replaceable *
 357 UnicodeString::clone() const {
 358   return new UnicodeString(*this);
 359 }
 360
 361 //========================================
 362 // array allocation
 363 //========================================
 364
 365 UBool
 366 UnicodeString::allocate(int32_t capacity) {
 367   if(capacity <= US_STACKBUF_SIZE) {
 368     fArray = fStackBuffer;
 369     fCapacity = US_STACKBUF_SIZE;
 370     fFlags = kShortString;
 371   } else {
 372     // count bytes for the refCounter and the string capacity, and
 373     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
 374     // to be safely aligned for the refCount
 375     int32_t words = (int32_t)(((sizeof(int32_t) + capacity * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
 376     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
 377     if(array != 0) {
 378       // set initial refCount and point behind the refCount
 379       *array++ = 1;
 380
 381       // have fArray point to the first UChar
 382       fArray = (UChar *)array;
 383       fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
 384       fFlags = kLongString;
 385     } else {
 386       fLength = 0;
 387       fCapacity = 0;
 388       fFlags = kIsBogus;
 389       return FALSE;
 390     }
 391   }
 392   return TRUE;
 393 }
 394
 395 //========================================
 396 // Destructor
 397 //========================================
 398 UnicodeString::~UnicodeString()
 399 {
 400   releaseArray();
 401 }
 402
 403
 404 //========================================
 405 // Assignment
 406 //========================================
 407
 408 UnicodeString &
 409 UnicodeString::operator=(const UnicodeString &src) {
 410   return copyFrom(src);
 411 }
 412
 413 UnicodeString &
 414 UnicodeString::fastCopyFrom(const UnicodeString &src) {
 415   return copyFrom(src, TRUE);
 416 }
 417
 418 UnicodeString &
 419 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
 420   // if assigning to ourselves, do nothing
 421   if(this == 0 || this == &src) {
 422     return *this;
 423   }
 424
 425   // is the right side bogus?
 426   if(&src == 0 || src.isBogus()) {
 427     setToBogus();
 428     return *this;
 429   }
 430
 431   // delete the current contents
 432   releaseArray();
 433
 434   // we always copy the length
 435   fLength = src.fLength;
 436   if(fLength == 0) {
 437     // empty string - use the stack buffer
 438     fArray = fStackBuffer;
 439     fCapacity = US_STACKBUF_SIZE;
 440     fFlags = kShortString;
 441     return *this;
 442   }
 443
 444   // fLength>0 and not an "open" src.getBuffer(minCapacity)
 445   switch(src.fFlags) {
 446   case kShortString:
 447     // short string using the stack buffer, do the same
 448     fArray = fStackBuffer;
 449     fCapacity = US_STACKBUF_SIZE;
 450     fFlags = kShortString;
 451     uprv_memcpy(fStackBuffer, src.fArray, fLength * U_SIZEOF_UCHAR);
 452     break;
 453   case kLongString:
 454     // src uses a refCounted string buffer, use that buffer with refCount
 455     // src is const, use a cast - we don't really change it
 456     ((UnicodeString &)src).addRef();
 457     // copy all fields, share the reference-counted buffer
 458     fArray = src.fArray;
 459     fCapacity = src.fCapacity;
 460     fFlags = src.fFlags;
 461     break;
 462   case kReadonlyAlias:
 463     if(fastCopy) {
 464       // src is a readonly alias, do the same
 465       // -> maintain the readonly alias as such
 466       fArray = src.fArray;
 467       fCapacity = src.fCapacity;
 468       fFlags = src.fFlags;
 469       break;
 470     }
 471     // else if(!fastCopy) fall through to case kWritableAlias
 472     // -> allocate a new buffer and copy the contents
 473   case kWritableAlias:
 474     // src is a writable alias; we make a copy of that instead
 475     if(allocate(fLength)) {
 476       uprv_memcpy(fArray, src.fArray, fLength * U_SIZEOF_UCHAR);
 477       break;
 478     }
 479     // if there is not enough memory, then fall through to setting to bogus
 480   default:
 481     // if src is bogus, set ourselves to bogus
 482     // do not call setToBogus() here because fArray and fFlags are not consistent here
 483     fArray = 0;
 484     fLength = 0;
 485     fCapacity = 0;
 486     fFlags = kIsBogus;
 487     break;
 488   }
 489
 490   return *this;
 491 }
 492
 493 //========================================
 494 // Miscellaneous operations
 495 //========================================
 496
 497 UnicodeString UnicodeString::unescape() const {
 498     UnicodeString result;
 499     for (int32_t i=0; i<length(); ) {
 500         UChar32 c = charAt(i++);
 501         if (c == 0x005C /*'\\'*/) {
 502             c = unescapeAt(i); // advances i
 503             if (c == (UChar32)0xFFFFFFFF) {
 504                 result.remove(); // return empty string
 505                 break; // invalid escape sequence
 506             }
 507         }
 508         result.append(c);
 509     }
 510     return result;
 511 }
 512
 513 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
 514     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
 515 }
 516
 517 //========================================
 518 // Read-only implementation
 519 //========================================
 520 int8_t
 521 UnicodeString::doCompare( int32_t start,
 522               int32_t length,
 523               const UChar *srcChars,
 524               int32_t srcStart,
 525               int32_t srcLength) const
 526 {
 527   // compare illegal string values
 528   // treat const UChar *srcChars==NULL as an empty string
 529   if(isBogus()) {
 530     return -1;
 531   }
 532
 533   // pin indices to legal values
 534   pinIndices(start, length);
 535
 536   if(srcChars == NULL) {
 537     srcStart = srcLength = 0;
 538   }
 539
 540   // get the correct pointer
 541   const UChar *chars = getArrayStart();
 542
 543   chars += start;
 544   srcChars += srcStart;
 545
 546   int32_t minLength;
 547   int8_t lengthResult;
 548
 549   // get the srcLength if necessary
 550   if(srcLength < 0) {
 551     srcLength = u_strlen(srcChars + srcStart);
 552   }
 553
 554   // are we comparing different lengths?
 555   if(length != srcLength) {
 556     if(length < srcLength) {
 557       minLength = length;
 558       lengthResult = -1;
 559     } else {
 560       minLength = srcLength;
 561       lengthResult = 1;
 562     }
 563   } else {
 564     minLength = length;
 565     lengthResult = 0;
 566   }
 567
 568   /*
 569    * note that uprv_memcmp() returns an int but we return an int8_t;
 570    * we need to take care not to truncate the result -
 571    * one way to do this is to right-shift the value to
 572    * move the sign bit into the lower 8 bits and making sure that this
 573    * does not become 0 itself
 574    */
 575
 576   if(minLength > 0 && chars != srcChars) {
 577     int32_t result;
 578
 579 #   if U_IS_BIG_ENDIAN
 580       // big-endian: byte comparison works
 581       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
 582       if(result != 0) {
 583         return (int8_t)(result >> 15 | 1);
 584       }
 585 #   else
 586       // little-endian: compare UChar units
 587       do {
 588         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
 589         if(result != 0) {
 590           return (int8_t)(result >> 15 | 1);
 591         }
 592       } while(--minLength > 0);
 593 #   endif
 594   }
 595   return lengthResult;
 596 }
 597
 598 /* String compare in code point order - doCompare() compares in code unit order. */
 599 int8_t
 600 UnicodeString::doCompareCodePointOrder(int32_t start,
 601                                        int32_t length,
 602                                        const UChar *srcChars,
 603                                        int32_t srcStart,
 604                                        int32_t srcLength) const
 605 {
 606   // compare illegal string values
 607   // treat const UChar *srcChars==NULL as an empty string
 608   if(isBogus()) {
 609     return -1;
 610   }
 611
 612   // pin indices to legal values
 613   pinIndices(start, length);
 614
 615   if(srcChars == NULL) {
 616     srcStart = srcLength = 0;
 617   }
 618
 619   int32_t diff = uprv_strCompare(fArray + start, length, srcChars + srcStart, srcLength, FALSE, TRUE);
 620   /* translate the 32-bit result into an 8-bit one */
 621   if(diff!=0) {
 622     return (int8_t)(diff >> 15 | 1);
 623   } else {
 624     return 0;
 625   }
 626 }
 627
 628 int32_t
 629 UnicodeString::getLength() const {
 630     return length();
 631 }
 632
 633 UChar
 634 UnicodeString::getCharAt(int32_t offset) const {
 635   return charAt(offset);
 636 }
 637
 638 UChar32
 639 UnicodeString::getChar32At(int32_t offset) const {
 640   return char32At(offset);
 641 }
 642
 643 int32_t
 644 UnicodeString::countChar32(int32_t start, int32_t length) const {
 645   pinIndices(start, length);
 646   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
 647   return u_countChar32(fArray+start, length);
 648 }
 649
 650 UBool
 651 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
 652   pinIndices(start, length);
 653   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
 654   return u_strHasMoreChar32Than(fArray+start, length, number);
 655 }
 656
 657 int32_t
 658 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
 659   // pin index
 660   if(index<0) {
 661     index=0;
 662   } else if(index>fLength) {
 663     index=fLength;
 664   }
 665
 666   if(delta>0) {
 667     UTF_FWD_N(fArray, index, fLength, delta);
 668   } else {
 669     UTF_BACK_N(fArray, 0, index, -delta);
 670   }
 671
 672   return index;
 673 }
 674
 675 void
 676 UnicodeString::doExtract(int32_t start,
 677              int32_t length,
 678              UChar *dst,
 679              int32_t dstStart) const
 680 {
 681   // pin indices to legal values
 682   pinIndices(start, length);
 683
 684   // do not copy anything if we alias dst itself
 685   if(fArray + start != dst + dstStart) {
 686     us_arrayCopy(getArrayStart(), start, dst, dstStart, length);
 687   }
 688 }
 689
 690 int32_t
 691 UnicodeString::extract(UChar *dest, int32_t destCapacity,
 692                        UErrorCode &errorCode) const {
 693   if(U_SUCCESS(errorCode)) {
 694     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 695       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 696     } else {
 697       if(fLength>0 && fLength<=destCapacity && fArray!=dest) {
 698         uprv_memcpy(dest, fArray, fLength*U_SIZEOF_UCHAR);
 699       }
 700       return u_terminateUChars(dest, destCapacity, fLength, &errorCode);
 701     }
 702   }
 703
 704   return fLength;
 705 }
 706
 707 int32_t
 708 UnicodeString::extract(int32_t start,
 709                        int32_t length,
 710                        char *target,
 711                        int32_t targetCapacity,
 712                        enum EInvariant) const
 713 {
 714   // if the arguments are illegal, then do nothing
 715   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
 716     return 0;
 717   }
 718
 719   // pin the indices to legal values
 720   pinIndices(start, length);
 721
 722   if(length <= targetCapacity) {
 723     u_UCharsToChars(getArrayStart() + start, target, length);
 724   }
 725   UErrorCode status = U_ZERO_ERROR;
 726   return u_terminateChars(target, targetCapacity, length, &status);
 727 }
 728
 729 void
 730 UnicodeString::extractBetween(int32_t start,
 731                   int32_t limit,
 732                   UnicodeString& target) const {
 733   pinIndex(start);
 734   pinIndex(limit);
 735   doExtract(start, limit - start, target);
 736 }
 737
 738 int32_t
 739 UnicodeString::indexOf(const UChar *srcChars,
 740                int32_t srcStart,
 741                int32_t srcLength,
 742                int32_t start,
 743                int32_t length) const
 744 {
 745   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
 746     return -1;
 747   }
 748
 749   // UnicodeString does not find empty substrings
 750   if(srcLength < 0 && srcChars[srcStart] == 0) {
 751     return -1;
 752   }
 753
 754   // get the indices within bounds
 755   pinIndices(start, length);
 756
 757   // find the first occurrence of the substring
 758   const UChar *match = u_strFindFirst(fArray + start, length, srcChars + srcStart, srcLength);
 759   if(match == NULL) {
 760     return -1;
 761   } else {
 762     return (int32_t)(match - fArray);
 763   }
 764 }
 765
 766 int32_t
 767 UnicodeString::doIndexOf(UChar c,
 768              int32_t start,
 769              int32_t length) const
 770 {
 771   // pin indices
 772   pinIndices(start, length);
 773
 774   // find the first occurrence of c
 775   const UChar *match = u_memchr(fArray + start, c, length);
 776   if(match == NULL) {
 777     return -1;
 778   } else {
 779     return (int32_t)(match - fArray);
 780   }
 781 }
 782
 783 int32_t
 784 UnicodeString::doIndexOf(UChar32 c,
 785                          int32_t start,
 786                          int32_t length) const {
 787   // pin indices
 788   pinIndices(start, length);
 789
 790   // find the first occurrence of c
 791   const UChar *match = u_memchr32(fArray + start, c, length);
 792   if(match == NULL) {
 793     return -1;
 794   } else {
 795     return (int32_t)(match - fArray);
 796   }
 797 }
 798
 799 int32_t
 800 UnicodeString::lastIndexOf(const UChar *srcChars,
 801                int32_t srcStart,
 802                int32_t srcLength,
 803                int32_t start,
 804                int32_t length) const
 805 {
 806   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
 807     return -1;
 808   }
 809
 810   // UnicodeString does not find empty substrings
 811   if(srcLength < 0 && srcChars[srcStart] == 0) {
 812     return -1;
 813   }
 814
 815   // get the indices within bounds
 816   pinIndices(start, length);
 817
 818   // find the last occurrence of the substring
 819   const UChar *match = u_strFindLast(fArray + start, length, srcChars + srcStart, srcLength);
 820   if(match == NULL) {
 821     return -1;
 822   } else {
 823     return (int32_t)(match - fArray);
 824   }
 825 }
 826
 827 int32_t
 828 UnicodeString::doLastIndexOf(UChar c,
 829                  int32_t start,
 830                  int32_t length) const
 831 {
 832   if(isBogus()) {
 833     return -1;
 834   }
 835
 836   // pin indices
 837   pinIndices(start, length);
 838
 839   // find the last occurrence of c
 840   const UChar *match = u_memrchr(fArray + start, c, length);
 841   if(match == NULL) {
 842     return -1;
 843   } else {
 844     return (int32_t)(match - fArray);
 845   }
 846 }
 847
 848 int32_t
 849 UnicodeString::doLastIndexOf(UChar32 c,
 850                              int32_t start,
 851                              int32_t length) const {
 852   // pin indices
 853   pinIndices(start, length);
 854
 855   // find the last occurrence of c
 856   const UChar *match = u_memrchr32(fArray + start, c, length);
 857   if(match == NULL) {
 858     return -1;
 859   } else {
 860     return (int32_t)(match - fArray);
 861   }
 862 }
 863
 864 //========================================
 865 // Write implementation
 866 //========================================
 867
 868 UnicodeString&
 869 UnicodeString::findAndReplace(int32_t start,
 870                   int32_t length,
 871                   const UnicodeString& oldText,
 872                   int32_t oldStart,
 873                   int32_t oldLength,
 874                   const UnicodeString& newText,
 875                   int32_t newStart,
 876                   int32_t newLength)
 877 {
 878   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
 879     return *this;
 880   }
 881
 882   pinIndices(start, length);
 883   oldText.pinIndices(oldStart, oldLength);
 884   newText.pinIndices(newStart, newLength);
 885
 886   if(oldLength == 0) {
 887     return *this;
 888   }
 889
 890   while(length > 0 && length >= oldLength) {
 891     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
 892     if(pos < 0) {
 893       // no more oldText's here: done
 894       break;
 895     } else {
 896       // we found oldText, replace it by newText and go beyond it
 897       replace(pos, oldLength, newText, newStart, newLength);
 898       length -= pos + oldLength - start;
 899       start = pos + newLength;
 900     }
 901   }
 902
 903   return *this;
 904 }
 905
 906
 907 void
 908 UnicodeString::setToBogus()
 909 {
 910   releaseArray();
 911
 912   fArray = 0;
 913   fCapacity = fLength = 0;
 914   fFlags = kIsBogus;
 915 }
 916
 917 // turn a bogus string into an empty one
 918 void
 919 UnicodeString::unBogus() {
 920   if(fFlags & kIsBogus) {
 921     fArray = fStackBuffer;
 922     fLength = 0;
 923     fCapacity = US_STACKBUF_SIZE;
 924     fFlags = kShortString;
 925   }
 926 }
 927
 928 // setTo() analogous to the readonly-aliasing constructor with the same signature
 929 UnicodeString &
 930 UnicodeString::setTo(UBool isTerminated,
 931                      const UChar *text,
 932                      int32_t textLength)
 933 {
 934   if(fFlags & kOpenGetBuffer) {
 935     // do not modify a string that has an "open" getBuffer(minCapacity)
 936     return *this;
 937   }
 938
 939   if(text == NULL) {
 940     // treat as an empty string, do not alias
 941     releaseArray();
 942     fLength = 0;
 943     fCapacity = US_STACKBUF_SIZE;
 944     fArray = fStackBuffer;
 945     fFlags = kShortString;
 946     return *this;
 947   }
 948
 949   if( textLength < -1 ||
 950       (textLength == -1 && !isTerminated) ||
 951       (textLength >= 0 && isTerminated && text[textLength] != 0)
 952   ) {
 953     setToBogus();
 954     return *this;
 955   }
 956
 957   releaseArray();
 958
 959   fArray = (UChar *)text;
 960   if(textLength != -1) {
 961     fLength = textLength;
 962     fCapacity = isTerminated ? fLength + 1 : fLength;
 963   } else {
 964     // text is terminated, or else it would have failed the above test
 965     fLength = u_strlen(text);
 966     fCapacity = fLength + 1;
 967   }
 968
 969   fFlags = kReadonlyAlias;
 970   return *this;
 971 }
 972
 973 // setTo() analogous to the writable-aliasing constructor with the same signature
 974 UnicodeString &
 975 UnicodeString::setTo(UChar *buffer,
 976                      int32_t buffLength,
 977                      int32_t buffCapacity) {
 978   if(fFlags & kOpenGetBuffer) {
 979     // do not modify a string that has an "open" getBuffer(minCapacity)
 980     return *this;
 981   }
 982
 983   if(buffer == NULL) {
 984     // treat as an empty string, do not alias
 985     releaseArray();
 986     fLength = 0;
 987     fCapacity = US_STACKBUF_SIZE;
 988     fArray = fStackBuffer;
 989     fFlags = kShortString;
 990     return *this;
 991   }
 992
 993   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
 994     setToBogus();
 995     return *this;
 996   } else if(buffLength == -1) {
 997     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
 998     const UChar *p = buffer, *limit = buffer + buffCapacity;
 999     while(p != limit && *p != 0) {
1000       ++p;
1001     }
1002     buffLength = (int32_t)(p - buffer);
1003   }
1004
1005   releaseArray();
1006
1007   fArray = buffer;
1008   fLength = buffLength;
1009   fCapacity = buffCapacity;
1010   fFlags = kWritableAlias;
1011   return *this;
1012 }
1013
1014 UnicodeString&
1015 UnicodeString::setCharAt(int32_t offset,
1016              UChar c)
1017 {
1018   if(cloneArrayIfNeeded() && fLength > 0) {
1019     if(offset < 0) {
1020       offset = 0;
1021     } else if(offset >= fLength) {
1022       offset = fLength - 1;
1023     }
1024
1025     fArray[offset] = c;
1026   }
1027   return *this;
1028 }
1029
1030 UnicodeString&
1031 UnicodeString::doReplace( int32_t start,
1032               int32_t length,
1033               const UnicodeString& src,
1034               int32_t srcStart,
1035               int32_t srcLength)
1036 {
1037   if(!src.isBogus()) {
1038     // pin the indices to legal values
1039     src.pinIndices(srcStart, srcLength);
1040
1041     // get the characters from src
1042     // and replace the range in ourselves with them
1043     return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1044   } else {
1045     // remove the range
1046     return doReplace(start, length, 0, 0, 0);
1047   }
1048 }
1049
1050 UnicodeString&
1051 UnicodeString::doReplace(int32_t start,
1052              int32_t length,
1053              const UChar *srcChars,
1054              int32_t srcStart,
1055              int32_t srcLength)
1056 {
1057   if(isBogus()) {
1058     return *this;
1059   }
1060
1061   if(srcChars == 0) {
1062     srcStart = srcLength = 0;
1063   } else if(srcLength < 0) {
1064     // get the srcLength if necessary
1065     srcLength = u_strlen(srcChars + srcStart);
1066   }
1067
1068   int32_t *bufferToDelete = 0;
1069
1070   // the following may change fArray but will not copy the current contents;
1071   // therefore we need to keep the current fArray
1072   UChar *oldArray = fArray;
1073   int32_t oldLength = fLength;
1074
1075   // pin the indices to legal values
1076   pinIndices(start, length);
1077
1078   // calculate the size of the string after the replace
1079   int32_t newSize = oldLength - length + srcLength;
1080
1081   // clone our array and allocate a bigger array if needed
1082   if(!cloneArrayIfNeeded(newSize, newSize + (newSize >> 2) + kGrowSize,
1083                          FALSE, &bufferToDelete)
1084   ) {
1085     return *this;
1086   }
1087
1088   // now do the replace
1089
1090   if(fArray != oldArray) {
1091     // if fArray changed, then we need to copy everything except what will change
1092     us_arrayCopy(oldArray, 0, fArray, 0, start);
1093     us_arrayCopy(oldArray, start + length,
1094                  fArray, start + srcLength,
1095                  oldLength - (start + length));
1096   } else if(length != srcLength) {
1097     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1098     us_arrayCopy(oldArray, start + length,
1099                  fArray, start + srcLength,
1100                  oldLength - (start + length));
1101   }
1102
1103   // now fill in the hole with the new string
1104   us_arrayCopy(srcChars, srcStart, getArrayStart(), start, srcLength);
1105
1106   fLength = newSize;
1107
1108   // delayed delete in case srcChars == fArray when we started, and
1109   // to keep oldArray alive for the above operations
1110   if (bufferToDelete) {
1111     uprv_free(bufferToDelete);
1112   }
1113
1114   return *this;
1115 }
1116
1117 /**
1118  * Replaceable API
1119  */
1120 void
1121 UnicodeString::handleReplaceBetween(int32_t start,
1122                                     int32_t limit,
1123                                     const UnicodeString& text) {
1124     replaceBetween(start, limit, text);
1125 }
1126
1127 /**
1128  * Replaceable API
1129  */
1130 void
1131 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1132     if (limit <= start) {
1133         return; // Nothing to do; avoid bogus malloc call
1134     }
1135     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1136     extractBetween(start, limit, text, 0);
1137     insert(dest, text, 0, limit - start);
1138     uprv_free(text);
1139 }
1140
1141 /**
1142  * Replaceable API
1143  *
1144  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1145  * so we implement this function here.
1146  */
1147 UBool Replaceable::hasMetaData() const {
1148     return TRUE;
1149 }
1150
1151 /**
1152  * Replaceable API
1153  */
1154 UBool UnicodeString::hasMetaData() const {
1155     return FALSE;
1156 }
1157
1158 UnicodeString&
1159 UnicodeString::doReverse(int32_t start,
1160              int32_t length)
1161 {
1162   if(fLength <= 1 || !cloneArrayIfNeeded()) {
1163     return *this;
1164   }
1165
1166   // pin the indices to legal values
1167   pinIndices(start, length);
1168
1169   UChar *left = getArrayStart() + start;
1170   UChar *right = getArrayStart() + start + length;
1171   UChar swap;
1172   UBool hasSupplementary = FALSE;
1173
1174   while(left < --right) {
1175     hasSupplementary |= (UBool)UTF_IS_LEAD(swap = *left);
1176     hasSupplementary |= (UBool)UTF_IS_LEAD(*left++ = *right);
1177     *right = swap;
1178   }
1179
1180   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1181   if(hasSupplementary) {
1182     UChar swap2;
1183
1184     left = getArrayStart() + start;
1185     right = getArrayStart() + start + length - 1; // -1 so that we can look at *(left+1) if left<right
1186     while(left < right) {
1187       if(UTF_IS_TRAIL(swap = *left) && UTF_IS_LEAD(swap2 = *(left + 1))) {
1188         *left++ = swap2;
1189         *left++ = swap;
1190       } else {
1191         ++left;
1192       }
1193     }
1194   }
1195
1196   return *this;
1197 }
1198
1199 UBool
1200 UnicodeString::padLeading(int32_t targetLength,
1201                           UChar padChar)
1202 {
1203   if(fLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1204     return FALSE;
1205   } else {
1206     // move contents up by padding width
1207     int32_t start = targetLength - fLength;
1208     us_arrayCopy(fArray, 0, fArray, start, fLength);
1209
1210     // fill in padding character
1211     while(--start >= 0) {
1212       fArray[start] = padChar;
1213     }
1214     fLength = targetLength;
1215     return TRUE;
1216   }
1217 }
1218
1219 UBool
1220 UnicodeString::padTrailing(int32_t targetLength,
1221                            UChar padChar)
1222 {
1223   if(fLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1224     return FALSE;
1225   } else {
1226     // fill in padding character
1227     int32_t length = targetLength;
1228     while(--length >= fLength) {
1229       fArray[length] = padChar;
1230     }
1231     fLength = targetLength;
1232     return TRUE;
1233   }
1234 }
1235
1236 //========================================
1237 // Hashing
1238 //========================================
1239 int32_t
1240 UnicodeString::doHashCode() const
1241 {
1242     /* Delegate hash computation to uhash.  This makes UnicodeString
1243      * hashing consistent with UChar* hashing.  */
1244     int32_t hashCode = uhash_hashUCharsN(getArrayStart(), fLength);
1245     if (hashCode == kInvalidHashCode) {
1246         hashCode = kEmptyHashCode;
1247     }
1248     return hashCode;
1249 }
1250
1251 //========================================
1252 // External Buffer
1253 //========================================
1254
1255 UChar *
1256 UnicodeString::getBuffer(int32_t minCapacity) {
1257   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1258     fFlags|=kOpenGetBuffer;
1259     fLength=0;
1260     return fArray;
1261   } else {
1262     return 0;
1263   }
1264 }
1265
1266 void
1267 UnicodeString::releaseBuffer(int32_t newLength) {
1268   if(fFlags&kOpenGetBuffer && newLength>=-1) {
1269     // set the new fLength
1270     if(newLength==-1) {
1271       // the new length is the string length, capped by fCapacity
1272       const UChar *p=fArray, *limit=fArray+fCapacity;
1273       while(p<limit && *p!=0) {
1274         ++p;
1275       }
1276       fLength=(int32_t)(p-fArray);
1277     } else if(newLength<=fCapacity) {
1278       fLength=newLength;
1279     } else {
1280       fLength=fCapacity;
1281     }
1282     fFlags&=~kOpenGetBuffer;
1283   }
1284 }
1285
1286 //========================================
1287 // Miscellaneous
1288 //========================================
1289 UBool
1290 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1291                                   int32_t growCapacity,
1292                                   UBool doCopyArray,
1293                                   int32_t **pBufferToDelete,
1294                                   UBool forceClone) {
1295   // default parameters need to be static, therefore
1296   // the defaults are -1 to have convenience defaults
1297   if(newCapacity == -1) {
1298     newCapacity = fCapacity;
1299   }
1300
1301   // while a getBuffer(minCapacity) is "open",
1302   // prevent any modifications of the string by returning FALSE here
1303   // if the string is bogus, then only an assignment or similar can revive it
1304   if((fFlags&(kOpenGetBuffer|kIsBogus))!=0) {
1305     return FALSE;
1306   }
1307
1308   /*
1309    * We need to make a copy of the array if
1310    * the buffer is read-only, or
1311    * the buffer is refCounted (shared), and refCount>1, or
1312    * the buffer is too small.
1313    * Return FALSE if memory could not be allocated.
1314    */
1315   if(forceClone ||
1316      fFlags & kBufferIsReadonly ||
1317      fFlags & kRefCounted && refCount() > 1 ||
1318      newCapacity > fCapacity
1319   ) {
1320     // save old values
1321     UChar *array = fArray;
1322     uint16_t flags = fFlags;
1323
1324     // check growCapacity for default value and use of the stack buffer
1325     if(growCapacity == -1) {
1326       growCapacity = newCapacity;
1327     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1328       growCapacity = US_STACKBUF_SIZE;
1329     }
1330
1331     // allocate a new array
1332     if(allocate(growCapacity) ||
1333        newCapacity < growCapacity && allocate(newCapacity)
1334     ) {
1335       if(doCopyArray) {
1336         // copy the contents
1337         // do not copy more than what fits - it may be smaller than before
1338         if(fCapacity < fLength) {
1339           fLength = fCapacity;
1340         }
1341         us_arrayCopy(array, 0, fArray, 0, fLength);
1342       } else {
1343         fLength = 0;
1344       }
1345
1346       // release the old array
1347       if(flags & kRefCounted) {
1348         // the array is refCounted; decrement and release if 0
1349         int32_t *pRefCount = ((int32_t *)array - 1);
1350         if(umtx_atomic_dec(pRefCount) == 0) {
1351           if(pBufferToDelete == 0) {
1352             uprv_free(pRefCount);
1353           } else {
1354             // the caller requested to delete it himself
1355             *pBufferToDelete = pRefCount;
1356           }
1357         }
1358       }
1359     } else {
1360       // not enough memory for growCapacity and not even for the smaller newCapacity
1361       // reset the old values for setToBogus() to release the array
1362       fArray = array;
1363       fFlags = flags;
1364       setToBogus();
1365       return FALSE;
1366     }
1367   }
1368   return TRUE;
1369 }
1370 U_NAMESPACE_END
1371
1372 #ifdef U_STATIC_IMPLEMENTATION
1373 /*
1374 This should never be called. It is defined here to make sure that the
1375 virtual vector deleting destructor is defined within unistr.cpp.
1376 The vector deleting destructor is already a part of UObject,
1377 but defining it here makes sure that it is included with this object file.
1378 This makes sure that static library dependencies are kept to a minimum.
1379 */
1380 static void uprv_UnicodeStringDummy(void) {
1381     U_NAMESPACE_USE
1382     delete [] (new UnicodeString[2]);
1383 }
1384 #endif
1385