icuSources/common/unistr.cpp

   1 /*
   2 ******************************************************************************
   3 * Copyright (C) 1999-2004, International Business Machines Corporation and   *
   4 * others. All Rights Reserved.                                               *
   5 ******************************************************************************
   6 *
   7 * File unistr.cpp
   8 *
   9 * Modification History:
  10 *
  11 *   Date        Name        Description
  12 *   09/25/98    stephen     Creation.
  13 *   04/20/99    stephen     Overhauled per 4/16 code review.
  14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
  15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
  16 *                           Replaceable.
  17 *   06/25/01    grhoten     Removed the dependency on iostream
  18 ******************************************************************************
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "unicode/putil.h"
  23 #include "cstring.h"
  24 #include "cmemory.h"
  25 #include "unicode/ustring.h"
  26 #include "unicode/unistr.h"
  27 #include "uhash.h"
  28 #include "ustr_imp.h"
  29 #include "umutex.h"
  30
  31 #if 0
  32
  33 #if U_IOSTREAM_SOURCE >= 199711
  34 #include <iostream>
  35 using namespace std;
  36 #elif U_IOSTREAM_SOURCE >= 198506
  37 #include <iostream.h>
  38 #endif
  39
  40 //DEBUGGING
  41 void
  42 print(const UnicodeString& s,
  43       const char *name)
  44 {
  45   UChar c;
  46   cout << name << ":|";
  47   for(int i = 0; i < s.length(); ++i) {
  48     c = s[i];
  49     if(c>= 0x007E || c < 0x0020)
  50       cout << "[0x" << hex << s[i] << "]";
  51     else
  52       cout << (char) s[i];
  53   }
  54   cout << '|' << endl;
  55 }
  56
  57 void
  58 print(const UChar *s,
  59       int32_t len,
  60       const char *name)
  61 {
  62   UChar c;
  63   cout << name << ":|";
  64   for(int i = 0; i < len; ++i) {
  65     c = s[i];
  66     if(c>= 0x007E || c < 0x0020)
  67       cout << "[0x" << hex << s[i] << "]";
  68     else
  69       cout << (char) s[i];
  70   }
  71   cout << '|' << endl;
  72 }
  73 // END DEBUGGING
  74 #endif
  75
  76 // Local function definitions for now
  77
  78 // need to copy areas that may overlap
  79 static
  80 inline void
  81 us_arrayCopy(const UChar *src, int32_t srcStart,
  82          UChar *dst, int32_t dstStart, int32_t count)
  83 {
  84   if(count>0) {
  85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
  86   }
  87 }
  88
  89 // u_unescapeAt() callback to get a UChar from a UnicodeString
  90 U_CDECL_BEGIN
  91 static UChar U_CALLCONV
  92 UnicodeString_charAt(int32_t offset, void *context) {
  93     return ((UnicodeString*) context)->charAt(offset);
  94 }
  95 U_CDECL_END
  96
  97 U_NAMESPACE_BEGIN
  98
  99 /* The Replaceable virtual destructor can't be defined in the header
 100    due to how AIX works with multiple definitions of virtual functions.
 101 */
 102 Replaceable::~Replaceable() {}
 103 Replaceable::Replaceable() {}
 104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
 105
 106 UnicodeString U_EXPORT2
 107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
 108     return
 109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
 110             append(s1).
 111                 append(s2);
 112 }
 113
 114 //========================================
 115 // Reference Counting functions, put at top of file so that optimizing compilers
 116 //                               have a chance to automatically inline.
 117 //========================================
 118
 119 void
 120 UnicodeString::addRef()
 121 {  umtx_atomic_inc((int32_t *)fArray - 1);}
 122
 123 int32_t
 124 UnicodeString::removeRef()
 125 { return umtx_atomic_dec((int32_t *)fArray - 1);}
 126
 127 int32_t
 128 UnicodeString::refCount() const
 129 {
 130     umtx_lock(NULL);
 131     // Note: without the lock to force a memory barrier, we might see a very
 132     //       stale value on some multi-processor systems.
 133     int32_t  count = *((int32_t *)fArray - 1);
 134     umtx_unlock(NULL);
 135     return count;
 136  }
 137
 138 void
 139 UnicodeString::releaseArray() {
 140   if((fFlags & kRefCounted) && removeRef() == 0) {
 141     uprv_free((int32_t *)fArray - 1);
 142   }
 143 }
 144
 145
 146
 147 //========================================
 148 // Constructors
 149 //========================================
 150 UnicodeString::UnicodeString()
 151   : fLength(0),
 152     fCapacity(US_STACKBUF_SIZE),
 153     fArray(fStackBuffer),
 154     fFlags(kShortString)
 155 {}
 156
 157 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count)
 158   : fLength(0),
 159     fCapacity(US_STACKBUF_SIZE),
 160     fArray(0),
 161     fFlags(0)
 162 {
 163   if(count <= 0 || (uint32_t)c > 0x10ffff) {
 164     // just allocate and do not do anything else
 165     allocate(capacity);
 166   } else {
 167     // count > 0, allocate and fill the new string with count c's
 168     int32_t unitCount = UTF_CHAR_LENGTH(c), length = count * unitCount;
 169     if(capacity < length) {
 170       capacity = length;
 171     }
 172     if(allocate(capacity)) {
 173       int32_t i = 0;
 174
 175       // fill the new string with c
 176       if(unitCount == 1) {
 177         // fill with length UChars
 178         while(i < length) {
 179           fArray[i++] = (UChar)c;
 180         }
 181       } else {
 182         // get the code units for c
 183         UChar units[UTF_MAX_CHAR_LENGTH];
 184         UTF_APPEND_CHAR_UNSAFE(units, i, c);
 185
 186         // now it must be i==unitCount
 187         i = 0;
 188
 189         // for Unicode, unitCount can only be 1, 2, 3, or 4
 190         // 1 is handled above
 191         while(i < length) {
 192           int32_t unitIdx = 0;
 193           while(unitIdx < unitCount) {
 194             fArray[i++]=units[unitIdx++];
 195           }
 196         }
 197       }
 198     }
 199     fLength = length;
 200   }
 201 }
 202
 203 UnicodeString::UnicodeString(UChar ch)
 204   : fLength(1),
 205     fCapacity(US_STACKBUF_SIZE),
 206     fArray(fStackBuffer),
 207     fFlags(kShortString)
 208 {
 209   fStackBuffer[0] = ch;
 210 }
 211
 212 UnicodeString::UnicodeString(UChar32 ch)
 213   : fLength(1),
 214     fCapacity(US_STACKBUF_SIZE),
 215     fArray(fStackBuffer),
 216     fFlags(kShortString)
 217 {
 218   int32_t i = 0;
 219   UBool isError = FALSE;
 220   U16_APPEND(fStackBuffer, i, US_STACKBUF_SIZE, ch, isError);
 221   fLength = i;
 222 }
 223
 224 UnicodeString::UnicodeString(const UChar *text)
 225   : fLength(0),
 226     fCapacity(US_STACKBUF_SIZE),
 227     fArray(fStackBuffer),
 228     fFlags(kShortString)
 229 {
 230   doReplace(0, 0, text, 0, -1);
 231 }
 232
 233 UnicodeString::UnicodeString(const UChar *text,
 234                              int32_t textLength)
 235   : fLength(0),
 236     fCapacity(US_STACKBUF_SIZE),
 237     fArray(fStackBuffer),
 238     fFlags(kShortString)
 239 {
 240   doReplace(0, 0, text, 0, textLength);
 241 }
 242
 243 UnicodeString::UnicodeString(UBool isTerminated,
 244                              const UChar *text,
 245                              int32_t textLength)
 246   : fLength(textLength),
 247     fCapacity(isTerminated ? textLength + 1 : textLength),
 248     fArray((UChar *)text),
 249     fFlags(kReadonlyAlias)
 250 {
 251   if(text == NULL) {
 252     // treat as an empty string, do not alias
 253     fLength = 0;
 254     fCapacity = US_STACKBUF_SIZE;
 255     fArray = fStackBuffer;
 256     fFlags = kShortString;
 257   } else if(textLength < -1 ||
 258             (textLength == -1 && !isTerminated) ||
 259             (textLength >= 0 && isTerminated && text[textLength] != 0)
 260   ) {
 261     setToBogus();
 262   } else if(textLength == -1) {
 263     // text is terminated, or else it would have failed the above test
 264     fLength = u_strlen(text);
 265     fCapacity = fLength + 1;
 266   }
 267 }
 268
 269 UnicodeString::UnicodeString(UChar *buff,
 270                              int32_t buffLength,
 271                              int32_t buffCapacity)
 272   : fLength(buffLength),
 273     fCapacity(buffCapacity),
 274     fArray(buff),
 275     fFlags(kWritableAlias)
 276 {
 277   if(buff == NULL) {
 278     // treat as an empty string, do not alias
 279     fLength = 0;
 280     fCapacity = US_STACKBUF_SIZE;
 281     fArray = fStackBuffer;
 282     fFlags = kShortString;
 283   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
 284     setToBogus();
 285   } else if(buffLength == -1) {
 286     // fLength = u_strlen(buff); but do not look beyond buffCapacity
 287     const UChar *p = buff, *limit = buff + buffCapacity;
 288     while(p != limit && *p != 0) {
 289       ++p;
 290     }
 291     fLength = (int32_t)(p - buff);
 292   }
 293 }
 294
 295 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
 296   : fLength(0),
 297     fCapacity(US_STACKBUF_SIZE),
 298     fArray(fStackBuffer),
 299     fFlags(kShortString)
 300 {
 301   if(src==NULL) {
 302     // treat as an empty string
 303   } else {
 304     if(length<0) {
 305       length=uprv_strlen(src);
 306     }
 307     if(cloneArrayIfNeeded(length, length, FALSE)) {
 308       u_charsToUChars(src, getArrayStart(), length);
 309       fLength = length;
 310     } else {
 311       setToBogus();
 312     }
 313   }
 314 }
 315
 316 UnicodeString::UnicodeString(const UnicodeString& that)
 317   : Replaceable(),
 318     fLength(0),
 319     fCapacity(US_STACKBUF_SIZE),
 320     fArray(fStackBuffer),
 321     fFlags(kShortString)
 322 {
 323   copyFrom(that);
 324 }
 325
 326 UnicodeString::UnicodeString(const UnicodeString& that,
 327                              int32_t srcStart)
 328   : Replaceable(),
 329     fLength(0),
 330     fCapacity(US_STACKBUF_SIZE),
 331     fArray(fStackBuffer),
 332     fFlags(kShortString)
 333 {
 334   setTo(that, srcStart);
 335 }
 336
 337 UnicodeString::UnicodeString(const UnicodeString& that,
 338                              int32_t srcStart,
 339                              int32_t srcLength)
 340   : Replaceable(),
 341     fLength(0),
 342     fCapacity(US_STACKBUF_SIZE),
 343     fArray(fStackBuffer),
 344     fFlags(kShortString)
 345 {
 346   setTo(that, srcStart, srcLength);
 347 }
 348
 349 // Replaceable base class clone() default implementation, does not clone
 350 Replaceable *
 351 Replaceable::clone() const {
 352   return NULL;
 353 }
 354
 355 // UnicodeString overrides clone() with a real implementation
 356 Replaceable *
 357 UnicodeString::clone() const {
 358   return new UnicodeString(*this);
 359 }
 360
 361 //========================================
 362 // array allocation
 363 //========================================
 364
 365 UBool
 366 UnicodeString::allocate(int32_t capacity) {
 367   if(capacity <= US_STACKBUF_SIZE) {
 368     fArray = fStackBuffer;
 369     fCapacity = US_STACKBUF_SIZE;
 370     fFlags = kShortString;
 371   } else {
 372     // count bytes for the refCounter and the string capacity, and
 373     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
 374     // to be safely aligned for the refCount
 375     int32_t words = (int32_t)(((sizeof(int32_t) + capacity * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
 376     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
 377     if(array != 0) {
 378       // set initial refCount and point behind the refCount
 379       *array++ = 1;
 380
 381       // have fArray point to the first UChar
 382       fArray = (UChar *)array;
 383       fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
 384       fFlags = kLongString;
 385     } else {
 386       fLength = 0;
 387       fCapacity = 0;
 388       fFlags = kIsBogus;
 389       return FALSE;
 390     }
 391   }
 392   return TRUE;
 393 }
 394
 395 //========================================
 396 // Destructor
 397 //========================================
 398 UnicodeString::~UnicodeString()
 399 {
 400   releaseArray();
 401 }
 402
 403
 404 //========================================
 405 // Assignment
 406 //========================================
 407
 408 UnicodeString &
 409 UnicodeString::operator=(const UnicodeString &src) {
 410   return copyFrom(src);
 411 }
 412
 413 UnicodeString &
 414 UnicodeString::fastCopyFrom(const UnicodeString &src) {
 415   return copyFrom(src, TRUE);
 416 }
 417
 418 UnicodeString &
 419 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
 420   // if assigning to ourselves, do nothing
 421   if(this == 0 || this == &src) {
 422     return *this;
 423   }
 424
 425   // is the right side bogus?
 426   if(&src == 0 || src.isBogus()) {
 427     setToBogus();
 428     return *this;
 429   }
 430
 431   // delete the current contents
 432   releaseArray();
 433
 434   // we always copy the length
 435   fLength = src.fLength;
 436   if(fLength == 0) {
 437     // empty string - use the stack buffer
 438     fArray = fStackBuffer;
 439     fCapacity = US_STACKBUF_SIZE;
 440     fFlags = kShortString;
 441     return *this;
 442   }
 443
 444   // fLength>0 and not an "open" src.getBuffer(minCapacity)
 445   switch(src.fFlags) {
 446   case kShortString:
 447     // short string using the stack buffer, do the same
 448     fArray = fStackBuffer;
 449     fCapacity = US_STACKBUF_SIZE;
 450     fFlags = kShortString;
 451     uprv_memcpy(fStackBuffer, src.fArray, fLength * U_SIZEOF_UCHAR);
 452     break;
 453   case kLongString:
 454     // src uses a refCounted string buffer, use that buffer with refCount
 455     // src is const, use a cast - we don't really change it
 456     ((UnicodeString &)src).addRef();
 457     // copy all fields, share the reference-counted buffer
 458     fArray = src.fArray;
 459     fCapacity = src.fCapacity;
 460     fFlags = src.fFlags;
 461     break;
 462   case kReadonlyAlias:
 463     if(fastCopy) {
 464       // src is a readonly alias, do the same
 465       // -> maintain the readonly alias as such
 466       fArray = src.fArray;
 467       fCapacity = src.fCapacity;
 468       fFlags = src.fFlags;
 469       break;
 470     }
 471     // else if(!fastCopy) fall through to case kWritableAlias
 472     // -> allocate a new buffer and copy the contents
 473   case kWritableAlias:
 474     // src is a writable alias; we make a copy of that instead
 475     if(allocate(fLength)) {
 476       uprv_memcpy(fArray, src.fArray, fLength * U_SIZEOF_UCHAR);
 477       break;
 478     }
 479     // if there is not enough memory, then fall through to setting to bogus
 480   default:
 481     // if src is bogus, set ourselves to bogus
 482     // do not call setToBogus() here because fArray and fFlags are not consistent here
 483     fArray = 0;
 484     fLength = 0;
 485     fCapacity = 0;
 486     fFlags = kIsBogus;
 487     break;
 488   }
 489
 490   return *this;
 491 }
 492
 493 //========================================
 494 // Miscellaneous operations
 495 //========================================
 496
 497 UnicodeString UnicodeString::unescape() const {
 498     UnicodeString result;
 499     for (int32_t i=0; i<length(); ) {
 500         UChar32 c = charAt(i++);
 501         if (c == 0x005C /*'\\'*/) {
 502             c = unescapeAt(i); // advances i
 503             if (c == (UChar32)0xFFFFFFFF) {
 504                 result.remove(); // return empty string
 505                 break; // invalid escape sequence
 506             }
 507         }
 508         result.append(c);
 509     }
 510     return result;
 511 }
 512
 513 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
 514     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
 515 }
 516
 517 //========================================
 518 // Read-only implementation
 519 //========================================
 520 int8_t
 521 UnicodeString::doCompare( int32_t start,
 522               int32_t length,
 523               const UChar *srcChars,
 524               int32_t srcStart,
 525               int32_t srcLength) const
 526 {
 527   // compare illegal string values
 528   // treat const UChar *srcChars==NULL as an empty string
 529   if(isBogus()) {
 530     return -1;
 531   }
 532
 533   // pin indices to legal values
 534   pinIndices(start, length);
 535
 536   if(srcChars == NULL) {
 537     srcStart = srcLength = 0;
 538   }
 539
 540   // get the correct pointer
 541   const UChar *chars = getArrayStart();
 542
 543   chars += start;
 544   srcChars += srcStart;
 545
 546   int32_t minLength;
 547   int8_t lengthResult;
 548
 549   // get the srcLength if necessary
 550   if(srcLength < 0) {
 551     srcLength = u_strlen(srcChars + srcStart);
 552   }
 553
 554   // are we comparing different lengths?
 555   if(length != srcLength) {
 556     if(length < srcLength) {
 557       minLength = length;
 558       lengthResult = -1;
 559     } else {
 560       minLength = srcLength;
 561       lengthResult = 1;
 562     }
 563   } else {
 564     minLength = length;
 565     lengthResult = 0;
 566   }
 567
 568   /*
 569    * note that uprv_memcmp() returns an int but we return an int8_t;
 570    * we need to take care not to truncate the result -
 571    * one way to do this is to right-shift the value to
 572    * move the sign bit into the lower 8 bits and making sure that this
 573    * does not become 0 itself
 574    */
 575
 576   if(minLength > 0 && chars != srcChars) {
 577     int32_t result;
 578
 579 #   if U_IS_BIG_ENDIAN
 580       // big-endian: byte comparison works
 581       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
 582       if(result != 0) {
 583         return (int8_t)(result >> 15 | 1);
 584       }
 585 #   else
 586       // little-endian: compare UChar units
 587       do {
 588         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
 589         if(result != 0) {
 590           return (int8_t)(result >> 15 | 1);
 591         }
 592       } while(--minLength > 0);
 593 #   endif
 594   }
 595   return lengthResult;
 596 }
 597
 598 /* String compare in code point order - doCompare() compares in code unit order. */
 599 int8_t
 600 UnicodeString::doCompareCodePointOrder(int32_t start,
 601                                        int32_t length,
 602                                        const UChar *srcChars,
 603                                        int32_t srcStart,
 604                                        int32_t srcLength) const
 605 {
 606   // compare illegal string values
 607   // treat const UChar *srcChars==NULL as an empty string
 608   if(isBogus()) {
 609     return -1;
 610   }
 611
 612   // pin indices to legal values
 613   pinIndices(start, length);
 614
 615   if(srcChars == NULL) {
 616     srcStart = srcLength = 0;
 617   }
 618
 619   int32_t diff = uprv_strCompare(fArray + start, length, srcChars + srcStart, srcLength, FALSE, TRUE);
 620   /* translate the 32-bit result into an 8-bit one */
 621   if(diff!=0) {
 622     return (int8_t)(diff >> 15 | 1);
 623   } else {
 624     return 0;
 625   }
 626 }
 627
 628 int32_t
 629 UnicodeString::getLength() const {
 630     return length();
 631 }
 632
 633 UChar
 634 UnicodeString::getCharAt(int32_t offset) const {
 635   return charAt(offset);
 636 }
 637
 638 UChar32
 639 UnicodeString::getChar32At(int32_t offset) const {
 640   return char32At(offset);
 641 }
 642
 643 int32_t
 644 UnicodeString::countChar32(int32_t start, int32_t length) const {
 645   pinIndices(start, length);
 646   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
 647   return u_countChar32(fArray+start, length);
 648 }
 649
 650 UBool
 651 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
 652   pinIndices(start, length);
 653   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
 654   return u_strHasMoreChar32Than(fArray+start, length, number);
 655 }
 656
 657 int32_t
 658 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
 659   // pin index
 660   if(index<0) {
 661     index=0;
 662   } else if(index>fLength) {
 663     index=fLength;
 664   }
 665
 666   if(delta>0) {
 667     UTF_FWD_N(fArray, index, fLength, delta);
 668   } else {
 669     UTF_BACK_N(fArray, 0, index, -delta);
 670   }
 671
 672   return index;
 673 }
 674
 675 void
 676 UnicodeString::doExtract(int32_t start,
 677              int32_t length,
 678              UChar *dst,
 679              int32_t dstStart) const
 680 {
 681   // pin indices to legal values
 682   pinIndices(start, length);
 683
 684   // do not copy anything if we alias dst itself
 685   if(fArray + start != dst + dstStart) {
 686     us_arrayCopy(getArrayStart(), start, dst, dstStart, length);
 687   }
 688 }
 689
 690 int32_t
 691 UnicodeString::extract(UChar *dest, int32_t destCapacity,
 692                        UErrorCode &errorCode) const {
 693   if(U_SUCCESS(errorCode)) {
 694     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 695       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 696     } else {
 697       if(fLength>0 && fLength<=destCapacity && fArray!=dest) {
 698         uprv_memcpy(dest, fArray, fLength*U_SIZEOF_UCHAR);
 699       }
 700       return u_terminateUChars(dest, destCapacity, fLength, &errorCode);
 701     }
 702   }
 703
 704   return fLength;
 705 }
 706
 707 int32_t
 708 UnicodeString::extract(int32_t start,
 709                        int32_t length,
 710                        char *target,
 711                        int32_t targetCapacity,
 712                        enum EInvariant) const
 713 {
 714   // if the arguments are illegal, then do nothing
 715   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
 716     return 0;
 717   }
 718
 719   // pin the indices to legal values
 720   pinIndices(start, length);
 721
 722   if(length <= targetCapacity) {
 723     u_UCharsToChars(getArrayStart() + start, target, length);
 724   }
 725   UErrorCode status = U_ZERO_ERROR;
 726   return u_terminateChars(target, targetCapacity, length, &status);
 727 }
 728
 729 void
 730 UnicodeString::extractBetween(int32_t start,
 731                   int32_t limit,
 732                   UnicodeString& target) const {
 733   pinIndex(start);
 734   pinIndex(limit);
 735   doExtract(start, limit - start, target);
 736 }
 737
 738 int32_t
 739 UnicodeString::indexOf(const UChar *srcChars,
 740                int32_t srcStart,
 741                int32_t srcLength,
 742                int32_t start,
 743                int32_t length) const
 744 {
 745   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
 746     return -1;
 747   }
 748
 749   // UnicodeString does not find empty substrings
 750   if(srcLength < 0 && srcChars[srcStart] == 0) {
 751     return -1;
 752   }
 753
 754   // get the indices within bounds
 755   pinIndices(start, length);
 756
 757   // find the first occurrence of the substring
 758   const UChar *match = u_strFindFirst(fArray + start, length, srcChars + srcStart, srcLength);
 759   if(match == NULL) {
 760     return -1;
 761   } else {
 762     return match - fArray;
 763   }
 764 }
 765
 766 int32_t
 767 UnicodeString::doIndexOf(UChar c,
 768              int32_t start,
 769              int32_t length) const
 770 {
 771   // pin indices
 772   pinIndices(start, length);
 773
 774   // find the first occurrence of c
 775   const UChar *match = u_memchr(fArray + start, c, length);
 776   if(match == NULL) {
 777     return -1;
 778   } else {
 779     return match - fArray;
 780   }
 781 }
 782
 783 int32_t
 784 UnicodeString::doIndexOf(UChar32 c,
 785                          int32_t start,
 786                          int32_t length) const {
 787   // pin indices
 788   pinIndices(start, length);
 789
 790   // find the first occurrence of c
 791   const UChar *match = u_memchr32(fArray + start, c, length);
 792   if(match == NULL) {
 793     return -1;
 794   } else {
 795     return match - fArray;
 796   }
 797 }
 798
 799 int32_t
 800 UnicodeString::lastIndexOf(const UChar *srcChars,
 801                int32_t srcStart,
 802                int32_t srcLength,
 803                int32_t start,
 804                int32_t length) const
 805 {
 806   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
 807     return -1;
 808   }
 809
 810   // UnicodeString does not find empty substrings
 811   if(srcLength < 0 && srcChars[srcStart] == 0) {
 812     return -1;
 813   }
 814
 815   // get the indices within bounds
 816   pinIndices(start, length);
 817
 818   // find the last occurrence of the substring
 819   const UChar *match = u_strFindLast(fArray + start, length, srcChars + srcStart, srcLength);
 820   if(match == NULL) {
 821     return -1;
 822   } else {
 823     return match - fArray;
 824   }
 825 }
 826
 827 int32_t
 828 UnicodeString::doLastIndexOf(UChar c,
 829                  int32_t start,
 830                  int32_t length) const
 831 {
 832   if(isBogus()) {
 833     return -1;
 834   }
 835
 836   // pin indices
 837   pinIndices(start, length);
 838
 839   // find the last occurrence of c
 840   const UChar *match = u_memrchr(fArray + start, c, length);
 841   if(match == NULL) {
 842     return -1;
 843   } else {
 844     return match - fArray;
 845   }
 846 }
 847
 848 int32_t
 849 UnicodeString::doLastIndexOf(UChar32 c,
 850                              int32_t start,
 851                              int32_t length) const {
 852   // pin indices
 853   pinIndices(start, length);
 854
 855   // find the last occurrence of c
 856   const UChar *match = u_memrchr32(fArray + start, c, length);
 857   if(match == NULL) {
 858     return -1;
 859   } else {
 860     return match - fArray;
 861   }
 862 }
 863
 864 //========================================
 865 // Write implementation
 866 //========================================
 867
 868 UnicodeString&
 869 UnicodeString::findAndReplace(int32_t start,
 870                   int32_t length,
 871                   const UnicodeString& oldText,
 872                   int32_t oldStart,
 873                   int32_t oldLength,
 874                   const UnicodeString& newText,
 875                   int32_t newStart,
 876                   int32_t newLength)
 877 {
 878   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
 879     return *this;
 880   }
 881
 882   pinIndices(start, length);
 883   oldText.pinIndices(oldStart, oldLength);
 884   newText.pinIndices(newStart, newLength);
 885
 886   if(oldLength == 0) {
 887     return *this;
 888   }
 889
 890   while(length > 0 && length >= oldLength) {
 891     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
 892     if(pos < 0) {
 893       // no more oldText's here: done
 894       break;
 895     } else {
 896       // we found oldText, replace it by newText and go beyond it
 897       replace(pos, oldLength, newText, newStart, newLength);
 898       length -= pos + oldLength - start;
 899       start = pos + newLength;
 900     }
 901   }
 902
 903   return *this;
 904 }
 905
 906
 907 void
 908 UnicodeString::setToBogus()
 909 {
 910   releaseArray();
 911
 912   fArray = 0;
 913   fCapacity = fLength = 0;
 914   fFlags = kIsBogus;
 915 }
 916
 917 // turn a bogus string into an empty one
 918 void
 919 UnicodeString::unBogus() {
 920   if(fFlags & kIsBogus) {
 921     fArray = fStackBuffer;
 922     fLength = 0;
 923     fCapacity = US_STACKBUF_SIZE;
 924     fFlags = kShortString;
 925   }
 926 }
 927
 928 // setTo() analogous to the readonly-aliasing constructor with the same signature
 929 UnicodeString &
 930 UnicodeString::setTo(UBool isTerminated,
 931                      const UChar *text,
 932                      int32_t textLength)
 933 {
 934   if(fFlags & kOpenGetBuffer) {
 935     // do not modify a string that has an "open" getBuffer(minCapacity)
 936     return *this;
 937   }
 938
 939   if(text == NULL) {
 940     // treat as an empty string, do not alias
 941     releaseArray();
 942     fLength = 0;
 943     fCapacity = US_STACKBUF_SIZE;
 944     fArray = fStackBuffer;
 945     fFlags = kShortString;
 946     return *this;
 947   }
 948
 949   if( textLength < -1 ||
 950       (textLength == -1 && !isTerminated) ||
 951       (textLength >= 0 && isTerminated && text[textLength] != 0)
 952   ) {
 953     setToBogus();
 954     return *this;
 955   }
 956
 957   releaseArray();
 958
 959   fArray = (UChar *)text;
 960   if(textLength != -1) {
 961     fLength = textLength;
 962     fCapacity = isTerminated ? fLength + 1 : fLength;
 963   } else {
 964     // text is terminated, or else it would have failed the above test
 965     fLength = u_strlen(text);
 966     fCapacity = fLength + 1;
 967   }
 968
 969   fFlags = kReadonlyAlias;
 970   return *this;
 971 }
 972
 973 // setTo() analogous to the writable-aliasing constructor with the same signature
 974 UnicodeString &
 975 UnicodeString::setTo(UChar *buffer,
 976                      int32_t buffLength,
 977                      int32_t buffCapacity) {
 978   if(fFlags & kOpenGetBuffer) {
 979     // do not modify a string that has an "open" getBuffer(minCapacity)
 980     return *this;
 981   }
 982
 983   if(buffer == NULL) {
 984     // treat as an empty string, do not alias
 985     releaseArray();
 986     fLength = 0;
 987     fCapacity = US_STACKBUF_SIZE;
 988     fArray = fStackBuffer;
 989     fFlags = kShortString;
 990     return *this;
 991   }
 992
 993   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
 994     setToBogus();
 995     return *this;
 996   } else if(buffLength == -1) {
 997     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
 998     const UChar *p = buffer, *limit = buffer + buffCapacity;
 999     while(p != limit && *p != 0) {
1000       ++p;
1001     }
1002     buffLength = (int32_t)(p - buffer);
1003   }
1004
1005   releaseArray();
1006
1007   fArray = buffer;
1008   fLength = buffLength;
1009   fCapacity = buffCapacity;
1010   fFlags = kWritableAlias;
1011   return *this;
1012 }
1013
1014 UnicodeString&
1015 UnicodeString::setCharAt(int32_t offset,
1016              UChar c)
1017 {
1018   if(cloneArrayIfNeeded() && fLength > 0) {
1019     if(offset < 0) {
1020       offset = 0;
1021     } else if(offset >= fLength) {
1022       offset = fLength - 1;
1023     }
1024
1025     fArray[offset] = c;
1026   }
1027   return *this;
1028 }
1029
1030 UnicodeString&
1031 UnicodeString::doReplace( int32_t start,
1032               int32_t length,
1033               const UnicodeString& src,
1034               int32_t srcStart,
1035               int32_t srcLength)
1036 {
1037   if(!src.isBogus()) {
1038     // pin the indices to legal values
1039     src.pinIndices(srcStart, srcLength);
1040
1041     // get the characters from src
1042     // and replace the range in ourselves with them
1043     return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1044   } else {
1045     // remove the range
1046     return doReplace(start, length, 0, 0, 0);
1047   }
1048 }
1049
1050 UnicodeString&
1051 UnicodeString::doReplace(int32_t start,
1052              int32_t length,
1053              const UChar *srcChars,
1054              int32_t srcStart,
1055              int32_t srcLength)
1056 {
1057   if(isBogus()) {
1058     return *this;
1059   }
1060
1061   if(srcChars == 0) {
1062     srcStart = srcLength = 0;
1063   } else if(srcLength < 0) {
1064     // get the srcLength if necessary
1065     srcLength = u_strlen(srcChars + srcStart);
1066   }
1067
1068   int32_t *bufferToDelete = 0;
1069
1070   // the following may change fArray but will not copy the current contents;
1071   // therefore we need to keep the current fArray
1072   UChar *oldArray = fArray;
1073   int32_t oldLength = fLength;
1074
1075   // pin the indices to legal values
1076   pinIndices(start, length);
1077
1078   // calculate the size of the string after the replace
1079   int32_t newSize = oldLength - length + srcLength;
1080
1081   // clone our array and allocate a bigger array if needed
1082   if(!cloneArrayIfNeeded(newSize, newSize + (newSize >> 2) + kGrowSize,
1083                          FALSE, &bufferToDelete)
1084   ) {
1085     return *this;
1086   }
1087
1088   // now do the replace
1089
1090   if(fArray != oldArray) {
1091     // if fArray changed, then we need to copy everything except what will change
1092     us_arrayCopy(oldArray, 0, fArray, 0, start);
1093     us_arrayCopy(oldArray, start + length,
1094                  fArray, start + srcLength,
1095                  oldLength - (start + length));
1096   } else if(length != srcLength) {
1097     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1098     us_arrayCopy(oldArray, start + length,
1099                  fArray, start + srcLength,
1100                  oldLength - (start + length));
1101   }
1102
1103   // now fill in the hole with the new string
1104   us_arrayCopy(srcChars, srcStart, getArrayStart(), start, srcLength);
1105
1106   fLength = newSize;
1107
1108   // delayed delete in case srcChars == fArray when we started, and
1109   // to keep oldArray alive for the above operations
1110   if (bufferToDelete) {
1111     uprv_free(bufferToDelete);
1112   }
1113
1114   return *this;
1115 }
1116
1117 /**
1118  * Replaceable API
1119  */
1120 void
1121 UnicodeString::handleReplaceBetween(int32_t start,
1122                                     int32_t limit,
1123                                     const UnicodeString& text) {
1124     replaceBetween(start, limit, text);
1125 }
1126
1127 /**
1128  * Replaceable API
1129  */
1130 void
1131 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1132     if (limit <= start) {
1133         return; // Nothing to do; avoid bogus malloc call
1134     }
1135     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1136     extractBetween(start, limit, text, 0);
1137     insert(dest, text, 0, limit - start);
1138     uprv_free(text);
1139 }
1140
1141 /**
1142  * Replaceable API
1143  *
1144  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1145  * so we implement this function here.
1146  */
1147 UBool Replaceable::hasMetaData() const {
1148     return TRUE;
1149 }
1150
1151 /**
1152  * Replaceable API
1153  */
1154 UBool UnicodeString::hasMetaData() const {
1155     return FALSE;
1156 }
1157
1158 UnicodeString&
1159 UnicodeString::doReverse(int32_t start,
1160              int32_t length)
1161 {
1162   if(fLength <= 1 || !cloneArrayIfNeeded()) {
1163     return *this;
1164   }
1165
1166   // pin the indices to legal values
1167   pinIndices(start, length);
1168
1169   UChar *left = getArrayStart() + start;
1170   UChar *right = getArrayStart() + start + length;
1171   UChar swap;
1172   UBool hasSupplementary = FALSE;
1173
1174   while(left < --right) {
1175     hasSupplementary |= (UBool)UTF_IS_LEAD(swap = *left);
1176     hasSupplementary |= (UBool)UTF_IS_LEAD(*left++ = *right);
1177     *right = swap;
1178   }
1179
1180   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1181   if(hasSupplementary) {
1182     UChar swap2;
1183
1184     left = getArrayStart() + start;
1185     right = getArrayStart() + start + length - 1; // -1 so that we can look at *(left+1) if left<right
1186     while(left < right) {
1187       if(UTF_IS_TRAIL(swap = *left) && UTF_IS_LEAD(swap2 = *(left + 1))) {
1188         *left++ = swap2;
1189         *left++ = swap;
1190       } else {
1191         ++left;
1192       }
1193     }
1194   }
1195
1196   return *this;
1197 }
1198
1199 UBool
1200 UnicodeString::padLeading(int32_t targetLength,
1201                           UChar padChar)
1202 {
1203   if(fLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1204     return FALSE;
1205   } else {
1206     // move contents up by padding width
1207     int32_t start = targetLength - fLength;
1208     us_arrayCopy(fArray, 0, fArray, start, fLength);
1209
1210     // fill in padding character
1211     while(--start >= 0) {
1212       fArray[start] = padChar;
1213     }
1214     fLength = targetLength;
1215     return TRUE;
1216   }
1217 }
1218
1219 UBool
1220 UnicodeString::padTrailing(int32_t targetLength,
1221                            UChar padChar)
1222 {
1223   if(fLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1224     return FALSE;
1225   } else {
1226     // fill in padding character
1227     int32_t length = targetLength;
1228     while(--length >= fLength) {
1229       fArray[length] = padChar;
1230     }
1231     fLength = targetLength;
1232     return TRUE;
1233   }
1234 }
1235
1236 //========================================
1237 // Hashing
1238 //========================================
1239 int32_t
1240 UnicodeString::doHashCode() const
1241 {
1242     /* Delegate hash computation to uhash.  This makes UnicodeString
1243      * hashing consistent with UChar* hashing.  */
1244     int32_t hashCode = uhash_hashUCharsN(getArrayStart(), fLength);
1245     if (hashCode == kInvalidHashCode) {
1246         hashCode = kEmptyHashCode;
1247     }
1248     return hashCode;
1249 }
1250
1251 //========================================
1252 // External Buffer
1253 //========================================
1254
1255 UChar *
1256 UnicodeString::getBuffer(int32_t minCapacity) {
1257   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1258     fFlags|=kOpenGetBuffer;
1259     fLength=0;
1260     return fArray;
1261   } else {
1262     return 0;
1263   }
1264 }
1265
1266 void
1267 UnicodeString::releaseBuffer(int32_t newLength) {
1268   if(fFlags&kOpenGetBuffer && newLength>=-1) {
1269     // set the new fLength
1270     if(newLength==-1) {
1271       // the new length is the string length, capped by fCapacity
1272       const UChar *p=fArray, *limit=fArray+fCapacity;
1273       while(p<limit && *p!=0) {
1274         ++p;
1275       }
1276       fLength=(int32_t)(p-fArray);
1277     } else if(newLength<=fCapacity) {
1278       fLength=newLength;
1279     } else {
1280       fLength=fCapacity;
1281     }
1282     fFlags&=~kOpenGetBuffer;
1283   }
1284 }
1285
1286 //========================================
1287 // Miscellaneous
1288 //========================================
1289 UBool
1290 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1291                                   int32_t growCapacity,
1292                                   UBool doCopyArray,
1293                                   int32_t **pBufferToDelete,
1294                                   UBool forceClone) {
1295   // default parameters need to be static, therefore
1296   // the defaults are -1 to have convenience defaults
1297   if(newCapacity == -1) {
1298     newCapacity = fCapacity;
1299   }
1300
1301   // while a getBuffer(minCapacity) is "open",
1302   // prevent any modifications of the string by returning FALSE here
1303   // if the string is bogus, then only an assignment or similar can revive it
1304   if((fFlags&(kOpenGetBuffer|kIsBogus))!=0) {
1305     return FALSE;
1306   }
1307
1308   /*
1309    * We need to make a copy of the array if
1310    * the buffer is read-only, or
1311    * the buffer is refCounted (shared), and refCount>1, or
1312    * the buffer is too small.
1313    * Return FALSE if memory could not be allocated.
1314    */
1315   if(forceClone ||
1316      fFlags & kBufferIsReadonly ||
1317      fFlags & kRefCounted && refCount() > 1 ||
1318      newCapacity > fCapacity
1319   ) {
1320     // save old values
1321     UChar *array = fArray;
1322     uint16_t flags = fFlags;
1323
1324     // check growCapacity for default value and use of the stack buffer
1325     if(growCapacity == -1) {
1326       growCapacity = newCapacity;
1327     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1328       growCapacity = US_STACKBUF_SIZE;
1329     }
1330
1331     // allocate a new array
1332     if(allocate(growCapacity) ||
1333        newCapacity < growCapacity && allocate(newCapacity)
1334     ) {
1335       if(doCopyArray) {
1336         // copy the contents
1337         // do not copy more than what fits - it may be smaller than before
1338         if(fCapacity < fLength) {
1339           fLength = fCapacity;
1340         }
1341         us_arrayCopy(array, 0, fArray, 0, fLength);
1342       } else {
1343         fLength = 0;
1344       }
1345
1346       // release the old array
1347       if(flags & kRefCounted) {
1348         // the array is refCounted; decrement and release if 0
1349         int32_t *pRefCount = ((int32_t *)array - 1);
1350         if(umtx_atomic_dec(pRefCount) == 0) {
1351           if(pBufferToDelete == 0) {
1352             uprv_free(pRefCount);
1353           } else {
1354             // the caller requested to delete it himself
1355             *pBufferToDelete = pRefCount;
1356           }
1357         }
1358       }
1359     } else {
1360       // not enough memory for growCapacity and not even for the smaller newCapacity
1361       // reset the old values for setToBogus() to release the array
1362       fArray = array;
1363       fFlags = flags;
1364       setToBogus();
1365       return FALSE;
1366     }
1367   }
1368   return TRUE;
1369 }
1370 U_NAMESPACE_END