icuSources/common/unistr.cpp

   1 /*
   2 ******************************************************************************
   3 * Copyright (C) 1999-2008, International Business Machines Corporation and   *
   4 * others. All Rights Reserved.                                               *
   5 ******************************************************************************
   6 *
   7 * File unistr.cpp
   8 *
   9 * Modification History:
  10 *
  11 *   Date        Name        Description
  12 *   09/25/98    stephen     Creation.
  13 *   04/20/99    stephen     Overhauled per 4/16 code review.
  14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
  15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
  16 *                           Replaceable.
  17 *   06/25/01    grhoten     Removed the dependency on iostream
  18 ******************************************************************************
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "unicode/putil.h"
  23 #include "cstring.h"
  24 #include "cmemory.h"
  25 #include "unicode/ustring.h"
  26 #include "unicode/unistr.h"
  27 #include "uhash.h"
  28 #include "ustr_imp.h"
  29 #include "umutex.h"
  30
  31 #if 0
  32
  33 #if U_IOSTREAM_SOURCE >= 199711
  34 #include <iostream>
  35 using namespace std;
  36 #elif U_IOSTREAM_SOURCE >= 198506
  37 #include <iostream.h>
  38 #endif
  39
  40 //DEBUGGING
  41 void
  42 print(const UnicodeString& s,
  43       const char *name)
  44 {
  45   UChar c;
  46   cout << name << ":|";
  47   for(int i = 0; i < s.length(); ++i) {
  48     c = s[i];
  49     if(c>= 0x007E || c < 0x0020)
  50       cout << "[0x" << hex << s[i] << "]";
  51     else
  52       cout << (char) s[i];
  53   }
  54   cout << '|' << endl;
  55 }
  56
  57 void
  58 print(const UChar *s,
  59       int32_t len,
  60       const char *name)
  61 {
  62   UChar c;
  63   cout << name << ":|";
  64   for(int i = 0; i < len; ++i) {
  65     c = s[i];
  66     if(c>= 0x007E || c < 0x0020)
  67       cout << "[0x" << hex << s[i] << "]";
  68     else
  69       cout << (char) s[i];
  70   }
  71   cout << '|' << endl;
  72 }
  73 // END DEBUGGING
  74 #endif
  75
  76 // Local function definitions for now
  77
  78 // need to copy areas that may overlap
  79 static
  80 inline void
  81 us_arrayCopy(const UChar *src, int32_t srcStart,
  82          UChar *dst, int32_t dstStart, int32_t count)
  83 {
  84   if(count>0) {
  85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
  86   }
  87 }
  88
  89 // u_unescapeAt() callback to get a UChar from a UnicodeString
  90 U_CDECL_BEGIN
  91 static UChar U_CALLCONV
  92 UnicodeString_charAt(int32_t offset, void *context) {
  93     return ((U_NAMESPACE_QUALIFIER UnicodeString*) context)->charAt(offset);
  94 }
  95 U_CDECL_END
  96
  97 U_NAMESPACE_BEGIN
  98
  99 /* The Replaceable virtual destructor can't be defined in the header
 100    due to how AIX works with multiple definitions of virtual functions.
 101 */
 102 Replaceable::~Replaceable() {}
 103 Replaceable::Replaceable() {}
 104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
 105
 106 UnicodeString U_EXPORT2
 107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
 108     return
 109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
 110             append(s1).
 111                 append(s2);
 112 }
 113
 114 //========================================
 115 // Reference Counting functions, put at top of file so that optimizing compilers
 116 //                               have a chance to automatically inline.
 117 //========================================
 118
 119 void
 120 UnicodeString::addRef()
 121 {  umtx_atomic_inc((int32_t *)fUnion.fFields.fArray - 1);}
 122
 123 int32_t
 124 UnicodeString::removeRef()
 125 { return umtx_atomic_dec((int32_t *)fUnion.fFields.fArray - 1);}
 126
 127 int32_t
 128 UnicodeString::refCount() const
 129 {
 130     umtx_lock(NULL);
 131     // Note: without the lock to force a memory barrier, we might see a very
 132     //       stale value on some multi-processor systems.
 133     int32_t  count = *((int32_t *)fUnion.fFields.fArray - 1);
 134     umtx_unlock(NULL);
 135     return count;
 136  }
 137
 138 void
 139 UnicodeString::releaseArray() {
 140   if((fFlags & kRefCounted) && removeRef() == 0) {
 141     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
 142   }
 143 }
 144
 145
 146
 147 //========================================
 148 // Constructors
 149 //========================================
 150 UnicodeString::UnicodeString()
 151   : fShortLength(0),
 152     fFlags(kShortString)
 153 {}
 154
 155 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count)
 156   : fShortLength(0),
 157     fFlags(0)
 158 {
 159   if(count <= 0 || (uint32_t)c > 0x10ffff) {
 160     // just allocate and do not do anything else
 161     allocate(capacity);
 162   } else {
 163     // count > 0, allocate and fill the new string with count c's
 164     int32_t unitCount = UTF_CHAR_LENGTH(c), length = count * unitCount;
 165     if(capacity < length) {
 166       capacity = length;
 167     }
 168     if(allocate(capacity)) {
 169       UChar *array = getArrayStart();
 170       int32_t i = 0;
 171
 172       // fill the new string with c
 173       if(unitCount == 1) {
 174         // fill with length UChars
 175         while(i < length) {
 176           array[i++] = (UChar)c;
 177         }
 178       } else {
 179         // get the code units for c
 180         UChar units[UTF_MAX_CHAR_LENGTH];
 181         UTF_APPEND_CHAR_UNSAFE(units, i, c);
 182
 183         // now it must be i==unitCount
 184         i = 0;
 185
 186         // for Unicode, unitCount can only be 1, 2, 3, or 4
 187         // 1 is handled above
 188         while(i < length) {
 189           int32_t unitIdx = 0;
 190           while(unitIdx < unitCount) {
 191             array[i++]=units[unitIdx++];
 192           }
 193         }
 194       }
 195     }
 196     setLength(length);
 197   }
 198 }
 199
 200 UnicodeString::UnicodeString(UChar ch)
 201   : fShortLength(1),
 202     fFlags(kShortString)
 203 {
 204   fUnion.fStackBuffer[0] = ch;
 205 }
 206
 207 UnicodeString::UnicodeString(UChar32 ch)
 208   : fShortLength(0),
 209     fFlags(kShortString)
 210 {
 211   int32_t i = 0;
 212   UBool isError = FALSE;
 213   U16_APPEND(fUnion.fStackBuffer, i, US_STACKBUF_SIZE, ch, isError);
 214   fShortLength = (int8_t)i;
 215 }
 216
 217 UnicodeString::UnicodeString(const UChar *text)
 218   : fShortLength(0),
 219     fFlags(kShortString)
 220 {
 221   doReplace(0, 0, text, 0, -1);
 222 }
 223
 224 UnicodeString::UnicodeString(const UChar *text,
 225                              int32_t textLength)
 226   : fShortLength(0),
 227     fFlags(kShortString)
 228 {
 229   doReplace(0, 0, text, 0, textLength);
 230 }
 231
 232 UnicodeString::UnicodeString(UBool isTerminated,
 233                              const UChar *text,
 234                              int32_t textLength)
 235   : fShortLength(0),
 236     fFlags(kReadonlyAlias)
 237 {
 238   if(text == NULL) {
 239     // treat as an empty string, do not alias
 240     setToEmpty();
 241   } else if(textLength < -1 ||
 242             (textLength == -1 && !isTerminated) ||
 243             (textLength >= 0 && isTerminated && text[textLength] != 0)
 244   ) {
 245     setToBogus();
 246   } else {
 247     if(textLength == -1) {
 248       // text is terminated, or else it would have failed the above test
 249       textLength = u_strlen(text);
 250     }
 251     setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
 252   }
 253 }
 254
 255 UnicodeString::UnicodeString(UChar *buff,
 256                              int32_t buffLength,
 257                              int32_t buffCapacity)
 258   : fShortLength(0),
 259     fFlags(kWritableAlias)
 260 {
 261   if(buff == NULL) {
 262     // treat as an empty string, do not alias
 263     setToEmpty();
 264   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
 265     setToBogus();
 266   } else {
 267     if(buffLength == -1) {
 268       // fLength = u_strlen(buff); but do not look beyond buffCapacity
 269       const UChar *p = buff, *limit = buff + buffCapacity;
 270       while(p != limit && *p != 0) {
 271         ++p;
 272       }
 273       buffLength = (int32_t)(p - buff);
 274     }
 275     setArray(buff, buffLength, buffCapacity);
 276   }
 277 }
 278
 279 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
 280   : fShortLength(0),
 281     fFlags(kShortString)
 282 {
 283   if(src==NULL) {
 284     // treat as an empty string
 285   } else {
 286     if(length<0) {
 287       length=(int32_t)uprv_strlen(src);
 288     }
 289     if(cloneArrayIfNeeded(length, length, FALSE)) {
 290       u_charsToUChars(src, getArrayStart(), length);
 291       setLength(length);
 292     } else {
 293       setToBogus();
 294     }
 295   }
 296 }
 297
 298 UnicodeString::UnicodeString(const UnicodeString& that)
 299   : Replaceable(),
 300     fShortLength(0),
 301     fFlags(kShortString)
 302 {
 303   copyFrom(that);
 304 }
 305
 306 UnicodeString::UnicodeString(const UnicodeString& that,
 307                              int32_t srcStart)
 308   : Replaceable(),
 309     fShortLength(0),
 310     fFlags(kShortString)
 311 {
 312   setTo(that, srcStart);
 313 }
 314
 315 UnicodeString::UnicodeString(const UnicodeString& that,
 316                              int32_t srcStart,
 317                              int32_t srcLength)
 318   : Replaceable(),
 319     fShortLength(0),
 320     fFlags(kShortString)
 321 {
 322   setTo(that, srcStart, srcLength);
 323 }
 324
 325 // Replaceable base class clone() default implementation, does not clone
 326 Replaceable *
 327 Replaceable::clone() const {
 328   return NULL;
 329 }
 330
 331 // UnicodeString overrides clone() with a real implementation
 332 Replaceable *
 333 UnicodeString::clone() const {
 334   return new UnicodeString(*this);
 335 }
 336
 337 //========================================
 338 // array allocation
 339 //========================================
 340
 341 UBool
 342 UnicodeString::allocate(int32_t capacity) {
 343   if(capacity <= US_STACKBUF_SIZE) {
 344     fFlags = kShortString;
 345   } else {
 346     // count bytes for the refCounter and the string capacity, and
 347     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
 348     // to be safely aligned for the refCount
 349     int32_t words = (int32_t)(((sizeof(int32_t) + capacity * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
 350     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
 351     if(array != 0) {
 352       // set initial refCount and point behind the refCount
 353       *array++ = 1;
 354
 355       // have fArray point to the first UChar
 356       fUnion.fFields.fArray = (UChar *)array;
 357       fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
 358       fFlags = kLongString;
 359     } else {
 360       fShortLength = 0;
 361       fUnion.fFields.fArray = 0;
 362       fUnion.fFields.fCapacity = 0;
 363       fFlags = kIsBogus;
 364       return FALSE;
 365     }
 366   }
 367   return TRUE;
 368 }
 369
 370 //========================================
 371 // Destructor
 372 //========================================
 373 UnicodeString::~UnicodeString()
 374 {
 375   releaseArray();
 376 }
 377
 378
 379 //========================================
 380 // Assignment
 381 //========================================
 382
 383 UnicodeString &
 384 UnicodeString::operator=(const UnicodeString &src) {
 385   return copyFrom(src);
 386 }
 387
 388 UnicodeString &
 389 UnicodeString::fastCopyFrom(const UnicodeString &src) {
 390   return copyFrom(src, TRUE);
 391 }
 392
 393 UnicodeString &
 394 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
 395   // if assigning to ourselves, do nothing
 396   if(this == 0 || this == &src) {
 397     return *this;
 398   }
 399
 400   // is the right side bogus?
 401   if(&src == 0 || src.isBogus()) {
 402     setToBogus();
 403     return *this;
 404   }
 405
 406   // delete the current contents
 407   releaseArray();
 408
 409   if(src.isEmpty()) {
 410     // empty string - use the stack buffer
 411     setToEmpty();
 412     return *this;
 413   }
 414
 415   // we always copy the length
 416   int32_t srcLength = src.length();
 417   setLength(srcLength);
 418
 419   // fLength>0 and not an "open" src.getBuffer(minCapacity)
 420   switch(src.fFlags) {
 421   case kShortString:
 422     // short string using the stack buffer, do the same
 423     fFlags = kShortString;
 424     uprv_memcpy(fUnion.fStackBuffer, src.fUnion.fStackBuffer, fShortLength * U_SIZEOF_UCHAR);
 425     break;
 426   case kLongString:
 427     // src uses a refCounted string buffer, use that buffer with refCount
 428     // src is const, use a cast - we don't really change it
 429     ((UnicodeString &)src).addRef();
 430     // copy all fields, share the reference-counted buffer
 431     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 432     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 433     fFlags = src.fFlags;
 434     break;
 435   case kReadonlyAlias:
 436     if(fastCopy) {
 437       // src is a readonly alias, do the same
 438       // -> maintain the readonly alias as such
 439       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 440       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 441       fFlags = src.fFlags;
 442       break;
 443     }
 444     // else if(!fastCopy) fall through to case kWritableAlias
 445     // -> allocate a new buffer and copy the contents
 446   case kWritableAlias:
 447     // src is a writable alias; we make a copy of that instead
 448     if(allocate(srcLength)) {
 449       uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
 450       break;
 451     }
 452     // if there is not enough memory, then fall through to setting to bogus
 453   default:
 454     // if src is bogus, set ourselves to bogus
 455     // do not call setToBogus() here because fArray and fFlags are not consistent here
 456     fShortLength = 0;
 457     fUnion.fFields.fArray = 0;
 458     fUnion.fFields.fCapacity = 0;
 459     fFlags = kIsBogus;
 460     break;
 461   }
 462
 463   return *this;
 464 }
 465
 466 //========================================
 467 // Miscellaneous operations
 468 //========================================
 469
 470 UnicodeString UnicodeString::unescape() const {
 471     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
 472     const UChar *array = getBuffer();
 473     int32_t len = length();
 474     int32_t prev = 0;
 475     for (int32_t i=0;;) {
 476         if (i == len) {
 477             result.append(array, prev, len - prev);
 478             break;
 479         }
 480         if (array[i++] == 0x5C /*'\\'*/) {
 481             result.append(array, prev, (i - 1) - prev);
 482             UChar32 c = unescapeAt(i); // advances i
 483             if (c < 0) {
 484                 result.remove(); // return empty string
 485                 break; // invalid escape sequence
 486             }
 487             result.append(c);
 488             prev = i;
 489         }
 490     }
 491     return result;
 492 }
 493
 494 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
 495     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
 496 }
 497
 498 //========================================
 499 // Read-only implementation
 500 //========================================
 501 int8_t
 502 UnicodeString::doCompare( int32_t start,
 503               int32_t length,
 504               const UChar *srcChars,
 505               int32_t srcStart,
 506               int32_t srcLength) const
 507 {
 508   // compare illegal string values
 509   // treat const UChar *srcChars==NULL as an empty string
 510   if(isBogus()) {
 511     return -1;
 512   }
 513
 514   // pin indices to legal values
 515   pinIndices(start, length);
 516
 517   if(srcChars == NULL) {
 518     srcStart = srcLength = 0;
 519   }
 520
 521   // get the correct pointer
 522   const UChar *chars = getArrayStart();
 523
 524   chars += start;
 525   srcChars += srcStart;
 526
 527   int32_t minLength;
 528   int8_t lengthResult;
 529
 530   // get the srcLength if necessary
 531   if(srcLength < 0) {
 532     srcLength = u_strlen(srcChars + srcStart);
 533   }
 534
 535   // are we comparing different lengths?
 536   if(length != srcLength) {
 537     if(length < srcLength) {
 538       minLength = length;
 539       lengthResult = -1;
 540     } else {
 541       minLength = srcLength;
 542       lengthResult = 1;
 543     }
 544   } else {
 545     minLength = length;
 546     lengthResult = 0;
 547   }
 548
 549   /*
 550    * note that uprv_memcmp() returns an int but we return an int8_t;
 551    * we need to take care not to truncate the result -
 552    * one way to do this is to right-shift the value to
 553    * move the sign bit into the lower 8 bits and making sure that this
 554    * does not become 0 itself
 555    */
 556
 557   if(minLength > 0 && chars != srcChars) {
 558     int32_t result;
 559
 560 #   if U_IS_BIG_ENDIAN
 561       // big-endian: byte comparison works
 562       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
 563       if(result != 0) {
 564         return (int8_t)(result >> 15 | 1);
 565       }
 566 #   else
 567       // little-endian: compare UChar units
 568       do {
 569         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
 570         if(result != 0) {
 571           return (int8_t)(result >> 15 | 1);
 572         }
 573       } while(--minLength > 0);
 574 #   endif
 575   }
 576   return lengthResult;
 577 }
 578
 579 /* String compare in code point order - doCompare() compares in code unit order. */
 580 int8_t
 581 UnicodeString::doCompareCodePointOrder(int32_t start,
 582                                        int32_t length,
 583                                        const UChar *srcChars,
 584                                        int32_t srcStart,
 585                                        int32_t srcLength) const
 586 {
 587   // compare illegal string values
 588   // treat const UChar *srcChars==NULL as an empty string
 589   if(isBogus()) {
 590     return -1;
 591   }
 592
 593   // pin indices to legal values
 594   pinIndices(start, length);
 595
 596   if(srcChars == NULL) {
 597     srcStart = srcLength = 0;
 598   }
 599
 600   int32_t diff = uprv_strCompare(getArrayStart() + start, length, srcChars + srcStart, srcLength, FALSE, TRUE);
 601   /* translate the 32-bit result into an 8-bit one */
 602   if(diff!=0) {
 603     return (int8_t)(diff >> 15 | 1);
 604   } else {
 605     return 0;
 606   }
 607 }
 608
 609 int32_t
 610 UnicodeString::getLength() const {
 611     return length();
 612 }
 613
 614 UChar
 615 UnicodeString::getCharAt(int32_t offset) const {
 616   return charAt(offset);
 617 }
 618
 619 UChar32
 620 UnicodeString::getChar32At(int32_t offset) const {
 621   return char32At(offset);
 622 }
 623
 624 int32_t
 625 UnicodeString::countChar32(int32_t start, int32_t length) const {
 626   pinIndices(start, length);
 627   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
 628   return u_countChar32(getArrayStart()+start, length);
 629 }
 630
 631 UBool
 632 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
 633   pinIndices(start, length);
 634   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
 635   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
 636 }
 637
 638 int32_t
 639 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
 640   // pin index
 641   int32_t len = length();
 642   if(index<0) {
 643     index=0;
 644   } else if(index>len) {
 645     index=len;
 646   }
 647
 648   const UChar *array = getArrayStart();
 649   if(delta>0) {
 650     UTF_FWD_N(array, index, len, delta);
 651   } else {
 652     UTF_BACK_N(array, 0, index, -delta);
 653   }
 654
 655   return index;
 656 }
 657
 658 void
 659 UnicodeString::doExtract(int32_t start,
 660              int32_t length,
 661              UChar *dst,
 662              int32_t dstStart) const
 663 {
 664   // pin indices to legal values
 665   pinIndices(start, length);
 666
 667   // do not copy anything if we alias dst itself
 668   const UChar *array = getArrayStart();
 669   if(array + start != dst + dstStart) {
 670     us_arrayCopy(array, start, dst, dstStart, length);
 671   }
 672 }
 673
 674 int32_t
 675 UnicodeString::extract(UChar *dest, int32_t destCapacity,
 676                        UErrorCode &errorCode) const {
 677   int32_t len = length();
 678   if(U_SUCCESS(errorCode)) {
 679     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 680       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 681     } else {
 682       const UChar *array = getArrayStart();
 683       if(len>0 && len<=destCapacity && array!=dest) {
 684         uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
 685       }
 686       return u_terminateUChars(dest, destCapacity, len, &errorCode);
 687     }
 688   }
 689
 690   return len;
 691 }
 692
 693 int32_t
 694 UnicodeString::extract(int32_t start,
 695                        int32_t length,
 696                        char *target,
 697                        int32_t targetCapacity,
 698                        enum EInvariant) const
 699 {
 700   // if the arguments are illegal, then do nothing
 701   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
 702     return 0;
 703   }
 704
 705   // pin the indices to legal values
 706   pinIndices(start, length);
 707
 708   if(length <= targetCapacity) {
 709     u_UCharsToChars(getArrayStart() + start, target, length);
 710   }
 711   UErrorCode status = U_ZERO_ERROR;
 712   return u_terminateChars(target, targetCapacity, length, &status);
 713 }
 714
 715 void
 716 UnicodeString::extractBetween(int32_t start,
 717                   int32_t limit,
 718                   UnicodeString& target) const {
 719   pinIndex(start);
 720   pinIndex(limit);
 721   doExtract(start, limit - start, target);
 722 }
 723
 724 int32_t
 725 UnicodeString::indexOf(const UChar *srcChars,
 726                int32_t srcStart,
 727                int32_t srcLength,
 728                int32_t start,
 729                int32_t length) const
 730 {
 731   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
 732     return -1;
 733   }
 734
 735   // UnicodeString does not find empty substrings
 736   if(srcLength < 0 && srcChars[srcStart] == 0) {
 737     return -1;
 738   }
 739
 740   // get the indices within bounds
 741   pinIndices(start, length);
 742
 743   // find the first occurrence of the substring
 744   const UChar *array = getArrayStart();
 745   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
 746   if(match == NULL) {
 747     return -1;
 748   } else {
 749     return (int32_t)(match - array);
 750   }
 751 }
 752
 753 int32_t
 754 UnicodeString::doIndexOf(UChar c,
 755              int32_t start,
 756              int32_t length) const
 757 {
 758   // pin indices
 759   pinIndices(start, length);
 760
 761   // find the first occurrence of c
 762   const UChar *array = getArrayStart();
 763   const UChar *match = u_memchr(array + start, c, length);
 764   if(match == NULL) {
 765     return -1;
 766   } else {
 767     return (int32_t)(match - array);
 768   }
 769 }
 770
 771 int32_t
 772 UnicodeString::doIndexOf(UChar32 c,
 773                          int32_t start,
 774                          int32_t length) const {
 775   // pin indices
 776   pinIndices(start, length);
 777
 778   // find the first occurrence of c
 779   const UChar *array = getArrayStart();
 780   const UChar *match = u_memchr32(array + start, c, length);
 781   if(match == NULL) {
 782     return -1;
 783   } else {
 784     return (int32_t)(match - array);
 785   }
 786 }
 787
 788 int32_t
 789 UnicodeString::lastIndexOf(const UChar *srcChars,
 790                int32_t srcStart,
 791                int32_t srcLength,
 792                int32_t start,
 793                int32_t length) const
 794 {
 795   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
 796     return -1;
 797   }
 798
 799   // UnicodeString does not find empty substrings
 800   if(srcLength < 0 && srcChars[srcStart] == 0) {
 801     return -1;
 802   }
 803
 804   // get the indices within bounds
 805   pinIndices(start, length);
 806
 807   // find the last occurrence of the substring
 808   const UChar *array = getArrayStart();
 809   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
 810   if(match == NULL) {
 811     return -1;
 812   } else {
 813     return (int32_t)(match - array);
 814   }
 815 }
 816
 817 int32_t
 818 UnicodeString::doLastIndexOf(UChar c,
 819                  int32_t start,
 820                  int32_t length) const
 821 {
 822   if(isBogus()) {
 823     return -1;
 824   }
 825
 826   // pin indices
 827   pinIndices(start, length);
 828
 829   // find the last occurrence of c
 830   const UChar *array = getArrayStart();
 831   const UChar *match = u_memrchr(array + start, c, length);
 832   if(match == NULL) {
 833     return -1;
 834   } else {
 835     return (int32_t)(match - array);
 836   }
 837 }
 838
 839 int32_t
 840 UnicodeString::doLastIndexOf(UChar32 c,
 841                              int32_t start,
 842                              int32_t length) const {
 843   // pin indices
 844   pinIndices(start, length);
 845
 846   // find the last occurrence of c
 847   const UChar *array = getArrayStart();
 848   const UChar *match = u_memrchr32(array + start, c, length);
 849   if(match == NULL) {
 850     return -1;
 851   } else {
 852     return (int32_t)(match - array);
 853   }
 854 }
 855
 856 //========================================
 857 // Write implementation
 858 //========================================
 859
 860 UnicodeString&
 861 UnicodeString::findAndReplace(int32_t start,
 862                   int32_t length,
 863                   const UnicodeString& oldText,
 864                   int32_t oldStart,
 865                   int32_t oldLength,
 866                   const UnicodeString& newText,
 867                   int32_t newStart,
 868                   int32_t newLength)
 869 {
 870   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
 871     return *this;
 872   }
 873
 874   pinIndices(start, length);
 875   oldText.pinIndices(oldStart, oldLength);
 876   newText.pinIndices(newStart, newLength);
 877
 878   if(oldLength == 0) {
 879     return *this;
 880   }
 881
 882   while(length > 0 && length >= oldLength) {
 883     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
 884     if(pos < 0) {
 885       // no more oldText's here: done
 886       break;
 887     } else {
 888       // we found oldText, replace it by newText and go beyond it
 889       replace(pos, oldLength, newText, newStart, newLength);
 890       length -= pos + oldLength - start;
 891       start = pos + newLength;
 892     }
 893   }
 894
 895   return *this;
 896 }
 897
 898
 899 void
 900 UnicodeString::setToBogus()
 901 {
 902   releaseArray();
 903
 904   fShortLength = 0;
 905   fUnion.fFields.fArray = 0;
 906   fUnion.fFields.fCapacity = 0;
 907   fFlags = kIsBogus;
 908 }
 909
 910 // turn a bogus string into an empty one
 911 void
 912 UnicodeString::unBogus() {
 913   if(fFlags & kIsBogus) {
 914     setToEmpty();
 915   }
 916 }
 917
 918 // setTo() analogous to the readonly-aliasing constructor with the same signature
 919 UnicodeString &
 920 UnicodeString::setTo(UBool isTerminated,
 921                      const UChar *text,
 922                      int32_t textLength)
 923 {
 924   if(fFlags & kOpenGetBuffer) {
 925     // do not modify a string that has an "open" getBuffer(minCapacity)
 926     return *this;
 927   }
 928
 929   if(text == NULL) {
 930     // treat as an empty string, do not alias
 931     releaseArray();
 932     setToEmpty();
 933     return *this;
 934   }
 935
 936   if( textLength < -1 ||
 937       (textLength == -1 && !isTerminated) ||
 938       (textLength >= 0 && isTerminated && text[textLength] != 0)
 939   ) {
 940     setToBogus();
 941     return *this;
 942   }
 943
 944   releaseArray();
 945
 946   if(textLength == -1) {
 947     // text is terminated, or else it would have failed the above test
 948     textLength = u_strlen(text);
 949   }
 950   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
 951
 952   fFlags = kReadonlyAlias;
 953   return *this;
 954 }
 955
 956 // setTo() analogous to the writable-aliasing constructor with the same signature
 957 UnicodeString &
 958 UnicodeString::setTo(UChar *buffer,
 959                      int32_t buffLength,
 960                      int32_t buffCapacity) {
 961   if(fFlags & kOpenGetBuffer) {
 962     // do not modify a string that has an "open" getBuffer(minCapacity)
 963     return *this;
 964   }
 965
 966   if(buffer == NULL) {
 967     // treat as an empty string, do not alias
 968     releaseArray();
 969     setToEmpty();
 970     return *this;
 971   }
 972
 973   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
 974     setToBogus();
 975     return *this;
 976   } else if(buffLength == -1) {
 977     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
 978     const UChar *p = buffer, *limit = buffer + buffCapacity;
 979     while(p != limit && *p != 0) {
 980       ++p;
 981     }
 982     buffLength = (int32_t)(p - buffer);
 983   }
 984
 985   releaseArray();
 986
 987   setArray(buffer, buffLength, buffCapacity);
 988   fFlags = kWritableAlias;
 989   return *this;
 990 }
 991
 992 UnicodeString&
 993 UnicodeString::setCharAt(int32_t offset,
 994              UChar c)
 995 {
 996   int32_t len = length();
 997   if(cloneArrayIfNeeded() && len > 0) {
 998     if(offset < 0) {
 999       offset = 0;
1000     } else if(offset >= len) {
1001       offset = len - 1;
1002     }
1003
1004     getArrayStart()[offset] = c;
1005   }
1006   return *this;
1007 }
1008
1009 UnicodeString&
1010 UnicodeString::doReplace( int32_t start,
1011               int32_t length,
1012               const UnicodeString& src,
1013               int32_t srcStart,
1014               int32_t srcLength)
1015 {
1016   if(!src.isBogus()) {
1017     // pin the indices to legal values
1018     src.pinIndices(srcStart, srcLength);
1019
1020     // get the characters from src
1021     // and replace the range in ourselves with them
1022     return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1023   } else {
1024     // remove the range
1025     return doReplace(start, length, 0, 0, 0);
1026   }
1027 }
1028
1029 UnicodeString&
1030 UnicodeString::doReplace(int32_t start,
1031              int32_t length,
1032              const UChar *srcChars,
1033              int32_t srcStart,
1034              int32_t srcLength)
1035 {
1036   if(!isWritable()) {
1037     return *this;
1038   }
1039
1040   if(srcChars == 0) {
1041     srcStart = srcLength = 0;
1042   } else if(srcLength < 0) {
1043     // get the srcLength if necessary
1044     srcLength = u_strlen(srcChars + srcStart);
1045   }
1046
1047   int32_t oldLength = this->length();
1048
1049   // calculate the size of the string after the replace
1050   int32_t newSize;
1051
1052   // optimize append() onto a large-enough, owned string
1053   if(start >= oldLength) {
1054     newSize = oldLength + srcLength;
1055     if(newSize <= getCapacity() && isBufferWritable()) {
1056       us_arrayCopy(srcChars, srcStart, getArrayStart(), oldLength, srcLength);
1057       setLength(newSize);
1058       return *this;
1059     } else {
1060       // pin the indices to legal values
1061       start = oldLength;
1062       length = 0;
1063     }
1064   } else {
1065     // pin the indices to legal values
1066     pinIndices(start, length);
1067
1068     newSize = oldLength - length + srcLength;
1069   }
1070
1071   // the following may change fArray but will not copy the current contents;
1072   // therefore we need to keep the current fArray
1073   UChar oldStackBuffer[US_STACKBUF_SIZE];
1074   UChar *oldArray;
1075   if((fFlags&kUsingStackBuffer) && (newSize > US_STACKBUF_SIZE)) {
1076     // copy the stack buffer contents because it will be overwritten with
1077     // fUnion.fFields values
1078     u_memcpy(oldStackBuffer, fUnion.fStackBuffer, oldLength);
1079     oldArray = oldStackBuffer;
1080   } else {
1081     oldArray = getArrayStart();
1082   }
1083
1084   // clone our array and allocate a bigger array if needed
1085   int32_t *bufferToDelete = 0;
1086   if(!cloneArrayIfNeeded(newSize, newSize + (newSize >> 2) + kGrowSize,
1087                          FALSE, &bufferToDelete)
1088   ) {
1089     return *this;
1090   }
1091
1092   // now do the replace
1093
1094   UChar *newArray = getArrayStart();
1095   if(newArray != oldArray) {
1096     // if fArray changed, then we need to copy everything except what will change
1097     us_arrayCopy(oldArray, 0, newArray, 0, start);
1098     us_arrayCopy(oldArray, start + length,
1099                  newArray, start + srcLength,
1100                  oldLength - (start + length));
1101   } else if(length != srcLength) {
1102     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1103     us_arrayCopy(oldArray, start + length,
1104                  newArray, start + srcLength,
1105                  oldLength - (start + length));
1106   }
1107
1108   // now fill in the hole with the new string
1109   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1110
1111   setLength(newSize);
1112
1113   // delayed delete in case srcChars == fArray when we started, and
1114   // to keep oldArray alive for the above operations
1115   if (bufferToDelete) {
1116     uprv_free(bufferToDelete);
1117   }
1118
1119   return *this;
1120 }
1121
1122 /**
1123  * Replaceable API
1124  */
1125 void
1126 UnicodeString::handleReplaceBetween(int32_t start,
1127                                     int32_t limit,
1128                                     const UnicodeString& text) {
1129     replaceBetween(start, limit, text);
1130 }
1131
1132 /**
1133  * Replaceable API
1134  */
1135 void
1136 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1137     if (limit <= start) {
1138         return; // Nothing to do; avoid bogus malloc call
1139     }
1140     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1141     // Check to make sure text is not null.
1142     if (text != NULL) {
1143             extractBetween(start, limit, text, 0);
1144             insert(dest, text, 0, limit - start);
1145             uprv_free(text);
1146     }
1147 }
1148
1149 /**
1150  * Replaceable API
1151  *
1152  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1153  * so we implement this function here.
1154  */
1155 UBool Replaceable::hasMetaData() const {
1156     return TRUE;
1157 }
1158
1159 /**
1160  * Replaceable API
1161  */
1162 UBool UnicodeString::hasMetaData() const {
1163     return FALSE;
1164 }
1165
1166 UnicodeString&
1167 UnicodeString::doReverse(int32_t start,
1168              int32_t length)
1169 {
1170   if(this->length() <= 1 || !cloneArrayIfNeeded()) {
1171     return *this;
1172   }
1173
1174   // pin the indices to legal values
1175   pinIndices(start, length);
1176
1177   UChar *left = getArrayStart() + start;
1178   UChar *right = left + length;
1179   UChar swap;
1180   UBool hasSupplementary = FALSE;
1181
1182   while(left < --right) {
1183     hasSupplementary |= (UBool)UTF_IS_LEAD(swap = *left);
1184     hasSupplementary |= (UBool)UTF_IS_LEAD(*left++ = *right);
1185     *right = swap;
1186   }
1187
1188   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1189   if(hasSupplementary) {
1190     UChar swap2;
1191
1192     left = getArrayStart() + start;
1193     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1194     while(left < right) {
1195       if(UTF_IS_TRAIL(swap = *left) && UTF_IS_LEAD(swap2 = *(left + 1))) {
1196         *left++ = swap2;
1197         *left++ = swap;
1198       } else {
1199         ++left;
1200       }
1201     }
1202   }
1203
1204   return *this;
1205 }
1206
1207 UBool
1208 UnicodeString::padLeading(int32_t targetLength,
1209                           UChar padChar)
1210 {
1211   int32_t oldLength = length();
1212   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1213     return FALSE;
1214   } else {
1215     // move contents up by padding width
1216     UChar *array = getArrayStart();
1217     int32_t start = targetLength - oldLength;
1218     us_arrayCopy(array, 0, array, start, oldLength);
1219
1220     // fill in padding character
1221     while(--start >= 0) {
1222       array[start] = padChar;
1223     }
1224     setLength(targetLength);
1225     return TRUE;
1226   }
1227 }
1228
1229 UBool
1230 UnicodeString::padTrailing(int32_t targetLength,
1231                            UChar padChar)
1232 {
1233   int32_t oldLength = length();
1234   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1235     return FALSE;
1236   } else {
1237     // fill in padding character
1238     UChar *array = getArrayStart();
1239     int32_t length = targetLength;
1240     while(--length >= oldLength) {
1241       array[length] = padChar;
1242     }
1243     setLength(targetLength);
1244     return TRUE;
1245   }
1246 }
1247
1248 //========================================
1249 // Hashing
1250 //========================================
1251 int32_t
1252 UnicodeString::doHashCode() const
1253 {
1254     /* Delegate hash computation to uhash.  This makes UnicodeString
1255      * hashing consistent with UChar* hashing.  */
1256     int32_t hashCode = uhash_hashUCharsN(getArrayStart(), length());
1257     if (hashCode == kInvalidHashCode) {
1258         hashCode = kEmptyHashCode;
1259     }
1260     return hashCode;
1261 }
1262
1263 //========================================
1264 // External Buffer
1265 //========================================
1266
1267 UChar *
1268 UnicodeString::getBuffer(int32_t minCapacity) {
1269   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1270     fFlags|=kOpenGetBuffer;
1271     fShortLength=0;
1272     return getArrayStart();
1273   } else {
1274     return 0;
1275   }
1276 }
1277
1278 void
1279 UnicodeString::releaseBuffer(int32_t newLength) {
1280   if(fFlags&kOpenGetBuffer && newLength>=-1) {
1281     // set the new fLength
1282     int32_t capacity=getCapacity();
1283     if(newLength==-1) {
1284       // the new length is the string length, capped by fCapacity
1285       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1286       while(p<limit && *p!=0) {
1287         ++p;
1288       }
1289       newLength=(int32_t)(p-array);
1290     } else if(newLength>capacity) {
1291       newLength=capacity;
1292     }
1293     setLength(newLength);
1294     fFlags&=~kOpenGetBuffer;
1295   }
1296 }
1297
1298 //========================================
1299 // Miscellaneous
1300 //========================================
1301 UBool
1302 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1303                                   int32_t growCapacity,
1304                                   UBool doCopyArray,
1305                                   int32_t **pBufferToDelete,
1306                                   UBool forceClone) {
1307   // default parameters need to be static, therefore
1308   // the defaults are -1 to have convenience defaults
1309   if(newCapacity == -1) {
1310     newCapacity = getCapacity();
1311   }
1312
1313   // while a getBuffer(minCapacity) is "open",
1314   // prevent any modifications of the string by returning FALSE here
1315   // if the string is bogus, then only an assignment or similar can revive it
1316   if(!isWritable()) {
1317     return FALSE;
1318   }
1319
1320   /*
1321    * We need to make a copy of the array if
1322    * the buffer is read-only, or
1323    * the buffer is refCounted (shared), and refCount>1, or
1324    * the buffer is too small.
1325    * Return FALSE if memory could not be allocated.
1326    */
1327   if(forceClone ||
1328      fFlags & kBufferIsReadonly ||
1329      fFlags & kRefCounted && refCount() > 1 ||
1330      newCapacity > getCapacity()
1331   ) {
1332     // check growCapacity for default value and use of the stack buffer
1333     if(growCapacity == -1) {
1334       growCapacity = newCapacity;
1335     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1336       growCapacity = US_STACKBUF_SIZE;
1337     }
1338
1339     // save old values
1340     UChar oldStackBuffer[US_STACKBUF_SIZE];
1341     UChar *oldArray;
1342     uint8_t flags = fFlags;
1343
1344     if(flags&kUsingStackBuffer) {
1345       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1346         // copy the stack buffer contents because it will be overwritten with
1347         // fUnion.fFields values
1348         us_arrayCopy(fUnion.fStackBuffer, 0, oldStackBuffer, 0, fShortLength);
1349         oldArray = oldStackBuffer;
1350       } else {
1351         oldArray = 0; // no need to copy from stack buffer to itself
1352       }
1353     } else {
1354       oldArray = fUnion.fFields.fArray;
1355     }
1356
1357     // allocate a new array
1358     if(allocate(growCapacity) ||
1359        newCapacity < growCapacity && allocate(newCapacity)
1360     ) {
1361       if(doCopyArray && oldArray != 0) {
1362         // copy the contents
1363         // do not copy more than what fits - it may be smaller than before
1364         int32_t minLength = length();
1365         newCapacity = getCapacity();
1366         if(newCapacity < minLength) {
1367           minLength = newCapacity;
1368           setLength(minLength);
1369         }
1370         us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1371       } else {
1372         fShortLength = 0;
1373       }
1374
1375       // release the old array
1376       if(flags & kRefCounted) {
1377         // the array is refCounted; decrement and release if 0
1378         int32_t *pRefCount = ((int32_t *)oldArray - 1);
1379         if(umtx_atomic_dec(pRefCount) == 0) {
1380           if(pBufferToDelete == 0) {
1381             uprv_free(pRefCount);
1382           } else {
1383             // the caller requested to delete it himself
1384             *pBufferToDelete = pRefCount;
1385           }
1386         }
1387       }
1388     } else {
1389       // not enough memory for growCapacity and not even for the smaller newCapacity
1390       // reset the old values for setToBogus() to release the array
1391       if(!(flags&kUsingStackBuffer)) {
1392         fUnion.fFields.fArray = oldArray;
1393       }
1394       fFlags = flags;
1395       setToBogus();
1396       return FALSE;
1397     }
1398   }
1399   return TRUE;
1400 }
1401 U_NAMESPACE_END
1402
1403 #ifdef U_STATIC_IMPLEMENTATION
1404 /*
1405 This should never be called. It is defined here to make sure that the
1406 virtual vector deleting destructor is defined within unistr.cpp.
1407 The vector deleting destructor is already a part of UObject,
1408 but defining it here makes sure that it is included with this object file.
1409 This makes sure that static library dependencies are kept to a minimum.
1410 */
1411 static void uprv_UnicodeStringDummy(void) {
1412     U_NAMESPACE_USE
1413     delete [] (new UnicodeString[2]);
1414 }
1415 #endif
1416