]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unistr.cpp
61b54fe06bc860d976ad0a71b6cf54f3bd01e891
[apple/icu.git] / icuSources / common / unistr.cpp
1 /*
2 ******************************************************************************
3 * Copyright (C) 1999-2016, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
6 *
7 * File unistr.cpp
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 09/25/98 stephen Creation.
13 * 04/20/99 stephen Overhauled per 4/16 code review.
14 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
15 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from
16 * Replaceable.
17 * 06/25/01 grhoten Removed the dependency on iostream
18 ******************************************************************************
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/appendable.h"
23 #include "unicode/putil.h"
24 #include "cstring.h"
25 #include "cmemory.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf16.h"
30 #include "uelement.h"
31 #include "ustr_imp.h"
32 #include "umutex.h"
33 #include "uassert.h"
34
35 #if 0
36
37 #include <iostream>
38 using namespace std;
39
40 //DEBUGGING
41 void
42 print(const UnicodeString& s,
43 const char *name)
44 {
45 UChar c;
46 cout << name << ":|";
47 for(int i = 0; i < s.length(); ++i) {
48 c = s[i];
49 if(c>= 0x007E || c < 0x0020)
50 cout << "[0x" << hex << s[i] << "]";
51 else
52 cout << (char) s[i];
53 }
54 cout << '|' << endl;
55 }
56
57 void
58 print(const UChar *s,
59 int32_t len,
60 const char *name)
61 {
62 UChar c;
63 cout << name << ":|";
64 for(int i = 0; i < len; ++i) {
65 c = s[i];
66 if(c>= 0x007E || c < 0x0020)
67 cout << "[0x" << hex << s[i] << "]";
68 else
69 cout << (char) s[i];
70 }
71 cout << '|' << endl;
72 }
73 // END DEBUGGING
74 #endif
75
76 // Local function definitions for now
77
78 // need to copy areas that may overlap
79 static
80 inline void
81 us_arrayCopy(const UChar *src, int32_t srcStart,
82 UChar *dst, int32_t dstStart, int32_t count)
83 {
84 if(count>0) {
85 uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
86 }
87 }
88
89 // u_unescapeAt() callback to get a UChar from a UnicodeString
90 U_CDECL_BEGIN
91 static UChar U_CALLCONV
92 UnicodeString_charAt(int32_t offset, void *context) {
93 return ((icu::UnicodeString*) context)->charAt(offset);
94 }
95 U_CDECL_END
96
97 U_NAMESPACE_BEGIN
98
99 /* The Replaceable virtual destructor can't be defined in the header
100 due to how AIX works with multiple definitions of virtual functions.
101 */
102 Replaceable::~Replaceable() {}
103
104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
105
106 UnicodeString U_EXPORT2
107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
108 return
109 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
110 append(s1).
111 append(s2);
112 }
113
114 //========================================
115 // Reference Counting functions, put at top of file so that optimizing compilers
116 // have a chance to automatically inline.
117 //========================================
118
119 void
120 UnicodeString::addRef() {
121 umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
122 }
123
124 int32_t
125 UnicodeString::removeRef() {
126 return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
127 }
128
129 int32_t
130 UnicodeString::refCount() const {
131 return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
132 }
133
134 void
135 UnicodeString::releaseArray() {
136 if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
137 uprv_free((int32_t *)fUnion.fFields.fArray - 1);
138 }
139 }
140
141
142
143 //========================================
144 // Constructors
145 //========================================
146
147 // The default constructor is inline in unistr.h.
148
149 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
150 fUnion.fFields.fLengthAndFlags = 0;
151 if(count <= 0 || (uint32_t)c > 0x10ffff) {
152 // just allocate and do not do anything else
153 allocate(capacity);
154 } else {
155 // count > 0, allocate and fill the new string with count c's
156 int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
157 if(capacity < length) {
158 capacity = length;
159 }
160 if(allocate(capacity)) {
161 UChar *array = getArrayStart();
162 int32_t i = 0;
163
164 // fill the new string with c
165 if(unitCount == 1) {
166 // fill with length UChars
167 while(i < length) {
168 array[i++] = (UChar)c;
169 }
170 } else {
171 // get the code units for c
172 UChar units[U16_MAX_LENGTH];
173 U16_APPEND_UNSAFE(units, i, c);
174
175 // now it must be i==unitCount
176 i = 0;
177
178 // for Unicode, unitCount can only be 1, 2, 3, or 4
179 // 1 is handled above
180 while(i < length) {
181 int32_t unitIdx = 0;
182 while(unitIdx < unitCount) {
183 array[i++]=units[unitIdx++];
184 }
185 }
186 }
187 }
188 setLength(length);
189 }
190 }
191
192 UnicodeString::UnicodeString(UChar ch) {
193 fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
194 fUnion.fStackFields.fBuffer[0] = ch;
195 }
196
197 UnicodeString::UnicodeString(UChar32 ch) {
198 fUnion.fFields.fLengthAndFlags = kShortString;
199 int32_t i = 0;
200 UBool isError = FALSE;
201 U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
202 // We test isError so that the compiler does not complain that we don't.
203 // If isError then i==0 which is what we want anyway.
204 if(!isError) {
205 setShortLength(i);
206 }
207 }
208
209 UnicodeString::UnicodeString(const UChar *text) {
210 fUnion.fFields.fLengthAndFlags = kShortString;
211 doAppend(text, 0, -1);
212 }
213
214 UnicodeString::UnicodeString(const UChar *text,
215 int32_t textLength) {
216 fUnion.fFields.fLengthAndFlags = kShortString;
217 doAppend(text, 0, textLength);
218 }
219
220 UnicodeString::UnicodeString(UBool isTerminated,
221 const UChar *text,
222 int32_t textLength) {
223 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
224 if(text == NULL) {
225 // treat as an empty string, do not alias
226 setToEmpty();
227 } else if(textLength < -1 ||
228 (textLength == -1 && !isTerminated) ||
229 (textLength >= 0 && isTerminated && text[textLength] != 0)
230 ) {
231 setToBogus();
232 } else {
233 if(textLength == -1) {
234 // text is terminated, or else it would have failed the above test
235 textLength = u_strlen(text);
236 }
237 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
238 }
239 }
240
241 UnicodeString::UnicodeString(UChar *buff,
242 int32_t buffLength,
243 int32_t buffCapacity) {
244 fUnion.fFields.fLengthAndFlags = kWritableAlias;
245 if(buff == NULL) {
246 // treat as an empty string, do not alias
247 setToEmpty();
248 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
249 setToBogus();
250 } else {
251 if(buffLength == -1) {
252 // fLength = u_strlen(buff); but do not look beyond buffCapacity
253 const UChar *p = buff, *limit = buff + buffCapacity;
254 while(p != limit && *p != 0) {
255 ++p;
256 }
257 buffLength = (int32_t)(p - buff);
258 }
259 setArray(buff, buffLength, buffCapacity);
260 }
261 }
262
263 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
264 fUnion.fFields.fLengthAndFlags = kShortString;
265 if(src==NULL) {
266 // treat as an empty string
267 } else {
268 if(length<0) {
269 length=(int32_t)uprv_strlen(src);
270 }
271 if(cloneArrayIfNeeded(length, length, FALSE)) {
272 u_charsToUChars(src, getArrayStart(), length);
273 setLength(length);
274 } else {
275 setToBogus();
276 }
277 }
278 }
279
280 #if U_CHARSET_IS_UTF8
281
282 UnicodeString::UnicodeString(const char *codepageData) {
283 fUnion.fFields.fLengthAndFlags = kShortString;
284 if(codepageData != 0) {
285 setToUTF8(codepageData);
286 }
287 }
288
289 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
290 fUnion.fFields.fLengthAndFlags = kShortString;
291 // if there's nothing to convert, do nothing
292 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
293 return;
294 }
295 if(dataLength == -1) {
296 dataLength = (int32_t)uprv_strlen(codepageData);
297 }
298 setToUTF8(StringPiece(codepageData, dataLength));
299 }
300
301 // else see unistr_cnv.cpp
302 #endif
303
304 UnicodeString::UnicodeString(const UnicodeString& that) {
305 fUnion.fFields.fLengthAndFlags = kShortString;
306 copyFrom(that);
307 }
308
309 #if U_HAVE_RVALUE_REFERENCES
310 UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
311 fUnion.fFields.fLengthAndFlags = kShortString;
312 moveFrom(src);
313 }
314 #endif
315
316 UnicodeString::UnicodeString(const UnicodeString& that,
317 int32_t srcStart) {
318 fUnion.fFields.fLengthAndFlags = kShortString;
319 setTo(that, srcStart);
320 }
321
322 UnicodeString::UnicodeString(const UnicodeString& that,
323 int32_t srcStart,
324 int32_t srcLength) {
325 fUnion.fFields.fLengthAndFlags = kShortString;
326 setTo(that, srcStart, srcLength);
327 }
328
329 // Replaceable base class clone() default implementation, does not clone
330 Replaceable *
331 Replaceable::clone() const {
332 return NULL;
333 }
334
335 // UnicodeString overrides clone() with a real implementation
336 Replaceable *
337 UnicodeString::clone() const {
338 return new UnicodeString(*this);
339 }
340
341 //========================================
342 // array allocation
343 //========================================
344
345 UBool
346 UnicodeString::allocate(int32_t capacity) {
347 if(capacity <= US_STACKBUF_SIZE) {
348 fUnion.fFields.fLengthAndFlags = kShortString;
349 } else {
350 // count bytes for the refCounter and the string capacity, and
351 // round up to a multiple of 16; then divide by 4 and allocate int32_t's
352 // to be safely aligned for the refCount
353 // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
354 int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
355 int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
356 if(array != 0) {
357 // set initial refCount and point behind the refCount
358 *array++ = 1;
359
360 // have fArray point to the first UChar
361 fUnion.fFields.fArray = (UChar *)array;
362 fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
363 fUnion.fFields.fLengthAndFlags = kLongString;
364 } else {
365 fUnion.fFields.fLengthAndFlags = kIsBogus;
366 fUnion.fFields.fArray = 0;
367 fUnion.fFields.fCapacity = 0;
368 return FALSE;
369 }
370 }
371 return TRUE;
372 }
373
374 //========================================
375 // Destructor
376 //========================================
377
378 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
379 static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1
380 static u_atomic_int32_t beyondCount(0);
381
382 U_CAPI void unistr_printLengths() {
383 int32_t i;
384 for(i = 0; i <= 59; ++i) {
385 printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]);
386 }
387 int32_t beyond = beyondCount;
388 for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
389 beyond += finalLengthCounts[i];
390 }
391 printf(">59, %9d\n", beyond);
392 }
393 #endif
394
395 UnicodeString::~UnicodeString()
396 {
397 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
398 // Count lengths of strings at the end of their lifetime.
399 // Useful for discussion of a desirable stack buffer size.
400 // Count the contents length, not the optional NUL terminator nor further capacity.
401 // Ignore open-buffer strings and strings which alias external storage.
402 if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
403 if(hasShortLength()) {
404 umtx_atomic_inc(finalLengthCounts + getShortLength());
405 } else {
406 umtx_atomic_inc(&beyondCount);
407 }
408 }
409 #endif
410
411 releaseArray();
412 }
413
414 //========================================
415 // Factory methods
416 //========================================
417
418 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
419 UnicodeString result;
420 result.setToUTF8(utf8);
421 return result;
422 }
423
424 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
425 UnicodeString result;
426 int32_t capacity;
427 // Most UTF-32 strings will be BMP-only and result in a same-length
428 // UTF-16 string. We overestimate the capacity just slightly,
429 // just in case there are a few supplementary characters.
430 if(length <= US_STACKBUF_SIZE) {
431 capacity = US_STACKBUF_SIZE;
432 } else {
433 capacity = length + (length >> 4) + 4;
434 }
435 do {
436 UChar *utf16 = result.getBuffer(capacity);
437 int32_t length16;
438 UErrorCode errorCode = U_ZERO_ERROR;
439 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
440 utf32, length,
441 0xfffd, // Substitution character.
442 NULL, // Don't care about number of substitutions.
443 &errorCode);
444 result.releaseBuffer(length16);
445 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
446 capacity = length16 + 1; // +1 for the terminating NUL.
447 continue;
448 } else if(U_FAILURE(errorCode)) {
449 result.setToBogus();
450 }
451 break;
452 } while(TRUE);
453 return result;
454 }
455
456 //========================================
457 // Assignment
458 //========================================
459
460 UnicodeString &
461 UnicodeString::operator=(const UnicodeString &src) {
462 return copyFrom(src);
463 }
464
465 UnicodeString &
466 UnicodeString::fastCopyFrom(const UnicodeString &src) {
467 return copyFrom(src, TRUE);
468 }
469
470 UnicodeString &
471 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
472 // if assigning to ourselves, do nothing
473 if(this == &src) {
474 return *this;
475 }
476
477 // is the right side bogus?
478 if(src.isBogus()) {
479 setToBogus();
480 return *this;
481 }
482
483 // delete the current contents
484 releaseArray();
485
486 if(src.isEmpty()) {
487 // empty string - use the stack buffer
488 setToEmpty();
489 return *this;
490 }
491
492 // fLength>0 and not an "open" src.getBuffer(minCapacity)
493 fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
494 switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
495 case kShortString:
496 // short string using the stack buffer, do the same
497 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
498 getShortLength() * U_SIZEOF_UCHAR);
499 break;
500 case kLongString:
501 // src uses a refCounted string buffer, use that buffer with refCount
502 // src is const, use a cast - we don't actually change it
503 ((UnicodeString &)src).addRef();
504 // copy all fields, share the reference-counted buffer
505 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
506 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
507 if(!hasShortLength()) {
508 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
509 }
510 break;
511 case kReadonlyAlias:
512 if(fastCopy) {
513 // src is a readonly alias, do the same
514 // -> maintain the readonly alias as such
515 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
516 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
517 if(!hasShortLength()) {
518 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
519 }
520 break;
521 }
522 // else if(!fastCopy) fall through to case kWritableAlias
523 // -> allocate a new buffer and copy the contents
524 U_FALLTHROUGH;
525 case kWritableAlias: {
526 // src is a writable alias; we make a copy of that instead
527 int32_t srcLength = src.length();
528 if(allocate(srcLength)) {
529 uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
530 setLength(srcLength);
531 break;
532 }
533 // if there is not enough memory, then fall through to setting to bogus
534 U_FALLTHROUGH;
535 }
536 default:
537 // if src is bogus, set ourselves to bogus
538 // do not call setToBogus() here because fArray and flags are not consistent here
539 fUnion.fFields.fLengthAndFlags = kIsBogus;
540 fUnion.fFields.fArray = 0;
541 fUnion.fFields.fCapacity = 0;
542 break;
543 }
544
545 return *this;
546 }
547
548 UnicodeString &UnicodeString::moveFrom(UnicodeString &src) U_NOEXCEPT {
549 // No explicit check for self move assignment, consistent with standard library.
550 // Self move assignment causes no crash nor leak but might make the object bogus.
551 releaseArray();
552 copyFieldsFrom(src, TRUE);
553 return *this;
554 }
555
556 // Same as moveFrom() except without memory management.
557 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
558 int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
559 if(lengthAndFlags & kUsingStackBuffer) {
560 // Short string using the stack buffer, copy the contents.
561 // Check for self assignment to prevent "overlap in memcpy" warnings,
562 // although it should be harmless to copy a buffer to itself exactly.
563 if(this != &src) {
564 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
565 getShortLength() * U_SIZEOF_UCHAR);
566 }
567 } else {
568 // In all other cases, copy all fields.
569 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
570 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
571 if(!hasShortLength()) {
572 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
573 }
574 if(setSrcToBogus) {
575 // Set src to bogus without releasing any memory.
576 src.fUnion.fFields.fLengthAndFlags = kIsBogus;
577 src.fUnion.fFields.fArray = NULL;
578 src.fUnion.fFields.fCapacity = 0;
579 }
580 }
581 }
582
583 void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
584 UnicodeString temp; // Empty short string: Known not to need releaseArray().
585 // Copy fields without resetting source values in between.
586 temp.copyFieldsFrom(*this, FALSE);
587 this->copyFieldsFrom(other, FALSE);
588 other.copyFieldsFrom(temp, FALSE);
589 // Set temp to an empty string so that other's memory is not released twice.
590 temp.fUnion.fFields.fLengthAndFlags = kShortString;
591 }
592
593 //========================================
594 // Miscellaneous operations
595 //========================================
596
597 UnicodeString UnicodeString::unescape() const {
598 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
599 if (result.isBogus()) {
600 return result;
601 }
602 const UChar *array = getBuffer();
603 int32_t len = length();
604 int32_t prev = 0;
605 for (int32_t i=0;;) {
606 if (i == len) {
607 result.append(array, prev, len - prev);
608 break;
609 }
610 if (array[i++] == 0x5C /*'\\'*/) {
611 result.append(array, prev, (i - 1) - prev);
612 UChar32 c = unescapeAt(i); // advances i
613 if (c < 0) {
614 result.remove(); // return empty string
615 break; // invalid escape sequence
616 }
617 result.append(c);
618 prev = i;
619 }
620 }
621 return result;
622 }
623
624 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
625 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
626 }
627
628 //========================================
629 // Read-only implementation
630 //========================================
631 UBool
632 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
633 // Requires: this & text not bogus and have same lengths.
634 // Byte-wise comparison works for equality regardless of endianness.
635 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
636 }
637
638 int8_t
639 UnicodeString::doCompare( int32_t start,
640 int32_t length,
641 const UChar *srcChars,
642 int32_t srcStart,
643 int32_t srcLength) const
644 {
645 // compare illegal string values
646 if(isBogus()) {
647 return -1;
648 }
649
650 // pin indices to legal values
651 pinIndices(start, length);
652
653 if(srcChars == NULL) {
654 // treat const UChar *srcChars==NULL as an empty string
655 return length == 0 ? 0 : 1;
656 }
657
658 // get the correct pointer
659 const UChar *chars = getArrayStart();
660
661 chars += start;
662 srcChars += srcStart;
663
664 int32_t minLength;
665 int8_t lengthResult;
666
667 // get the srcLength if necessary
668 if(srcLength < 0) {
669 srcLength = u_strlen(srcChars + srcStart);
670 }
671
672 // are we comparing different lengths?
673 if(length != srcLength) {
674 if(length < srcLength) {
675 minLength = length;
676 lengthResult = -1;
677 } else {
678 minLength = srcLength;
679 lengthResult = 1;
680 }
681 } else {
682 minLength = length;
683 lengthResult = 0;
684 }
685
686 /*
687 * note that uprv_memcmp() returns an int but we return an int8_t;
688 * we need to take care not to truncate the result -
689 * one way to do this is to right-shift the value to
690 * move the sign bit into the lower 8 bits and making sure that this
691 * does not become 0 itself
692 */
693
694 if(minLength > 0 && chars != srcChars) {
695 int32_t result;
696
697 # if U_IS_BIG_ENDIAN
698 // big-endian: byte comparison works
699 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
700 if(result != 0) {
701 return (int8_t)(result >> 15 | 1);
702 }
703 # else
704 // little-endian: compare UChar units
705 do {
706 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
707 if(result != 0) {
708 return (int8_t)(result >> 15 | 1);
709 }
710 } while(--minLength > 0);
711 # endif
712 }
713 return lengthResult;
714 }
715
716 /* String compare in code point order - doCompare() compares in code unit order. */
717 int8_t
718 UnicodeString::doCompareCodePointOrder(int32_t start,
719 int32_t length,
720 const UChar *srcChars,
721 int32_t srcStart,
722 int32_t srcLength) const
723 {
724 // compare illegal string values
725 // treat const UChar *srcChars==NULL as an empty string
726 if(isBogus()) {
727 return -1;
728 }
729
730 // pin indices to legal values
731 pinIndices(start, length);
732
733 if(srcChars == NULL) {
734 srcStart = srcLength = 0;
735 }
736
737 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
738 /* translate the 32-bit result into an 8-bit one */
739 if(diff!=0) {
740 return (int8_t)(diff >> 15 | 1);
741 } else {
742 return 0;
743 }
744 }
745
746 int32_t
747 UnicodeString::getLength() const {
748 return length();
749 }
750
751 UChar
752 UnicodeString::getCharAt(int32_t offset) const {
753 return charAt(offset);
754 }
755
756 UChar32
757 UnicodeString::getChar32At(int32_t offset) const {
758 return char32At(offset);
759 }
760
761 UChar32
762 UnicodeString::char32At(int32_t offset) const
763 {
764 int32_t len = length();
765 if((uint32_t)offset < (uint32_t)len) {
766 const UChar *array = getArrayStart();
767 UChar32 c;
768 U16_GET(array, 0, offset, len, c);
769 return c;
770 } else {
771 return kInvalidUChar;
772 }
773 }
774
775 int32_t
776 UnicodeString::getChar32Start(int32_t offset) const {
777 if((uint32_t)offset < (uint32_t)length()) {
778 const UChar *array = getArrayStart();
779 U16_SET_CP_START(array, 0, offset);
780 return offset;
781 } else {
782 return 0;
783 }
784 }
785
786 int32_t
787 UnicodeString::getChar32Limit(int32_t offset) const {
788 int32_t len = length();
789 if((uint32_t)offset < (uint32_t)len) {
790 const UChar *array = getArrayStart();
791 U16_SET_CP_LIMIT(array, 0, offset, len);
792 return offset;
793 } else {
794 return len;
795 }
796 }
797
798 int32_t
799 UnicodeString::countChar32(int32_t start, int32_t length) const {
800 pinIndices(start, length);
801 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
802 return u_countChar32(getArrayStart()+start, length);
803 }
804
805 UBool
806 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
807 pinIndices(start, length);
808 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
809 return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
810 }
811
812 int32_t
813 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
814 // pin index
815 int32_t len = length();
816 if(index<0) {
817 index=0;
818 } else if(index>len) {
819 index=len;
820 }
821
822 const UChar *array = getArrayStart();
823 if(delta>0) {
824 U16_FWD_N(array, index, len, delta);
825 } else {
826 U16_BACK_N(array, 0, index, -delta);
827 }
828
829 return index;
830 }
831
832 void
833 UnicodeString::doExtract(int32_t start,
834 int32_t length,
835 UChar *dst,
836 int32_t dstStart) const
837 {
838 // pin indices to legal values
839 pinIndices(start, length);
840
841 // do not copy anything if we alias dst itself
842 const UChar *array = getArrayStart();
843 if(array + start != dst + dstStart) {
844 us_arrayCopy(array, start, dst, dstStart, length);
845 }
846 }
847
848 int32_t
849 UnicodeString::extract(UChar *dest, int32_t destCapacity,
850 UErrorCode &errorCode) const {
851 int32_t len = length();
852 if(U_SUCCESS(errorCode)) {
853 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
854 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
855 } else {
856 const UChar *array = getArrayStart();
857 if(len>0 && len<=destCapacity && array!=dest) {
858 uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
859 }
860 return u_terminateUChars(dest, destCapacity, len, &errorCode);
861 }
862 }
863
864 return len;
865 }
866
867 int32_t
868 UnicodeString::extract(int32_t start,
869 int32_t length,
870 char *target,
871 int32_t targetCapacity,
872 enum EInvariant) const
873 {
874 // if the arguments are illegal, then do nothing
875 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
876 return 0;
877 }
878
879 // pin the indices to legal values
880 pinIndices(start, length);
881
882 if(length <= targetCapacity) {
883 u_UCharsToChars(getArrayStart() + start, target, length);
884 }
885 UErrorCode status = U_ZERO_ERROR;
886 return u_terminateChars(target, targetCapacity, length, &status);
887 }
888
889 UnicodeString
890 UnicodeString::tempSubString(int32_t start, int32_t len) const {
891 pinIndices(start, len);
892 const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
893 if(array==NULL) {
894 array=fUnion.fStackFields.fBuffer; // anything not NULL because that would make an empty string
895 len=-2; // bogus result string
896 }
897 return UnicodeString(FALSE, array + start, len);
898 }
899
900 int32_t
901 UnicodeString::toUTF8(int32_t start, int32_t len,
902 char *target, int32_t capacity) const {
903 pinIndices(start, len);
904 int32_t length8;
905 UErrorCode errorCode = U_ZERO_ERROR;
906 u_strToUTF8WithSub(target, capacity, &length8,
907 getBuffer() + start, len,
908 0xFFFD, // Standard substitution character.
909 NULL, // Don't care about number of substitutions.
910 &errorCode);
911 return length8;
912 }
913
914 #if U_CHARSET_IS_UTF8
915
916 int32_t
917 UnicodeString::extract(int32_t start, int32_t len,
918 char *target, uint32_t dstSize) const {
919 // if the arguments are illegal, then do nothing
920 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
921 return 0;
922 }
923 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
924 }
925
926 // else see unistr_cnv.cpp
927 #endif
928
929 void
930 UnicodeString::extractBetween(int32_t start,
931 int32_t limit,
932 UnicodeString& target) const {
933 pinIndex(start);
934 pinIndex(limit);
935 doExtract(start, limit - start, target);
936 }
937
938 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
939 // as many bytes as the source has UChars.
940 // The "worst cases" are writing systems like Indic, Thai and CJK with
941 // 3:1 bytes:UChars.
942 void
943 UnicodeString::toUTF8(ByteSink &sink) const {
944 int32_t length16 = length();
945 if(length16 != 0) {
946 char stackBuffer[1024];
947 int32_t capacity = (int32_t)sizeof(stackBuffer);
948 UBool utf8IsOwned = FALSE;
949 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
950 3*length16,
951 stackBuffer, capacity,
952 &capacity);
953 int32_t length8 = 0;
954 UErrorCode errorCode = U_ZERO_ERROR;
955 u_strToUTF8WithSub(utf8, capacity, &length8,
956 getBuffer(), length16,
957 0xFFFD, // Standard substitution character.
958 NULL, // Don't care about number of substitutions.
959 &errorCode);
960 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
961 utf8 = (char *)uprv_malloc(length8);
962 if(utf8 != NULL) {
963 utf8IsOwned = TRUE;
964 errorCode = U_ZERO_ERROR;
965 u_strToUTF8WithSub(utf8, length8, &length8,
966 getBuffer(), length16,
967 0xFFFD, // Standard substitution character.
968 NULL, // Don't care about number of substitutions.
969 &errorCode);
970 } else {
971 errorCode = U_MEMORY_ALLOCATION_ERROR;
972 }
973 }
974 if(U_SUCCESS(errorCode)) {
975 sink.Append(utf8, length8);
976 sink.Flush();
977 }
978 if(utf8IsOwned) {
979 uprv_free(utf8);
980 }
981 }
982 }
983
984 int32_t
985 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
986 int32_t length32=0;
987 if(U_SUCCESS(errorCode)) {
988 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
989 u_strToUTF32WithSub(utf32, capacity, &length32,
990 getBuffer(), length(),
991 0xfffd, // Substitution character.
992 NULL, // Don't care about number of substitutions.
993 &errorCode);
994 }
995 return length32;
996 }
997
998 int32_t
999 UnicodeString::indexOf(const UChar *srcChars,
1000 int32_t srcStart,
1001 int32_t srcLength,
1002 int32_t start,
1003 int32_t length) const
1004 {
1005 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1006 return -1;
1007 }
1008
1009 // UnicodeString does not find empty substrings
1010 if(srcLength < 0 && srcChars[srcStart] == 0) {
1011 return -1;
1012 }
1013
1014 // get the indices within bounds
1015 pinIndices(start, length);
1016
1017 // find the first occurrence of the substring
1018 const UChar *array = getArrayStart();
1019 const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1020 if(match == NULL) {
1021 return -1;
1022 } else {
1023 return (int32_t)(match - array);
1024 }
1025 }
1026
1027 int32_t
1028 UnicodeString::doIndexOf(UChar c,
1029 int32_t start,
1030 int32_t length) const
1031 {
1032 // pin indices
1033 pinIndices(start, length);
1034
1035 // find the first occurrence of c
1036 const UChar *array = getArrayStart();
1037 const UChar *match = u_memchr(array + start, c, length);
1038 if(match == NULL) {
1039 return -1;
1040 } else {
1041 return (int32_t)(match - array);
1042 }
1043 }
1044
1045 int32_t
1046 UnicodeString::doIndexOf(UChar32 c,
1047 int32_t start,
1048 int32_t length) const {
1049 // pin indices
1050 pinIndices(start, length);
1051
1052 // find the first occurrence of c
1053 const UChar *array = getArrayStart();
1054 const UChar *match = u_memchr32(array + start, c, length);
1055 if(match == NULL) {
1056 return -1;
1057 } else {
1058 return (int32_t)(match - array);
1059 }
1060 }
1061
1062 int32_t
1063 UnicodeString::lastIndexOf(const UChar *srcChars,
1064 int32_t srcStart,
1065 int32_t srcLength,
1066 int32_t start,
1067 int32_t length) const
1068 {
1069 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1070 return -1;
1071 }
1072
1073 // UnicodeString does not find empty substrings
1074 if(srcLength < 0 && srcChars[srcStart] == 0) {
1075 return -1;
1076 }
1077
1078 // get the indices within bounds
1079 pinIndices(start, length);
1080
1081 // find the last occurrence of the substring
1082 const UChar *array = getArrayStart();
1083 const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1084 if(match == NULL) {
1085 return -1;
1086 } else {
1087 return (int32_t)(match - array);
1088 }
1089 }
1090
1091 int32_t
1092 UnicodeString::doLastIndexOf(UChar c,
1093 int32_t start,
1094 int32_t length) const
1095 {
1096 if(isBogus()) {
1097 return -1;
1098 }
1099
1100 // pin indices
1101 pinIndices(start, length);
1102
1103 // find the last occurrence of c
1104 const UChar *array = getArrayStart();
1105 const UChar *match = u_memrchr(array + start, c, length);
1106 if(match == NULL) {
1107 return -1;
1108 } else {
1109 return (int32_t)(match - array);
1110 }
1111 }
1112
1113 int32_t
1114 UnicodeString::doLastIndexOf(UChar32 c,
1115 int32_t start,
1116 int32_t length) const {
1117 // pin indices
1118 pinIndices(start, length);
1119
1120 // find the last occurrence of c
1121 const UChar *array = getArrayStart();
1122 const UChar *match = u_memrchr32(array + start, c, length);
1123 if(match == NULL) {
1124 return -1;
1125 } else {
1126 return (int32_t)(match - array);
1127 }
1128 }
1129
1130 //========================================
1131 // Write implementation
1132 //========================================
1133
1134 UnicodeString&
1135 UnicodeString::findAndReplace(int32_t start,
1136 int32_t length,
1137 const UnicodeString& oldText,
1138 int32_t oldStart,
1139 int32_t oldLength,
1140 const UnicodeString& newText,
1141 int32_t newStart,
1142 int32_t newLength)
1143 {
1144 if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1145 return *this;
1146 }
1147
1148 pinIndices(start, length);
1149 oldText.pinIndices(oldStart, oldLength);
1150 newText.pinIndices(newStart, newLength);
1151
1152 if(oldLength == 0) {
1153 return *this;
1154 }
1155
1156 while(length > 0 && length >= oldLength) {
1157 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1158 if(pos < 0) {
1159 // no more oldText's here: done
1160 break;
1161 } else {
1162 // we found oldText, replace it by newText and go beyond it
1163 replace(pos, oldLength, newText, newStart, newLength);
1164 length -= pos + oldLength - start;
1165 start = pos + newLength;
1166 }
1167 }
1168
1169 return *this;
1170 }
1171
1172
1173 void
1174 UnicodeString::setToBogus()
1175 {
1176 releaseArray();
1177
1178 fUnion.fFields.fLengthAndFlags = kIsBogus;
1179 fUnion.fFields.fArray = 0;
1180 fUnion.fFields.fCapacity = 0;
1181 }
1182
1183 // turn a bogus string into an empty one
1184 void
1185 UnicodeString::unBogus() {
1186 if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1187 setToEmpty();
1188 }
1189 }
1190
1191 const UChar *
1192 UnicodeString::getTerminatedBuffer() {
1193 if(!isWritable()) {
1194 return 0;
1195 }
1196 UChar *array = getArrayStart();
1197 int32_t len = length();
1198 if(len < getCapacity()) {
1199 if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1200 // If len<capacity on a read-only alias, then array[len] is
1201 // either the original NUL (if constructed with (TRUE, s, length))
1202 // or one of the original string contents characters (if later truncated),
1203 // therefore we can assume that array[len] is initialized memory.
1204 if(array[len] == 0) {
1205 return array;
1206 }
1207 } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1208 // kRefCounted: Do not write the NUL if the buffer is shared.
1209 // That is mostly safe, except when the length of one copy was modified
1210 // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1211 // Then the NUL would be written into the middle of another copy's string.
1212
1213 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1214 // Do not test if there is a NUL already because it might be uninitialized memory.
1215 // (That would be safe, but tools like valgrind & Purify would complain.)
1216 array[len] = 0;
1217 return array;
1218 }
1219 }
1220 if(cloneArrayIfNeeded(len+1)) {
1221 array = getArrayStart();
1222 array[len] = 0;
1223 return array;
1224 } else {
1225 return NULL;
1226 }
1227 }
1228
1229 // setTo() analogous to the readonly-aliasing constructor with the same signature
1230 UnicodeString &
1231 UnicodeString::setTo(UBool isTerminated,
1232 const UChar *text,
1233 int32_t textLength)
1234 {
1235 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1236 // do not modify a string that has an "open" getBuffer(minCapacity)
1237 return *this;
1238 }
1239
1240 if(text == NULL) {
1241 // treat as an empty string, do not alias
1242 releaseArray();
1243 setToEmpty();
1244 return *this;
1245 }
1246
1247 if( textLength < -1 ||
1248 (textLength == -1 && !isTerminated) ||
1249 (textLength >= 0 && isTerminated && text[textLength] != 0)
1250 ) {
1251 setToBogus();
1252 return *this;
1253 }
1254
1255 releaseArray();
1256
1257 if(textLength == -1) {
1258 // text is terminated, or else it would have failed the above test
1259 textLength = u_strlen(text);
1260 }
1261 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1262 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1263 return *this;
1264 }
1265
1266 // setTo() analogous to the writable-aliasing constructor with the same signature
1267 UnicodeString &
1268 UnicodeString::setTo(UChar *buffer,
1269 int32_t buffLength,
1270 int32_t buffCapacity) {
1271 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1272 // do not modify a string that has an "open" getBuffer(minCapacity)
1273 return *this;
1274 }
1275
1276 if(buffer == NULL) {
1277 // treat as an empty string, do not alias
1278 releaseArray();
1279 setToEmpty();
1280 return *this;
1281 }
1282
1283 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1284 setToBogus();
1285 return *this;
1286 } else if(buffLength == -1) {
1287 // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1288 const UChar *p = buffer, *limit = buffer + buffCapacity;
1289 while(p != limit && *p != 0) {
1290 ++p;
1291 }
1292 buffLength = (int32_t)(p - buffer);
1293 }
1294
1295 releaseArray();
1296
1297 fUnion.fFields.fLengthAndFlags = kWritableAlias;
1298 setArray(buffer, buffLength, buffCapacity);
1299 return *this;
1300 }
1301
1302 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
1303 unBogus();
1304 int32_t length = utf8.length();
1305 int32_t capacity;
1306 // The UTF-16 string will be at most as long as the UTF-8 string.
1307 if(length <= US_STACKBUF_SIZE) {
1308 capacity = US_STACKBUF_SIZE;
1309 } else {
1310 capacity = length + 1; // +1 for the terminating NUL.
1311 }
1312 UChar *utf16 = getBuffer(capacity);
1313 int32_t length16;
1314 UErrorCode errorCode = U_ZERO_ERROR;
1315 u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1316 utf8.data(), length,
1317 0xfffd, // Substitution character.
1318 NULL, // Don't care about number of substitutions.
1319 &errorCode);
1320 releaseBuffer(length16);
1321 if(U_FAILURE(errorCode)) {
1322 setToBogus();
1323 }
1324 return *this;
1325 }
1326
1327 UnicodeString&
1328 UnicodeString::setCharAt(int32_t offset,
1329 UChar c)
1330 {
1331 int32_t len = length();
1332 if(cloneArrayIfNeeded() && len > 0) {
1333 if(offset < 0) {
1334 offset = 0;
1335 } else if(offset >= len) {
1336 offset = len - 1;
1337 }
1338
1339 getArrayStart()[offset] = c;
1340 }
1341 return *this;
1342 }
1343
1344 UnicodeString&
1345 UnicodeString::replace(int32_t start,
1346 int32_t _length,
1347 UChar32 srcChar) {
1348 UChar buffer[U16_MAX_LENGTH];
1349 int32_t count = 0;
1350 UBool isError = FALSE;
1351 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1352 // We test isError so that the compiler does not complain that we don't.
1353 // If isError (srcChar is not a valid code point) then count==0 which means
1354 // we remove the source segment rather than replacing it with srcChar.
1355 return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1356 }
1357
1358 UnicodeString&
1359 UnicodeString::append(UChar32 srcChar) {
1360 UChar buffer[U16_MAX_LENGTH];
1361 int32_t _length = 0;
1362 UBool isError = FALSE;
1363 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1364 // We test isError so that the compiler does not complain that we don't.
1365 // If isError then _length==0 which turns the doAppend() into a no-op anyway.
1366 return isError ? *this : doAppend(buffer, 0, _length);
1367 }
1368
1369 UnicodeString&
1370 UnicodeString::doReplace( int32_t start,
1371 int32_t length,
1372 const UnicodeString& src,
1373 int32_t srcStart,
1374 int32_t srcLength)
1375 {
1376 // pin the indices to legal values
1377 src.pinIndices(srcStart, srcLength);
1378
1379 // get the characters from src
1380 // and replace the range in ourselves with them
1381 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1382 }
1383
1384 UnicodeString&
1385 UnicodeString::doReplace(int32_t start,
1386 int32_t length,
1387 const UChar *srcChars,
1388 int32_t srcStart,
1389 int32_t srcLength)
1390 {
1391 if(!isWritable()) {
1392 return *this;
1393 }
1394
1395 int32_t oldLength = this->length();
1396
1397 // optimize (read-only alias).remove(0, start) and .remove(start, end)
1398 if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1399 if(start == 0) {
1400 // remove prefix by adjusting the array pointer
1401 pinIndex(length);
1402 fUnion.fFields.fArray += length;
1403 fUnion.fFields.fCapacity -= length;
1404 setLength(oldLength - length);
1405 return *this;
1406 } else {
1407 pinIndex(start);
1408 if(length >= (oldLength - start)) {
1409 // remove suffix by reducing the length (like truncate())
1410 setLength(start);
1411 fUnion.fFields.fCapacity = start; // not NUL-terminated any more
1412 return *this;
1413 }
1414 }
1415 }
1416
1417 if(start == oldLength) {
1418 return doAppend(srcChars, srcStart, srcLength);
1419 }
1420
1421 if(srcChars == 0) {
1422 srcStart = srcLength = 0;
1423 } else if(srcLength < 0) {
1424 // get the srcLength if necessary
1425 srcLength = u_strlen(srcChars + srcStart);
1426 }
1427
1428 // pin the indices to legal values
1429 pinIndices(start, length);
1430
1431 // calculate the size of the string after the replace
1432 int32_t newLength = oldLength - length + srcLength;
1433
1434 // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
1435 // therefore we need to keep the current fArray
1436 UChar oldStackBuffer[US_STACKBUF_SIZE];
1437 UChar *oldArray;
1438 if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1439 // copy the stack buffer contents because it will be overwritten with
1440 // fUnion.fFields values
1441 u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
1442 oldArray = oldStackBuffer;
1443 } else {
1444 oldArray = getArrayStart();
1445 }
1446
1447 // clone our array and allocate a bigger array if needed
1448 int32_t *bufferToDelete = 0;
1449 if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
1450 FALSE, &bufferToDelete)
1451 ) {
1452 return *this;
1453 }
1454
1455 // now do the replace
1456
1457 UChar *newArray = getArrayStart();
1458 if(newArray != oldArray) {
1459 // if fArray changed, then we need to copy everything except what will change
1460 us_arrayCopy(oldArray, 0, newArray, 0, start);
1461 us_arrayCopy(oldArray, start + length,
1462 newArray, start + srcLength,
1463 oldLength - (start + length));
1464 } else if(length != srcLength) {
1465 // fArray did not change; copy only the portion that isn't changing, leaving a hole
1466 us_arrayCopy(oldArray, start + length,
1467 newArray, start + srcLength,
1468 oldLength - (start + length));
1469 }
1470
1471 // now fill in the hole with the new string
1472 us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1473
1474 setLength(newLength);
1475
1476 // delayed delete in case srcChars == fArray when we started, and
1477 // to keep oldArray alive for the above operations
1478 if (bufferToDelete) {
1479 uprv_free(bufferToDelete);
1480 }
1481
1482 return *this;
1483 }
1484
1485 // Versions of doReplace() only for append() variants.
1486 // doReplace() and doAppend() optimize for different cases.
1487
1488 UnicodeString&
1489 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1490 if(srcLength == 0) {
1491 return *this;
1492 }
1493
1494 // pin the indices to legal values
1495 src.pinIndices(srcStart, srcLength);
1496 return doAppend(src.getArrayStart(), srcStart, srcLength);
1497 }
1498
1499 UnicodeString&
1500 UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
1501 if(!isWritable() || srcLength == 0 || srcChars == NULL) {
1502 return *this;
1503 }
1504
1505 if(srcLength < 0) {
1506 // get the srcLength if necessary
1507 if((srcLength = u_strlen(srcChars + srcStart)) == 0) {
1508 return *this;
1509 }
1510 }
1511
1512 int32_t oldLength = length();
1513 int32_t newLength = oldLength + srcLength;
1514 // optimize append() onto a large-enough, owned string
1515 if((newLength <= getCapacity() && isBufferWritable()) ||
1516 cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize)) {
1517 UChar *newArray = getArrayStart();
1518 // Do not copy characters when
1519 // UChar *buffer=str.getAppendBuffer(...);
1520 // is followed by
1521 // str.append(buffer, length);
1522 // or
1523 // str.appendString(buffer, length)
1524 // or similar.
1525 if(srcChars + srcStart != newArray + oldLength) {
1526 us_arrayCopy(srcChars, srcStart, newArray, oldLength, srcLength);
1527 }
1528 setLength(newLength);
1529 }
1530 return *this;
1531 }
1532
1533 /**
1534 * Replaceable API
1535 */
1536 void
1537 UnicodeString::handleReplaceBetween(int32_t start,
1538 int32_t limit,
1539 const UnicodeString& text) {
1540 replaceBetween(start, limit, text);
1541 }
1542
1543 /**
1544 * Replaceable API
1545 */
1546 void
1547 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1548 if (limit <= start) {
1549 return; // Nothing to do; avoid bogus malloc call
1550 }
1551 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1552 // Check to make sure text is not null.
1553 if (text != NULL) {
1554 extractBetween(start, limit, text, 0);
1555 insert(dest, text, 0, limit - start);
1556 uprv_free(text);
1557 }
1558 }
1559
1560 /**
1561 * Replaceable API
1562 *
1563 * NOTE: This is for the Replaceable class. There is no rep.cpp,
1564 * so we implement this function here.
1565 */
1566 UBool Replaceable::hasMetaData() const {
1567 return TRUE;
1568 }
1569
1570 /**
1571 * Replaceable API
1572 */
1573 UBool UnicodeString::hasMetaData() const {
1574 return FALSE;
1575 }
1576
1577 UnicodeString&
1578 UnicodeString::doReverse(int32_t start, int32_t length) {
1579 if(length <= 1 || !cloneArrayIfNeeded()) {
1580 return *this;
1581 }
1582
1583 // pin the indices to legal values
1584 pinIndices(start, length);
1585 if(length <= 1) { // pinIndices() might have shrunk the length
1586 return *this;
1587 }
1588
1589 UChar *left = getArrayStart() + start;
1590 UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2)
1591 UChar swap;
1592 UBool hasSupplementary = FALSE;
1593
1594 // Before the loop we know left<right because length>=2.
1595 do {
1596 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1597 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1598 *right-- = swap;
1599 } while(left < right);
1600 // Make sure to test the middle code unit of an odd-length string.
1601 // Redundant if the length is even.
1602 hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1603
1604 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1605 if(hasSupplementary) {
1606 UChar swap2;
1607
1608 left = getArrayStart() + start;
1609 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1610 while(left < right) {
1611 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1612 *left++ = swap2;
1613 *left++ = swap;
1614 } else {
1615 ++left;
1616 }
1617 }
1618 }
1619
1620 return *this;
1621 }
1622
1623 UBool
1624 UnicodeString::padLeading(int32_t targetLength,
1625 UChar padChar)
1626 {
1627 int32_t oldLength = length();
1628 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1629 return FALSE;
1630 } else {
1631 // move contents up by padding width
1632 UChar *array = getArrayStart();
1633 int32_t start = targetLength - oldLength;
1634 us_arrayCopy(array, 0, array, start, oldLength);
1635
1636 // fill in padding character
1637 while(--start >= 0) {
1638 array[start] = padChar;
1639 }
1640 setLength(targetLength);
1641 return TRUE;
1642 }
1643 }
1644
1645 UBool
1646 UnicodeString::padTrailing(int32_t targetLength,
1647 UChar padChar)
1648 {
1649 int32_t oldLength = length();
1650 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1651 return FALSE;
1652 } else {
1653 // fill in padding character
1654 UChar *array = getArrayStart();
1655 int32_t length = targetLength;
1656 while(--length >= oldLength) {
1657 array[length] = padChar;
1658 }
1659 setLength(targetLength);
1660 return TRUE;
1661 }
1662 }
1663
1664 //========================================
1665 // Hashing
1666 //========================================
1667 int32_t
1668 UnicodeString::doHashCode() const
1669 {
1670 /* Delegate hash computation to uhash. This makes UnicodeString
1671 * hashing consistent with UChar* hashing. */
1672 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1673 if (hashCode == kInvalidHashCode) {
1674 hashCode = kEmptyHashCode;
1675 }
1676 return hashCode;
1677 }
1678
1679 //========================================
1680 // External Buffer
1681 //========================================
1682
1683 UChar *
1684 UnicodeString::getBuffer(int32_t minCapacity) {
1685 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1686 fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1687 setZeroLength();
1688 return getArrayStart();
1689 } else {
1690 return 0;
1691 }
1692 }
1693
1694 void
1695 UnicodeString::releaseBuffer(int32_t newLength) {
1696 if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1697 // set the new fLength
1698 int32_t capacity=getCapacity();
1699 if(newLength==-1) {
1700 // the new length is the string length, capped by fCapacity
1701 const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1702 while(p<limit && *p!=0) {
1703 ++p;
1704 }
1705 newLength=(int32_t)(p-array);
1706 } else if(newLength>capacity) {
1707 newLength=capacity;
1708 }
1709 setLength(newLength);
1710 fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1711 }
1712 }
1713
1714 //========================================
1715 // Miscellaneous
1716 //========================================
1717 UBool
1718 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1719 int32_t growCapacity,
1720 UBool doCopyArray,
1721 int32_t **pBufferToDelete,
1722 UBool forceClone) {
1723 // default parameters need to be static, therefore
1724 // the defaults are -1 to have convenience defaults
1725 if(newCapacity == -1) {
1726 newCapacity = getCapacity();
1727 }
1728
1729 // while a getBuffer(minCapacity) is "open",
1730 // prevent any modifications of the string by returning FALSE here
1731 // if the string is bogus, then only an assignment or similar can revive it
1732 if(!isWritable()) {
1733 return FALSE;
1734 }
1735
1736 /*
1737 * We need to make a copy of the array if
1738 * the buffer is read-only, or
1739 * the buffer is refCounted (shared), and refCount>1, or
1740 * the buffer is too small.
1741 * Return FALSE if memory could not be allocated.
1742 */
1743 if(forceClone ||
1744 fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1745 (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1746 newCapacity > getCapacity()
1747 ) {
1748 // check growCapacity for default value and use of the stack buffer
1749 if(growCapacity < 0) {
1750 growCapacity = newCapacity;
1751 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1752 growCapacity = US_STACKBUF_SIZE;
1753 }
1754
1755 // save old values
1756 UChar oldStackBuffer[US_STACKBUF_SIZE];
1757 UChar *oldArray;
1758 int32_t oldLength = length();
1759 int16_t flags = fUnion.fFields.fLengthAndFlags;
1760
1761 if(flags&kUsingStackBuffer) {
1762 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1763 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1764 // copy the stack buffer contents because it will be overwritten with
1765 // fUnion.fFields values
1766 us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1767 oldArray = oldStackBuffer;
1768 } else {
1769 oldArray = NULL; // no need to copy from the stack buffer to itself
1770 }
1771 } else {
1772 oldArray = fUnion.fFields.fArray;
1773 U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1774 }
1775
1776 // allocate a new array
1777 if(allocate(growCapacity) ||
1778 (newCapacity < growCapacity && allocate(newCapacity))
1779 ) {
1780 if(doCopyArray) {
1781 // copy the contents
1782 // do not copy more than what fits - it may be smaller than before
1783 int32_t minLength = oldLength;
1784 newCapacity = getCapacity();
1785 if(newCapacity < minLength) {
1786 minLength = newCapacity;
1787 }
1788 if(oldArray != NULL) {
1789 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1790 }
1791 setLength(minLength);
1792 } else {
1793 setZeroLength();
1794 }
1795
1796 // release the old array
1797 if(flags & kRefCounted) {
1798 // the array is refCounted; decrement and release if 0
1799 u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1800 if(umtx_atomic_dec(pRefCount) == 0) {
1801 if(pBufferToDelete == 0) {
1802 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1803 // is defined as volatile. (Volatile has useful non-standard behavior
1804 // with this compiler.)
1805 uprv_free((void *)pRefCount);
1806 } else {
1807 // the caller requested to delete it himself
1808 *pBufferToDelete = (int32_t *)pRefCount;
1809 }
1810 }
1811 }
1812 } else {
1813 // not enough memory for growCapacity and not even for the smaller newCapacity
1814 // reset the old values for setToBogus() to release the array
1815 if(!(flags&kUsingStackBuffer)) {
1816 fUnion.fFields.fArray = oldArray;
1817 }
1818 fUnion.fFields.fLengthAndFlags = flags;
1819 setToBogus();
1820 return FALSE;
1821 }
1822 }
1823 return TRUE;
1824 }
1825
1826 // UnicodeStringAppendable ------------------------------------------------- ***
1827
1828 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1829
1830 UBool
1831 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1832 return str.doAppend(&c, 0, 1).isWritable();
1833 }
1834
1835 UBool
1836 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1837 UChar buffer[U16_MAX_LENGTH];
1838 int32_t cLength = 0;
1839 UBool isError = FALSE;
1840 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1841 return !isError && str.doAppend(buffer, 0, cLength).isWritable();
1842 }
1843
1844 UBool
1845 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1846 return str.doAppend(s, 0, length).isWritable();
1847 }
1848
1849 UBool
1850 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1851 return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1852 }
1853
1854 UChar *
1855 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1856 int32_t desiredCapacityHint,
1857 UChar *scratch, int32_t scratchCapacity,
1858 int32_t *resultCapacity) {
1859 if(minCapacity < 1 || scratchCapacity < minCapacity) {
1860 *resultCapacity = 0;
1861 return NULL;
1862 }
1863 int32_t oldLength = str.length();
1864 if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1865 *resultCapacity = str.getCapacity() - oldLength;
1866 return str.getArrayStart() + oldLength;
1867 }
1868 *resultCapacity = scratchCapacity;
1869 return scratch;
1870 }
1871
1872 U_NAMESPACE_END
1873
1874 U_NAMESPACE_USE
1875
1876 U_CAPI int32_t U_EXPORT2
1877 uhash_hashUnicodeString(const UElement key) {
1878 const UnicodeString *str = (const UnicodeString*) key.pointer;
1879 return (str == NULL) ? 0 : str->hashCode();
1880 }
1881
1882 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1883 // does not depend on hashtable code.
1884 U_CAPI UBool U_EXPORT2
1885 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1886 const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1887 const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1888 if (str1 == str2) {
1889 return TRUE;
1890 }
1891 if (str1 == NULL || str2 == NULL) {
1892 return FALSE;
1893 }
1894 return *str1 == *str2;
1895 }
1896
1897 #ifdef U_STATIC_IMPLEMENTATION
1898 /*
1899 This should never be called. It is defined here to make sure that the
1900 virtual vector deleting destructor is defined within unistr.cpp.
1901 The vector deleting destructor is already a part of UObject,
1902 but defining it here makes sure that it is included with this object file.
1903 This makes sure that static library dependencies are kept to a minimum.
1904 */
1905 static void uprv_UnicodeStringDummy(void) {
1906 delete [] (new UnicodeString[2]);
1907 }
1908 #endif