ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / common / unistr.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ******************************************************************************
8 *
9 * File unistr.cpp
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 09/25/98 stephen Creation.
15 * 04/20/99 stephen Overhauled per 4/16 code review.
16 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
17 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from
18 * Replaceable.
19 * 06/25/01 grhoten Removed the dependency on iostream
20 ******************************************************************************
21 */
22
23 #include "unicode/utypes.h"
24 #include "unicode/appendable.h"
25 #include "unicode/putil.h"
26 #include "cstring.h"
27 #include "cmemory.h"
28 #include "unicode/ustring.h"
29 #include "unicode/unistr.h"
30 #include "unicode/utf.h"
31 #include "unicode/utf16.h"
32 #include "uelement.h"
33 #include "ustr_imp.h"
34 #include "umutex.h"
35 #include "uassert.h"
36
37 #if 0
38
39 #include <iostream>
40 using namespace std;
41
42 //DEBUGGING
43 void
44 print(const UnicodeString& s,
45 const char *name)
46 {
47 UChar c;
48 cout << name << ":|";
49 for(int i = 0; i < s.length(); ++i) {
50 c = s[i];
51 if(c>= 0x007E || c < 0x0020)
52 cout << "[0x" << hex << s[i] << "]";
53 else
54 cout << (char) s[i];
55 }
56 cout << '|' << endl;
57 }
58
59 void
60 print(const UChar *s,
61 int32_t len,
62 const char *name)
63 {
64 UChar c;
65 cout << name << ":|";
66 for(int i = 0; i < len; ++i) {
67 c = s[i];
68 if(c>= 0x007E || c < 0x0020)
69 cout << "[0x" << hex << s[i] << "]";
70 else
71 cout << (char) s[i];
72 }
73 cout << '|' << endl;
74 }
75 // END DEBUGGING
76 #endif
77
78 // Local function definitions for now
79
80 // need to copy areas that may overlap
81 static
82 inline void
83 us_arrayCopy(const UChar *src, int32_t srcStart,
84 UChar *dst, int32_t dstStart, int32_t count)
85 {
86 if(count>0) {
87 uprv_memmove(dst+dstStart, src+srcStart, (size_t)count*sizeof(*src));
88 }
89 }
90
91 // u_unescapeAt() callback to get a UChar from a UnicodeString
92 U_CDECL_BEGIN
93 static UChar U_CALLCONV
94 UnicodeString_charAt(int32_t offset, void *context) {
95 return ((icu::UnicodeString*) context)->charAt(offset);
96 }
97 U_CDECL_END
98
99 U_NAMESPACE_BEGIN
100
101 /* The Replaceable virtual destructor can't be defined in the header
102 due to how AIX works with multiple definitions of virtual functions.
103 */
104 Replaceable::~Replaceable() {}
105
106 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
107
108 UnicodeString U_EXPORT2
109 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
110 return
111 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
112 append(s1).
113 append(s2);
114 }
115
116 //========================================
117 // Reference Counting functions, put at top of file so that optimizing compilers
118 // have a chance to automatically inline.
119 //========================================
120
121 void
122 UnicodeString::addRef() {
123 umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
124 }
125
126 int32_t
127 UnicodeString::removeRef() {
128 return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
129 }
130
131 int32_t
132 UnicodeString::refCount() const {
133 return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
134 }
135
136 void
137 UnicodeString::releaseArray() {
138 if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
139 uprv_free((int32_t *)fUnion.fFields.fArray - 1);
140 }
141 }
142
143
144
145 //========================================
146 // Constructors
147 //========================================
148
149 // The default constructor is inline in unistr.h.
150
151 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
152 fUnion.fFields.fLengthAndFlags = 0;
153 if(count <= 0 || (uint32_t)c > 0x10ffff) {
154 // just allocate and do not do anything else
155 allocate(capacity);
156 } else if(c <= 0xffff) {
157 int32_t length = count;
158 if(capacity < length) {
159 capacity = length;
160 }
161 if(allocate(capacity)) {
162 UChar *array = getArrayStart();
163 UChar unit = (UChar)c;
164 for(int32_t i = 0; i < length; ++i) {
165 array[i] = unit;
166 }
167 setLength(length);
168 }
169 } else { // supplementary code point, write surrogate pairs
170 if(count > (INT32_MAX / 2)) {
171 // We would get more than 2G UChars.
172 allocate(capacity);
173 return;
174 }
175 int32_t length = count * 2;
176 if(capacity < length) {
177 capacity = length;
178 }
179 if(allocate(capacity)) {
180 UChar *array = getArrayStart();
181 UChar lead = U16_LEAD(c);
182 UChar trail = U16_TRAIL(c);
183 for(int32_t i = 0; i < length; i += 2) {
184 array[i] = lead;
185 array[i + 1] = trail;
186 }
187 setLength(length);
188 }
189 }
190 }
191
192 UnicodeString::UnicodeString(UChar ch) {
193 fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
194 fUnion.fStackFields.fBuffer[0] = ch;
195 }
196
197 UnicodeString::UnicodeString(UChar32 ch) {
198 fUnion.fFields.fLengthAndFlags = kShortString;
199 int32_t i = 0;
200 UBool isError = FALSE;
201 U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
202 // We test isError so that the compiler does not complain that we don't.
203 // If isError then i==0 which is what we want anyway.
204 if(!isError) {
205 setShortLength(i);
206 }
207 }
208
209 UnicodeString::UnicodeString(const UChar *text) {
210 fUnion.fFields.fLengthAndFlags = kShortString;
211 doAppend(text, 0, -1);
212 }
213
214 UnicodeString::UnicodeString(const UChar *text,
215 int32_t textLength) {
216 fUnion.fFields.fLengthAndFlags = kShortString;
217 doAppend(text, 0, textLength);
218 }
219
220 UnicodeString::UnicodeString(UBool isTerminated,
221 ConstChar16Ptr textPtr,
222 int32_t textLength) {
223 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
224 const UChar *text = textPtr;
225 if(text == NULL) {
226 // treat as an empty string, do not alias
227 setToEmpty();
228 } else if(textLength < -1 ||
229 (textLength == -1 && !isTerminated) ||
230 (textLength >= 0 && isTerminated && text[textLength] != 0)
231 ) {
232 setToBogus();
233 } else {
234 if(textLength == -1) {
235 // text is terminated, or else it would have failed the above test
236 textLength = u_strlen(text);
237 }
238 setArray(const_cast<UChar *>(text), textLength,
239 isTerminated ? textLength + 1 : textLength);
240 }
241 }
242
243 UnicodeString::UnicodeString(UChar *buff,
244 int32_t buffLength,
245 int32_t buffCapacity) {
246 fUnion.fFields.fLengthAndFlags = kWritableAlias;
247 if(buff == NULL) {
248 // treat as an empty string, do not alias
249 setToEmpty();
250 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
251 setToBogus();
252 } else {
253 if(buffLength == -1) {
254 // fLength = u_strlen(buff); but do not look beyond buffCapacity
255 const UChar *p = buff, *limit = buff + buffCapacity;
256 while(p != limit && *p != 0) {
257 ++p;
258 }
259 buffLength = (int32_t)(p - buff);
260 }
261 setArray(buff, buffLength, buffCapacity);
262 }
263 }
264
265 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
266 fUnion.fFields.fLengthAndFlags = kShortString;
267 if(src==NULL) {
268 // treat as an empty string
269 } else {
270 if(length<0) {
271 length=(int32_t)uprv_strlen(src);
272 }
273 if(cloneArrayIfNeeded(length, length, FALSE)) {
274 u_charsToUChars(src, getArrayStart(), length);
275 setLength(length);
276 } else {
277 setToBogus();
278 }
279 }
280 }
281
282 #if U_CHARSET_IS_UTF8
283
284 UnicodeString::UnicodeString(const char *codepageData) {
285 fUnion.fFields.fLengthAndFlags = kShortString;
286 if(codepageData != 0) {
287 setToUTF8(codepageData);
288 }
289 }
290
291 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
292 fUnion.fFields.fLengthAndFlags = kShortString;
293 // if there's nothing to convert, do nothing
294 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
295 return;
296 }
297 if(dataLength == -1) {
298 dataLength = (int32_t)uprv_strlen(codepageData);
299 }
300 setToUTF8(StringPiece(codepageData, dataLength));
301 }
302
303 // else see unistr_cnv.cpp
304 #endif
305
306 UnicodeString::UnicodeString(const UnicodeString& that) {
307 fUnion.fFields.fLengthAndFlags = kShortString;
308 copyFrom(that);
309 }
310
311 UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
312 copyFieldsFrom(src, TRUE);
313 }
314
315 UnicodeString::UnicodeString(const UnicodeString& that,
316 int32_t srcStart) {
317 fUnion.fFields.fLengthAndFlags = kShortString;
318 setTo(that, srcStart);
319 }
320
321 UnicodeString::UnicodeString(const UnicodeString& that,
322 int32_t srcStart,
323 int32_t srcLength) {
324 fUnion.fFields.fLengthAndFlags = kShortString;
325 setTo(that, srcStart, srcLength);
326 }
327
328 // Replaceable base class clone() default implementation, does not clone
329 Replaceable *
330 Replaceable::clone() const {
331 return NULL;
332 }
333
334 // UnicodeString overrides clone() with a real implementation
335 Replaceable *
336 UnicodeString::clone() const {
337 return new UnicodeString(*this);
338 }
339
340 //========================================
341 // array allocation
342 //========================================
343
344 namespace {
345
346 const int32_t kGrowSize = 128;
347
348 // The number of bytes for one int32_t reference counter and capacity UChars
349 // must fit into a 32-bit size_t (at least when on a 32-bit platform).
350 // We also add one for the NUL terminator, to avoid reallocation in getTerminatedBuffer(),
351 // and round up to a multiple of 16 bytes.
352 // This means that capacity must be at most (0xfffffff0 - 4) / 2 - 1 = 0x7ffffff5.
353 // (With more complicated checks we could go up to 0x7ffffffd without rounding up,
354 // but that does not seem worth it.)
355 const int32_t kMaxCapacity = 0x7ffffff5;
356
357 int32_t getGrowCapacity(int32_t newLength) {
358 int32_t growSize = (newLength >> 2) + kGrowSize;
359 if(growSize <= (kMaxCapacity - newLength)) {
360 return newLength + growSize;
361 } else {
362 return kMaxCapacity;
363 }
364 }
365
366 } // namespace
367
368 UBool
369 UnicodeString::allocate(int32_t capacity) {
370 if(capacity <= US_STACKBUF_SIZE) {
371 fUnion.fFields.fLengthAndFlags = kShortString;
372 return TRUE;
373 }
374 if(capacity <= kMaxCapacity) {
375 ++capacity; // for the NUL
376 // Switch to size_t which is unsigned so that we can allocate up to 4GB.
377 // Reference counter + UChars.
378 size_t numBytes = sizeof(int32_t) + (size_t)capacity * U_SIZEOF_UCHAR;
379 // Round up to a multiple of 16.
380 numBytes = (numBytes + 15) & ~15;
381 int32_t *array = (int32_t *) uprv_malloc(numBytes);
382 if(array != NULL) {
383 // set initial refCount and point behind the refCount
384 *array++ = 1;
385 numBytes -= sizeof(int32_t);
386
387 // have fArray point to the first UChar
388 fUnion.fFields.fArray = (UChar *)array;
389 fUnion.fFields.fCapacity = (int32_t)(numBytes / U_SIZEOF_UCHAR);
390 fUnion.fFields.fLengthAndFlags = kLongString;
391 return TRUE;
392 }
393 }
394 fUnion.fFields.fLengthAndFlags = kIsBogus;
395 fUnion.fFields.fArray = 0;
396 fUnion.fFields.fCapacity = 0;
397 return FALSE;
398 }
399
400 //========================================
401 // Destructor
402 //========================================
403
404 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
405 static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1
406 static u_atomic_int32_t beyondCount(0);
407
408 U_CAPI void unistr_printLengths() {
409 int32_t i;
410 for(i = 0; i <= 59; ++i) {
411 printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]);
412 }
413 int32_t beyond = beyondCount;
414 for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
415 beyond += finalLengthCounts[i];
416 }
417 printf(">59, %9d\n", beyond);
418 }
419 #endif
420
421 UnicodeString::~UnicodeString()
422 {
423 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
424 // Count lengths of strings at the end of their lifetime.
425 // Useful for discussion of a desirable stack buffer size.
426 // Count the contents length, not the optional NUL terminator nor further capacity.
427 // Ignore open-buffer strings and strings which alias external storage.
428 if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
429 if(hasShortLength()) {
430 umtx_atomic_inc(finalLengthCounts + getShortLength());
431 } else {
432 umtx_atomic_inc(&beyondCount);
433 }
434 }
435 #endif
436
437 releaseArray();
438 }
439
440 //========================================
441 // Factory methods
442 //========================================
443
444 UnicodeString UnicodeString::fromUTF8(StringPiece utf8) {
445 UnicodeString result;
446 result.setToUTF8(utf8);
447 return result;
448 }
449
450 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
451 UnicodeString result;
452 int32_t capacity;
453 // Most UTF-32 strings will be BMP-only and result in a same-length
454 // UTF-16 string. We overestimate the capacity just slightly,
455 // just in case there are a few supplementary characters.
456 if(length <= US_STACKBUF_SIZE) {
457 capacity = US_STACKBUF_SIZE;
458 } else {
459 capacity = length + (length >> 4) + 4;
460 }
461 do {
462 UChar *utf16 = result.getBuffer(capacity);
463 int32_t length16;
464 UErrorCode errorCode = U_ZERO_ERROR;
465 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
466 utf32, length,
467 0xfffd, // Substitution character.
468 NULL, // Don't care about number of substitutions.
469 &errorCode);
470 result.releaseBuffer(length16);
471 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
472 capacity = length16 + 1; // +1 for the terminating NUL.
473 continue;
474 } else if(U_FAILURE(errorCode)) {
475 result.setToBogus();
476 }
477 break;
478 } while(TRUE);
479 return result;
480 }
481
482 //========================================
483 // Assignment
484 //========================================
485
486 UnicodeString &
487 UnicodeString::operator=(const UnicodeString &src) {
488 return copyFrom(src);
489 }
490
491 UnicodeString &
492 UnicodeString::fastCopyFrom(const UnicodeString &src) {
493 return copyFrom(src, TRUE);
494 }
495
496 UnicodeString &
497 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
498 // if assigning to ourselves, do nothing
499 if(this == &src) {
500 return *this;
501 }
502
503 // is the right side bogus?
504 if(src.isBogus()) {
505 setToBogus();
506 return *this;
507 }
508
509 // delete the current contents
510 releaseArray();
511
512 if(src.isEmpty()) {
513 // empty string - use the stack buffer
514 setToEmpty();
515 return *this;
516 }
517
518 // fLength>0 and not an "open" src.getBuffer(minCapacity)
519 fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
520 switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
521 case kShortString:
522 // short string using the stack buffer, do the same
523 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
524 getShortLength() * U_SIZEOF_UCHAR);
525 break;
526 case kLongString:
527 // src uses a refCounted string buffer, use that buffer with refCount
528 // src is const, use a cast - we don't actually change it
529 ((UnicodeString &)src).addRef();
530 // copy all fields, share the reference-counted buffer
531 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
532 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
533 if(!hasShortLength()) {
534 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
535 }
536 break;
537 case kReadonlyAlias:
538 if(fastCopy) {
539 // src is a readonly alias, do the same
540 // -> maintain the readonly alias as such
541 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
542 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
543 if(!hasShortLength()) {
544 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
545 }
546 break;
547 }
548 // else if(!fastCopy) fall through to case kWritableAlias
549 // -> allocate a new buffer and copy the contents
550 U_FALLTHROUGH;
551 case kWritableAlias: {
552 // src is a writable alias; we make a copy of that instead
553 int32_t srcLength = src.length();
554 if(allocate(srcLength)) {
555 u_memcpy(getArrayStart(), src.getArrayStart(), srcLength);
556 setLength(srcLength);
557 break;
558 }
559 // if there is not enough memory, then fall through to setting to bogus
560 U_FALLTHROUGH;
561 }
562 default:
563 // if src is bogus, set ourselves to bogus
564 // do not call setToBogus() here because fArray and flags are not consistent here
565 fUnion.fFields.fLengthAndFlags = kIsBogus;
566 fUnion.fFields.fArray = 0;
567 fUnion.fFields.fCapacity = 0;
568 break;
569 }
570
571 return *this;
572 }
573
574 UnicodeString &UnicodeString::operator=(UnicodeString &&src) U_NOEXCEPT {
575 // No explicit check for self move assignment, consistent with standard library.
576 // Self move assignment causes no crash nor leak but might make the object bogus.
577 releaseArray();
578 copyFieldsFrom(src, TRUE);
579 return *this;
580 }
581
582 // Same as move assignment except without memory management.
583 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
584 int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
585 if(lengthAndFlags & kUsingStackBuffer) {
586 // Short string using the stack buffer, copy the contents.
587 // Check for self assignment to prevent "overlap in memcpy" warnings,
588 // although it should be harmless to copy a buffer to itself exactly.
589 if(this != &src) {
590 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
591 getShortLength() * U_SIZEOF_UCHAR);
592 }
593 } else {
594 // In all other cases, copy all fields.
595 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
596 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
597 if(!hasShortLength()) {
598 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
599 }
600 if(setSrcToBogus) {
601 // Set src to bogus without releasing any memory.
602 src.fUnion.fFields.fLengthAndFlags = kIsBogus;
603 src.fUnion.fFields.fArray = NULL;
604 src.fUnion.fFields.fCapacity = 0;
605 }
606 }
607 }
608
609 void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
610 UnicodeString temp; // Empty short string: Known not to need releaseArray().
611 // Copy fields without resetting source values in between.
612 temp.copyFieldsFrom(*this, FALSE);
613 this->copyFieldsFrom(other, FALSE);
614 other.copyFieldsFrom(temp, FALSE);
615 // Set temp to an empty string so that other's memory is not released twice.
616 temp.fUnion.fFields.fLengthAndFlags = kShortString;
617 }
618
619 //========================================
620 // Miscellaneous operations
621 //========================================
622
623 UnicodeString UnicodeString::unescape() const {
624 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
625 if (result.isBogus()) {
626 return result;
627 }
628 const UChar *array = getBuffer();
629 int32_t len = length();
630 int32_t prev = 0;
631 for (int32_t i=0;;) {
632 if (i == len) {
633 result.append(array, prev, len - prev);
634 break;
635 }
636 if (array[i++] == 0x5C /*'\\'*/) {
637 result.append(array, prev, (i - 1) - prev);
638 UChar32 c = unescapeAt(i); // advances i
639 if (c < 0) {
640 result.remove(); // return empty string
641 break; // invalid escape sequence
642 }
643 result.append(c);
644 prev = i;
645 }
646 }
647 return result;
648 }
649
650 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
651 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
652 }
653
654 //========================================
655 // Read-only implementation
656 //========================================
657 UBool
658 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
659 // Requires: this & text not bogus and have same lengths.
660 // Byte-wise comparison works for equality regardless of endianness.
661 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
662 }
663
664 int8_t
665 UnicodeString::doCompare( int32_t start,
666 int32_t length,
667 const UChar *srcChars,
668 int32_t srcStart,
669 int32_t srcLength) const
670 {
671 // compare illegal string values
672 if(isBogus()) {
673 return -1;
674 }
675
676 // pin indices to legal values
677 pinIndices(start, length);
678
679 if(srcChars == NULL) {
680 // treat const UChar *srcChars==NULL as an empty string
681 return length == 0 ? 0 : 1;
682 }
683
684 // get the correct pointer
685 const UChar *chars = getArrayStart();
686
687 chars += start;
688 srcChars += srcStart;
689
690 int32_t minLength;
691 int8_t lengthResult;
692
693 // get the srcLength if necessary
694 if(srcLength < 0) {
695 srcLength = u_strlen(srcChars + srcStart);
696 }
697
698 // are we comparing different lengths?
699 if(length != srcLength) {
700 if(length < srcLength) {
701 minLength = length;
702 lengthResult = -1;
703 } else {
704 minLength = srcLength;
705 lengthResult = 1;
706 }
707 } else {
708 minLength = length;
709 lengthResult = 0;
710 }
711
712 /*
713 * note that uprv_memcmp() returns an int but we return an int8_t;
714 * we need to take care not to truncate the result -
715 * one way to do this is to right-shift the value to
716 * move the sign bit into the lower 8 bits and making sure that this
717 * does not become 0 itself
718 */
719
720 if(minLength > 0 && chars != srcChars) {
721 int32_t result;
722
723 # if U_IS_BIG_ENDIAN
724 // big-endian: byte comparison works
725 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
726 if(result != 0) {
727 return (int8_t)(result >> 15 | 1);
728 }
729 # else
730 // little-endian: compare UChar units
731 do {
732 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
733 if(result != 0) {
734 return (int8_t)(result >> 15 | 1);
735 }
736 } while(--minLength > 0);
737 # endif
738 }
739 return lengthResult;
740 }
741
742 /* String compare in code point order - doCompare() compares in code unit order. */
743 int8_t
744 UnicodeString::doCompareCodePointOrder(int32_t start,
745 int32_t length,
746 const UChar *srcChars,
747 int32_t srcStart,
748 int32_t srcLength) const
749 {
750 // compare illegal string values
751 // treat const UChar *srcChars==NULL as an empty string
752 if(isBogus()) {
753 return -1;
754 }
755
756 // pin indices to legal values
757 pinIndices(start, length);
758
759 if(srcChars == NULL) {
760 srcStart = srcLength = 0;
761 }
762
763 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
764 /* translate the 32-bit result into an 8-bit one */
765 if(diff!=0) {
766 return (int8_t)(diff >> 15 | 1);
767 } else {
768 return 0;
769 }
770 }
771
772 int32_t
773 UnicodeString::getLength() const {
774 return length();
775 }
776
777 UChar
778 UnicodeString::getCharAt(int32_t offset) const {
779 return charAt(offset);
780 }
781
782 UChar32
783 UnicodeString::getChar32At(int32_t offset) const {
784 return char32At(offset);
785 }
786
787 UChar32
788 UnicodeString::char32At(int32_t offset) const
789 {
790 int32_t len = length();
791 if((uint32_t)offset < (uint32_t)len) {
792 const UChar *array = getArrayStart();
793 UChar32 c;
794 U16_GET(array, 0, offset, len, c);
795 return c;
796 } else {
797 return kInvalidUChar;
798 }
799 }
800
801 int32_t
802 UnicodeString::getChar32Start(int32_t offset) const {
803 if((uint32_t)offset < (uint32_t)length()) {
804 const UChar *array = getArrayStart();
805 U16_SET_CP_START(array, 0, offset);
806 return offset;
807 } else {
808 return 0;
809 }
810 }
811
812 int32_t
813 UnicodeString::getChar32Limit(int32_t offset) const {
814 int32_t len = length();
815 if((uint32_t)offset < (uint32_t)len) {
816 const UChar *array = getArrayStart();
817 U16_SET_CP_LIMIT(array, 0, offset, len);
818 return offset;
819 } else {
820 return len;
821 }
822 }
823
824 int32_t
825 UnicodeString::countChar32(int32_t start, int32_t length) const {
826 pinIndices(start, length);
827 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
828 return u_countChar32(getArrayStart()+start, length);
829 }
830
831 UBool
832 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
833 pinIndices(start, length);
834 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
835 return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
836 }
837
838 int32_t
839 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
840 // pin index
841 int32_t len = length();
842 if(index<0) {
843 index=0;
844 } else if(index>len) {
845 index=len;
846 }
847
848 const UChar *array = getArrayStart();
849 if(delta>0) {
850 U16_FWD_N(array, index, len, delta);
851 } else {
852 U16_BACK_N(array, 0, index, -delta);
853 }
854
855 return index;
856 }
857
858 void
859 UnicodeString::doExtract(int32_t start,
860 int32_t length,
861 UChar *dst,
862 int32_t dstStart) const
863 {
864 // pin indices to legal values
865 pinIndices(start, length);
866
867 // do not copy anything if we alias dst itself
868 const UChar *array = getArrayStart();
869 if(array + start != dst + dstStart) {
870 us_arrayCopy(array, start, dst, dstStart, length);
871 }
872 }
873
874 int32_t
875 UnicodeString::extract(Char16Ptr dest, int32_t destCapacity,
876 UErrorCode &errorCode) const {
877 int32_t len = length();
878 if(U_SUCCESS(errorCode)) {
879 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
880 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
881 } else {
882 const UChar *array = getArrayStart();
883 if(len>0 && len<=destCapacity && array!=dest) {
884 u_memcpy(dest, array, len);
885 }
886 return u_terminateUChars(dest, destCapacity, len, &errorCode);
887 }
888 }
889
890 return len;
891 }
892
893 int32_t
894 UnicodeString::extract(int32_t start,
895 int32_t length,
896 char *target,
897 int32_t targetCapacity,
898 enum EInvariant) const
899 {
900 // if the arguments are illegal, then do nothing
901 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
902 return 0;
903 }
904
905 // pin the indices to legal values
906 pinIndices(start, length);
907
908 if(length <= targetCapacity) {
909 u_UCharsToChars(getArrayStart() + start, target, length);
910 }
911 UErrorCode status = U_ZERO_ERROR;
912 return u_terminateChars(target, targetCapacity, length, &status);
913 }
914
915 UnicodeString
916 UnicodeString::tempSubString(int32_t start, int32_t len) const {
917 pinIndices(start, len);
918 const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
919 if(array==NULL) {
920 array=fUnion.fStackFields.fBuffer; // anything not NULL because that would make an empty string
921 len=-2; // bogus result string
922 }
923 return UnicodeString(FALSE, array + start, len);
924 }
925
926 int32_t
927 UnicodeString::toUTF8(int32_t start, int32_t len,
928 char *target, int32_t capacity) const {
929 pinIndices(start, len);
930 int32_t length8;
931 UErrorCode errorCode = U_ZERO_ERROR;
932 u_strToUTF8WithSub(target, capacity, &length8,
933 getBuffer() + start, len,
934 0xFFFD, // Standard substitution character.
935 NULL, // Don't care about number of substitutions.
936 &errorCode);
937 return length8;
938 }
939
940 #if U_CHARSET_IS_UTF8
941
942 int32_t
943 UnicodeString::extract(int32_t start, int32_t len,
944 char *target, uint32_t dstSize) const {
945 // if the arguments are illegal, then do nothing
946 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
947 return 0;
948 }
949 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
950 }
951
952 // else see unistr_cnv.cpp
953 #endif
954
955 void
956 UnicodeString::extractBetween(int32_t start,
957 int32_t limit,
958 UnicodeString& target) const {
959 pinIndex(start);
960 pinIndex(limit);
961 doExtract(start, limit - start, target);
962 }
963
964 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
965 // as many bytes as the source has UChars.
966 // The "worst cases" are writing systems like Indic, Thai and CJK with
967 // 3:1 bytes:UChars.
968 void
969 UnicodeString::toUTF8(ByteSink &sink) const {
970 int32_t length16 = length();
971 if(length16 != 0) {
972 char stackBuffer[1024];
973 int32_t capacity = (int32_t)sizeof(stackBuffer);
974 UBool utf8IsOwned = FALSE;
975 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
976 3*length16,
977 stackBuffer, capacity,
978 &capacity);
979 int32_t length8 = 0;
980 UErrorCode errorCode = U_ZERO_ERROR;
981 u_strToUTF8WithSub(utf8, capacity, &length8,
982 getBuffer(), length16,
983 0xFFFD, // Standard substitution character.
984 NULL, // Don't care about number of substitutions.
985 &errorCode);
986 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
987 utf8 = (char *)uprv_malloc(length8);
988 if(utf8 != NULL) {
989 utf8IsOwned = TRUE;
990 errorCode = U_ZERO_ERROR;
991 u_strToUTF8WithSub(utf8, length8, &length8,
992 getBuffer(), length16,
993 0xFFFD, // Standard substitution character.
994 NULL, // Don't care about number of substitutions.
995 &errorCode);
996 } else {
997 errorCode = U_MEMORY_ALLOCATION_ERROR;
998 }
999 }
1000 if(U_SUCCESS(errorCode)) {
1001 sink.Append(utf8, length8);
1002 sink.Flush();
1003 }
1004 if(utf8IsOwned) {
1005 uprv_free(utf8);
1006 }
1007 }
1008 }
1009
1010 int32_t
1011 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
1012 int32_t length32=0;
1013 if(U_SUCCESS(errorCode)) {
1014 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
1015 u_strToUTF32WithSub(utf32, capacity, &length32,
1016 getBuffer(), length(),
1017 0xfffd, // Substitution character.
1018 NULL, // Don't care about number of substitutions.
1019 &errorCode);
1020 }
1021 return length32;
1022 }
1023
1024 int32_t
1025 UnicodeString::indexOf(const UChar *srcChars,
1026 int32_t srcStart,
1027 int32_t srcLength,
1028 int32_t start,
1029 int32_t length) const
1030 {
1031 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1032 return -1;
1033 }
1034
1035 // UnicodeString does not find empty substrings
1036 if(srcLength < 0 && srcChars[srcStart] == 0) {
1037 return -1;
1038 }
1039
1040 // get the indices within bounds
1041 pinIndices(start, length);
1042
1043 // find the first occurrence of the substring
1044 const UChar *array = getArrayStart();
1045 const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1046 if(match == NULL) {
1047 return -1;
1048 } else {
1049 return (int32_t)(match - array);
1050 }
1051 }
1052
1053 int32_t
1054 UnicodeString::doIndexOf(UChar c,
1055 int32_t start,
1056 int32_t length) const
1057 {
1058 // pin indices
1059 pinIndices(start, length);
1060
1061 // find the first occurrence of c
1062 const UChar *array = getArrayStart();
1063 const UChar *match = u_memchr(array + start, c, length);
1064 if(match == NULL) {
1065 return -1;
1066 } else {
1067 return (int32_t)(match - array);
1068 }
1069 }
1070
1071 int32_t
1072 UnicodeString::doIndexOf(UChar32 c,
1073 int32_t start,
1074 int32_t length) const {
1075 // pin indices
1076 pinIndices(start, length);
1077
1078 // find the first occurrence of c
1079 const UChar *array = getArrayStart();
1080 const UChar *match = u_memchr32(array + start, c, length);
1081 if(match == NULL) {
1082 return -1;
1083 } else {
1084 return (int32_t)(match - array);
1085 }
1086 }
1087
1088 int32_t
1089 UnicodeString::lastIndexOf(const UChar *srcChars,
1090 int32_t srcStart,
1091 int32_t srcLength,
1092 int32_t start,
1093 int32_t length) const
1094 {
1095 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1096 return -1;
1097 }
1098
1099 // UnicodeString does not find empty substrings
1100 if(srcLength < 0 && srcChars[srcStart] == 0) {
1101 return -1;
1102 }
1103
1104 // get the indices within bounds
1105 pinIndices(start, length);
1106
1107 // find the last occurrence of the substring
1108 const UChar *array = getArrayStart();
1109 const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1110 if(match == NULL) {
1111 return -1;
1112 } else {
1113 return (int32_t)(match - array);
1114 }
1115 }
1116
1117 int32_t
1118 UnicodeString::doLastIndexOf(UChar c,
1119 int32_t start,
1120 int32_t length) const
1121 {
1122 if(isBogus()) {
1123 return -1;
1124 }
1125
1126 // pin indices
1127 pinIndices(start, length);
1128
1129 // find the last occurrence of c
1130 const UChar *array = getArrayStart();
1131 const UChar *match = u_memrchr(array + start, c, length);
1132 if(match == NULL) {
1133 return -1;
1134 } else {
1135 return (int32_t)(match - array);
1136 }
1137 }
1138
1139 int32_t
1140 UnicodeString::doLastIndexOf(UChar32 c,
1141 int32_t start,
1142 int32_t length) const {
1143 // pin indices
1144 pinIndices(start, length);
1145
1146 // find the last occurrence of c
1147 const UChar *array = getArrayStart();
1148 const UChar *match = u_memrchr32(array + start, c, length);
1149 if(match == NULL) {
1150 return -1;
1151 } else {
1152 return (int32_t)(match - array);
1153 }
1154 }
1155
1156 //========================================
1157 // Write implementation
1158 //========================================
1159
1160 UnicodeString&
1161 UnicodeString::findAndReplace(int32_t start,
1162 int32_t length,
1163 const UnicodeString& oldText,
1164 int32_t oldStart,
1165 int32_t oldLength,
1166 const UnicodeString& newText,
1167 int32_t newStart,
1168 int32_t newLength)
1169 {
1170 if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1171 return *this;
1172 }
1173
1174 pinIndices(start, length);
1175 oldText.pinIndices(oldStart, oldLength);
1176 newText.pinIndices(newStart, newLength);
1177
1178 if(oldLength == 0) {
1179 return *this;
1180 }
1181
1182 while(length > 0 && length >= oldLength) {
1183 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1184 if(pos < 0) {
1185 // no more oldText's here: done
1186 break;
1187 } else {
1188 // we found oldText, replace it by newText and go beyond it
1189 replace(pos, oldLength, newText, newStart, newLength);
1190 length -= pos + oldLength - start;
1191 start = pos + newLength;
1192 }
1193 }
1194
1195 return *this;
1196 }
1197
1198
1199 void
1200 UnicodeString::setToBogus()
1201 {
1202 releaseArray();
1203
1204 fUnion.fFields.fLengthAndFlags = kIsBogus;
1205 fUnion.fFields.fArray = 0;
1206 fUnion.fFields.fCapacity = 0;
1207 }
1208
1209 // turn a bogus string into an empty one
1210 void
1211 UnicodeString::unBogus() {
1212 if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1213 setToEmpty();
1214 }
1215 }
1216
1217 const char16_t *
1218 UnicodeString::getTerminatedBuffer() {
1219 if(!isWritable()) {
1220 return nullptr;
1221 }
1222 UChar *array = getArrayStart();
1223 int32_t len = length();
1224 if(len < getCapacity()) {
1225 if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1226 // If len<capacity on a read-only alias, then array[len] is
1227 // either the original NUL (if constructed with (TRUE, s, length))
1228 // or one of the original string contents characters (if later truncated),
1229 // therefore we can assume that array[len] is initialized memory.
1230 if(array[len] == 0) {
1231 return array;
1232 }
1233 } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1234 // kRefCounted: Do not write the NUL if the buffer is shared.
1235 // That is mostly safe, except when the length of one copy was modified
1236 // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1237 // Then the NUL would be written into the middle of another copy's string.
1238
1239 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1240 // Do not test if there is a NUL already because it might be uninitialized memory.
1241 // (That would be safe, but tools like valgrind & Purify would complain.)
1242 array[len] = 0;
1243 return array;
1244 }
1245 }
1246 if(len<INT32_MAX && cloneArrayIfNeeded(len+1)) {
1247 array = getArrayStart();
1248 array[len] = 0;
1249 return array;
1250 } else {
1251 return nullptr;
1252 }
1253 }
1254
1255 // setTo() analogous to the readonly-aliasing constructor with the same signature
1256 UnicodeString &
1257 UnicodeString::setTo(UBool isTerminated,
1258 ConstChar16Ptr textPtr,
1259 int32_t textLength)
1260 {
1261 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1262 // do not modify a string that has an "open" getBuffer(minCapacity)
1263 return *this;
1264 }
1265
1266 const UChar *text = textPtr;
1267 if(text == NULL) {
1268 // treat as an empty string, do not alias
1269 releaseArray();
1270 setToEmpty();
1271 return *this;
1272 }
1273
1274 if( textLength < -1 ||
1275 (textLength == -1 && !isTerminated) ||
1276 (textLength >= 0 && isTerminated && text[textLength] != 0)
1277 ) {
1278 setToBogus();
1279 return *this;
1280 }
1281
1282 releaseArray();
1283
1284 if(textLength == -1) {
1285 // text is terminated, or else it would have failed the above test
1286 textLength = u_strlen(text);
1287 }
1288 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1289 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1290 return *this;
1291 }
1292
1293 // setTo() analogous to the writable-aliasing constructor with the same signature
1294 UnicodeString &
1295 UnicodeString::setTo(UChar *buffer,
1296 int32_t buffLength,
1297 int32_t buffCapacity) {
1298 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1299 // do not modify a string that has an "open" getBuffer(minCapacity)
1300 return *this;
1301 }
1302
1303 if(buffer == NULL) {
1304 // treat as an empty string, do not alias
1305 releaseArray();
1306 setToEmpty();
1307 return *this;
1308 }
1309
1310 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1311 setToBogus();
1312 return *this;
1313 } else if(buffLength == -1) {
1314 // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1315 const UChar *p = buffer, *limit = buffer + buffCapacity;
1316 while(p != limit && *p != 0) {
1317 ++p;
1318 }
1319 buffLength = (int32_t)(p - buffer);
1320 }
1321
1322 releaseArray();
1323
1324 fUnion.fFields.fLengthAndFlags = kWritableAlias;
1325 setArray(buffer, buffLength, buffCapacity);
1326 return *this;
1327 }
1328
1329 UnicodeString &UnicodeString::setToUTF8(StringPiece utf8) {
1330 unBogus();
1331 int32_t length = utf8.length();
1332 int32_t capacity;
1333 // The UTF-16 string will be at most as long as the UTF-8 string.
1334 if(length <= US_STACKBUF_SIZE) {
1335 capacity = US_STACKBUF_SIZE;
1336 } else {
1337 capacity = length + 1; // +1 for the terminating NUL.
1338 }
1339 UChar *utf16 = getBuffer(capacity);
1340 int32_t length16;
1341 UErrorCode errorCode = U_ZERO_ERROR;
1342 u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1343 utf8.data(), length,
1344 0xfffd, // Substitution character.
1345 NULL, // Don't care about number of substitutions.
1346 &errorCode);
1347 releaseBuffer(length16);
1348 if(U_FAILURE(errorCode)) {
1349 setToBogus();
1350 }
1351 return *this;
1352 }
1353
1354 UnicodeString&
1355 UnicodeString::setCharAt(int32_t offset,
1356 UChar c)
1357 {
1358 int32_t len = length();
1359 if(cloneArrayIfNeeded() && len > 0) {
1360 if(offset < 0) {
1361 offset = 0;
1362 } else if(offset >= len) {
1363 offset = len - 1;
1364 }
1365
1366 getArrayStart()[offset] = c;
1367 }
1368 return *this;
1369 }
1370
1371 UnicodeString&
1372 UnicodeString::replace(int32_t start,
1373 int32_t _length,
1374 UChar32 srcChar) {
1375 UChar buffer[U16_MAX_LENGTH];
1376 int32_t count = 0;
1377 UBool isError = FALSE;
1378 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1379 // We test isError so that the compiler does not complain that we don't.
1380 // If isError (srcChar is not a valid code point) then count==0 which means
1381 // we remove the source segment rather than replacing it with srcChar.
1382 return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1383 }
1384
1385 UnicodeString&
1386 UnicodeString::append(UChar32 srcChar) {
1387 UChar buffer[U16_MAX_LENGTH];
1388 int32_t _length = 0;
1389 UBool isError = FALSE;
1390 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1391 // We test isError so that the compiler does not complain that we don't.
1392 // If isError then _length==0 which turns the doAppend() into a no-op anyway.
1393 return isError ? *this : doAppend(buffer, 0, _length);
1394 }
1395
1396 UnicodeString&
1397 UnicodeString::doReplace( int32_t start,
1398 int32_t length,
1399 const UnicodeString& src,
1400 int32_t srcStart,
1401 int32_t srcLength)
1402 {
1403 // pin the indices to legal values
1404 src.pinIndices(srcStart, srcLength);
1405
1406 // get the characters from src
1407 // and replace the range in ourselves with them
1408 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1409 }
1410
1411 UnicodeString&
1412 UnicodeString::doReplace(int32_t start,
1413 int32_t length,
1414 const UChar *srcChars,
1415 int32_t srcStart,
1416 int32_t srcLength)
1417 {
1418 if(!isWritable()) {
1419 return *this;
1420 }
1421
1422 int32_t oldLength = this->length();
1423
1424 // optimize (read-only alias).remove(0, start) and .remove(start, end)
1425 if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1426 if(start == 0) {
1427 // remove prefix by adjusting the array pointer
1428 pinIndex(length);
1429 fUnion.fFields.fArray += length;
1430 fUnion.fFields.fCapacity -= length;
1431 setLength(oldLength - length);
1432 return *this;
1433 } else {
1434 pinIndex(start);
1435 if(length >= (oldLength - start)) {
1436 // remove suffix by reducing the length (like truncate())
1437 setLength(start);
1438 fUnion.fFields.fCapacity = start; // not NUL-terminated any more
1439 return *this;
1440 }
1441 }
1442 }
1443
1444 if(start == oldLength) {
1445 return doAppend(srcChars, srcStart, srcLength);
1446 }
1447
1448 if(srcChars == 0) {
1449 srcLength = 0;
1450 } else {
1451 // Perform all remaining operations relative to srcChars + srcStart.
1452 // From this point forward, do not use srcStart.
1453 srcChars += srcStart;
1454 if (srcLength < 0) {
1455 // get the srcLength if necessary
1456 srcLength = u_strlen(srcChars);
1457 }
1458 }
1459
1460 // pin the indices to legal values
1461 pinIndices(start, length);
1462
1463 // Calculate the size of the string after the replace.
1464 // Avoid int32_t overflow.
1465 int32_t newLength = oldLength - length;
1466 if(srcLength > (INT32_MAX - newLength)) {
1467 setToBogus();
1468 return *this;
1469 }
1470 newLength += srcLength;
1471
1472 // Check for insertion into ourself
1473 const UChar *oldArray = getArrayStart();
1474 if (isBufferWritable() &&
1475 oldArray < srcChars + srcLength &&
1476 srcChars < oldArray + oldLength) {
1477 // Copy into a new UnicodeString and start over
1478 UnicodeString copy(srcChars, srcLength);
1479 if (copy.isBogus()) {
1480 setToBogus();
1481 return *this;
1482 }
1483 return doReplace(start, length, copy.getArrayStart(), 0, srcLength);
1484 }
1485
1486 // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
1487 // therefore we need to keep the current fArray
1488 UChar oldStackBuffer[US_STACKBUF_SIZE];
1489 if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1490 // copy the stack buffer contents because it will be overwritten with
1491 // fUnion.fFields values
1492 u_memcpy(oldStackBuffer, oldArray, oldLength);
1493 oldArray = oldStackBuffer;
1494 }
1495
1496 // clone our array and allocate a bigger array if needed
1497 int32_t *bufferToDelete = 0;
1498 if(!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength),
1499 FALSE, &bufferToDelete)
1500 ) {
1501 return *this;
1502 }
1503
1504 // now do the replace
1505
1506 UChar *newArray = getArrayStart();
1507 if(newArray != oldArray) {
1508 // if fArray changed, then we need to copy everything except what will change
1509 us_arrayCopy(oldArray, 0, newArray, 0, start);
1510 us_arrayCopy(oldArray, start + length,
1511 newArray, start + srcLength,
1512 oldLength - (start + length));
1513 } else if(length != srcLength) {
1514 // fArray did not change; copy only the portion that isn't changing, leaving a hole
1515 us_arrayCopy(oldArray, start + length,
1516 newArray, start + srcLength,
1517 oldLength - (start + length));
1518 }
1519
1520 // now fill in the hole with the new string
1521 us_arrayCopy(srcChars, 0, newArray, start, srcLength);
1522
1523 setLength(newLength);
1524
1525 // delayed delete in case srcChars == fArray when we started, and
1526 // to keep oldArray alive for the above operations
1527 if (bufferToDelete) {
1528 uprv_free(bufferToDelete);
1529 }
1530
1531 return *this;
1532 }
1533
1534 // Versions of doReplace() only for append() variants.
1535 // doReplace() and doAppend() optimize for different cases.
1536
1537 UnicodeString&
1538 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1539 if(srcLength == 0) {
1540 return *this;
1541 }
1542
1543 // pin the indices to legal values
1544 src.pinIndices(srcStart, srcLength);
1545 return doAppend(src.getArrayStart(), srcStart, srcLength);
1546 }
1547
1548 UnicodeString&
1549 UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
1550 if(!isWritable() || srcLength == 0 || srcChars == NULL) {
1551 return *this;
1552 }
1553
1554 // Perform all remaining operations relative to srcChars + srcStart.
1555 // From this point forward, do not use srcStart.
1556 srcChars += srcStart;
1557
1558 if(srcLength < 0) {
1559 // get the srcLength if necessary
1560 if((srcLength = u_strlen(srcChars)) == 0) {
1561 return *this;
1562 }
1563 }
1564
1565 int32_t oldLength = length();
1566 int32_t newLength = oldLength + srcLength;
1567
1568 // Check for append onto ourself
1569 const UChar* oldArray = getArrayStart();
1570 if (isBufferWritable() &&
1571 oldArray < srcChars + srcLength &&
1572 srcChars < oldArray + oldLength) {
1573 // Copy into a new UnicodeString and start over
1574 UnicodeString copy(srcChars, srcLength);
1575 if (copy.isBogus()) {
1576 setToBogus();
1577 return *this;
1578 }
1579 return doAppend(copy.getArrayStart(), 0, srcLength);
1580 }
1581
1582 // optimize append() onto a large-enough, owned string
1583 if((newLength <= getCapacity() && isBufferWritable()) ||
1584 cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) {
1585 UChar *newArray = getArrayStart();
1586 // Do not copy characters when
1587 // UChar *buffer=str.getAppendBuffer(...);
1588 // is followed by
1589 // str.append(buffer, length);
1590 // or
1591 // str.appendString(buffer, length)
1592 // or similar.
1593 if(srcChars != newArray + oldLength) {
1594 us_arrayCopy(srcChars, 0, newArray, oldLength, srcLength);
1595 }
1596 setLength(newLength);
1597 }
1598 return *this;
1599 }
1600
1601 /**
1602 * Replaceable API
1603 */
1604 void
1605 UnicodeString::handleReplaceBetween(int32_t start,
1606 int32_t limit,
1607 const UnicodeString& text) {
1608 replaceBetween(start, limit, text);
1609 }
1610
1611 /**
1612 * Replaceable API
1613 */
1614 void
1615 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1616 if (limit <= start) {
1617 return; // Nothing to do; avoid bogus malloc call
1618 }
1619 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1620 // Check to make sure text is not null.
1621 if (text != NULL) {
1622 extractBetween(start, limit, text, 0);
1623 insert(dest, text, 0, limit - start);
1624 uprv_free(text);
1625 }
1626 }
1627
1628 /**
1629 * Replaceable API
1630 *
1631 * NOTE: This is for the Replaceable class. There is no rep.cpp,
1632 * so we implement this function here.
1633 */
1634 UBool Replaceable::hasMetaData() const {
1635 return TRUE;
1636 }
1637
1638 /**
1639 * Replaceable API
1640 */
1641 UBool UnicodeString::hasMetaData() const {
1642 return FALSE;
1643 }
1644
1645 UnicodeString&
1646 UnicodeString::doReverse(int32_t start, int32_t length) {
1647 if(length <= 1 || !cloneArrayIfNeeded()) {
1648 return *this;
1649 }
1650
1651 // pin the indices to legal values
1652 pinIndices(start, length);
1653 if(length <= 1) { // pinIndices() might have shrunk the length
1654 return *this;
1655 }
1656
1657 UChar *left = getArrayStart() + start;
1658 UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2)
1659 UChar swap;
1660 UBool hasSupplementary = FALSE;
1661
1662 // Before the loop we know left<right because length>=2.
1663 do {
1664 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1665 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1666 *right-- = swap;
1667 } while(left < right);
1668 // Make sure to test the middle code unit of an odd-length string.
1669 // Redundant if the length is even.
1670 hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1671
1672 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1673 if(hasSupplementary) {
1674 UChar swap2;
1675
1676 left = getArrayStart() + start;
1677 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1678 while(left < right) {
1679 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1680 *left++ = swap2;
1681 *left++ = swap;
1682 } else {
1683 ++left;
1684 }
1685 }
1686 }
1687
1688 return *this;
1689 }
1690
1691 UBool
1692 UnicodeString::padLeading(int32_t targetLength,
1693 UChar padChar)
1694 {
1695 int32_t oldLength = length();
1696 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1697 return FALSE;
1698 } else {
1699 // move contents up by padding width
1700 UChar *array = getArrayStart();
1701 int32_t start = targetLength - oldLength;
1702 us_arrayCopy(array, 0, array, start, oldLength);
1703
1704 // fill in padding character
1705 while(--start >= 0) {
1706 array[start] = padChar;
1707 }
1708 setLength(targetLength);
1709 return TRUE;
1710 }
1711 }
1712
1713 UBool
1714 UnicodeString::padTrailing(int32_t targetLength,
1715 UChar padChar)
1716 {
1717 int32_t oldLength = length();
1718 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1719 return FALSE;
1720 } else {
1721 // fill in padding character
1722 UChar *array = getArrayStart();
1723 int32_t length = targetLength;
1724 while(--length >= oldLength) {
1725 array[length] = padChar;
1726 }
1727 setLength(targetLength);
1728 return TRUE;
1729 }
1730 }
1731
1732 //========================================
1733 // Hashing
1734 //========================================
1735 int32_t
1736 UnicodeString::doHashCode() const
1737 {
1738 /* Delegate hash computation to uhash. This makes UnicodeString
1739 * hashing consistent with UChar* hashing. */
1740 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1741 if (hashCode == kInvalidHashCode) {
1742 hashCode = kEmptyHashCode;
1743 }
1744 return hashCode;
1745 }
1746
1747 //========================================
1748 // External Buffer
1749 //========================================
1750
1751 char16_t *
1752 UnicodeString::getBuffer(int32_t minCapacity) {
1753 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1754 fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1755 setZeroLength();
1756 return getArrayStart();
1757 } else {
1758 return nullptr;
1759 }
1760 }
1761
1762 void
1763 UnicodeString::releaseBuffer(int32_t newLength) {
1764 if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1765 // set the new fLength
1766 int32_t capacity=getCapacity();
1767 if(newLength==-1) {
1768 // the new length is the string length, capped by fCapacity
1769 const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1770 while(p<limit && *p!=0) {
1771 ++p;
1772 }
1773 newLength=(int32_t)(p-array);
1774 } else if(newLength>capacity) {
1775 newLength=capacity;
1776 }
1777 setLength(newLength);
1778 fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1779 }
1780 }
1781
1782 //========================================
1783 // Miscellaneous
1784 //========================================
1785 UBool
1786 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1787 int32_t growCapacity,
1788 UBool doCopyArray,
1789 int32_t **pBufferToDelete,
1790 UBool forceClone) {
1791 // default parameters need to be static, therefore
1792 // the defaults are -1 to have convenience defaults
1793 if(newCapacity == -1) {
1794 newCapacity = getCapacity();
1795 }
1796
1797 // while a getBuffer(minCapacity) is "open",
1798 // prevent any modifications of the string by returning FALSE here
1799 // if the string is bogus, then only an assignment or similar can revive it
1800 if(!isWritable()) {
1801 return FALSE;
1802 }
1803
1804 /*
1805 * We need to make a copy of the array if
1806 * the buffer is read-only, or
1807 * the buffer is refCounted (shared), and refCount>1, or
1808 * the buffer is too small.
1809 * Return FALSE if memory could not be allocated.
1810 */
1811 if(forceClone ||
1812 fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1813 (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1814 newCapacity > getCapacity()
1815 ) {
1816 // check growCapacity for default value and use of the stack buffer
1817 if(growCapacity < 0) {
1818 growCapacity = newCapacity;
1819 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1820 growCapacity = US_STACKBUF_SIZE;
1821 }
1822
1823 // save old values
1824 UChar oldStackBuffer[US_STACKBUF_SIZE];
1825 UChar *oldArray;
1826 int32_t oldLength = length();
1827 int16_t flags = fUnion.fFields.fLengthAndFlags;
1828
1829 if(flags&kUsingStackBuffer) {
1830 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1831 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1832 // copy the stack buffer contents because it will be overwritten with
1833 // fUnion.fFields values
1834 us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1835 oldArray = oldStackBuffer;
1836 } else {
1837 oldArray = NULL; // no need to copy from the stack buffer to itself
1838 }
1839 } else {
1840 oldArray = fUnion.fFields.fArray;
1841 U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1842 }
1843
1844 // allocate a new array
1845 if(allocate(growCapacity) ||
1846 (newCapacity < growCapacity && allocate(newCapacity))
1847 ) {
1848 if(doCopyArray) {
1849 // copy the contents
1850 // do not copy more than what fits - it may be smaller than before
1851 int32_t minLength = oldLength;
1852 newCapacity = getCapacity();
1853 if(newCapacity < minLength) {
1854 minLength = newCapacity;
1855 }
1856 if(oldArray != NULL) {
1857 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1858 }
1859 setLength(minLength);
1860 } else {
1861 setZeroLength();
1862 }
1863
1864 // release the old array
1865 if(flags & kRefCounted) {
1866 // the array is refCounted; decrement and release if 0
1867 u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1868 if(umtx_atomic_dec(pRefCount) == 0) {
1869 if(pBufferToDelete == 0) {
1870 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1871 // is defined as volatile. (Volatile has useful non-standard behavior
1872 // with this compiler.)
1873 uprv_free((void *)pRefCount);
1874 } else {
1875 // the caller requested to delete it himself
1876 *pBufferToDelete = (int32_t *)pRefCount;
1877 }
1878 }
1879 }
1880 } else {
1881 // not enough memory for growCapacity and not even for the smaller newCapacity
1882 // reset the old values for setToBogus() to release the array
1883 if(!(flags&kUsingStackBuffer)) {
1884 fUnion.fFields.fArray = oldArray;
1885 }
1886 fUnion.fFields.fLengthAndFlags = flags;
1887 setToBogus();
1888 return FALSE;
1889 }
1890 }
1891 return TRUE;
1892 }
1893
1894 // UnicodeStringAppendable ------------------------------------------------- ***
1895
1896 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1897
1898 UBool
1899 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1900 return str.doAppend(&c, 0, 1).isWritable();
1901 }
1902
1903 UBool
1904 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1905 UChar buffer[U16_MAX_LENGTH];
1906 int32_t cLength = 0;
1907 UBool isError = FALSE;
1908 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1909 return !isError && str.doAppend(buffer, 0, cLength).isWritable();
1910 }
1911
1912 UBool
1913 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1914 return str.doAppend(s, 0, length).isWritable();
1915 }
1916
1917 UBool
1918 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1919 return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1920 }
1921
1922 UChar *
1923 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1924 int32_t desiredCapacityHint,
1925 UChar *scratch, int32_t scratchCapacity,
1926 int32_t *resultCapacity) {
1927 if(minCapacity < 1 || scratchCapacity < minCapacity) {
1928 *resultCapacity = 0;
1929 return NULL;
1930 }
1931 int32_t oldLength = str.length();
1932 if(minCapacity <= (kMaxCapacity - oldLength) &&
1933 desiredCapacityHint <= (kMaxCapacity - oldLength) &&
1934 str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1935 *resultCapacity = str.getCapacity() - oldLength;
1936 return str.getArrayStart() + oldLength;
1937 }
1938 *resultCapacity = scratchCapacity;
1939 return scratch;
1940 }
1941
1942 U_NAMESPACE_END
1943
1944 U_NAMESPACE_USE
1945
1946 U_CAPI int32_t U_EXPORT2
1947 uhash_hashUnicodeString(const UElement key) {
1948 const UnicodeString *str = (const UnicodeString*) key.pointer;
1949 return (str == NULL) ? 0 : str->hashCode();
1950 }
1951
1952 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1953 // does not depend on hashtable code.
1954 U_CAPI UBool U_EXPORT2
1955 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1956 const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1957 const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1958 if (str1 == str2) {
1959 return TRUE;
1960 }
1961 if (str1 == NULL || str2 == NULL) {
1962 return FALSE;
1963 }
1964 return *str1 == *str2;
1965 }
1966
1967 #ifdef U_STATIC_IMPLEMENTATION
1968 /*
1969 This should never be called. It is defined here to make sure that the
1970 virtual vector deleting destructor is defined within unistr.cpp.
1971 The vector deleting destructor is already a part of UObject,
1972 but defining it here makes sure that it is included with this object file.
1973 This makes sure that static library dependencies are kept to a minimum.
1974 */
1975 static void uprv_UnicodeStringDummy(void) {
1976 delete [] (new UnicodeString[2]);
1977 }
1978 #endif