]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unistr.cpp
ICU-59131.0.1.tar.gz
[apple/icu.git] / icuSources / common / unistr.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ******************************************************************************
8 *
9 * File unistr.cpp
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 09/25/98 stephen Creation.
15 * 04/20/99 stephen Overhauled per 4/16 code review.
16 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
17 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from
18 * Replaceable.
19 * 06/25/01 grhoten Removed the dependency on iostream
20 ******************************************************************************
21 */
22
23 #include "unicode/utypes.h"
24 #include "unicode/appendable.h"
25 #include "unicode/putil.h"
26 #include "cstring.h"
27 #include "cmemory.h"
28 #include "unicode/ustring.h"
29 #include "unicode/unistr.h"
30 #include "unicode/utf.h"
31 #include "unicode/utf16.h"
32 #include "uelement.h"
33 #include "ustr_imp.h"
34 #include "umutex.h"
35 #include "uassert.h"
36
37 #if 0
38
39 #include <iostream>
40 using namespace std;
41
42 //DEBUGGING
43 void
44 print(const UnicodeString& s,
45 const char *name)
46 {
47 UChar c;
48 cout << name << ":|";
49 for(int i = 0; i < s.length(); ++i) {
50 c = s[i];
51 if(c>= 0x007E || c < 0x0020)
52 cout << "[0x" << hex << s[i] << "]";
53 else
54 cout << (char) s[i];
55 }
56 cout << '|' << endl;
57 }
58
59 void
60 print(const UChar *s,
61 int32_t len,
62 const char *name)
63 {
64 UChar c;
65 cout << name << ":|";
66 for(int i = 0; i < len; ++i) {
67 c = s[i];
68 if(c>= 0x007E || c < 0x0020)
69 cout << "[0x" << hex << s[i] << "]";
70 else
71 cout << (char) s[i];
72 }
73 cout << '|' << endl;
74 }
75 // END DEBUGGING
76 #endif
77
78 // Local function definitions for now
79
80 // need to copy areas that may overlap
81 static
82 inline void
83 us_arrayCopy(const UChar *src, int32_t srcStart,
84 UChar *dst, int32_t dstStart, int32_t count)
85 {
86 if(count>0) {
87 uprv_memmove(dst+dstStart, src+srcStart, (size_t)count*sizeof(*src));
88 }
89 }
90
91 // u_unescapeAt() callback to get a UChar from a UnicodeString
92 U_CDECL_BEGIN
93 static UChar U_CALLCONV
94 UnicodeString_charAt(int32_t offset, void *context) {
95 return ((icu::UnicodeString*) context)->charAt(offset);
96 }
97 U_CDECL_END
98
99 U_NAMESPACE_BEGIN
100
101 /* The Replaceable virtual destructor can't be defined in the header
102 due to how AIX works with multiple definitions of virtual functions.
103 */
104 Replaceable::~Replaceable() {}
105
106 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
107
108 UnicodeString U_EXPORT2
109 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
110 return
111 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
112 append(s1).
113 append(s2);
114 }
115
116 //========================================
117 // Reference Counting functions, put at top of file so that optimizing compilers
118 // have a chance to automatically inline.
119 //========================================
120
121 void
122 UnicodeString::addRef() {
123 umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
124 }
125
126 int32_t
127 UnicodeString::removeRef() {
128 return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
129 }
130
131 int32_t
132 UnicodeString::refCount() const {
133 return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
134 }
135
136 void
137 UnicodeString::releaseArray() {
138 if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
139 uprv_free((int32_t *)fUnion.fFields.fArray - 1);
140 }
141 }
142
143
144
145 //========================================
146 // Constructors
147 //========================================
148
149 // The default constructor is inline in unistr.h.
150
151 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
152 fUnion.fFields.fLengthAndFlags = 0;
153 if(count <= 0 || (uint32_t)c > 0x10ffff) {
154 // just allocate and do not do anything else
155 allocate(capacity);
156 } else if(c <= 0xffff) {
157 int32_t length = count;
158 if(capacity < length) {
159 capacity = length;
160 }
161 if(allocate(capacity)) {
162 UChar *array = getArrayStart();
163 UChar unit = (UChar)c;
164 for(int32_t i = 0; i < length; ++i) {
165 array[i] = unit;
166 }
167 setLength(length);
168 }
169 } else { // supplementary code point, write surrogate pairs
170 if(count > (INT32_MAX / 2)) {
171 // We would get more than 2G UChars.
172 allocate(capacity);
173 return;
174 }
175 int32_t length = count * 2;
176 if(capacity < length) {
177 capacity = length;
178 }
179 if(allocate(capacity)) {
180 UChar *array = getArrayStart();
181 UChar lead = U16_LEAD(c);
182 UChar trail = U16_TRAIL(c);
183 for(int32_t i = 0; i < length; i += 2) {
184 array[i] = lead;
185 array[i + 1] = trail;
186 }
187 setLength(length);
188 }
189 }
190 }
191
192 UnicodeString::UnicodeString(UChar ch) {
193 fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
194 fUnion.fStackFields.fBuffer[0] = ch;
195 }
196
197 UnicodeString::UnicodeString(UChar32 ch) {
198 fUnion.fFields.fLengthAndFlags = kShortString;
199 int32_t i = 0;
200 UBool isError = FALSE;
201 U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
202 // We test isError so that the compiler does not complain that we don't.
203 // If isError then i==0 which is what we want anyway.
204 if(!isError) {
205 setShortLength(i);
206 }
207 }
208
209 UnicodeString::UnicodeString(const UChar *text) {
210 fUnion.fFields.fLengthAndFlags = kShortString;
211 doAppend(text, 0, -1);
212 }
213
214 UnicodeString::UnicodeString(const UChar *text,
215 int32_t textLength) {
216 fUnion.fFields.fLengthAndFlags = kShortString;
217 doAppend(text, 0, textLength);
218 }
219
220 UnicodeString::UnicodeString(UBool isTerminated,
221 ConstChar16Ptr textPtr,
222 int32_t textLength) {
223 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
224 const UChar *text = textPtr;
225 if(text == NULL) {
226 // treat as an empty string, do not alias
227 setToEmpty();
228 } else if(textLength < -1 ||
229 (textLength == -1 && !isTerminated) ||
230 (textLength >= 0 && isTerminated && text[textLength] != 0)
231 ) {
232 setToBogus();
233 } else {
234 if(textLength == -1) {
235 // text is terminated, or else it would have failed the above test
236 textLength = u_strlen(text);
237 }
238 setArray(const_cast<UChar *>(text), textLength,
239 isTerminated ? textLength + 1 : textLength);
240 }
241 }
242
243 UnicodeString::UnicodeString(UChar *buff,
244 int32_t buffLength,
245 int32_t buffCapacity) {
246 fUnion.fFields.fLengthAndFlags = kWritableAlias;
247 if(buff == NULL) {
248 // treat as an empty string, do not alias
249 setToEmpty();
250 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
251 setToBogus();
252 } else {
253 if(buffLength == -1) {
254 // fLength = u_strlen(buff); but do not look beyond buffCapacity
255 const UChar *p = buff, *limit = buff + buffCapacity;
256 while(p != limit && *p != 0) {
257 ++p;
258 }
259 buffLength = (int32_t)(p - buff);
260 }
261 setArray(buff, buffLength, buffCapacity);
262 }
263 }
264
265 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
266 fUnion.fFields.fLengthAndFlags = kShortString;
267 if(src==NULL) {
268 // treat as an empty string
269 } else {
270 if(length<0) {
271 length=(int32_t)uprv_strlen(src);
272 }
273 if(cloneArrayIfNeeded(length, length, FALSE)) {
274 u_charsToUChars(src, getArrayStart(), length);
275 setLength(length);
276 } else {
277 setToBogus();
278 }
279 }
280 }
281
282 #if U_CHARSET_IS_UTF8
283
284 UnicodeString::UnicodeString(const char *codepageData) {
285 fUnion.fFields.fLengthAndFlags = kShortString;
286 if(codepageData != 0) {
287 setToUTF8(codepageData);
288 }
289 }
290
291 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
292 fUnion.fFields.fLengthAndFlags = kShortString;
293 // if there's nothing to convert, do nothing
294 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
295 return;
296 }
297 if(dataLength == -1) {
298 dataLength = (int32_t)uprv_strlen(codepageData);
299 }
300 setToUTF8(StringPiece(codepageData, dataLength));
301 }
302
303 // else see unistr_cnv.cpp
304 #endif
305
306 UnicodeString::UnicodeString(const UnicodeString& that) {
307 fUnion.fFields.fLengthAndFlags = kShortString;
308 copyFrom(that);
309 }
310
311 #if U_HAVE_RVALUE_REFERENCES
312 UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
313 fUnion.fFields.fLengthAndFlags = kShortString;
314 moveFrom(src);
315 }
316 #endif
317
318 UnicodeString::UnicodeString(const UnicodeString& that,
319 int32_t srcStart) {
320 fUnion.fFields.fLengthAndFlags = kShortString;
321 setTo(that, srcStart);
322 }
323
324 UnicodeString::UnicodeString(const UnicodeString& that,
325 int32_t srcStart,
326 int32_t srcLength) {
327 fUnion.fFields.fLengthAndFlags = kShortString;
328 setTo(that, srcStart, srcLength);
329 }
330
331 // Replaceable base class clone() default implementation, does not clone
332 Replaceable *
333 Replaceable::clone() const {
334 return NULL;
335 }
336
337 // UnicodeString overrides clone() with a real implementation
338 Replaceable *
339 UnicodeString::clone() const {
340 return new UnicodeString(*this);
341 }
342
343 //========================================
344 // array allocation
345 //========================================
346
347 namespace {
348
349 const int32_t kGrowSize = 128;
350
351 // The number of bytes for one int32_t reference counter and capacity UChars
352 // must fit into a 32-bit size_t (at least when on a 32-bit platform).
353 // We also add one for the NUL terminator, to avoid reallocation in getTerminatedBuffer(),
354 // and round up to a multiple of 16 bytes.
355 // This means that capacity must be at most (0xfffffff0 - 4) / 2 - 1 = 0x7ffffff5.
356 // (With more complicated checks we could go up to 0x7ffffffd without rounding up,
357 // but that does not seem worth it.)
358 const int32_t kMaxCapacity = 0x7ffffff5;
359
360 int32_t getGrowCapacity(int32_t newLength) {
361 int32_t growSize = (newLength >> 2) + kGrowSize;
362 if(growSize <= (kMaxCapacity - newLength)) {
363 return newLength + growSize;
364 } else {
365 return kMaxCapacity;
366 }
367 }
368
369 } // namespace
370
371 UBool
372 UnicodeString::allocate(int32_t capacity) {
373 if(capacity <= US_STACKBUF_SIZE) {
374 fUnion.fFields.fLengthAndFlags = kShortString;
375 return TRUE;
376 }
377 if(capacity <= kMaxCapacity) {
378 ++capacity; // for the NUL
379 // Switch to size_t which is unsigned so that we can allocate up to 4GB.
380 // Reference counter + UChars.
381 size_t numBytes = sizeof(int32_t) + (size_t)capacity * U_SIZEOF_UCHAR;
382 // Round up to a multiple of 16.
383 numBytes = (numBytes + 15) & ~15;
384 int32_t *array = (int32_t *) uprv_malloc(numBytes);
385 if(array != NULL) {
386 // set initial refCount and point behind the refCount
387 *array++ = 1;
388 numBytes -= sizeof(int32_t);
389
390 // have fArray point to the first UChar
391 fUnion.fFields.fArray = (UChar *)array;
392 fUnion.fFields.fCapacity = (int32_t)(numBytes / U_SIZEOF_UCHAR);
393 fUnion.fFields.fLengthAndFlags = kLongString;
394 return TRUE;
395 }
396 }
397 fUnion.fFields.fLengthAndFlags = kIsBogus;
398 fUnion.fFields.fArray = 0;
399 fUnion.fFields.fCapacity = 0;
400 return FALSE;
401 }
402
403 //========================================
404 // Destructor
405 //========================================
406
407 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
408 static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1
409 static u_atomic_int32_t beyondCount(0);
410
411 U_CAPI void unistr_printLengths() {
412 int32_t i;
413 for(i = 0; i <= 59; ++i) {
414 printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]);
415 }
416 int32_t beyond = beyondCount;
417 for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
418 beyond += finalLengthCounts[i];
419 }
420 printf(">59, %9d\n", beyond);
421 }
422 #endif
423
424 UnicodeString::~UnicodeString()
425 {
426 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
427 // Count lengths of strings at the end of their lifetime.
428 // Useful for discussion of a desirable stack buffer size.
429 // Count the contents length, not the optional NUL terminator nor further capacity.
430 // Ignore open-buffer strings and strings which alias external storage.
431 if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
432 if(hasShortLength()) {
433 umtx_atomic_inc(finalLengthCounts + getShortLength());
434 } else {
435 umtx_atomic_inc(&beyondCount);
436 }
437 }
438 #endif
439
440 releaseArray();
441 }
442
443 //========================================
444 // Factory methods
445 //========================================
446
447 UnicodeString UnicodeString::fromUTF8(StringPiece utf8) {
448 UnicodeString result;
449 result.setToUTF8(utf8);
450 return result;
451 }
452
453 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
454 UnicodeString result;
455 int32_t capacity;
456 // Most UTF-32 strings will be BMP-only and result in a same-length
457 // UTF-16 string. We overestimate the capacity just slightly,
458 // just in case there are a few supplementary characters.
459 if(length <= US_STACKBUF_SIZE) {
460 capacity = US_STACKBUF_SIZE;
461 } else {
462 capacity = length + (length >> 4) + 4;
463 }
464 do {
465 UChar *utf16 = result.getBuffer(capacity);
466 int32_t length16;
467 UErrorCode errorCode = U_ZERO_ERROR;
468 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
469 utf32, length,
470 0xfffd, // Substitution character.
471 NULL, // Don't care about number of substitutions.
472 &errorCode);
473 result.releaseBuffer(length16);
474 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
475 capacity = length16 + 1; // +1 for the terminating NUL.
476 continue;
477 } else if(U_FAILURE(errorCode)) {
478 result.setToBogus();
479 }
480 break;
481 } while(TRUE);
482 return result;
483 }
484
485 //========================================
486 // Assignment
487 //========================================
488
489 UnicodeString &
490 UnicodeString::operator=(const UnicodeString &src) {
491 return copyFrom(src);
492 }
493
494 UnicodeString &
495 UnicodeString::fastCopyFrom(const UnicodeString &src) {
496 return copyFrom(src, TRUE);
497 }
498
499 UnicodeString &
500 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
501 // if assigning to ourselves, do nothing
502 if(this == &src) {
503 return *this;
504 }
505
506 // is the right side bogus?
507 if(src.isBogus()) {
508 setToBogus();
509 return *this;
510 }
511
512 // delete the current contents
513 releaseArray();
514
515 if(src.isEmpty()) {
516 // empty string - use the stack buffer
517 setToEmpty();
518 return *this;
519 }
520
521 // fLength>0 and not an "open" src.getBuffer(minCapacity)
522 fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
523 switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
524 case kShortString:
525 // short string using the stack buffer, do the same
526 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
527 getShortLength() * U_SIZEOF_UCHAR);
528 break;
529 case kLongString:
530 // src uses a refCounted string buffer, use that buffer with refCount
531 // src is const, use a cast - we don't actually change it
532 ((UnicodeString &)src).addRef();
533 // copy all fields, share the reference-counted buffer
534 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
535 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
536 if(!hasShortLength()) {
537 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
538 }
539 break;
540 case kReadonlyAlias:
541 if(fastCopy) {
542 // src is a readonly alias, do the same
543 // -> maintain the readonly alias as such
544 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
545 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
546 if(!hasShortLength()) {
547 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
548 }
549 break;
550 }
551 // else if(!fastCopy) fall through to case kWritableAlias
552 // -> allocate a new buffer and copy the contents
553 U_FALLTHROUGH;
554 case kWritableAlias: {
555 // src is a writable alias; we make a copy of that instead
556 int32_t srcLength = src.length();
557 if(allocate(srcLength)) {
558 u_memcpy(getArrayStart(), src.getArrayStart(), srcLength);
559 setLength(srcLength);
560 break;
561 }
562 // if there is not enough memory, then fall through to setting to bogus
563 U_FALLTHROUGH;
564 }
565 default:
566 // if src is bogus, set ourselves to bogus
567 // do not call setToBogus() here because fArray and flags are not consistent here
568 fUnion.fFields.fLengthAndFlags = kIsBogus;
569 fUnion.fFields.fArray = 0;
570 fUnion.fFields.fCapacity = 0;
571 break;
572 }
573
574 return *this;
575 }
576
577 UnicodeString &UnicodeString::moveFrom(UnicodeString &src) U_NOEXCEPT {
578 // No explicit check for self move assignment, consistent with standard library.
579 // Self move assignment causes no crash nor leak but might make the object bogus.
580 releaseArray();
581 copyFieldsFrom(src, TRUE);
582 return *this;
583 }
584
585 // Same as moveFrom() except without memory management.
586 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
587 int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
588 if(lengthAndFlags & kUsingStackBuffer) {
589 // Short string using the stack buffer, copy the contents.
590 // Check for self assignment to prevent "overlap in memcpy" warnings,
591 // although it should be harmless to copy a buffer to itself exactly.
592 if(this != &src) {
593 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
594 getShortLength() * U_SIZEOF_UCHAR);
595 }
596 } else {
597 // In all other cases, copy all fields.
598 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
599 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
600 if(!hasShortLength()) {
601 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
602 }
603 if(setSrcToBogus) {
604 // Set src to bogus without releasing any memory.
605 src.fUnion.fFields.fLengthAndFlags = kIsBogus;
606 src.fUnion.fFields.fArray = NULL;
607 src.fUnion.fFields.fCapacity = 0;
608 }
609 }
610 }
611
612 void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
613 UnicodeString temp; // Empty short string: Known not to need releaseArray().
614 // Copy fields without resetting source values in between.
615 temp.copyFieldsFrom(*this, FALSE);
616 this->copyFieldsFrom(other, FALSE);
617 other.copyFieldsFrom(temp, FALSE);
618 // Set temp to an empty string so that other's memory is not released twice.
619 temp.fUnion.fFields.fLengthAndFlags = kShortString;
620 }
621
622 //========================================
623 // Miscellaneous operations
624 //========================================
625
626 UnicodeString UnicodeString::unescape() const {
627 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
628 if (result.isBogus()) {
629 return result;
630 }
631 const UChar *array = getBuffer();
632 int32_t len = length();
633 int32_t prev = 0;
634 for (int32_t i=0;;) {
635 if (i == len) {
636 result.append(array, prev, len - prev);
637 break;
638 }
639 if (array[i++] == 0x5C /*'\\'*/) {
640 result.append(array, prev, (i - 1) - prev);
641 UChar32 c = unescapeAt(i); // advances i
642 if (c < 0) {
643 result.remove(); // return empty string
644 break; // invalid escape sequence
645 }
646 result.append(c);
647 prev = i;
648 }
649 }
650 return result;
651 }
652
653 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
654 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
655 }
656
657 //========================================
658 // Read-only implementation
659 //========================================
660 UBool
661 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
662 // Requires: this & text not bogus and have same lengths.
663 // Byte-wise comparison works for equality regardless of endianness.
664 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
665 }
666
667 int8_t
668 UnicodeString::doCompare( int32_t start,
669 int32_t length,
670 const UChar *srcChars,
671 int32_t srcStart,
672 int32_t srcLength) const
673 {
674 // compare illegal string values
675 if(isBogus()) {
676 return -1;
677 }
678
679 // pin indices to legal values
680 pinIndices(start, length);
681
682 if(srcChars == NULL) {
683 // treat const UChar *srcChars==NULL as an empty string
684 return length == 0 ? 0 : 1;
685 }
686
687 // get the correct pointer
688 const UChar *chars = getArrayStart();
689
690 chars += start;
691 srcChars += srcStart;
692
693 int32_t minLength;
694 int8_t lengthResult;
695
696 // get the srcLength if necessary
697 if(srcLength < 0) {
698 srcLength = u_strlen(srcChars + srcStart);
699 }
700
701 // are we comparing different lengths?
702 if(length != srcLength) {
703 if(length < srcLength) {
704 minLength = length;
705 lengthResult = -1;
706 } else {
707 minLength = srcLength;
708 lengthResult = 1;
709 }
710 } else {
711 minLength = length;
712 lengthResult = 0;
713 }
714
715 /*
716 * note that uprv_memcmp() returns an int but we return an int8_t;
717 * we need to take care not to truncate the result -
718 * one way to do this is to right-shift the value to
719 * move the sign bit into the lower 8 bits and making sure that this
720 * does not become 0 itself
721 */
722
723 if(minLength > 0 && chars != srcChars) {
724 int32_t result;
725
726 # if U_IS_BIG_ENDIAN
727 // big-endian: byte comparison works
728 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
729 if(result != 0) {
730 return (int8_t)(result >> 15 | 1);
731 }
732 # else
733 // little-endian: compare UChar units
734 do {
735 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
736 if(result != 0) {
737 return (int8_t)(result >> 15 | 1);
738 }
739 } while(--minLength > 0);
740 # endif
741 }
742 return lengthResult;
743 }
744
745 /* String compare in code point order - doCompare() compares in code unit order. */
746 int8_t
747 UnicodeString::doCompareCodePointOrder(int32_t start,
748 int32_t length,
749 const UChar *srcChars,
750 int32_t srcStart,
751 int32_t srcLength) const
752 {
753 // compare illegal string values
754 // treat const UChar *srcChars==NULL as an empty string
755 if(isBogus()) {
756 return -1;
757 }
758
759 // pin indices to legal values
760 pinIndices(start, length);
761
762 if(srcChars == NULL) {
763 srcStart = srcLength = 0;
764 }
765
766 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
767 /* translate the 32-bit result into an 8-bit one */
768 if(diff!=0) {
769 return (int8_t)(diff >> 15 | 1);
770 } else {
771 return 0;
772 }
773 }
774
775 int32_t
776 UnicodeString::getLength() const {
777 return length();
778 }
779
780 UChar
781 UnicodeString::getCharAt(int32_t offset) const {
782 return charAt(offset);
783 }
784
785 UChar32
786 UnicodeString::getChar32At(int32_t offset) const {
787 return char32At(offset);
788 }
789
790 UChar32
791 UnicodeString::char32At(int32_t offset) const
792 {
793 int32_t len = length();
794 if((uint32_t)offset < (uint32_t)len) {
795 const UChar *array = getArrayStart();
796 UChar32 c;
797 U16_GET(array, 0, offset, len, c);
798 return c;
799 } else {
800 return kInvalidUChar;
801 }
802 }
803
804 int32_t
805 UnicodeString::getChar32Start(int32_t offset) const {
806 if((uint32_t)offset < (uint32_t)length()) {
807 const UChar *array = getArrayStart();
808 U16_SET_CP_START(array, 0, offset);
809 return offset;
810 } else {
811 return 0;
812 }
813 }
814
815 int32_t
816 UnicodeString::getChar32Limit(int32_t offset) const {
817 int32_t len = length();
818 if((uint32_t)offset < (uint32_t)len) {
819 const UChar *array = getArrayStart();
820 U16_SET_CP_LIMIT(array, 0, offset, len);
821 return offset;
822 } else {
823 return len;
824 }
825 }
826
827 int32_t
828 UnicodeString::countChar32(int32_t start, int32_t length) const {
829 pinIndices(start, length);
830 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
831 return u_countChar32(getArrayStart()+start, length);
832 }
833
834 UBool
835 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
836 pinIndices(start, length);
837 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
838 return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
839 }
840
841 int32_t
842 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
843 // pin index
844 int32_t len = length();
845 if(index<0) {
846 index=0;
847 } else if(index>len) {
848 index=len;
849 }
850
851 const UChar *array = getArrayStart();
852 if(delta>0) {
853 U16_FWD_N(array, index, len, delta);
854 } else {
855 U16_BACK_N(array, 0, index, -delta);
856 }
857
858 return index;
859 }
860
861 void
862 UnicodeString::doExtract(int32_t start,
863 int32_t length,
864 UChar *dst,
865 int32_t dstStart) const
866 {
867 // pin indices to legal values
868 pinIndices(start, length);
869
870 // do not copy anything if we alias dst itself
871 const UChar *array = getArrayStart();
872 if(array + start != dst + dstStart) {
873 us_arrayCopy(array, start, dst, dstStart, length);
874 }
875 }
876
877 int32_t
878 UnicodeString::extract(Char16Ptr dest, int32_t destCapacity,
879 UErrorCode &errorCode) const {
880 int32_t len = length();
881 if(U_SUCCESS(errorCode)) {
882 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
883 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
884 } else {
885 const UChar *array = getArrayStart();
886 if(len>0 && len<=destCapacity && array!=dest) {
887 u_memcpy(dest, array, len);
888 }
889 return u_terminateUChars(dest, destCapacity, len, &errorCode);
890 }
891 }
892
893 return len;
894 }
895
896 int32_t
897 UnicodeString::extract(int32_t start,
898 int32_t length,
899 char *target,
900 int32_t targetCapacity,
901 enum EInvariant) const
902 {
903 // if the arguments are illegal, then do nothing
904 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
905 return 0;
906 }
907
908 // pin the indices to legal values
909 pinIndices(start, length);
910
911 if(length <= targetCapacity) {
912 u_UCharsToChars(getArrayStart() + start, target, length);
913 }
914 UErrorCode status = U_ZERO_ERROR;
915 return u_terminateChars(target, targetCapacity, length, &status);
916 }
917
918 UnicodeString
919 UnicodeString::tempSubString(int32_t start, int32_t len) const {
920 pinIndices(start, len);
921 const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
922 if(array==NULL) {
923 array=fUnion.fStackFields.fBuffer; // anything not NULL because that would make an empty string
924 len=-2; // bogus result string
925 }
926 return UnicodeString(FALSE, array + start, len);
927 }
928
929 int32_t
930 UnicodeString::toUTF8(int32_t start, int32_t len,
931 char *target, int32_t capacity) const {
932 pinIndices(start, len);
933 int32_t length8;
934 UErrorCode errorCode = U_ZERO_ERROR;
935 u_strToUTF8WithSub(target, capacity, &length8,
936 getBuffer() + start, len,
937 0xFFFD, // Standard substitution character.
938 NULL, // Don't care about number of substitutions.
939 &errorCode);
940 return length8;
941 }
942
943 #if U_CHARSET_IS_UTF8
944
945 int32_t
946 UnicodeString::extract(int32_t start, int32_t len,
947 char *target, uint32_t dstSize) const {
948 // if the arguments are illegal, then do nothing
949 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
950 return 0;
951 }
952 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
953 }
954
955 // else see unistr_cnv.cpp
956 #endif
957
958 void
959 UnicodeString::extractBetween(int32_t start,
960 int32_t limit,
961 UnicodeString& target) const {
962 pinIndex(start);
963 pinIndex(limit);
964 doExtract(start, limit - start, target);
965 }
966
967 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
968 // as many bytes as the source has UChars.
969 // The "worst cases" are writing systems like Indic, Thai and CJK with
970 // 3:1 bytes:UChars.
971 void
972 UnicodeString::toUTF8(ByteSink &sink) const {
973 int32_t length16 = length();
974 if(length16 != 0) {
975 char stackBuffer[1024];
976 int32_t capacity = (int32_t)sizeof(stackBuffer);
977 UBool utf8IsOwned = FALSE;
978 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
979 3*length16,
980 stackBuffer, capacity,
981 &capacity);
982 int32_t length8 = 0;
983 UErrorCode errorCode = U_ZERO_ERROR;
984 u_strToUTF8WithSub(utf8, capacity, &length8,
985 getBuffer(), length16,
986 0xFFFD, // Standard substitution character.
987 NULL, // Don't care about number of substitutions.
988 &errorCode);
989 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
990 utf8 = (char *)uprv_malloc(length8);
991 if(utf8 != NULL) {
992 utf8IsOwned = TRUE;
993 errorCode = U_ZERO_ERROR;
994 u_strToUTF8WithSub(utf8, length8, &length8,
995 getBuffer(), length16,
996 0xFFFD, // Standard substitution character.
997 NULL, // Don't care about number of substitutions.
998 &errorCode);
999 } else {
1000 errorCode = U_MEMORY_ALLOCATION_ERROR;
1001 }
1002 }
1003 if(U_SUCCESS(errorCode)) {
1004 sink.Append(utf8, length8);
1005 sink.Flush();
1006 }
1007 if(utf8IsOwned) {
1008 uprv_free(utf8);
1009 }
1010 }
1011 }
1012
1013 int32_t
1014 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
1015 int32_t length32=0;
1016 if(U_SUCCESS(errorCode)) {
1017 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
1018 u_strToUTF32WithSub(utf32, capacity, &length32,
1019 getBuffer(), length(),
1020 0xfffd, // Substitution character.
1021 NULL, // Don't care about number of substitutions.
1022 &errorCode);
1023 }
1024 return length32;
1025 }
1026
1027 int32_t
1028 UnicodeString::indexOf(const UChar *srcChars,
1029 int32_t srcStart,
1030 int32_t srcLength,
1031 int32_t start,
1032 int32_t length) const
1033 {
1034 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1035 return -1;
1036 }
1037
1038 // UnicodeString does not find empty substrings
1039 if(srcLength < 0 && srcChars[srcStart] == 0) {
1040 return -1;
1041 }
1042
1043 // get the indices within bounds
1044 pinIndices(start, length);
1045
1046 // find the first occurrence of the substring
1047 const UChar *array = getArrayStart();
1048 const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1049 if(match == NULL) {
1050 return -1;
1051 } else {
1052 return (int32_t)(match - array);
1053 }
1054 }
1055
1056 int32_t
1057 UnicodeString::doIndexOf(UChar c,
1058 int32_t start,
1059 int32_t length) const
1060 {
1061 // pin indices
1062 pinIndices(start, length);
1063
1064 // find the first occurrence of c
1065 const UChar *array = getArrayStart();
1066 const UChar *match = u_memchr(array + start, c, length);
1067 if(match == NULL) {
1068 return -1;
1069 } else {
1070 return (int32_t)(match - array);
1071 }
1072 }
1073
1074 int32_t
1075 UnicodeString::doIndexOf(UChar32 c,
1076 int32_t start,
1077 int32_t length) const {
1078 // pin indices
1079 pinIndices(start, length);
1080
1081 // find the first occurrence of c
1082 const UChar *array = getArrayStart();
1083 const UChar *match = u_memchr32(array + start, c, length);
1084 if(match == NULL) {
1085 return -1;
1086 } else {
1087 return (int32_t)(match - array);
1088 }
1089 }
1090
1091 int32_t
1092 UnicodeString::lastIndexOf(const UChar *srcChars,
1093 int32_t srcStart,
1094 int32_t srcLength,
1095 int32_t start,
1096 int32_t length) const
1097 {
1098 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1099 return -1;
1100 }
1101
1102 // UnicodeString does not find empty substrings
1103 if(srcLength < 0 && srcChars[srcStart] == 0) {
1104 return -1;
1105 }
1106
1107 // get the indices within bounds
1108 pinIndices(start, length);
1109
1110 // find the last occurrence of the substring
1111 const UChar *array = getArrayStart();
1112 const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1113 if(match == NULL) {
1114 return -1;
1115 } else {
1116 return (int32_t)(match - array);
1117 }
1118 }
1119
1120 int32_t
1121 UnicodeString::doLastIndexOf(UChar c,
1122 int32_t start,
1123 int32_t length) const
1124 {
1125 if(isBogus()) {
1126 return -1;
1127 }
1128
1129 // pin indices
1130 pinIndices(start, length);
1131
1132 // find the last occurrence of c
1133 const UChar *array = getArrayStart();
1134 const UChar *match = u_memrchr(array + start, c, length);
1135 if(match == NULL) {
1136 return -1;
1137 } else {
1138 return (int32_t)(match - array);
1139 }
1140 }
1141
1142 int32_t
1143 UnicodeString::doLastIndexOf(UChar32 c,
1144 int32_t start,
1145 int32_t length) const {
1146 // pin indices
1147 pinIndices(start, length);
1148
1149 // find the last occurrence of c
1150 const UChar *array = getArrayStart();
1151 const UChar *match = u_memrchr32(array + start, c, length);
1152 if(match == NULL) {
1153 return -1;
1154 } else {
1155 return (int32_t)(match - array);
1156 }
1157 }
1158
1159 //========================================
1160 // Write implementation
1161 //========================================
1162
1163 UnicodeString&
1164 UnicodeString::findAndReplace(int32_t start,
1165 int32_t length,
1166 const UnicodeString& oldText,
1167 int32_t oldStart,
1168 int32_t oldLength,
1169 const UnicodeString& newText,
1170 int32_t newStart,
1171 int32_t newLength)
1172 {
1173 if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1174 return *this;
1175 }
1176
1177 pinIndices(start, length);
1178 oldText.pinIndices(oldStart, oldLength);
1179 newText.pinIndices(newStart, newLength);
1180
1181 if(oldLength == 0) {
1182 return *this;
1183 }
1184
1185 while(length > 0 && length >= oldLength) {
1186 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1187 if(pos < 0) {
1188 // no more oldText's here: done
1189 break;
1190 } else {
1191 // we found oldText, replace it by newText and go beyond it
1192 replace(pos, oldLength, newText, newStart, newLength);
1193 length -= pos + oldLength - start;
1194 start = pos + newLength;
1195 }
1196 }
1197
1198 return *this;
1199 }
1200
1201
1202 void
1203 UnicodeString::setToBogus()
1204 {
1205 releaseArray();
1206
1207 fUnion.fFields.fLengthAndFlags = kIsBogus;
1208 fUnion.fFields.fArray = 0;
1209 fUnion.fFields.fCapacity = 0;
1210 }
1211
1212 // turn a bogus string into an empty one
1213 void
1214 UnicodeString::unBogus() {
1215 if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1216 setToEmpty();
1217 }
1218 }
1219
1220 const char16_t *
1221 UnicodeString::getTerminatedBuffer() {
1222 if(!isWritable()) {
1223 return nullptr;
1224 }
1225 UChar *array = getArrayStart();
1226 int32_t len = length();
1227 if(len < getCapacity()) {
1228 if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1229 // If len<capacity on a read-only alias, then array[len] is
1230 // either the original NUL (if constructed with (TRUE, s, length))
1231 // or one of the original string contents characters (if later truncated),
1232 // therefore we can assume that array[len] is initialized memory.
1233 if(array[len] == 0) {
1234 return array;
1235 }
1236 } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1237 // kRefCounted: Do not write the NUL if the buffer is shared.
1238 // That is mostly safe, except when the length of one copy was modified
1239 // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1240 // Then the NUL would be written into the middle of another copy's string.
1241
1242 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1243 // Do not test if there is a NUL already because it might be uninitialized memory.
1244 // (That would be safe, but tools like valgrind & Purify would complain.)
1245 array[len] = 0;
1246 return array;
1247 }
1248 }
1249 if(len<INT32_MAX && cloneArrayIfNeeded(len+1)) {
1250 array = getArrayStart();
1251 array[len] = 0;
1252 return array;
1253 } else {
1254 return nullptr;
1255 }
1256 }
1257
1258 // setTo() analogous to the readonly-aliasing constructor with the same signature
1259 UnicodeString &
1260 UnicodeString::setTo(UBool isTerminated,
1261 ConstChar16Ptr textPtr,
1262 int32_t textLength)
1263 {
1264 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1265 // do not modify a string that has an "open" getBuffer(minCapacity)
1266 return *this;
1267 }
1268
1269 const UChar *text = textPtr;
1270 if(text == NULL) {
1271 // treat as an empty string, do not alias
1272 releaseArray();
1273 setToEmpty();
1274 return *this;
1275 }
1276
1277 if( textLength < -1 ||
1278 (textLength == -1 && !isTerminated) ||
1279 (textLength >= 0 && isTerminated && text[textLength] != 0)
1280 ) {
1281 setToBogus();
1282 return *this;
1283 }
1284
1285 releaseArray();
1286
1287 if(textLength == -1) {
1288 // text is terminated, or else it would have failed the above test
1289 textLength = u_strlen(text);
1290 }
1291 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1292 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1293 return *this;
1294 }
1295
1296 // setTo() analogous to the writable-aliasing constructor with the same signature
1297 UnicodeString &
1298 UnicodeString::setTo(UChar *buffer,
1299 int32_t buffLength,
1300 int32_t buffCapacity) {
1301 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1302 // do not modify a string that has an "open" getBuffer(minCapacity)
1303 return *this;
1304 }
1305
1306 if(buffer == NULL) {
1307 // treat as an empty string, do not alias
1308 releaseArray();
1309 setToEmpty();
1310 return *this;
1311 }
1312
1313 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1314 setToBogus();
1315 return *this;
1316 } else if(buffLength == -1) {
1317 // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1318 const UChar *p = buffer, *limit = buffer + buffCapacity;
1319 while(p != limit && *p != 0) {
1320 ++p;
1321 }
1322 buffLength = (int32_t)(p - buffer);
1323 }
1324
1325 releaseArray();
1326
1327 fUnion.fFields.fLengthAndFlags = kWritableAlias;
1328 setArray(buffer, buffLength, buffCapacity);
1329 return *this;
1330 }
1331
1332 UnicodeString &UnicodeString::setToUTF8(StringPiece utf8) {
1333 unBogus();
1334 int32_t length = utf8.length();
1335 int32_t capacity;
1336 // The UTF-16 string will be at most as long as the UTF-8 string.
1337 if(length <= US_STACKBUF_SIZE) {
1338 capacity = US_STACKBUF_SIZE;
1339 } else {
1340 capacity = length + 1; // +1 for the terminating NUL.
1341 }
1342 UChar *utf16 = getBuffer(capacity);
1343 int32_t length16;
1344 UErrorCode errorCode = U_ZERO_ERROR;
1345 u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1346 utf8.data(), length,
1347 0xfffd, // Substitution character.
1348 NULL, // Don't care about number of substitutions.
1349 &errorCode);
1350 releaseBuffer(length16);
1351 if(U_FAILURE(errorCode)) {
1352 setToBogus();
1353 }
1354 return *this;
1355 }
1356
1357 UnicodeString&
1358 UnicodeString::setCharAt(int32_t offset,
1359 UChar c)
1360 {
1361 int32_t len = length();
1362 if(cloneArrayIfNeeded() && len > 0) {
1363 if(offset < 0) {
1364 offset = 0;
1365 } else if(offset >= len) {
1366 offset = len - 1;
1367 }
1368
1369 getArrayStart()[offset] = c;
1370 }
1371 return *this;
1372 }
1373
1374 UnicodeString&
1375 UnicodeString::replace(int32_t start,
1376 int32_t _length,
1377 UChar32 srcChar) {
1378 UChar buffer[U16_MAX_LENGTH];
1379 int32_t count = 0;
1380 UBool isError = FALSE;
1381 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1382 // We test isError so that the compiler does not complain that we don't.
1383 // If isError (srcChar is not a valid code point) then count==0 which means
1384 // we remove the source segment rather than replacing it with srcChar.
1385 return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1386 }
1387
1388 UnicodeString&
1389 UnicodeString::append(UChar32 srcChar) {
1390 UChar buffer[U16_MAX_LENGTH];
1391 int32_t _length = 0;
1392 UBool isError = FALSE;
1393 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1394 // We test isError so that the compiler does not complain that we don't.
1395 // If isError then _length==0 which turns the doAppend() into a no-op anyway.
1396 return isError ? *this : doAppend(buffer, 0, _length);
1397 }
1398
1399 UnicodeString&
1400 UnicodeString::doReplace( int32_t start,
1401 int32_t length,
1402 const UnicodeString& src,
1403 int32_t srcStart,
1404 int32_t srcLength)
1405 {
1406 // pin the indices to legal values
1407 src.pinIndices(srcStart, srcLength);
1408
1409 // get the characters from src
1410 // and replace the range in ourselves with them
1411 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1412 }
1413
1414 UnicodeString&
1415 UnicodeString::doReplace(int32_t start,
1416 int32_t length,
1417 const UChar *srcChars,
1418 int32_t srcStart,
1419 int32_t srcLength)
1420 {
1421 if(!isWritable()) {
1422 return *this;
1423 }
1424
1425 int32_t oldLength = this->length();
1426
1427 // optimize (read-only alias).remove(0, start) and .remove(start, end)
1428 if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1429 if(start == 0) {
1430 // remove prefix by adjusting the array pointer
1431 pinIndex(length);
1432 fUnion.fFields.fArray += length;
1433 fUnion.fFields.fCapacity -= length;
1434 setLength(oldLength - length);
1435 return *this;
1436 } else {
1437 pinIndex(start);
1438 if(length >= (oldLength - start)) {
1439 // remove suffix by reducing the length (like truncate())
1440 setLength(start);
1441 fUnion.fFields.fCapacity = start; // not NUL-terminated any more
1442 return *this;
1443 }
1444 }
1445 }
1446
1447 if(start == oldLength) {
1448 return doAppend(srcChars, srcStart, srcLength);
1449 }
1450
1451 if(srcChars == 0) {
1452 srcStart = srcLength = 0;
1453 } else if(srcLength < 0) {
1454 // get the srcLength if necessary
1455 srcLength = u_strlen(srcChars + srcStart);
1456 }
1457
1458 // pin the indices to legal values
1459 pinIndices(start, length);
1460
1461 // Calculate the size of the string after the replace.
1462 // Avoid int32_t overflow.
1463 int32_t newLength = oldLength - length;
1464 if(srcLength > (INT32_MAX - newLength)) {
1465 setToBogus();
1466 return *this;
1467 }
1468 newLength += srcLength;
1469
1470 // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
1471 // therefore we need to keep the current fArray
1472 UChar oldStackBuffer[US_STACKBUF_SIZE];
1473 UChar *oldArray;
1474 if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1475 // copy the stack buffer contents because it will be overwritten with
1476 // fUnion.fFields values
1477 u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
1478 oldArray = oldStackBuffer;
1479 } else {
1480 oldArray = getArrayStart();
1481 }
1482
1483 // clone our array and allocate a bigger array if needed
1484 int32_t *bufferToDelete = 0;
1485 if(!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength),
1486 FALSE, &bufferToDelete)
1487 ) {
1488 return *this;
1489 }
1490
1491 // now do the replace
1492
1493 UChar *newArray = getArrayStart();
1494 if(newArray != oldArray) {
1495 // if fArray changed, then we need to copy everything except what will change
1496 us_arrayCopy(oldArray, 0, newArray, 0, start);
1497 us_arrayCopy(oldArray, start + length,
1498 newArray, start + srcLength,
1499 oldLength - (start + length));
1500 } else if(length != srcLength) {
1501 // fArray did not change; copy only the portion that isn't changing, leaving a hole
1502 us_arrayCopy(oldArray, start + length,
1503 newArray, start + srcLength,
1504 oldLength - (start + length));
1505 }
1506
1507 // now fill in the hole with the new string
1508 us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1509
1510 setLength(newLength);
1511
1512 // delayed delete in case srcChars == fArray when we started, and
1513 // to keep oldArray alive for the above operations
1514 if (bufferToDelete) {
1515 uprv_free(bufferToDelete);
1516 }
1517
1518 return *this;
1519 }
1520
1521 // Versions of doReplace() only for append() variants.
1522 // doReplace() and doAppend() optimize for different cases.
1523
1524 UnicodeString&
1525 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1526 if(srcLength == 0) {
1527 return *this;
1528 }
1529
1530 // pin the indices to legal values
1531 src.pinIndices(srcStart, srcLength);
1532 return doAppend(src.getArrayStart(), srcStart, srcLength);
1533 }
1534
1535 UnicodeString&
1536 UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
1537 if(!isWritable() || srcLength == 0 || srcChars == NULL) {
1538 return *this;
1539 }
1540
1541 if(srcLength < 0) {
1542 // get the srcLength if necessary
1543 if((srcLength = u_strlen(srcChars + srcStart)) == 0) {
1544 return *this;
1545 }
1546 }
1547
1548 int32_t oldLength = length();
1549 int32_t newLength = oldLength + srcLength;
1550 // optimize append() onto a large-enough, owned string
1551 if((newLength <= getCapacity() && isBufferWritable()) ||
1552 cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) {
1553 UChar *newArray = getArrayStart();
1554 // Do not copy characters when
1555 // UChar *buffer=str.getAppendBuffer(...);
1556 // is followed by
1557 // str.append(buffer, length);
1558 // or
1559 // str.appendString(buffer, length)
1560 // or similar.
1561 if(srcChars + srcStart != newArray + oldLength) {
1562 us_arrayCopy(srcChars, srcStart, newArray, oldLength, srcLength);
1563 }
1564 setLength(newLength);
1565 }
1566 return *this;
1567 }
1568
1569 /**
1570 * Replaceable API
1571 */
1572 void
1573 UnicodeString::handleReplaceBetween(int32_t start,
1574 int32_t limit,
1575 const UnicodeString& text) {
1576 replaceBetween(start, limit, text);
1577 }
1578
1579 /**
1580 * Replaceable API
1581 */
1582 void
1583 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1584 if (limit <= start) {
1585 return; // Nothing to do; avoid bogus malloc call
1586 }
1587 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1588 // Check to make sure text is not null.
1589 if (text != NULL) {
1590 extractBetween(start, limit, text, 0);
1591 insert(dest, text, 0, limit - start);
1592 uprv_free(text);
1593 }
1594 }
1595
1596 /**
1597 * Replaceable API
1598 *
1599 * NOTE: This is for the Replaceable class. There is no rep.cpp,
1600 * so we implement this function here.
1601 */
1602 UBool Replaceable::hasMetaData() const {
1603 return TRUE;
1604 }
1605
1606 /**
1607 * Replaceable API
1608 */
1609 UBool UnicodeString::hasMetaData() const {
1610 return FALSE;
1611 }
1612
1613 UnicodeString&
1614 UnicodeString::doReverse(int32_t start, int32_t length) {
1615 if(length <= 1 || !cloneArrayIfNeeded()) {
1616 return *this;
1617 }
1618
1619 // pin the indices to legal values
1620 pinIndices(start, length);
1621 if(length <= 1) { // pinIndices() might have shrunk the length
1622 return *this;
1623 }
1624
1625 UChar *left = getArrayStart() + start;
1626 UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2)
1627 UChar swap;
1628 UBool hasSupplementary = FALSE;
1629
1630 // Before the loop we know left<right because length>=2.
1631 do {
1632 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1633 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1634 *right-- = swap;
1635 } while(left < right);
1636 // Make sure to test the middle code unit of an odd-length string.
1637 // Redundant if the length is even.
1638 hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1639
1640 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1641 if(hasSupplementary) {
1642 UChar swap2;
1643
1644 left = getArrayStart() + start;
1645 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1646 while(left < right) {
1647 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1648 *left++ = swap2;
1649 *left++ = swap;
1650 } else {
1651 ++left;
1652 }
1653 }
1654 }
1655
1656 return *this;
1657 }
1658
1659 UBool
1660 UnicodeString::padLeading(int32_t targetLength,
1661 UChar padChar)
1662 {
1663 int32_t oldLength = length();
1664 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1665 return FALSE;
1666 } else {
1667 // move contents up by padding width
1668 UChar *array = getArrayStart();
1669 int32_t start = targetLength - oldLength;
1670 us_arrayCopy(array, 0, array, start, oldLength);
1671
1672 // fill in padding character
1673 while(--start >= 0) {
1674 array[start] = padChar;
1675 }
1676 setLength(targetLength);
1677 return TRUE;
1678 }
1679 }
1680
1681 UBool
1682 UnicodeString::padTrailing(int32_t targetLength,
1683 UChar padChar)
1684 {
1685 int32_t oldLength = length();
1686 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1687 return FALSE;
1688 } else {
1689 // fill in padding character
1690 UChar *array = getArrayStart();
1691 int32_t length = targetLength;
1692 while(--length >= oldLength) {
1693 array[length] = padChar;
1694 }
1695 setLength(targetLength);
1696 return TRUE;
1697 }
1698 }
1699
1700 //========================================
1701 // Hashing
1702 //========================================
1703 int32_t
1704 UnicodeString::doHashCode() const
1705 {
1706 /* Delegate hash computation to uhash. This makes UnicodeString
1707 * hashing consistent with UChar* hashing. */
1708 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1709 if (hashCode == kInvalidHashCode) {
1710 hashCode = kEmptyHashCode;
1711 }
1712 return hashCode;
1713 }
1714
1715 //========================================
1716 // External Buffer
1717 //========================================
1718
1719 char16_t *
1720 UnicodeString::getBuffer(int32_t minCapacity) {
1721 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1722 fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1723 setZeroLength();
1724 return getArrayStart();
1725 } else {
1726 return nullptr;
1727 }
1728 }
1729
1730 void
1731 UnicodeString::releaseBuffer(int32_t newLength) {
1732 if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1733 // set the new fLength
1734 int32_t capacity=getCapacity();
1735 if(newLength==-1) {
1736 // the new length is the string length, capped by fCapacity
1737 const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1738 while(p<limit && *p!=0) {
1739 ++p;
1740 }
1741 newLength=(int32_t)(p-array);
1742 } else if(newLength>capacity) {
1743 newLength=capacity;
1744 }
1745 setLength(newLength);
1746 fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1747 }
1748 }
1749
1750 //========================================
1751 // Miscellaneous
1752 //========================================
1753 UBool
1754 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1755 int32_t growCapacity,
1756 UBool doCopyArray,
1757 int32_t **pBufferToDelete,
1758 UBool forceClone) {
1759 // default parameters need to be static, therefore
1760 // the defaults are -1 to have convenience defaults
1761 if(newCapacity == -1) {
1762 newCapacity = getCapacity();
1763 }
1764
1765 // while a getBuffer(minCapacity) is "open",
1766 // prevent any modifications of the string by returning FALSE here
1767 // if the string is bogus, then only an assignment or similar can revive it
1768 if(!isWritable()) {
1769 return FALSE;
1770 }
1771
1772 /*
1773 * We need to make a copy of the array if
1774 * the buffer is read-only, or
1775 * the buffer is refCounted (shared), and refCount>1, or
1776 * the buffer is too small.
1777 * Return FALSE if memory could not be allocated.
1778 */
1779 if(forceClone ||
1780 fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1781 (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1782 newCapacity > getCapacity()
1783 ) {
1784 // check growCapacity for default value and use of the stack buffer
1785 if(growCapacity < 0) {
1786 growCapacity = newCapacity;
1787 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1788 growCapacity = US_STACKBUF_SIZE;
1789 }
1790
1791 // save old values
1792 UChar oldStackBuffer[US_STACKBUF_SIZE];
1793 UChar *oldArray;
1794 int32_t oldLength = length();
1795 int16_t flags = fUnion.fFields.fLengthAndFlags;
1796
1797 if(flags&kUsingStackBuffer) {
1798 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1799 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1800 // copy the stack buffer contents because it will be overwritten with
1801 // fUnion.fFields values
1802 us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1803 oldArray = oldStackBuffer;
1804 } else {
1805 oldArray = NULL; // no need to copy from the stack buffer to itself
1806 }
1807 } else {
1808 oldArray = fUnion.fFields.fArray;
1809 U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1810 }
1811
1812 // allocate a new array
1813 if(allocate(growCapacity) ||
1814 (newCapacity < growCapacity && allocate(newCapacity))
1815 ) {
1816 if(doCopyArray) {
1817 // copy the contents
1818 // do not copy more than what fits - it may be smaller than before
1819 int32_t minLength = oldLength;
1820 newCapacity = getCapacity();
1821 if(newCapacity < minLength) {
1822 minLength = newCapacity;
1823 }
1824 if(oldArray != NULL) {
1825 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1826 }
1827 setLength(minLength);
1828 } else {
1829 setZeroLength();
1830 }
1831
1832 // release the old array
1833 if(flags & kRefCounted) {
1834 // the array is refCounted; decrement and release if 0
1835 u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1836 if(umtx_atomic_dec(pRefCount) == 0) {
1837 if(pBufferToDelete == 0) {
1838 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1839 // is defined as volatile. (Volatile has useful non-standard behavior
1840 // with this compiler.)
1841 uprv_free((void *)pRefCount);
1842 } else {
1843 // the caller requested to delete it himself
1844 *pBufferToDelete = (int32_t *)pRefCount;
1845 }
1846 }
1847 }
1848 } else {
1849 // not enough memory for growCapacity and not even for the smaller newCapacity
1850 // reset the old values for setToBogus() to release the array
1851 if(!(flags&kUsingStackBuffer)) {
1852 fUnion.fFields.fArray = oldArray;
1853 }
1854 fUnion.fFields.fLengthAndFlags = flags;
1855 setToBogus();
1856 return FALSE;
1857 }
1858 }
1859 return TRUE;
1860 }
1861
1862 // UnicodeStringAppendable ------------------------------------------------- ***
1863
1864 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1865
1866 UBool
1867 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1868 return str.doAppend(&c, 0, 1).isWritable();
1869 }
1870
1871 UBool
1872 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1873 UChar buffer[U16_MAX_LENGTH];
1874 int32_t cLength = 0;
1875 UBool isError = FALSE;
1876 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1877 return !isError && str.doAppend(buffer, 0, cLength).isWritable();
1878 }
1879
1880 UBool
1881 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1882 return str.doAppend(s, 0, length).isWritable();
1883 }
1884
1885 UBool
1886 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1887 return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1888 }
1889
1890 UChar *
1891 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1892 int32_t desiredCapacityHint,
1893 UChar *scratch, int32_t scratchCapacity,
1894 int32_t *resultCapacity) {
1895 if(minCapacity < 1 || scratchCapacity < minCapacity) {
1896 *resultCapacity = 0;
1897 return NULL;
1898 }
1899 int32_t oldLength = str.length();
1900 if(minCapacity <= (kMaxCapacity - oldLength) &&
1901 desiredCapacityHint <= (kMaxCapacity - oldLength) &&
1902 str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1903 *resultCapacity = str.getCapacity() - oldLength;
1904 return str.getArrayStart() + oldLength;
1905 }
1906 *resultCapacity = scratchCapacity;
1907 return scratch;
1908 }
1909
1910 U_NAMESPACE_END
1911
1912 U_NAMESPACE_USE
1913
1914 U_CAPI int32_t U_EXPORT2
1915 uhash_hashUnicodeString(const UElement key) {
1916 const UnicodeString *str = (const UnicodeString*) key.pointer;
1917 return (str == NULL) ? 0 : str->hashCode();
1918 }
1919
1920 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1921 // does not depend on hashtable code.
1922 U_CAPI UBool U_EXPORT2
1923 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1924 const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1925 const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1926 if (str1 == str2) {
1927 return TRUE;
1928 }
1929 if (str1 == NULL || str2 == NULL) {
1930 return FALSE;
1931 }
1932 return *str1 == *str2;
1933 }
1934
1935 #ifdef U_STATIC_IMPLEMENTATION
1936 /*
1937 This should never be called. It is defined here to make sure that the
1938 virtual vector deleting destructor is defined within unistr.cpp.
1939 The vector deleting destructor is already a part of UObject,
1940 but defining it here makes sure that it is included with this object file.
1941 This makes sure that static library dependencies are kept to a minimum.
1942 */
1943 static void uprv_UnicodeStringDummy(void) {
1944 delete [] (new UnicodeString[2]);
1945 }
1946 #endif