2 ******************************************************************************
3 * Copyright (C) 1999-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
9 * Modification History:
11 * Date Name Description
12 * 09/25/98 stephen Creation.
13 * 04/20/99 stephen Overhauled per 4/16 code review.
14 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
15 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from
17 * 06/25/01 grhoten Removed the dependency on iostream
18 ******************************************************************************
21 #include "unicode/utypes.h"
22 #include "unicode/appendable.h"
23 #include "unicode/putil.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf16.h"
42 print(const UnicodeString
& s
,
47 for(int i
= 0; i
< s
.length(); ++i
) {
49 if(c
>= 0x007E || c
< 0x0020)
50 cout
<< "[0x" << hex
<< s
[i
] << "]";
64 for(int i
= 0; i
< len
; ++i
) {
66 if(c
>= 0x007E || c
< 0x0020)
67 cout
<< "[0x" << hex
<< s
[i
] << "]";
76 // Local function definitions for now
78 // need to copy areas that may overlap
81 us_arrayCopy(const UChar
*src
, int32_t srcStart
,
82 UChar
*dst
, int32_t dstStart
, int32_t count
)
85 uprv_memmove(dst
+dstStart
, src
+srcStart
, (size_t)(count
*sizeof(*src
)));
89 // u_unescapeAt() callback to get a UChar from a UnicodeString
91 static UChar U_CALLCONV
92 UnicodeString_charAt(int32_t offset
, void *context
) {
93 return ((icu::UnicodeString
*) context
)->charAt(offset
);
99 /* The Replaceable virtual destructor can't be defined in the header
100 due to how AIX works with multiple definitions of virtual functions.
102 Replaceable::~Replaceable() {}
104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString
)
106 UnicodeString U_EXPORT2
107 operator+ (const UnicodeString
&s1
, const UnicodeString
&s2
) {
109 UnicodeString(s1
.length()+s2
.length()+1, (UChar32
)0, 0).
114 //========================================
115 // Reference Counting functions, put at top of file so that optimizing compilers
116 // have a chance to automatically inline.
117 //========================================
120 UnicodeString::addRef() {
121 umtx_atomic_inc((u_atomic_int32_t
*)fUnion
.fFields
.fArray
- 1);
125 UnicodeString::removeRef() {
126 return umtx_atomic_dec((u_atomic_int32_t
*)fUnion
.fFields
.fArray
- 1);
130 UnicodeString::refCount() const {
131 return umtx_loadAcquire(*((u_atomic_int32_t
*)fUnion
.fFields
.fArray
- 1));
135 UnicodeString::releaseArray() {
136 if((fUnion
.fFields
.fLengthAndFlags
& kRefCounted
) && removeRef() == 0) {
137 uprv_free((int32_t *)fUnion
.fFields
.fArray
- 1);
143 //========================================
145 //========================================
147 // The default constructor is inline in unistr.h.
149 UnicodeString::UnicodeString(int32_t capacity
, UChar32 c
, int32_t count
) {
150 fUnion
.fFields
.fLengthAndFlags
= 0;
151 if(count
<= 0 || (uint32_t)c
> 0x10ffff) {
152 // just allocate and do not do anything else
155 // count > 0, allocate and fill the new string with count c's
156 int32_t unitCount
= U16_LENGTH(c
), length
= count
* unitCount
;
157 if(capacity
< length
) {
160 if(allocate(capacity
)) {
161 UChar
*array
= getArrayStart();
164 // fill the new string with c
166 // fill with length UChars
168 array
[i
++] = (UChar
)c
;
171 // get the code units for c
172 UChar units
[U16_MAX_LENGTH
];
173 U16_APPEND_UNSAFE(units
, i
, c
);
175 // now it must be i==unitCount
178 // for Unicode, unitCount can only be 1, 2, 3, or 4
179 // 1 is handled above
182 while(unitIdx
< unitCount
) {
183 array
[i
++]=units
[unitIdx
++];
192 UnicodeString::UnicodeString(UChar ch
) {
193 fUnion
.fFields
.fLengthAndFlags
= kLength1
| kShortString
;
194 fUnion
.fStackFields
.fBuffer
[0] = ch
;
197 UnicodeString::UnicodeString(UChar32 ch
) {
198 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
200 UBool isError
= FALSE
;
201 U16_APPEND(fUnion
.fStackFields
.fBuffer
, i
, US_STACKBUF_SIZE
, ch
, isError
);
202 // We test isError so that the compiler does not complain that we don't.
203 // If isError then i==0 which is what we want anyway.
209 UnicodeString::UnicodeString(const UChar
*text
) {
210 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
211 doReplace(0, 0, text
, 0, -1);
214 UnicodeString::UnicodeString(const UChar
*text
,
215 int32_t textLength
) {
216 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
217 doReplace(0, 0, text
, 0, textLength
);
220 UnicodeString::UnicodeString(UBool isTerminated
,
222 int32_t textLength
) {
223 fUnion
.fFields
.fLengthAndFlags
= kReadonlyAlias
;
225 // treat as an empty string, do not alias
227 } else if(textLength
< -1 ||
228 (textLength
== -1 && !isTerminated
) ||
229 (textLength
>= 0 && isTerminated
&& text
[textLength
] != 0)
233 if(textLength
== -1) {
234 // text is terminated, or else it would have failed the above test
235 textLength
= u_strlen(text
);
237 setArray((UChar
*)text
, textLength
, isTerminated
? textLength
+ 1 : textLength
);
241 UnicodeString::UnicodeString(UChar
*buff
,
243 int32_t buffCapacity
) {
244 fUnion
.fFields
.fLengthAndFlags
= kWritableAlias
;
246 // treat as an empty string, do not alias
248 } else if(buffLength
< -1 || buffCapacity
< 0 || buffLength
> buffCapacity
) {
251 if(buffLength
== -1) {
252 // fLength = u_strlen(buff); but do not look beyond buffCapacity
253 const UChar
*p
= buff
, *limit
= buff
+ buffCapacity
;
254 while(p
!= limit
&& *p
!= 0) {
257 buffLength
= (int32_t)(p
- buff
);
259 setArray(buff
, buffLength
, buffCapacity
);
263 UnicodeString::UnicodeString(const char *src
, int32_t length
, EInvariant
) {
264 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
266 // treat as an empty string
269 length
=(int32_t)uprv_strlen(src
);
271 if(cloneArrayIfNeeded(length
, length
, FALSE
)) {
272 u_charsToUChars(src
, getArrayStart(), length
);
280 #if U_CHARSET_IS_UTF8
282 UnicodeString::UnicodeString(const char *codepageData
) {
283 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
284 if(codepageData
!= 0) {
285 setToUTF8(codepageData
);
289 UnicodeString::UnicodeString(const char *codepageData
, int32_t dataLength
) {
290 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
291 // if there's nothing to convert, do nothing
292 if(codepageData
== 0 || dataLength
== 0 || dataLength
< -1) {
295 if(dataLength
== -1) {
296 dataLength
= (int32_t)uprv_strlen(codepageData
);
298 setToUTF8(StringPiece(codepageData
, dataLength
));
301 // else see unistr_cnv.cpp
304 UnicodeString::UnicodeString(const UnicodeString
& that
) {
305 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
309 UnicodeString::UnicodeString(const UnicodeString
& that
,
311 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
312 setTo(that
, srcStart
);
315 UnicodeString::UnicodeString(const UnicodeString
& that
,
318 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
319 setTo(that
, srcStart
, srcLength
);
322 // Replaceable base class clone() default implementation, does not clone
324 Replaceable::clone() const {
328 // UnicodeString overrides clone() with a real implementation
330 UnicodeString::clone() const {
331 return new UnicodeString(*this);
334 //========================================
336 //========================================
339 UnicodeString::allocate(int32_t capacity
) {
340 if(capacity
<= US_STACKBUF_SIZE
) {
341 fUnion
.fFields
.fLengthAndFlags
= kShortString
;
343 // count bytes for the refCounter and the string capacity, and
344 // round up to a multiple of 16; then divide by 4 and allocate int32_t's
345 // to be safely aligned for the refCount
346 // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
347 int32_t words
= (int32_t)(((sizeof(int32_t) + (capacity
+ 1) * U_SIZEOF_UCHAR
+ 15) & ~15) >> 2);
348 int32_t *array
= (int32_t*) uprv_malloc( sizeof(int32_t) * words
);
350 // set initial refCount and point behind the refCount
353 // have fArray point to the first UChar
354 fUnion
.fFields
.fArray
= (UChar
*)array
;
355 fUnion
.fFields
.fCapacity
= (int32_t)((words
- 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR
));
356 fUnion
.fFields
.fLengthAndFlags
= kLongString
;
358 fUnion
.fFields
.fLengthAndFlags
= kIsBogus
;
359 fUnion
.fFields
.fArray
= 0;
360 fUnion
.fFields
.fCapacity
= 0;
367 //========================================
369 //========================================
370 UnicodeString::~UnicodeString()
375 //========================================
377 //========================================
379 UnicodeString
UnicodeString::fromUTF8(const StringPiece
&utf8
) {
380 UnicodeString result
;
381 result
.setToUTF8(utf8
);
385 UnicodeString
UnicodeString::fromUTF32(const UChar32
*utf32
, int32_t length
) {
386 UnicodeString result
;
388 // Most UTF-32 strings will be BMP-only and result in a same-length
389 // UTF-16 string. We overestimate the capacity just slightly,
390 // just in case there are a few supplementary characters.
391 if(length
<= US_STACKBUF_SIZE
) {
392 capacity
= US_STACKBUF_SIZE
;
394 capacity
= length
+ (length
>> 4) + 4;
397 UChar
*utf16
= result
.getBuffer(capacity
);
399 UErrorCode errorCode
= U_ZERO_ERROR
;
400 u_strFromUTF32WithSub(utf16
, result
.getCapacity(), &length16
,
402 0xfffd, // Substitution character.
403 NULL
, // Don't care about number of substitutions.
405 result
.releaseBuffer(length16
);
406 if(errorCode
== U_BUFFER_OVERFLOW_ERROR
) {
407 capacity
= length16
+ 1; // +1 for the terminating NUL.
409 } else if(U_FAILURE(errorCode
)) {
417 //========================================
419 //========================================
422 UnicodeString::operator=(const UnicodeString
&src
) {
423 return copyFrom(src
);
427 UnicodeString::fastCopyFrom(const UnicodeString
&src
) {
428 return copyFrom(src
, TRUE
);
432 UnicodeString::copyFrom(const UnicodeString
&src
, UBool fastCopy
) {
433 // if assigning to ourselves, do nothing
438 // is the right side bogus?
444 // delete the current contents
448 // empty string - use the stack buffer
453 // fLength>0 and not an "open" src.getBuffer(minCapacity)
454 fUnion
.fFields
.fLengthAndFlags
= src
.fUnion
.fFields
.fLengthAndFlags
;
455 switch(src
.fUnion
.fFields
.fLengthAndFlags
& kAllStorageFlags
) {
457 // short string using the stack buffer, do the same
458 uprv_memcpy(fUnion
.fStackFields
.fBuffer
, src
.fUnion
.fStackFields
.fBuffer
,
459 getShortLength() * U_SIZEOF_UCHAR
);
462 // src uses a refCounted string buffer, use that buffer with refCount
463 // src is const, use a cast - we don't actually change it
464 ((UnicodeString
&)src
).addRef();
465 // copy all fields, share the reference-counted buffer
466 fUnion
.fFields
.fArray
= src
.fUnion
.fFields
.fArray
;
467 fUnion
.fFields
.fCapacity
= src
.fUnion
.fFields
.fCapacity
;
468 if(!hasShortLength()) {
469 fUnion
.fFields
.fLength
= src
.fUnion
.fFields
.fLength
;
474 // src is a readonly alias, do the same
475 // -> maintain the readonly alias as such
476 fUnion
.fFields
.fArray
= src
.fUnion
.fFields
.fArray
;
477 fUnion
.fFields
.fCapacity
= src
.fUnion
.fFields
.fCapacity
;
478 if(!hasShortLength()) {
479 fUnion
.fFields
.fLength
= src
.fUnion
.fFields
.fLength
;
483 // else if(!fastCopy) fall through to case kWritableAlias
484 // -> allocate a new buffer and copy the contents
485 case kWritableAlias
: {
486 // src is a writable alias; we make a copy of that instead
487 int32_t srcLength
= src
.length();
488 if(allocate(srcLength
)) {
489 uprv_memcpy(getArrayStart(), src
.getArrayStart(), srcLength
* U_SIZEOF_UCHAR
);
490 setLength(srcLength
);
493 // if there is not enough memory, then fall through to setting to bogus
496 // if src is bogus, set ourselves to bogus
497 // do not call setToBogus() here because fArray and flags are not consistent here
498 fUnion
.fFields
.fLengthAndFlags
= kIsBogus
;
499 fUnion
.fFields
.fArray
= 0;
500 fUnion
.fFields
.fCapacity
= 0;
507 //========================================
508 // Miscellaneous operations
509 //========================================
511 UnicodeString
UnicodeString::unescape() const {
512 UnicodeString
result(length(), (UChar32
)0, (int32_t)0); // construct with capacity
513 const UChar
*array
= getBuffer();
514 int32_t len
= length();
516 for (int32_t i
=0;;) {
518 result
.append(array
, prev
, len
- prev
);
521 if (array
[i
++] == 0x5C /*'\\'*/) {
522 result
.append(array
, prev
, (i
- 1) - prev
);
523 UChar32 c
= unescapeAt(i
); // advances i
525 result
.remove(); // return empty string
526 break; // invalid escape sequence
535 UChar32
UnicodeString::unescapeAt(int32_t &offset
) const {
536 return u_unescapeAt(UnicodeString_charAt
, &offset
, length(), (void*)this);
539 //========================================
540 // Read-only implementation
541 //========================================
543 UnicodeString::doEquals(const UnicodeString
&text
, int32_t len
) const {
544 // Requires: this & text not bogus and have same lengths.
545 // Byte-wise comparison works for equality regardless of endianness.
546 return uprv_memcmp(getArrayStart(), text
.getArrayStart(), len
* U_SIZEOF_UCHAR
) == 0;
550 UnicodeString::doCompare( int32_t start
,
552 const UChar
*srcChars
,
554 int32_t srcLength
) const
556 // compare illegal string values
561 // pin indices to legal values
562 pinIndices(start
, length
);
564 if(srcChars
== NULL
) {
565 // treat const UChar *srcChars==NULL as an empty string
566 return length
== 0 ? 0 : 1;
569 // get the correct pointer
570 const UChar
*chars
= getArrayStart();
573 srcChars
+= srcStart
;
578 // get the srcLength if necessary
580 srcLength
= u_strlen(srcChars
+ srcStart
);
583 // are we comparing different lengths?
584 if(length
!= srcLength
) {
585 if(length
< srcLength
) {
589 minLength
= srcLength
;
598 * note that uprv_memcmp() returns an int but we return an int8_t;
599 * we need to take care not to truncate the result -
600 * one way to do this is to right-shift the value to
601 * move the sign bit into the lower 8 bits and making sure that this
602 * does not become 0 itself
605 if(minLength
> 0 && chars
!= srcChars
) {
609 // big-endian: byte comparison works
610 result
= uprv_memcmp(chars
, srcChars
, minLength
* sizeof(UChar
));
612 return (int8_t)(result
>> 15 | 1);
615 // little-endian: compare UChar units
617 result
= ((int32_t)*(chars
++) - (int32_t)*(srcChars
++));
619 return (int8_t)(result
>> 15 | 1);
621 } while(--minLength
> 0);
627 /* String compare in code point order - doCompare() compares in code unit order. */
629 UnicodeString::doCompareCodePointOrder(int32_t start
,
631 const UChar
*srcChars
,
633 int32_t srcLength
) const
635 // compare illegal string values
636 // treat const UChar *srcChars==NULL as an empty string
641 // pin indices to legal values
642 pinIndices(start
, length
);
644 if(srcChars
== NULL
) {
645 srcStart
= srcLength
= 0;
648 int32_t diff
= uprv_strCompare(getArrayStart() + start
, length
, (srcChars
!=NULL
)?(srcChars
+ srcStart
):NULL
, srcLength
, FALSE
, TRUE
);
649 /* translate the 32-bit result into an 8-bit one */
651 return (int8_t)(diff
>> 15 | 1);
658 UnicodeString::getLength() const {
663 UnicodeString::getCharAt(int32_t offset
) const {
664 return charAt(offset
);
668 UnicodeString::getChar32At(int32_t offset
) const {
669 return char32At(offset
);
673 UnicodeString::char32At(int32_t offset
) const
675 int32_t len
= length();
676 if((uint32_t)offset
< (uint32_t)len
) {
677 const UChar
*array
= getArrayStart();
679 U16_GET(array
, 0, offset
, len
, c
);
682 return kInvalidUChar
;
687 UnicodeString::getChar32Start(int32_t offset
) const {
688 if((uint32_t)offset
< (uint32_t)length()) {
689 const UChar
*array
= getArrayStart();
690 U16_SET_CP_START(array
, 0, offset
);
698 UnicodeString::getChar32Limit(int32_t offset
) const {
699 int32_t len
= length();
700 if((uint32_t)offset
< (uint32_t)len
) {
701 const UChar
*array
= getArrayStart();
702 U16_SET_CP_LIMIT(array
, 0, offset
, len
);
710 UnicodeString::countChar32(int32_t start
, int32_t length
) const {
711 pinIndices(start
, length
);
712 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
713 return u_countChar32(getArrayStart()+start
, length
);
717 UnicodeString::hasMoreChar32Than(int32_t start
, int32_t length
, int32_t number
) const {
718 pinIndices(start
, length
);
719 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
720 return u_strHasMoreChar32Than(getArrayStart()+start
, length
, number
);
724 UnicodeString::moveIndex32(int32_t index
, int32_t delta
) const {
726 int32_t len
= length();
729 } else if(index
>len
) {
733 const UChar
*array
= getArrayStart();
735 U16_FWD_N(array
, index
, len
, delta
);
737 U16_BACK_N(array
, 0, index
, -delta
);
744 UnicodeString::doExtract(int32_t start
,
747 int32_t dstStart
) const
749 // pin indices to legal values
750 pinIndices(start
, length
);
752 // do not copy anything if we alias dst itself
753 const UChar
*array
= getArrayStart();
754 if(array
+ start
!= dst
+ dstStart
) {
755 us_arrayCopy(array
, start
, dst
, dstStart
, length
);
760 UnicodeString::extract(UChar
*dest
, int32_t destCapacity
,
761 UErrorCode
&errorCode
) const {
762 int32_t len
= length();
763 if(U_SUCCESS(errorCode
)) {
764 if(isBogus() || destCapacity
<0 || (destCapacity
>0 && dest
==0)) {
765 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
767 const UChar
*array
= getArrayStart();
768 if(len
>0 && len
<=destCapacity
&& array
!=dest
) {
769 uprv_memcpy(dest
, array
, len
*U_SIZEOF_UCHAR
);
771 return u_terminateUChars(dest
, destCapacity
, len
, &errorCode
);
779 UnicodeString::extract(int32_t start
,
782 int32_t targetCapacity
,
783 enum EInvariant
) const
785 // if the arguments are illegal, then do nothing
786 if(targetCapacity
< 0 || (targetCapacity
> 0 && target
== NULL
)) {
790 // pin the indices to legal values
791 pinIndices(start
, length
);
793 if(length
<= targetCapacity
) {
794 u_UCharsToChars(getArrayStart() + start
, target
, length
);
796 UErrorCode status
= U_ZERO_ERROR
;
797 return u_terminateChars(target
, targetCapacity
, length
, &status
);
801 UnicodeString::tempSubString(int32_t start
, int32_t len
) const {
802 pinIndices(start
, len
);
803 const UChar
*array
= getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
805 array
=fUnion
.fStackFields
.fBuffer
; // anything not NULL because that would make an empty string
806 len
=-2; // bogus result string
808 return UnicodeString(FALSE
, array
+ start
, len
);
812 UnicodeString::toUTF8(int32_t start
, int32_t len
,
813 char *target
, int32_t capacity
) const {
814 pinIndices(start
, len
);
816 UErrorCode errorCode
= U_ZERO_ERROR
;
817 u_strToUTF8WithSub(target
, capacity
, &length8
,
818 getBuffer() + start
, len
,
819 0xFFFD, // Standard substitution character.
820 NULL
, // Don't care about number of substitutions.
825 #if U_CHARSET_IS_UTF8
828 UnicodeString::extract(int32_t start
, int32_t len
,
829 char *target
, uint32_t dstSize
) const {
830 // if the arguments are illegal, then do nothing
831 if(/*dstSize < 0 || */(dstSize
> 0 && target
== 0)) {
834 return toUTF8(start
, len
, target
, dstSize
<= 0x7fffffff ? (int32_t)dstSize
: 0x7fffffff);
837 // else see unistr_cnv.cpp
841 UnicodeString::extractBetween(int32_t start
,
843 UnicodeString
& target
) const {
846 doExtract(start
, limit
- start
, target
);
849 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
850 // as many bytes as the source has UChars.
851 // The "worst cases" are writing systems like Indic, Thai and CJK with
854 UnicodeString::toUTF8(ByteSink
&sink
) const {
855 int32_t length16
= length();
857 char stackBuffer
[1024];
858 int32_t capacity
= (int32_t)sizeof(stackBuffer
);
859 UBool utf8IsOwned
= FALSE
;
860 char *utf8
= sink
.GetAppendBuffer(length16
< capacity
? length16
: capacity
,
862 stackBuffer
, capacity
,
865 UErrorCode errorCode
= U_ZERO_ERROR
;
866 u_strToUTF8WithSub(utf8
, capacity
, &length8
,
867 getBuffer(), length16
,
868 0xFFFD, // Standard substitution character.
869 NULL
, // Don't care about number of substitutions.
871 if(errorCode
== U_BUFFER_OVERFLOW_ERROR
) {
872 utf8
= (char *)uprv_malloc(length8
);
875 errorCode
= U_ZERO_ERROR
;
876 u_strToUTF8WithSub(utf8
, length8
, &length8
,
877 getBuffer(), length16
,
878 0xFFFD, // Standard substitution character.
879 NULL
, // Don't care about number of substitutions.
882 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
885 if(U_SUCCESS(errorCode
)) {
886 sink
.Append(utf8
, length8
);
896 UnicodeString::toUTF32(UChar32
*utf32
, int32_t capacity
, UErrorCode
&errorCode
) const {
898 if(U_SUCCESS(errorCode
)) {
899 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
900 u_strToUTF32WithSub(utf32
, capacity
, &length32
,
901 getBuffer(), length(),
902 0xfffd, // Substitution character.
903 NULL
, // Don't care about number of substitutions.
910 UnicodeString::indexOf(const UChar
*srcChars
,
914 int32_t length
) const
916 if(isBogus() || srcChars
== 0 || srcStart
< 0 || srcLength
== 0) {
920 // UnicodeString does not find empty substrings
921 if(srcLength
< 0 && srcChars
[srcStart
] == 0) {
925 // get the indices within bounds
926 pinIndices(start
, length
);
928 // find the first occurrence of the substring
929 const UChar
*array
= getArrayStart();
930 const UChar
*match
= u_strFindFirst(array
+ start
, length
, srcChars
+ srcStart
, srcLength
);
934 return (int32_t)(match
- array
);
939 UnicodeString::doIndexOf(UChar c
,
941 int32_t length
) const
944 pinIndices(start
, length
);
946 // find the first occurrence of c
947 const UChar
*array
= getArrayStart();
948 const UChar
*match
= u_memchr(array
+ start
, c
, length
);
952 return (int32_t)(match
- array
);
957 UnicodeString::doIndexOf(UChar32 c
,
959 int32_t length
) const {
961 pinIndices(start
, length
);
963 // find the first occurrence of c
964 const UChar
*array
= getArrayStart();
965 const UChar
*match
= u_memchr32(array
+ start
, c
, length
);
969 return (int32_t)(match
- array
);
974 UnicodeString::lastIndexOf(const UChar
*srcChars
,
978 int32_t length
) const
980 if(isBogus() || srcChars
== 0 || srcStart
< 0 || srcLength
== 0) {
984 // UnicodeString does not find empty substrings
985 if(srcLength
< 0 && srcChars
[srcStart
] == 0) {
989 // get the indices within bounds
990 pinIndices(start
, length
);
992 // find the last occurrence of the substring
993 const UChar
*array
= getArrayStart();
994 const UChar
*match
= u_strFindLast(array
+ start
, length
, srcChars
+ srcStart
, srcLength
);
998 return (int32_t)(match
- array
);
1003 UnicodeString::doLastIndexOf(UChar c
,
1005 int32_t length
) const
1012 pinIndices(start
, length
);
1014 // find the last occurrence of c
1015 const UChar
*array
= getArrayStart();
1016 const UChar
*match
= u_memrchr(array
+ start
, c
, length
);
1020 return (int32_t)(match
- array
);
1025 UnicodeString::doLastIndexOf(UChar32 c
,
1027 int32_t length
) const {
1029 pinIndices(start
, length
);
1031 // find the last occurrence of c
1032 const UChar
*array
= getArrayStart();
1033 const UChar
*match
= u_memrchr32(array
+ start
, c
, length
);
1037 return (int32_t)(match
- array
);
1041 //========================================
1042 // Write implementation
1043 //========================================
1046 UnicodeString::findAndReplace(int32_t start
,
1048 const UnicodeString
& oldText
,
1051 const UnicodeString
& newText
,
1055 if(isBogus() || oldText
.isBogus() || newText
.isBogus()) {
1059 pinIndices(start
, length
);
1060 oldText
.pinIndices(oldStart
, oldLength
);
1061 newText
.pinIndices(newStart
, newLength
);
1063 if(oldLength
== 0) {
1067 while(length
> 0 && length
>= oldLength
) {
1068 int32_t pos
= indexOf(oldText
, oldStart
, oldLength
, start
, length
);
1070 // no more oldText's here: done
1073 // we found oldText, replace it by newText and go beyond it
1074 replace(pos
, oldLength
, newText
, newStart
, newLength
);
1075 length
-= pos
+ oldLength
- start
;
1076 start
= pos
+ newLength
;
1085 UnicodeString::setToBogus()
1089 fUnion
.fFields
.fLengthAndFlags
= kIsBogus
;
1090 fUnion
.fFields
.fArray
= 0;
1091 fUnion
.fFields
.fCapacity
= 0;
1094 // turn a bogus string into an empty one
1096 UnicodeString::unBogus() {
1097 if(fUnion
.fFields
.fLengthAndFlags
& kIsBogus
) {
1103 UnicodeString::getTerminatedBuffer() {
1107 UChar
*array
= getArrayStart();
1108 int32_t len
= length();
1109 if(len
< getCapacity()) {
1110 if(fUnion
.fFields
.fLengthAndFlags
& kBufferIsReadonly
) {
1111 // If len<capacity on a read-only alias, then array[len] is
1112 // either the original NUL (if constructed with (TRUE, s, length))
1113 // or one of the original string contents characters (if later truncated),
1114 // therefore we can assume that array[len] is initialized memory.
1115 if(array
[len
] == 0) {
1118 } else if(((fUnion
.fFields
.fLengthAndFlags
& kRefCounted
) == 0 || refCount() == 1)) {
1119 // kRefCounted: Do not write the NUL if the buffer is shared.
1120 // That is mostly safe, except when the length of one copy was modified
1121 // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1122 // Then the NUL would be written into the middle of another copy's string.
1124 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1125 // Do not test if there is a NUL already because it might be uninitialized memory.
1126 // (That would be safe, but tools like valgrind & Purify would complain.)
1131 if(cloneArrayIfNeeded(len
+1)) {
1132 array
= getArrayStart();
1140 // setTo() analogous to the readonly-aliasing constructor with the same signature
1142 UnicodeString::setTo(UBool isTerminated
,
1146 if(fUnion
.fFields
.fLengthAndFlags
& kOpenGetBuffer
) {
1147 // do not modify a string that has an "open" getBuffer(minCapacity)
1152 // treat as an empty string, do not alias
1158 if( textLength
< -1 ||
1159 (textLength
== -1 && !isTerminated
) ||
1160 (textLength
>= 0 && isTerminated
&& text
[textLength
] != 0)
1168 if(textLength
== -1) {
1169 // text is terminated, or else it would have failed the above test
1170 textLength
= u_strlen(text
);
1172 fUnion
.fFields
.fLengthAndFlags
= kReadonlyAlias
;
1173 setArray((UChar
*)text
, textLength
, isTerminated
? textLength
+ 1 : textLength
);
1177 // setTo() analogous to the writable-aliasing constructor with the same signature
1179 UnicodeString::setTo(UChar
*buffer
,
1181 int32_t buffCapacity
) {
1182 if(fUnion
.fFields
.fLengthAndFlags
& kOpenGetBuffer
) {
1183 // do not modify a string that has an "open" getBuffer(minCapacity)
1187 if(buffer
== NULL
) {
1188 // treat as an empty string, do not alias
1194 if(buffLength
< -1 || buffCapacity
< 0 || buffLength
> buffCapacity
) {
1197 } else if(buffLength
== -1) {
1198 // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1199 const UChar
*p
= buffer
, *limit
= buffer
+ buffCapacity
;
1200 while(p
!= limit
&& *p
!= 0) {
1203 buffLength
= (int32_t)(p
- buffer
);
1208 fUnion
.fFields
.fLengthAndFlags
= kWritableAlias
;
1209 setArray(buffer
, buffLength
, buffCapacity
);
1213 UnicodeString
&UnicodeString::setToUTF8(const StringPiece
&utf8
) {
1215 int32_t length
= utf8
.length();
1217 // The UTF-16 string will be at most as long as the UTF-8 string.
1218 if(length
<= US_STACKBUF_SIZE
) {
1219 capacity
= US_STACKBUF_SIZE
;
1221 capacity
= length
+ 1; // +1 for the terminating NUL.
1223 UChar
*utf16
= getBuffer(capacity
);
1225 UErrorCode errorCode
= U_ZERO_ERROR
;
1226 u_strFromUTF8WithSub(utf16
, getCapacity(), &length16
,
1227 utf8
.data(), length
,
1228 0xfffd, // Substitution character.
1229 NULL
, // Don't care about number of substitutions.
1231 releaseBuffer(length16
);
1232 if(U_FAILURE(errorCode
)) {
1239 UnicodeString::setCharAt(int32_t offset
,
1242 int32_t len
= length();
1243 if(cloneArrayIfNeeded() && len
> 0) {
1246 } else if(offset
>= len
) {
1250 getArrayStart()[offset
] = c
;
1256 UnicodeString::replace(int32_t start
,
1259 UChar buffer
[U16_MAX_LENGTH
];
1261 UBool isError
= FALSE
;
1262 U16_APPEND(buffer
, count
, U16_MAX_LENGTH
, srcChar
, isError
);
1263 // We test isError so that the compiler does not complain that we don't.
1264 // If isError (srcChar is not a valid code point) then count==0 which means
1265 // we remove the source segment rather than replacing it with srcChar.
1266 return doReplace(start
, _length
, buffer
, 0, isError
? 0 : count
);
1270 UnicodeString::append(UChar32 srcChar
) {
1271 UChar buffer
[U16_MAX_LENGTH
];
1272 int32_t _length
= 0;
1273 UBool isError
= FALSE
;
1274 U16_APPEND(buffer
, _length
, U16_MAX_LENGTH
, srcChar
, isError
);
1275 // We test isError so that the compiler does not complain that we don't.
1276 // If isError then _length==0 which turns the doReplace() into a no-op anyway.
1277 return isError
? *this : doReplace(length(), 0, buffer
, 0, _length
);
1281 UnicodeString::doReplace( int32_t start
,
1283 const UnicodeString
& src
,
1287 if(!src
.isBogus()) {
1288 // pin the indices to legal values
1289 src
.pinIndices(srcStart
, srcLength
);
1291 // get the characters from src
1292 // and replace the range in ourselves with them
1293 return doReplace(start
, length
, src
.getArrayStart(), srcStart
, srcLength
);
1296 return doReplace(start
, length
, 0, 0, 0);
1301 UnicodeString::doReplace(int32_t start
,
1303 const UChar
*srcChars
,
1311 int32_t oldLength
= this->length();
1313 // optimize (read-only alias).remove(0, start) and .remove(start, end)
1314 if((fUnion
.fFields
.fLengthAndFlags
&kBufferIsReadonly
) && srcLength
== 0) {
1316 // remove prefix by adjusting the array pointer
1318 fUnion
.fFields
.fArray
+= length
;
1319 fUnion
.fFields
.fCapacity
-= length
;
1320 setLength(oldLength
- length
);
1324 if(length
>= (oldLength
- start
)) {
1325 // remove suffix by reducing the length (like truncate())
1327 fUnion
.fFields
.fCapacity
= start
; // not NUL-terminated any more
1334 srcStart
= srcLength
= 0;
1335 } else if(srcLength
< 0) {
1336 // get the srcLength if necessary
1337 srcLength
= u_strlen(srcChars
+ srcStart
);
1340 // calculate the size of the string after the replace
1343 // optimize append() onto a large-enough, owned string
1344 if(start
>= oldLength
) {
1345 if(srcLength
== 0) {
1348 newLength
= oldLength
+ srcLength
;
1349 if(newLength
<= getCapacity() && isBufferWritable()) {
1350 UChar
*oldArray
= getArrayStart();
1351 // Do not copy characters when
1352 // UChar *buffer=str.getAppendBuffer(...);
1354 // str.append(buffer, length);
1356 // str.appendString(buffer, length)
1358 if(srcChars
+ srcStart
!= oldArray
+ start
|| start
> oldLength
) {
1359 us_arrayCopy(srcChars
, srcStart
, oldArray
, oldLength
, srcLength
);
1361 setLength(newLength
);
1364 // pin the indices to legal values
1369 // pin the indices to legal values
1370 pinIndices(start
, length
);
1372 newLength
= oldLength
- length
+ srcLength
;
1375 // the following may change fArray but will not copy the current contents;
1376 // therefore we need to keep the current fArray
1377 UChar oldStackBuffer
[US_STACKBUF_SIZE
];
1379 if((fUnion
.fFields
.fLengthAndFlags
&kUsingStackBuffer
) && (newLength
> US_STACKBUF_SIZE
)) {
1380 // copy the stack buffer contents because it will be overwritten with
1381 // fUnion.fFields values
1382 u_memcpy(oldStackBuffer
, fUnion
.fStackFields
.fBuffer
, oldLength
);
1383 oldArray
= oldStackBuffer
;
1385 oldArray
= getArrayStart();
1388 // clone our array and allocate a bigger array if needed
1389 int32_t *bufferToDelete
= 0;
1390 if(!cloneArrayIfNeeded(newLength
, newLength
+ (newLength
>> 2) + kGrowSize
,
1391 FALSE
, &bufferToDelete
)
1396 // now do the replace
1398 UChar
*newArray
= getArrayStart();
1399 if(newArray
!= oldArray
) {
1400 // if fArray changed, then we need to copy everything except what will change
1401 us_arrayCopy(oldArray
, 0, newArray
, 0, start
);
1402 us_arrayCopy(oldArray
, start
+ length
,
1403 newArray
, start
+ srcLength
,
1404 oldLength
- (start
+ length
));
1405 } else if(length
!= srcLength
) {
1406 // fArray did not change; copy only the portion that isn't changing, leaving a hole
1407 us_arrayCopy(oldArray
, start
+ length
,
1408 newArray
, start
+ srcLength
,
1409 oldLength
- (start
+ length
));
1412 // now fill in the hole with the new string
1413 us_arrayCopy(srcChars
, srcStart
, newArray
, start
, srcLength
);
1415 setLength(newLength
);
1417 // delayed delete in case srcChars == fArray when we started, and
1418 // to keep oldArray alive for the above operations
1419 if (bufferToDelete
) {
1420 uprv_free(bufferToDelete
);
1430 UnicodeString::handleReplaceBetween(int32_t start
,
1432 const UnicodeString
& text
) {
1433 replaceBetween(start
, limit
, text
);
1440 UnicodeString::copy(int32_t start
, int32_t limit
, int32_t dest
) {
1441 if (limit
<= start
) {
1442 return; // Nothing to do; avoid bogus malloc call
1444 UChar
* text
= (UChar
*) uprv_malloc( sizeof(UChar
) * (limit
- start
) );
1445 // Check to make sure text is not null.
1447 extractBetween(start
, limit
, text
, 0);
1448 insert(dest
, text
, 0, limit
- start
);
1456 * NOTE: This is for the Replaceable class. There is no rep.cpp,
1457 * so we implement this function here.
1459 UBool
Replaceable::hasMetaData() const {
1466 UBool
UnicodeString::hasMetaData() const {
1471 UnicodeString::doReverse(int32_t start
, int32_t length
) {
1472 if(length
<= 1 || !cloneArrayIfNeeded()) {
1476 // pin the indices to legal values
1477 pinIndices(start
, length
);
1478 if(length
<= 1) { // pinIndices() might have shrunk the length
1482 UChar
*left
= getArrayStart() + start
;
1483 UChar
*right
= left
+ length
- 1; // -1 for inclusive boundary (length>=2)
1485 UBool hasSupplementary
= FALSE
;
1487 // Before the loop we know left<right because length>=2.
1489 hasSupplementary
|= (UBool
)U16_IS_LEAD(swap
= *left
);
1490 hasSupplementary
|= (UBool
)U16_IS_LEAD(*left
++ = *right
);
1492 } while(left
< right
);
1493 // Make sure to test the middle code unit of an odd-length string.
1494 // Redundant if the length is even.
1495 hasSupplementary
|= (UBool
)U16_IS_LEAD(*left
);
1497 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1498 if(hasSupplementary
) {
1501 left
= getArrayStart() + start
;
1502 right
= left
+ length
- 1; // -1 so that we can look at *(left+1) if left<right
1503 while(left
< right
) {
1504 if(U16_IS_TRAIL(swap
= *left
) && U16_IS_LEAD(swap2
= *(left
+ 1))) {
1517 UnicodeString::padLeading(int32_t targetLength
,
1520 int32_t oldLength
= length();
1521 if(oldLength
>= targetLength
|| !cloneArrayIfNeeded(targetLength
)) {
1524 // move contents up by padding width
1525 UChar
*array
= getArrayStart();
1526 int32_t start
= targetLength
- oldLength
;
1527 us_arrayCopy(array
, 0, array
, start
, oldLength
);
1529 // fill in padding character
1530 while(--start
>= 0) {
1531 array
[start
] = padChar
;
1533 setLength(targetLength
);
1539 UnicodeString::padTrailing(int32_t targetLength
,
1542 int32_t oldLength
= length();
1543 if(oldLength
>= targetLength
|| !cloneArrayIfNeeded(targetLength
)) {
1546 // fill in padding character
1547 UChar
*array
= getArrayStart();
1548 int32_t length
= targetLength
;
1549 while(--length
>= oldLength
) {
1550 array
[length
] = padChar
;
1552 setLength(targetLength
);
1557 //========================================
1559 //========================================
1561 UnicodeString::doHashCode() const
1563 /* Delegate hash computation to uhash. This makes UnicodeString
1564 * hashing consistent with UChar* hashing. */
1565 int32_t hashCode
= ustr_hashUCharsN(getArrayStart(), length());
1566 if (hashCode
== kInvalidHashCode
) {
1567 hashCode
= kEmptyHashCode
;
1572 //========================================
1574 //========================================
1577 UnicodeString::getBuffer(int32_t minCapacity
) {
1578 if(minCapacity
>=-1 && cloneArrayIfNeeded(minCapacity
)) {
1579 fUnion
.fFields
.fLengthAndFlags
|=kOpenGetBuffer
;
1581 return getArrayStart();
1588 UnicodeString::releaseBuffer(int32_t newLength
) {
1589 if(fUnion
.fFields
.fLengthAndFlags
&kOpenGetBuffer
&& newLength
>=-1) {
1590 // set the new fLength
1591 int32_t capacity
=getCapacity();
1593 // the new length is the string length, capped by fCapacity
1594 const UChar
*array
=getArrayStart(), *p
=array
, *limit
=array
+capacity
;
1595 while(p
<limit
&& *p
!=0) {
1598 newLength
=(int32_t)(p
-array
);
1599 } else if(newLength
>capacity
) {
1602 setLength(newLength
);
1603 fUnion
.fFields
.fLengthAndFlags
&=~kOpenGetBuffer
;
1607 //========================================
1609 //========================================
1611 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity
,
1612 int32_t growCapacity
,
1614 int32_t **pBufferToDelete
,
1616 // default parameters need to be static, therefore
1617 // the defaults are -1 to have convenience defaults
1618 if(newCapacity
== -1) {
1619 newCapacity
= getCapacity();
1622 // while a getBuffer(minCapacity) is "open",
1623 // prevent any modifications of the string by returning FALSE here
1624 // if the string is bogus, then only an assignment or similar can revive it
1630 * We need to make a copy of the array if
1631 * the buffer is read-only, or
1632 * the buffer is refCounted (shared), and refCount>1, or
1633 * the buffer is too small.
1634 * Return FALSE if memory could not be allocated.
1637 fUnion
.fFields
.fLengthAndFlags
& kBufferIsReadonly
||
1638 (fUnion
.fFields
.fLengthAndFlags
& kRefCounted
&& refCount() > 1) ||
1639 newCapacity
> getCapacity()
1641 // check growCapacity for default value and use of the stack buffer
1642 if(growCapacity
< 0) {
1643 growCapacity
= newCapacity
;
1644 } else if(newCapacity
<= US_STACKBUF_SIZE
&& growCapacity
> US_STACKBUF_SIZE
) {
1645 growCapacity
= US_STACKBUF_SIZE
;
1649 UChar oldStackBuffer
[US_STACKBUF_SIZE
];
1651 int32_t oldLength
= length();
1652 int16_t flags
= fUnion
.fFields
.fLengthAndFlags
;
1654 if(flags
&kUsingStackBuffer
) {
1655 U_ASSERT(!(flags
&kRefCounted
)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1656 if(doCopyArray
&& growCapacity
> US_STACKBUF_SIZE
) {
1657 // copy the stack buffer contents because it will be overwritten with
1658 // fUnion.fFields values
1659 us_arrayCopy(fUnion
.fStackFields
.fBuffer
, 0, oldStackBuffer
, 0, oldLength
);
1660 oldArray
= oldStackBuffer
;
1662 oldArray
= NULL
; // no need to copy from the stack buffer to itself
1665 oldArray
= fUnion
.fFields
.fArray
;
1666 U_ASSERT(oldArray
!=NULL
); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1669 // allocate a new array
1670 if(allocate(growCapacity
) ||
1671 (newCapacity
< growCapacity
&& allocate(newCapacity
))
1674 // copy the contents
1675 // do not copy more than what fits - it may be smaller than before
1676 int32_t minLength
= oldLength
;
1677 newCapacity
= getCapacity();
1678 if(newCapacity
< minLength
) {
1679 minLength
= newCapacity
;
1681 if(oldArray
!= NULL
) {
1682 us_arrayCopy(oldArray
, 0, getArrayStart(), 0, minLength
);
1684 setLength(minLength
);
1689 // release the old array
1690 if(flags
& kRefCounted
) {
1691 // the array is refCounted; decrement and release if 0
1692 u_atomic_int32_t
*pRefCount
= ((u_atomic_int32_t
*)oldArray
- 1);
1693 if(umtx_atomic_dec(pRefCount
) == 0) {
1694 if(pBufferToDelete
== 0) {
1695 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1696 // is defined as volatile. (Volatile has useful non-standard behavior
1697 // with this compiler.)
1698 uprv_free((void *)pRefCount
);
1700 // the caller requested to delete it himself
1701 *pBufferToDelete
= (int32_t *)pRefCount
;
1706 // not enough memory for growCapacity and not even for the smaller newCapacity
1707 // reset the old values for setToBogus() to release the array
1708 if(!(flags
&kUsingStackBuffer
)) {
1709 fUnion
.fFields
.fArray
= oldArray
;
1711 fUnion
.fFields
.fLengthAndFlags
= flags
;
1719 // UnicodeStringAppendable ------------------------------------------------- ***
1721 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1724 UnicodeStringAppendable::appendCodeUnit(UChar c
) {
1725 return str
.doReplace(str
.length(), 0, &c
, 0, 1).isWritable();
1729 UnicodeStringAppendable::appendCodePoint(UChar32 c
) {
1730 UChar buffer
[U16_MAX_LENGTH
];
1731 int32_t cLength
= 0;
1732 UBool isError
= FALSE
;
1733 U16_APPEND(buffer
, cLength
, U16_MAX_LENGTH
, c
, isError
);
1734 return !isError
&& str
.doReplace(str
.length(), 0, buffer
, 0, cLength
).isWritable();
1738 UnicodeStringAppendable::appendString(const UChar
*s
, int32_t length
) {
1739 return str
.doReplace(str
.length(), 0, s
, 0, length
).isWritable();
1743 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity
) {
1744 return str
.cloneArrayIfNeeded(str
.length() + appendCapacity
);
1748 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity
,
1749 int32_t desiredCapacityHint
,
1750 UChar
*scratch
, int32_t scratchCapacity
,
1751 int32_t *resultCapacity
) {
1752 if(minCapacity
< 1 || scratchCapacity
< minCapacity
) {
1753 *resultCapacity
= 0;
1756 int32_t oldLength
= str
.length();
1757 if(str
.cloneArrayIfNeeded(oldLength
+ minCapacity
, oldLength
+ desiredCapacityHint
)) {
1758 *resultCapacity
= str
.getCapacity() - oldLength
;
1759 return str
.getArrayStart() + oldLength
;
1761 *resultCapacity
= scratchCapacity
;
1769 U_CAPI
int32_t U_EXPORT2
1770 uhash_hashUnicodeString(const UElement key
) {
1771 const UnicodeString
*str
= (const UnicodeString
*) key
.pointer
;
1772 return (str
== NULL
) ? 0 : str
->hashCode();
1775 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1776 // does not depend on hashtable code.
1777 U_CAPI UBool U_EXPORT2
1778 uhash_compareUnicodeString(const UElement key1
, const UElement key2
) {
1779 const UnicodeString
*str1
= (const UnicodeString
*) key1
.pointer
;
1780 const UnicodeString
*str2
= (const UnicodeString
*) key2
.pointer
;
1784 if (str1
== NULL
|| str2
== NULL
) {
1787 return *str1
== *str2
;
1790 #ifdef U_STATIC_IMPLEMENTATION
1792 This should never be called. It is defined here to make sure that the
1793 virtual vector deleting destructor is defined within unistr.cpp.
1794 The vector deleting destructor is already a part of UObject,
1795 but defining it here makes sure that it is included with this object file.
1796 This makes sure that static library dependencies are kept to a minimum.
1798 static void uprv_UnicodeStringDummy(void) {
1799 delete [] (new UnicodeString
[2]);