2 ******************************************************************************
3 * Copyright (C) 1999-2010, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ******************************************************************************
9 * Modification History:
11 * Date Name Description
12 * 09/25/98 stephen Creation.
13 * 04/20/99 stephen Overhauled per 4/16 code review.
14 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
15 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from
17 * 06/25/01 grhoten Removed the dependency on iostream
18 ******************************************************************************
21 #include "unicode/utypes.h"
22 #include "unicode/putil.h"
25 #include "unicode/ustring.h"
26 #include "unicode/unistr.h"
33 #if U_IOSTREAM_SOURCE >= 199711
36 #elif U_IOSTREAM_SOURCE >= 198506
42 print(const UnicodeString
& s
,
47 for(int i
= 0; i
< s
.length(); ++i
) {
49 if(c
>= 0x007E || c
< 0x0020)
50 cout
<< "[0x" << hex
<< s
[i
] << "]";
64 for(int i
= 0; i
< len
; ++i
) {
66 if(c
>= 0x007E || c
< 0x0020)
67 cout
<< "[0x" << hex
<< s
[i
] << "]";
76 // Local function definitions for now
78 // need to copy areas that may overlap
81 us_arrayCopy(const UChar
*src
, int32_t srcStart
,
82 UChar
*dst
, int32_t dstStart
, int32_t count
)
85 uprv_memmove(dst
+dstStart
, src
+srcStart
, (size_t)(count
*sizeof(*src
)));
89 // u_unescapeAt() callback to get a UChar from a UnicodeString
91 static UChar U_CALLCONV
92 UnicodeString_charAt(int32_t offset
, void *context
) {
93 return ((U_NAMESPACE_QUALIFIER UnicodeString
*) context
)->charAt(offset
);
99 /* The Replaceable virtual destructor can't be defined in the header
100 due to how AIX works with multiple definitions of virtual functions.
102 Replaceable::~Replaceable() {}
103 Replaceable::Replaceable() {}
104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString
)
106 UnicodeString U_EXPORT2
107 operator+ (const UnicodeString
&s1
, const UnicodeString
&s2
) {
109 UnicodeString(s1
.length()+s2
.length()+1, (UChar32
)0, 0).
114 //========================================
115 // Reference Counting functions, put at top of file so that optimizing compilers
116 // have a chance to automatically inline.
117 //========================================
120 UnicodeString::addRef()
121 { umtx_atomic_inc((int32_t *)fUnion
.fFields
.fArray
- 1);}
124 UnicodeString::removeRef()
125 { return umtx_atomic_dec((int32_t *)fUnion
.fFields
.fArray
- 1);}
128 UnicodeString::refCount() const
131 // Note: without the lock to force a memory barrier, we might see a very
132 // stale value on some multi-processor systems.
133 int32_t count
= *((int32_t *)fUnion
.fFields
.fArray
- 1);
139 UnicodeString::releaseArray() {
140 if((fFlags
& kRefCounted
) && removeRef() == 0) {
141 uprv_free((int32_t *)fUnion
.fFields
.fArray
- 1);
147 //========================================
149 //========================================
150 UnicodeString::UnicodeString()
155 UnicodeString::UnicodeString(int32_t capacity
, UChar32 c
, int32_t count
)
159 if(count
<= 0 || (uint32_t)c
> 0x10ffff) {
160 // just allocate and do not do anything else
163 // count > 0, allocate and fill the new string with count c's
164 int32_t unitCount
= UTF_CHAR_LENGTH(c
), length
= count
* unitCount
;
165 if(capacity
< length
) {
168 if(allocate(capacity
)) {
169 UChar
*array
= getArrayStart();
172 // fill the new string with c
174 // fill with length UChars
176 array
[i
++] = (UChar
)c
;
179 // get the code units for c
180 UChar units
[UTF_MAX_CHAR_LENGTH
];
181 UTF_APPEND_CHAR_UNSAFE(units
, i
, c
);
183 // now it must be i==unitCount
186 // for Unicode, unitCount can only be 1, 2, 3, or 4
187 // 1 is handled above
190 while(unitIdx
< unitCount
) {
191 array
[i
++]=units
[unitIdx
++];
200 UnicodeString::UnicodeString(UChar ch
)
204 fUnion
.fStackBuffer
[0] = ch
;
207 UnicodeString::UnicodeString(UChar32 ch
)
212 UBool isError
= FALSE
;
213 U16_APPEND(fUnion
.fStackBuffer
, i
, US_STACKBUF_SIZE
, ch
, isError
);
214 fShortLength
= (int8_t)i
;
217 UnicodeString::UnicodeString(const UChar
*text
)
221 doReplace(0, 0, text
, 0, -1);
224 UnicodeString::UnicodeString(const UChar
*text
,
229 doReplace(0, 0, text
, 0, textLength
);
232 UnicodeString::UnicodeString(UBool isTerminated
,
236 fFlags(kReadonlyAlias
)
239 // treat as an empty string, do not alias
241 } else if(textLength
< -1 ||
242 (textLength
== -1 && !isTerminated
) ||
243 (textLength
>= 0 && isTerminated
&& text
[textLength
] != 0)
247 if(textLength
== -1) {
248 // text is terminated, or else it would have failed the above test
249 textLength
= u_strlen(text
);
251 setArray((UChar
*)text
, textLength
, isTerminated
? textLength
+ 1 : textLength
);
255 UnicodeString::UnicodeString(UChar
*buff
,
257 int32_t buffCapacity
)
259 fFlags(kWritableAlias
)
262 // treat as an empty string, do not alias
264 } else if(buffLength
< -1 || buffCapacity
< 0 || buffLength
> buffCapacity
) {
267 if(buffLength
== -1) {
268 // fLength = u_strlen(buff); but do not look beyond buffCapacity
269 const UChar
*p
= buff
, *limit
= buff
+ buffCapacity
;
270 while(p
!= limit
&& *p
!= 0) {
273 buffLength
= (int32_t)(p
- buff
);
275 setArray(buff
, buffLength
, buffCapacity
);
279 UnicodeString::UnicodeString(const char *src
, int32_t length
, EInvariant
)
284 // treat as an empty string
287 length
=(int32_t)uprv_strlen(src
);
289 if(cloneArrayIfNeeded(length
, length
, FALSE
)) {
290 u_charsToUChars(src
, getArrayStart(), length
);
298 #if U_CHARSET_IS_UTF8
300 UnicodeString::UnicodeString(const char *codepageData
)
302 fFlags(kShortString
) {
303 if(codepageData
!= 0) {
304 setToUTF8(codepageData
);
308 UnicodeString::UnicodeString(const char *codepageData
, int32_t dataLength
)
310 fFlags(kShortString
) {
311 // if there's nothing to convert, do nothing
312 if(codepageData
== 0 || dataLength
== 0 || dataLength
< -1) {
315 if(dataLength
== -1) {
316 dataLength
= (int32_t)uprv_strlen(codepageData
);
318 setToUTF8(StringPiece(codepageData
, dataLength
));
321 // else see unistr_cnv.cpp
324 UnicodeString::UnicodeString(const UnicodeString
& that
)
332 UnicodeString::UnicodeString(const UnicodeString
& that
,
338 setTo(that
, srcStart
);
341 UnicodeString::UnicodeString(const UnicodeString
& that
,
348 setTo(that
, srcStart
, srcLength
);
351 // Replaceable base class clone() default implementation, does not clone
353 Replaceable::clone() const {
357 // UnicodeString overrides clone() with a real implementation
359 UnicodeString::clone() const {
360 return new UnicodeString(*this);
363 //========================================
365 //========================================
368 UnicodeString::allocate(int32_t capacity
) {
369 if(capacity
<= US_STACKBUF_SIZE
) {
370 fFlags
= kShortString
;
372 // count bytes for the refCounter and the string capacity, and
373 // round up to a multiple of 16; then divide by 4 and allocate int32_t's
374 // to be safely aligned for the refCount
375 // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
376 int32_t words
= (int32_t)(((sizeof(int32_t) + (capacity
+ 1) * U_SIZEOF_UCHAR
+ 15) & ~15) >> 2);
377 int32_t *array
= (int32_t*) uprv_malloc( sizeof(int32_t) * words
);
379 // set initial refCount and point behind the refCount
382 // have fArray point to the first UChar
383 fUnion
.fFields
.fArray
= (UChar
*)array
;
384 fUnion
.fFields
.fCapacity
= (int32_t)((words
- 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR
));
385 fFlags
= kLongString
;
388 fUnion
.fFields
.fArray
= 0;
389 fUnion
.fFields
.fCapacity
= 0;
397 //========================================
399 //========================================
400 UnicodeString::~UnicodeString()
405 //========================================
407 //========================================
409 UnicodeString
UnicodeString::fromUTF8(const StringPiece
&utf8
) {
410 UnicodeString result
;
411 result
.setToUTF8(utf8
);
415 UnicodeString
UnicodeString::fromUTF32(const UChar32
*utf32
, int32_t length
) {
416 UnicodeString result
;
418 // Most UTF-32 strings will be BMP-only and result in a same-length
419 // UTF-16 string. We overestimate the capacity just slightly,
420 // just in case there are a few supplementary characters.
421 if(length
<= US_STACKBUF_SIZE
) {
422 capacity
= US_STACKBUF_SIZE
;
424 capacity
= length
+ (length
>> 4) + 4;
427 UChar
*utf16
= result
.getBuffer(capacity
);
429 UErrorCode errorCode
= U_ZERO_ERROR
;
430 u_strFromUTF32WithSub(utf16
, result
.getCapacity(), &length16
,
432 0xfffd, // Substitution character.
433 NULL
, // Don't care about number of substitutions.
435 result
.releaseBuffer(length16
);
436 if(errorCode
== U_BUFFER_OVERFLOW_ERROR
) {
437 capacity
= length16
+ 1; // +1 for the terminating NUL.
439 } else if(U_FAILURE(errorCode
)) {
447 //========================================
449 //========================================
452 UnicodeString::operator=(const UnicodeString
&src
) {
453 return copyFrom(src
);
457 UnicodeString::fastCopyFrom(const UnicodeString
&src
) {
458 return copyFrom(src
, TRUE
);
462 UnicodeString::copyFrom(const UnicodeString
&src
, UBool fastCopy
) {
463 // if assigning to ourselves, do nothing
464 if(this == 0 || this == &src
) {
468 // is the right side bogus?
469 if(&src
== 0 || src
.isBogus()) {
474 // delete the current contents
478 // empty string - use the stack buffer
483 // we always copy the length
484 int32_t srcLength
= src
.length();
485 setLength(srcLength
);
487 // fLength>0 and not an "open" src.getBuffer(minCapacity)
490 // short string using the stack buffer, do the same
491 fFlags
= kShortString
;
492 uprv_memcpy(fUnion
.fStackBuffer
, src
.fUnion
.fStackBuffer
, srcLength
* U_SIZEOF_UCHAR
);
495 // src uses a refCounted string buffer, use that buffer with refCount
496 // src is const, use a cast - we don't really change it
497 ((UnicodeString
&)src
).addRef();
498 // copy all fields, share the reference-counted buffer
499 fUnion
.fFields
.fArray
= src
.fUnion
.fFields
.fArray
;
500 fUnion
.fFields
.fCapacity
= src
.fUnion
.fFields
.fCapacity
;
505 // src is a readonly alias, do the same
506 // -> maintain the readonly alias as such
507 fUnion
.fFields
.fArray
= src
.fUnion
.fFields
.fArray
;
508 fUnion
.fFields
.fCapacity
= src
.fUnion
.fFields
.fCapacity
;
512 // else if(!fastCopy) fall through to case kWritableAlias
513 // -> allocate a new buffer and copy the contents
515 // src is a writable alias; we make a copy of that instead
516 if(allocate(srcLength
)) {
517 uprv_memcpy(getArrayStart(), src
.getArrayStart(), srcLength
* U_SIZEOF_UCHAR
);
520 // if there is not enough memory, then fall through to setting to bogus
522 // if src is bogus, set ourselves to bogus
523 // do not call setToBogus() here because fArray and fFlags are not consistent here
525 fUnion
.fFields
.fArray
= 0;
526 fUnion
.fFields
.fCapacity
= 0;
534 //========================================
535 // Miscellaneous operations
536 //========================================
538 UnicodeString
UnicodeString::unescape() const {
539 UnicodeString
result(length(), (UChar32
)0, (int32_t)0); // construct with capacity
540 const UChar
*array
= getBuffer();
541 int32_t len
= length();
543 for (int32_t i
=0;;) {
545 result
.append(array
, prev
, len
- prev
);
548 if (array
[i
++] == 0x5C /*'\\'*/) {
549 result
.append(array
, prev
, (i
- 1) - prev
);
550 UChar32 c
= unescapeAt(i
); // advances i
552 result
.remove(); // return empty string
553 break; // invalid escape sequence
562 UChar32
UnicodeString::unescapeAt(int32_t &offset
) const {
563 return u_unescapeAt(UnicodeString_charAt
, &offset
, length(), (void*)this);
566 //========================================
567 // Read-only implementation
568 //========================================
570 UnicodeString::doCompare( int32_t start
,
572 const UChar
*srcChars
,
574 int32_t srcLength
) const
576 // compare illegal string values
577 // treat const UChar *srcChars==NULL as an empty string
582 // pin indices to legal values
583 pinIndices(start
, length
);
585 if(srcChars
== NULL
) {
586 srcStart
= srcLength
= 0;
589 // get the correct pointer
590 const UChar
*chars
= getArrayStart();
593 srcChars
+= srcStart
;
598 // get the srcLength if necessary
600 srcLength
= u_strlen(srcChars
+ srcStart
);
603 // are we comparing different lengths?
604 if(length
!= srcLength
) {
605 if(length
< srcLength
) {
609 minLength
= srcLength
;
618 * note that uprv_memcmp() returns an int but we return an int8_t;
619 * we need to take care not to truncate the result -
620 * one way to do this is to right-shift the value to
621 * move the sign bit into the lower 8 bits and making sure that this
622 * does not become 0 itself
625 if(minLength
> 0 && chars
!= srcChars
) {
629 // big-endian: byte comparison works
630 result
= uprv_memcmp(chars
, srcChars
, minLength
* sizeof(UChar
));
632 return (int8_t)(result
>> 15 | 1);
635 // little-endian: compare UChar units
637 result
= ((int32_t)*(chars
++) - (int32_t)*(srcChars
++));
639 return (int8_t)(result
>> 15 | 1);
641 } while(--minLength
> 0);
647 /* String compare in code point order - doCompare() compares in code unit order. */
649 UnicodeString::doCompareCodePointOrder(int32_t start
,
651 const UChar
*srcChars
,
653 int32_t srcLength
) const
655 // compare illegal string values
656 // treat const UChar *srcChars==NULL as an empty string
661 // pin indices to legal values
662 pinIndices(start
, length
);
664 if(srcChars
== NULL
) {
665 srcStart
= srcLength
= 0;
668 int32_t diff
= uprv_strCompare(getArrayStart() + start
, length
, srcChars
+ srcStart
, srcLength
, FALSE
, TRUE
);
669 /* translate the 32-bit result into an 8-bit one */
671 return (int8_t)(diff
>> 15 | 1);
678 UnicodeString::getLength() const {
683 UnicodeString::getCharAt(int32_t offset
) const {
684 return charAt(offset
);
688 UnicodeString::getChar32At(int32_t offset
) const {
689 return char32At(offset
);
693 UnicodeString::countChar32(int32_t start
, int32_t length
) const {
694 pinIndices(start
, length
);
695 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
696 return u_countChar32(getArrayStart()+start
, length
);
700 UnicodeString::hasMoreChar32Than(int32_t start
, int32_t length
, int32_t number
) const {
701 pinIndices(start
, length
);
702 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
703 return u_strHasMoreChar32Than(getArrayStart()+start
, length
, number
);
707 UnicodeString::moveIndex32(int32_t index
, int32_t delta
) const {
709 int32_t len
= length();
712 } else if(index
>len
) {
716 const UChar
*array
= getArrayStart();
718 UTF_FWD_N(array
, index
, len
, delta
);
720 UTF_BACK_N(array
, 0, index
, -delta
);
727 UnicodeString::doExtract(int32_t start
,
730 int32_t dstStart
) const
732 // pin indices to legal values
733 pinIndices(start
, length
);
735 // do not copy anything if we alias dst itself
736 const UChar
*array
= getArrayStart();
737 if(array
+ start
!= dst
+ dstStart
) {
738 us_arrayCopy(array
, start
, dst
, dstStart
, length
);
743 UnicodeString::extract(UChar
*dest
, int32_t destCapacity
,
744 UErrorCode
&errorCode
) const {
745 int32_t len
= length();
746 if(U_SUCCESS(errorCode
)) {
747 if(isBogus() || destCapacity
<0 || (destCapacity
>0 && dest
==0)) {
748 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
750 const UChar
*array
= getArrayStart();
751 if(len
>0 && len
<=destCapacity
&& array
!=dest
) {
752 uprv_memcpy(dest
, array
, len
*U_SIZEOF_UCHAR
);
754 return u_terminateUChars(dest
, destCapacity
, len
, &errorCode
);
762 UnicodeString::extract(int32_t start
,
765 int32_t targetCapacity
,
766 enum EInvariant
) const
768 // if the arguments are illegal, then do nothing
769 if(targetCapacity
< 0 || (targetCapacity
> 0 && target
== NULL
)) {
773 // pin the indices to legal values
774 pinIndices(start
, length
);
776 if(length
<= targetCapacity
) {
777 u_UCharsToChars(getArrayStart() + start
, target
, length
);
779 UErrorCode status
= U_ZERO_ERROR
;
780 return u_terminateChars(target
, targetCapacity
, length
, &status
);
784 UnicodeString::tempSubString(int32_t start
, int32_t len
) const {
785 pinIndices(start
, len
);
786 const UChar
*array
= getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
788 array
=fUnion
.fStackBuffer
; // anything not NULL because that would make an empty string
789 len
=-2; // bogus result string
791 return UnicodeString(FALSE
, array
+ start
, len
);
795 UnicodeString::toUTF8(int32_t start
, int32_t len
,
796 char *target
, int32_t capacity
) const {
797 pinIndices(start
, len
);
799 UErrorCode errorCode
= U_ZERO_ERROR
;
800 u_strToUTF8WithSub(target
, capacity
, &length8
,
801 getBuffer() + start
, len
,
802 0xFFFD, // Standard substitution character.
803 NULL
, // Don't care about number of substitutions.
808 #if U_CHARSET_IS_UTF8
811 UnicodeString::extract(int32_t start
, int32_t len
,
812 char *target
, uint32_t dstSize
) const {
813 // if the arguments are illegal, then do nothing
814 if(/*dstSize < 0 || */(dstSize
> 0 && target
== 0)) {
817 return toUTF8(start
, len
, target
, dstSize
<= 0x7fffffff ? (int32_t)dstSize
: 0x7fffffff);
820 // else see unistr_cnv.cpp
824 UnicodeString::extractBetween(int32_t start
,
826 UnicodeString
& target
) const {
829 doExtract(start
, limit
- start
, target
);
832 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
833 // as many bytes as the source has UChars.
834 // The "worst cases" are writing systems like Indic, Thai and CJK with
837 UnicodeString::toUTF8(ByteSink
&sink
) const {
838 int32_t length16
= length();
840 char stackBuffer
[1024];
841 int32_t capacity
= (int32_t)sizeof(stackBuffer
);
842 UBool utf8IsOwned
= FALSE
;
843 char *utf8
= sink
.GetAppendBuffer(length16
< capacity
? length16
: capacity
,
845 stackBuffer
, capacity
,
848 UErrorCode errorCode
= U_ZERO_ERROR
;
849 u_strToUTF8WithSub(utf8
, capacity
, &length8
,
850 getBuffer(), length16
,
851 0xFFFD, // Standard substitution character.
852 NULL
, // Don't care about number of substitutions.
854 if(errorCode
== U_BUFFER_OVERFLOW_ERROR
) {
855 utf8
= (char *)uprv_malloc(length8
);
858 errorCode
= U_ZERO_ERROR
;
859 u_strToUTF8WithSub(utf8
, length8
, &length8
,
860 getBuffer(), length16
,
861 0xFFFD, // Standard substitution character.
862 NULL
, // Don't care about number of substitutions.
865 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
868 if(U_SUCCESS(errorCode
)) {
869 sink
.Append(utf8
, length8
);
879 UnicodeString::toUTF32(UChar32
*utf32
, int32_t capacity
, UErrorCode
&errorCode
) const {
881 if(U_SUCCESS(errorCode
)) {
882 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
883 u_strToUTF32WithSub(utf32
, capacity
, &length32
,
884 getBuffer(), length(),
885 0xfffd, // Substitution character.
886 NULL
, // Don't care about number of substitutions.
893 UnicodeString::indexOf(const UChar
*srcChars
,
897 int32_t length
) const
899 if(isBogus() || srcChars
== 0 || srcStart
< 0 || srcLength
== 0) {
903 // UnicodeString does not find empty substrings
904 if(srcLength
< 0 && srcChars
[srcStart
] == 0) {
908 // get the indices within bounds
909 pinIndices(start
, length
);
911 // find the first occurrence of the substring
912 const UChar
*array
= getArrayStart();
913 const UChar
*match
= u_strFindFirst(array
+ start
, length
, srcChars
+ srcStart
, srcLength
);
917 return (int32_t)(match
- array
);
922 UnicodeString::doIndexOf(UChar c
,
924 int32_t length
) const
927 pinIndices(start
, length
);
929 // find the first occurrence of c
930 const UChar
*array
= getArrayStart();
931 const UChar
*match
= u_memchr(array
+ start
, c
, length
);
935 return (int32_t)(match
- array
);
940 UnicodeString::doIndexOf(UChar32 c
,
942 int32_t length
) const {
944 pinIndices(start
, length
);
946 // find the first occurrence of c
947 const UChar
*array
= getArrayStart();
948 const UChar
*match
= u_memchr32(array
+ start
, c
, length
);
952 return (int32_t)(match
- array
);
957 UnicodeString::lastIndexOf(const UChar
*srcChars
,
961 int32_t length
) const
963 if(isBogus() || srcChars
== 0 || srcStart
< 0 || srcLength
== 0) {
967 // UnicodeString does not find empty substrings
968 if(srcLength
< 0 && srcChars
[srcStart
] == 0) {
972 // get the indices within bounds
973 pinIndices(start
, length
);
975 // find the last occurrence of the substring
976 const UChar
*array
= getArrayStart();
977 const UChar
*match
= u_strFindLast(array
+ start
, length
, srcChars
+ srcStart
, srcLength
);
981 return (int32_t)(match
- array
);
986 UnicodeString::doLastIndexOf(UChar c
,
988 int32_t length
) const
995 pinIndices(start
, length
);
997 // find the last occurrence of c
998 const UChar
*array
= getArrayStart();
999 const UChar
*match
= u_memrchr(array
+ start
, c
, length
);
1003 return (int32_t)(match
- array
);
1008 UnicodeString::doLastIndexOf(UChar32 c
,
1010 int32_t length
) const {
1012 pinIndices(start
, length
);
1014 // find the last occurrence of c
1015 const UChar
*array
= getArrayStart();
1016 const UChar
*match
= u_memrchr32(array
+ start
, c
, length
);
1020 return (int32_t)(match
- array
);
1024 //========================================
1025 // Write implementation
1026 //========================================
1029 UnicodeString::findAndReplace(int32_t start
,
1031 const UnicodeString
& oldText
,
1034 const UnicodeString
& newText
,
1038 if(isBogus() || oldText
.isBogus() || newText
.isBogus()) {
1042 pinIndices(start
, length
);
1043 oldText
.pinIndices(oldStart
, oldLength
);
1044 newText
.pinIndices(newStart
, newLength
);
1046 if(oldLength
== 0) {
1050 while(length
> 0 && length
>= oldLength
) {
1051 int32_t pos
= indexOf(oldText
, oldStart
, oldLength
, start
, length
);
1053 // no more oldText's here: done
1056 // we found oldText, replace it by newText and go beyond it
1057 replace(pos
, oldLength
, newText
, newStart
, newLength
);
1058 length
-= pos
+ oldLength
- start
;
1059 start
= pos
+ newLength
;
1068 UnicodeString::setToBogus()
1073 fUnion
.fFields
.fArray
= 0;
1074 fUnion
.fFields
.fCapacity
= 0;
1078 // turn a bogus string into an empty one
1080 UnicodeString::unBogus() {
1081 if(fFlags
& kIsBogus
) {
1086 // setTo() analogous to the readonly-aliasing constructor with the same signature
1088 UnicodeString::setTo(UBool isTerminated
,
1092 if(fFlags
& kOpenGetBuffer
) {
1093 // do not modify a string that has an "open" getBuffer(minCapacity)
1098 // treat as an empty string, do not alias
1104 if( textLength
< -1 ||
1105 (textLength
== -1 && !isTerminated
) ||
1106 (textLength
>= 0 && isTerminated
&& text
[textLength
] != 0)
1114 if(textLength
== -1) {
1115 // text is terminated, or else it would have failed the above test
1116 textLength
= u_strlen(text
);
1118 setArray((UChar
*)text
, textLength
, isTerminated
? textLength
+ 1 : textLength
);
1120 fFlags
= kReadonlyAlias
;
1124 // setTo() analogous to the writable-aliasing constructor with the same signature
1126 UnicodeString::setTo(UChar
*buffer
,
1128 int32_t buffCapacity
) {
1129 if(fFlags
& kOpenGetBuffer
) {
1130 // do not modify a string that has an "open" getBuffer(minCapacity)
1134 if(buffer
== NULL
) {
1135 // treat as an empty string, do not alias
1141 if(buffLength
< -1 || buffCapacity
< 0 || buffLength
> buffCapacity
) {
1144 } else if(buffLength
== -1) {
1145 // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1146 const UChar
*p
= buffer
, *limit
= buffer
+ buffCapacity
;
1147 while(p
!= limit
&& *p
!= 0) {
1150 buffLength
= (int32_t)(p
- buffer
);
1155 setArray(buffer
, buffLength
, buffCapacity
);
1156 fFlags
= kWritableAlias
;
1160 UnicodeString
&UnicodeString::setToUTF8(const StringPiece
&utf8
) {
1162 int32_t length
= utf8
.length();
1164 // The UTF-16 string will be at most as long as the UTF-8 string.
1165 if(length
<= US_STACKBUF_SIZE
) {
1166 capacity
= US_STACKBUF_SIZE
;
1168 capacity
= length
+ 1; // +1 for the terminating NUL.
1170 UChar
*utf16
= getBuffer(capacity
);
1172 UErrorCode errorCode
= U_ZERO_ERROR
;
1173 u_strFromUTF8WithSub(utf16
, getCapacity(), &length16
,
1174 utf8
.data(), length
,
1175 0xfffd, // Substitution character.
1176 NULL
, // Don't care about number of substitutions.
1178 releaseBuffer(length16
);
1179 if(U_FAILURE(errorCode
)) {
1186 UnicodeString::setCharAt(int32_t offset
,
1189 int32_t len
= length();
1190 if(cloneArrayIfNeeded() && len
> 0) {
1193 } else if(offset
>= len
) {
1197 getArrayStart()[offset
] = c
;
1203 UnicodeString::doReplace( int32_t start
,
1205 const UnicodeString
& src
,
1209 if(!src
.isBogus()) {
1210 // pin the indices to legal values
1211 src
.pinIndices(srcStart
, srcLength
);
1213 // get the characters from src
1214 // and replace the range in ourselves with them
1215 return doReplace(start
, length
, src
.getArrayStart(), srcStart
, srcLength
);
1218 return doReplace(start
, length
, 0, 0, 0);
1223 UnicodeString::doReplace(int32_t start
,
1225 const UChar
*srcChars
,
1233 int32_t oldLength
= this->length();
1235 // optimize (read-only alias).remove(0, start) and .remove(start, end)
1236 if((fFlags
&kBufferIsReadonly
) && srcLength
== 0) {
1238 // remove prefix by adjusting the array pointer
1240 fUnion
.fFields
.fArray
+= length
;
1241 fUnion
.fFields
.fCapacity
-= length
;
1242 setLength(oldLength
- length
);
1246 if(length
>= (oldLength
- start
)) {
1247 // remove suffix by reducing the length (like truncate())
1249 fUnion
.fFields
.fCapacity
= start
; // not NUL-terminated any more
1256 srcStart
= srcLength
= 0;
1257 } else if(srcLength
< 0) {
1258 // get the srcLength if necessary
1259 srcLength
= u_strlen(srcChars
+ srcStart
);
1262 // calculate the size of the string after the replace
1265 // optimize append() onto a large-enough, owned string
1266 if(start
>= oldLength
) {
1267 newSize
= oldLength
+ srcLength
;
1268 if(newSize
<= getCapacity() && isBufferWritable()) {
1269 us_arrayCopy(srcChars
, srcStart
, getArrayStart(), oldLength
, srcLength
);
1273 // pin the indices to legal values
1278 // pin the indices to legal values
1279 pinIndices(start
, length
);
1281 newSize
= oldLength
- length
+ srcLength
;
1284 // the following may change fArray but will not copy the current contents;
1285 // therefore we need to keep the current fArray
1286 UChar oldStackBuffer
[US_STACKBUF_SIZE
];
1288 if((fFlags
&kUsingStackBuffer
) && (newSize
> US_STACKBUF_SIZE
)) {
1289 // copy the stack buffer contents because it will be overwritten with
1290 // fUnion.fFields values
1291 u_memcpy(oldStackBuffer
, fUnion
.fStackBuffer
, oldLength
);
1292 oldArray
= oldStackBuffer
;
1294 oldArray
= getArrayStart();
1297 // clone our array and allocate a bigger array if needed
1298 int32_t *bufferToDelete
= 0;
1299 if(!cloneArrayIfNeeded(newSize
, newSize
+ (newSize
>> 2) + kGrowSize
,
1300 FALSE
, &bufferToDelete
)
1305 // now do the replace
1307 UChar
*newArray
= getArrayStart();
1308 if(newArray
!= oldArray
) {
1309 // if fArray changed, then we need to copy everything except what will change
1310 us_arrayCopy(oldArray
, 0, newArray
, 0, start
);
1311 us_arrayCopy(oldArray
, start
+ length
,
1312 newArray
, start
+ srcLength
,
1313 oldLength
- (start
+ length
));
1314 } else if(length
!= srcLength
) {
1315 // fArray did not change; copy only the portion that isn't changing, leaving a hole
1316 us_arrayCopy(oldArray
, start
+ length
,
1317 newArray
, start
+ srcLength
,
1318 oldLength
- (start
+ length
));
1321 // now fill in the hole with the new string
1322 us_arrayCopy(srcChars
, srcStart
, newArray
, start
, srcLength
);
1326 // delayed delete in case srcChars == fArray when we started, and
1327 // to keep oldArray alive for the above operations
1328 if (bufferToDelete
) {
1329 uprv_free(bufferToDelete
);
1339 UnicodeString::handleReplaceBetween(int32_t start
,
1341 const UnicodeString
& text
) {
1342 replaceBetween(start
, limit
, text
);
1349 UnicodeString::copy(int32_t start
, int32_t limit
, int32_t dest
) {
1350 if (limit
<= start
) {
1351 return; // Nothing to do; avoid bogus malloc call
1353 UChar
* text
= (UChar
*) uprv_malloc( sizeof(UChar
) * (limit
- start
) );
1354 // Check to make sure text is not null.
1356 extractBetween(start
, limit
, text
, 0);
1357 insert(dest
, text
, 0, limit
- start
);
1365 * NOTE: This is for the Replaceable class. There is no rep.cpp,
1366 * so we implement this function here.
1368 UBool
Replaceable::hasMetaData() const {
1375 UBool
UnicodeString::hasMetaData() const {
1380 UnicodeString::doReverse(int32_t start
, int32_t length
) {
1381 if(length
<= 1 || !cloneArrayIfNeeded()) {
1385 // pin the indices to legal values
1386 pinIndices(start
, length
);
1387 if(length
<= 1) { // pinIndices() might have shrunk the length
1391 UChar
*left
= getArrayStart() + start
;
1392 UChar
*right
= left
+ length
- 1; // -1 for inclusive boundary (length>=2)
1394 UBool hasSupplementary
= FALSE
;
1396 // Before the loop we know left<right because length>=2.
1398 hasSupplementary
|= (UBool
)U16_IS_LEAD(swap
= *left
);
1399 hasSupplementary
|= (UBool
)U16_IS_LEAD(*left
++ = *right
);
1401 } while(left
< right
);
1402 // Make sure to test the middle code unit of an odd-length string.
1403 // Redundant if the length is even.
1404 hasSupplementary
|= (UBool
)U16_IS_LEAD(*left
);
1406 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1407 if(hasSupplementary
) {
1410 left
= getArrayStart() + start
;
1411 right
= left
+ length
- 1; // -1 so that we can look at *(left+1) if left<right
1412 while(left
< right
) {
1413 if(U16_IS_TRAIL(swap
= *left
) && U16_IS_LEAD(swap2
= *(left
+ 1))) {
1426 UnicodeString::padLeading(int32_t targetLength
,
1429 int32_t oldLength
= length();
1430 if(oldLength
>= targetLength
|| !cloneArrayIfNeeded(targetLength
)) {
1433 // move contents up by padding width
1434 UChar
*array
= getArrayStart();
1435 int32_t start
= targetLength
- oldLength
;
1436 us_arrayCopy(array
, 0, array
, start
, oldLength
);
1438 // fill in padding character
1439 while(--start
>= 0) {
1440 array
[start
] = padChar
;
1442 setLength(targetLength
);
1448 UnicodeString::padTrailing(int32_t targetLength
,
1451 int32_t oldLength
= length();
1452 if(oldLength
>= targetLength
|| !cloneArrayIfNeeded(targetLength
)) {
1455 // fill in padding character
1456 UChar
*array
= getArrayStart();
1457 int32_t length
= targetLength
;
1458 while(--length
>= oldLength
) {
1459 array
[length
] = padChar
;
1461 setLength(targetLength
);
1466 //========================================
1468 //========================================
1470 UnicodeString::doHashCode() const
1472 /* Delegate hash computation to uhash. This makes UnicodeString
1473 * hashing consistent with UChar* hashing. */
1474 int32_t hashCode
= uhash_hashUCharsN(getArrayStart(), length());
1475 if (hashCode
== kInvalidHashCode
) {
1476 hashCode
= kEmptyHashCode
;
1481 //========================================
1483 //========================================
1486 UnicodeString::getBuffer(int32_t minCapacity
) {
1487 if(minCapacity
>=-1 && cloneArrayIfNeeded(minCapacity
)) {
1488 fFlags
|=kOpenGetBuffer
;
1490 return getArrayStart();
1497 UnicodeString::releaseBuffer(int32_t newLength
) {
1498 if(fFlags
&kOpenGetBuffer
&& newLength
>=-1) {
1499 // set the new fLength
1500 int32_t capacity
=getCapacity();
1502 // the new length is the string length, capped by fCapacity
1503 const UChar
*array
=getArrayStart(), *p
=array
, *limit
=array
+capacity
;
1504 while(p
<limit
&& *p
!=0) {
1507 newLength
=(int32_t)(p
-array
);
1508 } else if(newLength
>capacity
) {
1511 setLength(newLength
);
1512 fFlags
&=~kOpenGetBuffer
;
1516 //========================================
1518 //========================================
1520 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity
,
1521 int32_t growCapacity
,
1523 int32_t **pBufferToDelete
,
1525 // default parameters need to be static, therefore
1526 // the defaults are -1 to have convenience defaults
1527 if(newCapacity
== -1) {
1528 newCapacity
= getCapacity();
1531 // while a getBuffer(minCapacity) is "open",
1532 // prevent any modifications of the string by returning FALSE here
1533 // if the string is bogus, then only an assignment or similar can revive it
1539 * We need to make a copy of the array if
1540 * the buffer is read-only, or
1541 * the buffer is refCounted (shared), and refCount>1, or
1542 * the buffer is too small.
1543 * Return FALSE if memory could not be allocated.
1546 fFlags
& kBufferIsReadonly
||
1547 (fFlags
& kRefCounted
&& refCount() > 1) ||
1548 newCapacity
> getCapacity()
1550 // check growCapacity for default value and use of the stack buffer
1551 if(growCapacity
== -1) {
1552 growCapacity
= newCapacity
;
1553 } else if(newCapacity
<= US_STACKBUF_SIZE
&& growCapacity
> US_STACKBUF_SIZE
) {
1554 growCapacity
= US_STACKBUF_SIZE
;
1558 UChar oldStackBuffer
[US_STACKBUF_SIZE
];
1560 uint8_t flags
= fFlags
;
1562 if(flags
&kUsingStackBuffer
) {
1563 if(doCopyArray
&& growCapacity
> US_STACKBUF_SIZE
) {
1564 // copy the stack buffer contents because it will be overwritten with
1565 // fUnion.fFields values
1566 us_arrayCopy(fUnion
.fStackBuffer
, 0, oldStackBuffer
, 0, fShortLength
);
1567 oldArray
= oldStackBuffer
;
1569 oldArray
= 0; // no need to copy from stack buffer to itself
1572 oldArray
= fUnion
.fFields
.fArray
;
1575 // allocate a new array
1576 if(allocate(growCapacity
) ||
1577 (newCapacity
< growCapacity
&& allocate(newCapacity
))
1579 if(doCopyArray
&& oldArray
!= 0) {
1580 // copy the contents
1581 // do not copy more than what fits - it may be smaller than before
1582 int32_t minLength
= length();
1583 newCapacity
= getCapacity();
1584 if(newCapacity
< minLength
) {
1585 minLength
= newCapacity
;
1586 setLength(minLength
);
1588 us_arrayCopy(oldArray
, 0, getArrayStart(), 0, minLength
);
1593 // release the old array
1594 if(flags
& kRefCounted
) {
1595 // the array is refCounted; decrement and release if 0
1596 int32_t *pRefCount
= ((int32_t *)oldArray
- 1);
1597 if(umtx_atomic_dec(pRefCount
) == 0) {
1598 if(pBufferToDelete
== 0) {
1599 uprv_free(pRefCount
);
1601 // the caller requested to delete it himself
1602 *pBufferToDelete
= pRefCount
;
1607 // not enough memory for growCapacity and not even for the smaller newCapacity
1608 // reset the old values for setToBogus() to release the array
1609 if(!(flags
&kUsingStackBuffer
)) {
1610 fUnion
.fFields
.fArray
= oldArray
;
1621 #ifdef U_STATIC_IMPLEMENTATION
1623 This should never be called. It is defined here to make sure that the
1624 virtual vector deleting destructor is defined within unistr.cpp.
1625 The vector deleting destructor is already a part of UObject,
1626 but defining it here makes sure that it is included with this object file.
1627 This makes sure that static library dependencies are kept to a minimum.
1629 static void uprv_UnicodeStringDummy(void) {
1631 delete [] (new UnicodeString
[2]);