2 ******************************************************************************
3 * Copyright (C) 1999-2012, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
9 * Modification History:
11 * Date Name Description
12 * 09/25/98 stephen Creation.
13 * 04/20/99 stephen Overhauled per 4/16 code review.
14 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
15 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from
17 * 06/25/01 grhoten Removed the dependency on iostream
18 ******************************************************************************
21 #include "unicode/utypes.h"
22 #include "unicode/appendable.h"
23 #include "unicode/putil.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf16.h"
42 print(const UnicodeString
& s
,
47 for(int i
= 0; i
< s
.length(); ++i
) {
49 if(c
>= 0x007E || c
< 0x0020)
50 cout
<< "[0x" << hex
<< s
[i
] << "]";
64 for(int i
= 0; i
< len
; ++i
) {
66 if(c
>= 0x007E || c
< 0x0020)
67 cout
<< "[0x" << hex
<< s
[i
] << "]";
76 // Local function definitions for now
78 // need to copy areas that may overlap
81 us_arrayCopy(const UChar
*src
, int32_t srcStart
,
82 UChar
*dst
, int32_t dstStart
, int32_t count
)
85 uprv_memmove(dst
+dstStart
, src
+srcStart
, (size_t)(count
*sizeof(*src
)));
89 // u_unescapeAt() callback to get a UChar from a UnicodeString
91 static UChar U_CALLCONV
92 UnicodeString_charAt(int32_t offset
, void *context
) {
93 return ((icu::UnicodeString
*) context
)->charAt(offset
);
99 /* The Replaceable virtual destructor can't be defined in the header
100 due to how AIX works with multiple definitions of virtual functions.
102 Replaceable::~Replaceable() {}
103 Replaceable::Replaceable() {}
104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString
)
106 UnicodeString U_EXPORT2
107 operator+ (const UnicodeString
&s1
, const UnicodeString
&s2
) {
109 UnicodeString(s1
.length()+s2
.length()+1, (UChar32
)0, 0).
114 //========================================
115 // Reference Counting functions, put at top of file so that optimizing compilers
116 // have a chance to automatically inline.
117 //========================================
120 UnicodeString::addRef()
121 { umtx_atomic_inc((int32_t *)fUnion
.fFields
.fArray
- 1);}
124 UnicodeString::removeRef()
125 { return umtx_atomic_dec((int32_t *)fUnion
.fFields
.fArray
- 1);}
128 UnicodeString::refCount() const
131 // Note: without the lock to force a memory barrier, we might see a very
132 // stale value on some multi-processor systems.
133 int32_t count
= *((int32_t *)fUnion
.fFields
.fArray
- 1);
139 UnicodeString::releaseArray() {
140 if((fFlags
& kRefCounted
) && removeRef() == 0) {
141 uprv_free((int32_t *)fUnion
.fFields
.fArray
- 1);
147 //========================================
149 //========================================
150 UnicodeString::UnicodeString()
155 UnicodeString::UnicodeString(int32_t capacity
, UChar32 c
, int32_t count
)
159 if(count
<= 0 || (uint32_t)c
> 0x10ffff) {
160 // just allocate and do not do anything else
163 // count > 0, allocate and fill the new string with count c's
164 int32_t unitCount
= U16_LENGTH(c
), length
= count
* unitCount
;
165 if(capacity
< length
) {
168 if(allocate(capacity
)) {
169 UChar
*array
= getArrayStart();
172 // fill the new string with c
174 // fill with length UChars
176 array
[i
++] = (UChar
)c
;
179 // get the code units for c
180 UChar units
[U16_MAX_LENGTH
];
181 U16_APPEND_UNSAFE(units
, i
, c
);
183 // now it must be i==unitCount
186 // for Unicode, unitCount can only be 1, 2, 3, or 4
187 // 1 is handled above
190 while(unitIdx
< unitCount
) {
191 array
[i
++]=units
[unitIdx
++];
200 UnicodeString::UnicodeString(UChar ch
)
204 fUnion
.fStackBuffer
[0] = ch
;
207 UnicodeString::UnicodeString(UChar32 ch
)
212 UBool isError
= FALSE
;
213 U16_APPEND(fUnion
.fStackBuffer
, i
, US_STACKBUF_SIZE
, ch
, isError
);
214 // We test isError so that the compiler does not complain that we don't.
215 // If isError then i==0 which is what we want anyway.
217 fShortLength
= (int8_t)i
;
221 UnicodeString::UnicodeString(const UChar
*text
)
225 doReplace(0, 0, text
, 0, -1);
228 UnicodeString::UnicodeString(const UChar
*text
,
233 doReplace(0, 0, text
, 0, textLength
);
236 UnicodeString::UnicodeString(UBool isTerminated
,
240 fFlags(kReadonlyAlias
)
243 // treat as an empty string, do not alias
245 } else if(textLength
< -1 ||
246 (textLength
== -1 && !isTerminated
) ||
247 (textLength
>= 0 && isTerminated
&& text
[textLength
] != 0)
251 if(textLength
== -1) {
252 // text is terminated, or else it would have failed the above test
253 textLength
= u_strlen(text
);
255 setArray((UChar
*)text
, textLength
, isTerminated
? textLength
+ 1 : textLength
);
259 UnicodeString::UnicodeString(UChar
*buff
,
261 int32_t buffCapacity
)
263 fFlags(kWritableAlias
)
266 // treat as an empty string, do not alias
268 } else if(buffLength
< -1 || buffCapacity
< 0 || buffLength
> buffCapacity
) {
271 if(buffLength
== -1) {
272 // fLength = u_strlen(buff); but do not look beyond buffCapacity
273 const UChar
*p
= buff
, *limit
= buff
+ buffCapacity
;
274 while(p
!= limit
&& *p
!= 0) {
277 buffLength
= (int32_t)(p
- buff
);
279 setArray(buff
, buffLength
, buffCapacity
);
283 UnicodeString::UnicodeString(const char *src
, int32_t length
, EInvariant
)
288 // treat as an empty string
291 length
=(int32_t)uprv_strlen(src
);
293 if(cloneArrayIfNeeded(length
, length
, FALSE
)) {
294 u_charsToUChars(src
, getArrayStart(), length
);
302 #if U_CHARSET_IS_UTF8
304 UnicodeString::UnicodeString(const char *codepageData
)
306 fFlags(kShortString
) {
307 if(codepageData
!= 0) {
308 setToUTF8(codepageData
);
312 UnicodeString::UnicodeString(const char *codepageData
, int32_t dataLength
)
314 fFlags(kShortString
) {
315 // if there's nothing to convert, do nothing
316 if(codepageData
== 0 || dataLength
== 0 || dataLength
< -1) {
319 if(dataLength
== -1) {
320 dataLength
= (int32_t)uprv_strlen(codepageData
);
322 setToUTF8(StringPiece(codepageData
, dataLength
));
325 // else see unistr_cnv.cpp
328 UnicodeString::UnicodeString(const UnicodeString
& that
)
336 UnicodeString::UnicodeString(const UnicodeString
& that
,
342 setTo(that
, srcStart
);
345 UnicodeString::UnicodeString(const UnicodeString
& that
,
352 setTo(that
, srcStart
, srcLength
);
355 // Replaceable base class clone() default implementation, does not clone
357 Replaceable::clone() const {
361 // UnicodeString overrides clone() with a real implementation
363 UnicodeString::clone() const {
364 return new UnicodeString(*this);
367 //========================================
369 //========================================
372 UnicodeString::allocate(int32_t capacity
) {
373 if(capacity
<= US_STACKBUF_SIZE
) {
374 fFlags
= kShortString
;
376 // count bytes for the refCounter and the string capacity, and
377 // round up to a multiple of 16; then divide by 4 and allocate int32_t's
378 // to be safely aligned for the refCount
379 // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
380 int32_t words
= (int32_t)(((sizeof(int32_t) + (capacity
+ 1) * U_SIZEOF_UCHAR
+ 15) & ~15) >> 2);
381 int32_t *array
= (int32_t*) uprv_malloc( sizeof(int32_t) * words
);
383 // set initial refCount and point behind the refCount
386 // have fArray point to the first UChar
387 fUnion
.fFields
.fArray
= (UChar
*)array
;
388 fUnion
.fFields
.fCapacity
= (int32_t)((words
- 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR
));
389 fFlags
= kLongString
;
392 fUnion
.fFields
.fArray
= 0;
393 fUnion
.fFields
.fCapacity
= 0;
401 //========================================
403 //========================================
404 UnicodeString::~UnicodeString()
409 //========================================
411 //========================================
413 UnicodeString
UnicodeString::fromUTF8(const StringPiece
&utf8
) {
414 UnicodeString result
;
415 result
.setToUTF8(utf8
);
419 UnicodeString
UnicodeString::fromUTF32(const UChar32
*utf32
, int32_t length
) {
420 UnicodeString result
;
422 // Most UTF-32 strings will be BMP-only and result in a same-length
423 // UTF-16 string. We overestimate the capacity just slightly,
424 // just in case there are a few supplementary characters.
425 if(length
<= US_STACKBUF_SIZE
) {
426 capacity
= US_STACKBUF_SIZE
;
428 capacity
= length
+ (length
>> 4) + 4;
431 UChar
*utf16
= result
.getBuffer(capacity
);
433 UErrorCode errorCode
= U_ZERO_ERROR
;
434 u_strFromUTF32WithSub(utf16
, result
.getCapacity(), &length16
,
436 0xfffd, // Substitution character.
437 NULL
, // Don't care about number of substitutions.
439 result
.releaseBuffer(length16
);
440 if(errorCode
== U_BUFFER_OVERFLOW_ERROR
) {
441 capacity
= length16
+ 1; // +1 for the terminating NUL.
443 } else if(U_FAILURE(errorCode
)) {
451 //========================================
453 //========================================
456 UnicodeString::operator=(const UnicodeString
&src
) {
457 return copyFrom(src
);
461 UnicodeString::fastCopyFrom(const UnicodeString
&src
) {
462 return copyFrom(src
, TRUE
);
466 UnicodeString::copyFrom(const UnicodeString
&src
, UBool fastCopy
) {
467 // if assigning to ourselves, do nothing
468 if(this == 0 || this == &src
) {
472 // is the right side bogus?
473 if(&src
== 0 || src
.isBogus()) {
478 // delete the current contents
482 // empty string - use the stack buffer
487 // we always copy the length
488 int32_t srcLength
= src
.length();
489 setLength(srcLength
);
491 // fLength>0 and not an "open" src.getBuffer(minCapacity)
494 // short string using the stack buffer, do the same
495 fFlags
= kShortString
;
496 uprv_memcpy(fUnion
.fStackBuffer
, src
.fUnion
.fStackBuffer
, srcLength
* U_SIZEOF_UCHAR
);
499 // src uses a refCounted string buffer, use that buffer with refCount
500 // src is const, use a cast - we don't really change it
501 ((UnicodeString
&)src
).addRef();
502 // copy all fields, share the reference-counted buffer
503 fUnion
.fFields
.fArray
= src
.fUnion
.fFields
.fArray
;
504 fUnion
.fFields
.fCapacity
= src
.fUnion
.fFields
.fCapacity
;
509 // src is a readonly alias, do the same
510 // -> maintain the readonly alias as such
511 fUnion
.fFields
.fArray
= src
.fUnion
.fFields
.fArray
;
512 fUnion
.fFields
.fCapacity
= src
.fUnion
.fFields
.fCapacity
;
516 // else if(!fastCopy) fall through to case kWritableAlias
517 // -> allocate a new buffer and copy the contents
519 // src is a writable alias; we make a copy of that instead
520 if(allocate(srcLength
)) {
521 uprv_memcpy(getArrayStart(), src
.getArrayStart(), srcLength
* U_SIZEOF_UCHAR
);
524 // if there is not enough memory, then fall through to setting to bogus
526 // if src is bogus, set ourselves to bogus
527 // do not call setToBogus() here because fArray and fFlags are not consistent here
529 fUnion
.fFields
.fArray
= 0;
530 fUnion
.fFields
.fCapacity
= 0;
538 //========================================
539 // Miscellaneous operations
540 //========================================
542 UnicodeString
UnicodeString::unescape() const {
543 UnicodeString
result(length(), (UChar32
)0, (int32_t)0); // construct with capacity
544 const UChar
*array
= getBuffer();
545 int32_t len
= length();
547 for (int32_t i
=0;;) {
549 result
.append(array
, prev
, len
- prev
);
552 if (array
[i
++] == 0x5C /*'\\'*/) {
553 result
.append(array
, prev
, (i
- 1) - prev
);
554 UChar32 c
= unescapeAt(i
); // advances i
556 result
.remove(); // return empty string
557 break; // invalid escape sequence
566 UChar32
UnicodeString::unescapeAt(int32_t &offset
) const {
567 return u_unescapeAt(UnicodeString_charAt
, &offset
, length(), (void*)this);
570 //========================================
571 // Read-only implementation
572 //========================================
574 UnicodeString::doCompare( int32_t start
,
576 const UChar
*srcChars
,
578 int32_t srcLength
) const
580 // compare illegal string values
585 // pin indices to legal values
586 pinIndices(start
, length
);
588 if(srcChars
== NULL
) {
589 // treat const UChar *srcChars==NULL as an empty string
590 return length
== 0 ? 0 : 1;
593 // get the correct pointer
594 const UChar
*chars
= getArrayStart();
597 srcChars
+= srcStart
;
602 // get the srcLength if necessary
604 srcLength
= u_strlen(srcChars
+ srcStart
);
607 // are we comparing different lengths?
608 if(length
!= srcLength
) {
609 if(length
< srcLength
) {
613 minLength
= srcLength
;
622 * note that uprv_memcmp() returns an int but we return an int8_t;
623 * we need to take care not to truncate the result -
624 * one way to do this is to right-shift the value to
625 * move the sign bit into the lower 8 bits and making sure that this
626 * does not become 0 itself
629 if(minLength
> 0 && chars
!= srcChars
) {
633 // big-endian: byte comparison works
634 result
= uprv_memcmp(chars
, srcChars
, minLength
* sizeof(UChar
));
636 return (int8_t)(result
>> 15 | 1);
639 // little-endian: compare UChar units
641 result
= ((int32_t)*(chars
++) - (int32_t)*(srcChars
++));
643 return (int8_t)(result
>> 15 | 1);
645 } while(--minLength
> 0);
651 /* String compare in code point order - doCompare() compares in code unit order. */
653 UnicodeString::doCompareCodePointOrder(int32_t start
,
655 const UChar
*srcChars
,
657 int32_t srcLength
) const
659 // compare illegal string values
660 // treat const UChar *srcChars==NULL as an empty string
665 // pin indices to legal values
666 pinIndices(start
, length
);
668 if(srcChars
== NULL
) {
669 srcStart
= srcLength
= 0;
672 int32_t diff
= uprv_strCompare(getArrayStart() + start
, length
, (srcChars
!=NULL
)?(srcChars
+ srcStart
):NULL
, srcLength
, FALSE
, TRUE
);
673 /* translate the 32-bit result into an 8-bit one */
675 return (int8_t)(diff
>> 15 | 1);
682 UnicodeString::getLength() const {
687 UnicodeString::getCharAt(int32_t offset
) const {
688 return charAt(offset
);
692 UnicodeString::getChar32At(int32_t offset
) const {
693 return char32At(offset
);
697 UnicodeString::char32At(int32_t offset
) const
699 int32_t len
= length();
700 if((uint32_t)offset
< (uint32_t)len
) {
701 const UChar
*array
= getArrayStart();
703 U16_GET(array
, 0, offset
, len
, c
);
706 return kInvalidUChar
;
711 UnicodeString::getChar32Start(int32_t offset
) const {
712 if((uint32_t)offset
< (uint32_t)length()) {
713 const UChar
*array
= getArrayStart();
714 U16_SET_CP_START(array
, 0, offset
);
722 UnicodeString::getChar32Limit(int32_t offset
) const {
723 int32_t len
= length();
724 if((uint32_t)offset
< (uint32_t)len
) {
725 const UChar
*array
= getArrayStart();
726 U16_SET_CP_LIMIT(array
, 0, offset
, len
);
734 UnicodeString::countChar32(int32_t start
, int32_t length
) const {
735 pinIndices(start
, length
);
736 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
737 return u_countChar32(getArrayStart()+start
, length
);
741 UnicodeString::hasMoreChar32Than(int32_t start
, int32_t length
, int32_t number
) const {
742 pinIndices(start
, length
);
743 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
744 return u_strHasMoreChar32Than(getArrayStart()+start
, length
, number
);
748 UnicodeString::moveIndex32(int32_t index
, int32_t delta
) const {
750 int32_t len
= length();
753 } else if(index
>len
) {
757 const UChar
*array
= getArrayStart();
759 U16_FWD_N(array
, index
, len
, delta
);
761 U16_BACK_N(array
, 0, index
, -delta
);
768 UnicodeString::doExtract(int32_t start
,
771 int32_t dstStart
) const
773 // pin indices to legal values
774 pinIndices(start
, length
);
776 // do not copy anything if we alias dst itself
777 const UChar
*array
= getArrayStart();
778 if(array
+ start
!= dst
+ dstStart
) {
779 us_arrayCopy(array
, start
, dst
, dstStart
, length
);
784 UnicodeString::extract(UChar
*dest
, int32_t destCapacity
,
785 UErrorCode
&errorCode
) const {
786 int32_t len
= length();
787 if(U_SUCCESS(errorCode
)) {
788 if(isBogus() || destCapacity
<0 || (destCapacity
>0 && dest
==0)) {
789 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
791 const UChar
*array
= getArrayStart();
792 if(len
>0 && len
<=destCapacity
&& array
!=dest
) {
793 uprv_memcpy(dest
, array
, len
*U_SIZEOF_UCHAR
);
795 return u_terminateUChars(dest
, destCapacity
, len
, &errorCode
);
803 UnicodeString::extract(int32_t start
,
806 int32_t targetCapacity
,
807 enum EInvariant
) const
809 // if the arguments are illegal, then do nothing
810 if(targetCapacity
< 0 || (targetCapacity
> 0 && target
== NULL
)) {
814 // pin the indices to legal values
815 pinIndices(start
, length
);
817 if(length
<= targetCapacity
) {
818 u_UCharsToChars(getArrayStart() + start
, target
, length
);
820 UErrorCode status
= U_ZERO_ERROR
;
821 return u_terminateChars(target
, targetCapacity
, length
, &status
);
825 UnicodeString::tempSubString(int32_t start
, int32_t len
) const {
826 pinIndices(start
, len
);
827 const UChar
*array
= getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
829 array
=fUnion
.fStackBuffer
; // anything not NULL because that would make an empty string
830 len
=-2; // bogus result string
832 return UnicodeString(FALSE
, array
+ start
, len
);
836 UnicodeString::toUTF8(int32_t start
, int32_t len
,
837 char *target
, int32_t capacity
) const {
838 pinIndices(start
, len
);
840 UErrorCode errorCode
= U_ZERO_ERROR
;
841 u_strToUTF8WithSub(target
, capacity
, &length8
,
842 getBuffer() + start
, len
,
843 0xFFFD, // Standard substitution character.
844 NULL
, // Don't care about number of substitutions.
849 #if U_CHARSET_IS_UTF8
852 UnicodeString::extract(int32_t start
, int32_t len
,
853 char *target
, uint32_t dstSize
) const {
854 // if the arguments are illegal, then do nothing
855 if(/*dstSize < 0 || */(dstSize
> 0 && target
== 0)) {
858 return toUTF8(start
, len
, target
, dstSize
<= 0x7fffffff ? (int32_t)dstSize
: 0x7fffffff);
861 // else see unistr_cnv.cpp
865 UnicodeString::extractBetween(int32_t start
,
867 UnicodeString
& target
) const {
870 doExtract(start
, limit
- start
, target
);
873 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
874 // as many bytes as the source has UChars.
875 // The "worst cases" are writing systems like Indic, Thai and CJK with
878 UnicodeString::toUTF8(ByteSink
&sink
) const {
879 int32_t length16
= length();
881 char stackBuffer
[1024];
882 int32_t capacity
= (int32_t)sizeof(stackBuffer
);
883 UBool utf8IsOwned
= FALSE
;
884 char *utf8
= sink
.GetAppendBuffer(length16
< capacity
? length16
: capacity
,
886 stackBuffer
, capacity
,
889 UErrorCode errorCode
= U_ZERO_ERROR
;
890 u_strToUTF8WithSub(utf8
, capacity
, &length8
,
891 getBuffer(), length16
,
892 0xFFFD, // Standard substitution character.
893 NULL
, // Don't care about number of substitutions.
895 if(errorCode
== U_BUFFER_OVERFLOW_ERROR
) {
896 utf8
= (char *)uprv_malloc(length8
);
899 errorCode
= U_ZERO_ERROR
;
900 u_strToUTF8WithSub(utf8
, length8
, &length8
,
901 getBuffer(), length16
,
902 0xFFFD, // Standard substitution character.
903 NULL
, // Don't care about number of substitutions.
906 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
909 if(U_SUCCESS(errorCode
)) {
910 sink
.Append(utf8
, length8
);
920 UnicodeString::toUTF32(UChar32
*utf32
, int32_t capacity
, UErrorCode
&errorCode
) const {
922 if(U_SUCCESS(errorCode
)) {
923 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
924 u_strToUTF32WithSub(utf32
, capacity
, &length32
,
925 getBuffer(), length(),
926 0xfffd, // Substitution character.
927 NULL
, // Don't care about number of substitutions.
934 UnicodeString::indexOf(const UChar
*srcChars
,
938 int32_t length
) const
940 if(isBogus() || srcChars
== 0 || srcStart
< 0 || srcLength
== 0) {
944 // UnicodeString does not find empty substrings
945 if(srcLength
< 0 && srcChars
[srcStart
] == 0) {
949 // get the indices within bounds
950 pinIndices(start
, length
);
952 // find the first occurrence of the substring
953 const UChar
*array
= getArrayStart();
954 const UChar
*match
= u_strFindFirst(array
+ start
, length
, srcChars
+ srcStart
, srcLength
);
958 return (int32_t)(match
- array
);
963 UnicodeString::doIndexOf(UChar c
,
965 int32_t length
) const
968 pinIndices(start
, length
);
970 // find the first occurrence of c
971 const UChar
*array
= getArrayStart();
972 const UChar
*match
= u_memchr(array
+ start
, c
, length
);
976 return (int32_t)(match
- array
);
981 UnicodeString::doIndexOf(UChar32 c
,
983 int32_t length
) const {
985 pinIndices(start
, length
);
987 // find the first occurrence of c
988 const UChar
*array
= getArrayStart();
989 const UChar
*match
= u_memchr32(array
+ start
, c
, length
);
993 return (int32_t)(match
- array
);
998 UnicodeString::lastIndexOf(const UChar
*srcChars
,
1002 int32_t length
) const
1004 if(isBogus() || srcChars
== 0 || srcStart
< 0 || srcLength
== 0) {
1008 // UnicodeString does not find empty substrings
1009 if(srcLength
< 0 && srcChars
[srcStart
] == 0) {
1013 // get the indices within bounds
1014 pinIndices(start
, length
);
1016 // find the last occurrence of the substring
1017 const UChar
*array
= getArrayStart();
1018 const UChar
*match
= u_strFindLast(array
+ start
, length
, srcChars
+ srcStart
, srcLength
);
1022 return (int32_t)(match
- array
);
1027 UnicodeString::doLastIndexOf(UChar c
,
1029 int32_t length
) const
1036 pinIndices(start
, length
);
1038 // find the last occurrence of c
1039 const UChar
*array
= getArrayStart();
1040 const UChar
*match
= u_memrchr(array
+ start
, c
, length
);
1044 return (int32_t)(match
- array
);
1049 UnicodeString::doLastIndexOf(UChar32 c
,
1051 int32_t length
) const {
1053 pinIndices(start
, length
);
1055 // find the last occurrence of c
1056 const UChar
*array
= getArrayStart();
1057 const UChar
*match
= u_memrchr32(array
+ start
, c
, length
);
1061 return (int32_t)(match
- array
);
1065 //========================================
1066 // Write implementation
1067 //========================================
1070 UnicodeString::findAndReplace(int32_t start
,
1072 const UnicodeString
& oldText
,
1075 const UnicodeString
& newText
,
1079 if(isBogus() || oldText
.isBogus() || newText
.isBogus()) {
1083 pinIndices(start
, length
);
1084 oldText
.pinIndices(oldStart
, oldLength
);
1085 newText
.pinIndices(newStart
, newLength
);
1087 if(oldLength
== 0) {
1091 while(length
> 0 && length
>= oldLength
) {
1092 int32_t pos
= indexOf(oldText
, oldStart
, oldLength
, start
, length
);
1094 // no more oldText's here: done
1097 // we found oldText, replace it by newText and go beyond it
1098 replace(pos
, oldLength
, newText
, newStart
, newLength
);
1099 length
-= pos
+ oldLength
- start
;
1100 start
= pos
+ newLength
;
1109 UnicodeString::setToBogus()
1114 fUnion
.fFields
.fArray
= 0;
1115 fUnion
.fFields
.fCapacity
= 0;
1119 // turn a bogus string into an empty one
1121 UnicodeString::unBogus() {
1122 if(fFlags
& kIsBogus
) {
1127 // setTo() analogous to the readonly-aliasing constructor with the same signature
1129 UnicodeString::setTo(UBool isTerminated
,
1133 if(fFlags
& kOpenGetBuffer
) {
1134 // do not modify a string that has an "open" getBuffer(minCapacity)
1139 // treat as an empty string, do not alias
1145 if( textLength
< -1 ||
1146 (textLength
== -1 && !isTerminated
) ||
1147 (textLength
>= 0 && isTerminated
&& text
[textLength
] != 0)
1155 if(textLength
== -1) {
1156 // text is terminated, or else it would have failed the above test
1157 textLength
= u_strlen(text
);
1159 setArray((UChar
*)text
, textLength
, isTerminated
? textLength
+ 1 : textLength
);
1161 fFlags
= kReadonlyAlias
;
1165 // setTo() analogous to the writable-aliasing constructor with the same signature
1167 UnicodeString::setTo(UChar
*buffer
,
1169 int32_t buffCapacity
) {
1170 if(fFlags
& kOpenGetBuffer
) {
1171 // do not modify a string that has an "open" getBuffer(minCapacity)
1175 if(buffer
== NULL
) {
1176 // treat as an empty string, do not alias
1182 if(buffLength
< -1 || buffCapacity
< 0 || buffLength
> buffCapacity
) {
1185 } else if(buffLength
== -1) {
1186 // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1187 const UChar
*p
= buffer
, *limit
= buffer
+ buffCapacity
;
1188 while(p
!= limit
&& *p
!= 0) {
1191 buffLength
= (int32_t)(p
- buffer
);
1196 setArray(buffer
, buffLength
, buffCapacity
);
1197 fFlags
= kWritableAlias
;
1201 UnicodeString
&UnicodeString::setToUTF8(const StringPiece
&utf8
) {
1203 int32_t length
= utf8
.length();
1205 // The UTF-16 string will be at most as long as the UTF-8 string.
1206 if(length
<= US_STACKBUF_SIZE
) {
1207 capacity
= US_STACKBUF_SIZE
;
1209 capacity
= length
+ 1; // +1 for the terminating NUL.
1211 UChar
*utf16
= getBuffer(capacity
);
1213 UErrorCode errorCode
= U_ZERO_ERROR
;
1214 u_strFromUTF8WithSub(utf16
, getCapacity(), &length16
,
1215 utf8
.data(), length
,
1216 0xfffd, // Substitution character.
1217 NULL
, // Don't care about number of substitutions.
1219 releaseBuffer(length16
);
1220 if(U_FAILURE(errorCode
)) {
1227 UnicodeString::setCharAt(int32_t offset
,
1230 int32_t len
= length();
1231 if(cloneArrayIfNeeded() && len
> 0) {
1234 } else if(offset
>= len
) {
1238 getArrayStart()[offset
] = c
;
1244 UnicodeString::replace(int32_t start
,
1247 UChar buffer
[U16_MAX_LENGTH
];
1249 UBool isError
= FALSE
;
1250 U16_APPEND(buffer
, count
, U16_MAX_LENGTH
, srcChar
, isError
);
1251 // We test isError so that the compiler does not complain that we don't.
1252 // If isError then count==0 which turns the doReplace() into a no-op anyway.
1253 return isError
? *this : doReplace(start
, _length
, buffer
, 0, count
);
1257 UnicodeString::append(UChar32 srcChar
) {
1258 UChar buffer
[U16_MAX_LENGTH
];
1259 int32_t _length
= 0;
1260 UBool isError
= FALSE
;
1261 U16_APPEND(buffer
, _length
, U16_MAX_LENGTH
, srcChar
, isError
);
1262 // We test isError so that the compiler does not complain that we don't.
1263 // If isError then _length==0 which turns the doReplace() into a no-op anyway.
1264 return isError
? *this : doReplace(length(), 0, buffer
, 0, _length
);
1268 UnicodeString::doReplace( int32_t start
,
1270 const UnicodeString
& src
,
1274 if(!src
.isBogus()) {
1275 // pin the indices to legal values
1276 src
.pinIndices(srcStart
, srcLength
);
1278 // get the characters from src
1279 // and replace the range in ourselves with them
1280 return doReplace(start
, length
, src
.getArrayStart(), srcStart
, srcLength
);
1283 return doReplace(start
, length
, 0, 0, 0);
1288 UnicodeString::doReplace(int32_t start
,
1290 const UChar
*srcChars
,
1298 int32_t oldLength
= this->length();
1300 // optimize (read-only alias).remove(0, start) and .remove(start, end)
1301 if((fFlags
&kBufferIsReadonly
) && srcLength
== 0) {
1303 // remove prefix by adjusting the array pointer
1305 fUnion
.fFields
.fArray
+= length
;
1306 fUnion
.fFields
.fCapacity
-= length
;
1307 setLength(oldLength
- length
);
1311 if(length
>= (oldLength
- start
)) {
1312 // remove suffix by reducing the length (like truncate())
1314 fUnion
.fFields
.fCapacity
= start
; // not NUL-terminated any more
1321 srcStart
= srcLength
= 0;
1322 } else if(srcLength
< 0) {
1323 // get the srcLength if necessary
1324 srcLength
= u_strlen(srcChars
+ srcStart
);
1327 // calculate the size of the string after the replace
1330 // optimize append() onto a large-enough, owned string
1331 if(start
>= oldLength
) {
1332 if(srcLength
== 0) {
1335 newLength
= oldLength
+ srcLength
;
1336 if(newLength
<= getCapacity() && isBufferWritable()) {
1337 UChar
*oldArray
= getArrayStart();
1338 // Do not copy characters when
1339 // UChar *buffer=str.getAppendBuffer(...);
1341 // str.append(buffer, length);
1343 // str.appendString(buffer, length)
1345 if(srcChars
+ srcStart
!= oldArray
+ start
|| start
> oldLength
) {
1346 us_arrayCopy(srcChars
, srcStart
, oldArray
, oldLength
, srcLength
);
1348 setLength(newLength
);
1351 // pin the indices to legal values
1356 // pin the indices to legal values
1357 pinIndices(start
, length
);
1359 newLength
= oldLength
- length
+ srcLength
;
1362 // the following may change fArray but will not copy the current contents;
1363 // therefore we need to keep the current fArray
1364 UChar oldStackBuffer
[US_STACKBUF_SIZE
];
1366 if((fFlags
&kUsingStackBuffer
) && (newLength
> US_STACKBUF_SIZE
)) {
1367 // copy the stack buffer contents because it will be overwritten with
1368 // fUnion.fFields values
1369 u_memcpy(oldStackBuffer
, fUnion
.fStackBuffer
, oldLength
);
1370 oldArray
= oldStackBuffer
;
1372 oldArray
= getArrayStart();
1375 // clone our array and allocate a bigger array if needed
1376 int32_t *bufferToDelete
= 0;
1377 if(!cloneArrayIfNeeded(newLength
, newLength
+ (newLength
>> 2) + kGrowSize
,
1378 FALSE
, &bufferToDelete
)
1383 // now do the replace
1385 UChar
*newArray
= getArrayStart();
1386 if(newArray
!= oldArray
) {
1387 // if fArray changed, then we need to copy everything except what will change
1388 us_arrayCopy(oldArray
, 0, newArray
, 0, start
);
1389 us_arrayCopy(oldArray
, start
+ length
,
1390 newArray
, start
+ srcLength
,
1391 oldLength
- (start
+ length
));
1392 } else if(length
!= srcLength
) {
1393 // fArray did not change; copy only the portion that isn't changing, leaving a hole
1394 us_arrayCopy(oldArray
, start
+ length
,
1395 newArray
, start
+ srcLength
,
1396 oldLength
- (start
+ length
));
1399 // now fill in the hole with the new string
1400 us_arrayCopy(srcChars
, srcStart
, newArray
, start
, srcLength
);
1402 setLength(newLength
);
1404 // delayed delete in case srcChars == fArray when we started, and
1405 // to keep oldArray alive for the above operations
1406 if (bufferToDelete
) {
1407 uprv_free(bufferToDelete
);
1417 UnicodeString::handleReplaceBetween(int32_t start
,
1419 const UnicodeString
& text
) {
1420 replaceBetween(start
, limit
, text
);
1427 UnicodeString::copy(int32_t start
, int32_t limit
, int32_t dest
) {
1428 if (limit
<= start
) {
1429 return; // Nothing to do; avoid bogus malloc call
1431 UChar
* text
= (UChar
*) uprv_malloc( sizeof(UChar
) * (limit
- start
) );
1432 // Check to make sure text is not null.
1434 extractBetween(start
, limit
, text
, 0);
1435 insert(dest
, text
, 0, limit
- start
);
1443 * NOTE: This is for the Replaceable class. There is no rep.cpp,
1444 * so we implement this function here.
1446 UBool
Replaceable::hasMetaData() const {
1453 UBool
UnicodeString::hasMetaData() const {
1458 UnicodeString::doReverse(int32_t start
, int32_t length
) {
1459 if(length
<= 1 || !cloneArrayIfNeeded()) {
1463 // pin the indices to legal values
1464 pinIndices(start
, length
);
1465 if(length
<= 1) { // pinIndices() might have shrunk the length
1469 UChar
*left
= getArrayStart() + start
;
1470 UChar
*right
= left
+ length
- 1; // -1 for inclusive boundary (length>=2)
1472 UBool hasSupplementary
= FALSE
;
1474 // Before the loop we know left<right because length>=2.
1476 hasSupplementary
|= (UBool
)U16_IS_LEAD(swap
= *left
);
1477 hasSupplementary
|= (UBool
)U16_IS_LEAD(*left
++ = *right
);
1479 } while(left
< right
);
1480 // Make sure to test the middle code unit of an odd-length string.
1481 // Redundant if the length is even.
1482 hasSupplementary
|= (UBool
)U16_IS_LEAD(*left
);
1484 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1485 if(hasSupplementary
) {
1488 left
= getArrayStart() + start
;
1489 right
= left
+ length
- 1; // -1 so that we can look at *(left+1) if left<right
1490 while(left
< right
) {
1491 if(U16_IS_TRAIL(swap
= *left
) && U16_IS_LEAD(swap2
= *(left
+ 1))) {
1504 UnicodeString::padLeading(int32_t targetLength
,
1507 int32_t oldLength
= length();
1508 if(oldLength
>= targetLength
|| !cloneArrayIfNeeded(targetLength
)) {
1511 // move contents up by padding width
1512 UChar
*array
= getArrayStart();
1513 int32_t start
= targetLength
- oldLength
;
1514 us_arrayCopy(array
, 0, array
, start
, oldLength
);
1516 // fill in padding character
1517 while(--start
>= 0) {
1518 array
[start
] = padChar
;
1520 setLength(targetLength
);
1526 UnicodeString::padTrailing(int32_t targetLength
,
1529 int32_t oldLength
= length();
1530 if(oldLength
>= targetLength
|| !cloneArrayIfNeeded(targetLength
)) {
1533 // fill in padding character
1534 UChar
*array
= getArrayStart();
1535 int32_t length
= targetLength
;
1536 while(--length
>= oldLength
) {
1537 array
[length
] = padChar
;
1539 setLength(targetLength
);
1544 //========================================
1546 //========================================
1548 UnicodeString::doHashCode() const
1550 /* Delegate hash computation to uhash. This makes UnicodeString
1551 * hashing consistent with UChar* hashing. */
1552 int32_t hashCode
= ustr_hashUCharsN(getArrayStart(), length());
1553 if (hashCode
== kInvalidHashCode
) {
1554 hashCode
= kEmptyHashCode
;
1559 //========================================
1561 //========================================
1564 UnicodeString::getBuffer(int32_t minCapacity
) {
1565 if(minCapacity
>=-1 && cloneArrayIfNeeded(minCapacity
)) {
1566 fFlags
|=kOpenGetBuffer
;
1568 return getArrayStart();
1575 UnicodeString::releaseBuffer(int32_t newLength
) {
1576 if(fFlags
&kOpenGetBuffer
&& newLength
>=-1) {
1577 // set the new fLength
1578 int32_t capacity
=getCapacity();
1580 // the new length is the string length, capped by fCapacity
1581 const UChar
*array
=getArrayStart(), *p
=array
, *limit
=array
+capacity
;
1582 while(p
<limit
&& *p
!=0) {
1585 newLength
=(int32_t)(p
-array
);
1586 } else if(newLength
>capacity
) {
1589 setLength(newLength
);
1590 fFlags
&=~kOpenGetBuffer
;
1594 //========================================
1596 //========================================
1598 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity
,
1599 int32_t growCapacity
,
1601 int32_t **pBufferToDelete
,
1603 // default parameters need to be static, therefore
1604 // the defaults are -1 to have convenience defaults
1605 if(newCapacity
== -1) {
1606 newCapacity
= getCapacity();
1609 // while a getBuffer(minCapacity) is "open",
1610 // prevent any modifications of the string by returning FALSE here
1611 // if the string is bogus, then only an assignment or similar can revive it
1617 * We need to make a copy of the array if
1618 * the buffer is read-only, or
1619 * the buffer is refCounted (shared), and refCount>1, or
1620 * the buffer is too small.
1621 * Return FALSE if memory could not be allocated.
1624 fFlags
& kBufferIsReadonly
||
1625 (fFlags
& kRefCounted
&& refCount() > 1) ||
1626 newCapacity
> getCapacity()
1628 // check growCapacity for default value and use of the stack buffer
1629 if(growCapacity
< 0) {
1630 growCapacity
= newCapacity
;
1631 } else if(newCapacity
<= US_STACKBUF_SIZE
&& growCapacity
> US_STACKBUF_SIZE
) {
1632 growCapacity
= US_STACKBUF_SIZE
;
1636 UChar oldStackBuffer
[US_STACKBUF_SIZE
];
1638 uint8_t flags
= fFlags
;
1640 if(flags
&kUsingStackBuffer
) {
1641 U_ASSERT(!(flags
&kRefCounted
)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1642 if(doCopyArray
&& growCapacity
> US_STACKBUF_SIZE
) {
1643 // copy the stack buffer contents because it will be overwritten with
1644 // fUnion.fFields values
1645 us_arrayCopy(fUnion
.fStackBuffer
, 0, oldStackBuffer
, 0, fShortLength
);
1646 oldArray
= oldStackBuffer
;
1648 oldArray
= 0; // no need to copy from stack buffer to itself
1651 oldArray
= fUnion
.fFields
.fArray
;
1652 U_ASSERT(oldArray
!=NULL
); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1655 // allocate a new array
1656 if(allocate(growCapacity
) ||
1657 (newCapacity
< growCapacity
&& allocate(newCapacity
))
1659 if(doCopyArray
&& oldArray
!= 0) {
1660 // copy the contents
1661 // do not copy more than what fits - it may be smaller than before
1662 int32_t minLength
= length();
1663 newCapacity
= getCapacity();
1664 if(newCapacity
< minLength
) {
1665 minLength
= newCapacity
;
1666 setLength(minLength
);
1668 us_arrayCopy(oldArray
, 0, getArrayStart(), 0, minLength
);
1673 // release the old array
1674 if(flags
& kRefCounted
) {
1675 // the array is refCounted; decrement and release if 0
1676 int32_t *pRefCount
= ((int32_t *)oldArray
- 1);
1677 if(umtx_atomic_dec(pRefCount
) == 0) {
1678 if(pBufferToDelete
== 0) {
1679 uprv_free(pRefCount
);
1681 // the caller requested to delete it himself
1682 *pBufferToDelete
= pRefCount
;
1687 // not enough memory for growCapacity and not even for the smaller newCapacity
1688 // reset the old values for setToBogus() to release the array
1689 if(!(flags
&kUsingStackBuffer
)) {
1690 fUnion
.fFields
.fArray
= oldArray
;
1700 // UnicodeStringAppendable ------------------------------------------------- ***
1702 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1705 UnicodeStringAppendable::appendCodeUnit(UChar c
) {
1706 return str
.doReplace(str
.length(), 0, &c
, 0, 1).isWritable();
1710 UnicodeStringAppendable::appendCodePoint(UChar32 c
) {
1711 UChar buffer
[U16_MAX_LENGTH
];
1712 int32_t cLength
= 0;
1713 UBool isError
= FALSE
;
1714 U16_APPEND(buffer
, cLength
, U16_MAX_LENGTH
, c
, isError
);
1715 return !isError
&& str
.doReplace(str
.length(), 0, buffer
, 0, cLength
).isWritable();
1719 UnicodeStringAppendable::appendString(const UChar
*s
, int32_t length
) {
1720 return str
.doReplace(str
.length(), 0, s
, 0, length
).isWritable();
1724 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity
) {
1725 return str
.cloneArrayIfNeeded(str
.length() + appendCapacity
);
1729 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity
,
1730 int32_t desiredCapacityHint
,
1731 UChar
*scratch
, int32_t scratchCapacity
,
1732 int32_t *resultCapacity
) {
1733 if(minCapacity
< 1 || scratchCapacity
< minCapacity
) {
1734 *resultCapacity
= 0;
1737 int32_t oldLength
= str
.length();
1738 if(str
.cloneArrayIfNeeded(oldLength
+ minCapacity
, oldLength
+ desiredCapacityHint
)) {
1739 *resultCapacity
= str
.getCapacity() - oldLength
;
1740 return str
.getArrayStart() + oldLength
;
1742 *resultCapacity
= scratchCapacity
;
1750 U_CAPI
int32_t U_EXPORT2
1751 uhash_hashUnicodeString(const UElement key
) {
1752 const UnicodeString
*str
= (const UnicodeString
*) key
.pointer
;
1753 return (str
== NULL
) ? 0 : str
->hashCode();
1756 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1757 // does not depend on hashtable code.
1758 U_CAPI UBool U_EXPORT2
1759 uhash_compareUnicodeString(const UElement key1
, const UElement key2
) {
1760 const UnicodeString
*str1
= (const UnicodeString
*) key1
.pointer
;
1761 const UnicodeString
*str2
= (const UnicodeString
*) key2
.pointer
;
1765 if (str1
== NULL
|| str2
== NULL
) {
1768 return *str1
== *str2
;
1771 #ifdef U_STATIC_IMPLEMENTATION
1773 This should never be called. It is defined here to make sure that the
1774 virtual vector deleting destructor is defined within unistr.cpp.
1775 The vector deleting destructor is already a part of UObject,
1776 but defining it here makes sure that it is included with this object file.
1777 This makes sure that static library dependencies are kept to a minimum.
1779 static void uprv_UnicodeStringDummy(void) {
1780 delete [] (new UnicodeString
[2]);