2 ******************************************************************************
3 * Copyright (C) 1999-2012, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
9 * Modification History:
11 * Date Name Description
12 * 09/25/98 stephen Creation.
13 * 04/20/99 stephen Overhauled per 4/16 code review.
14 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
15 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from
17 * 06/25/01 grhoten Removed the dependency on iostream
18 ******************************************************************************
21 #include "unicode/utypes.h"
22 #include "unicode/appendable.h"
23 #include "unicode/putil.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf16.h"
42 print(const UnicodeString
& s
,
47 for(int i
= 0; i
< s
.length(); ++i
) {
49 if(c
>= 0x007E || c
< 0x0020)
50 cout
<< "[0x" << hex
<< s
[i
] << "]";
64 for(int i
= 0; i
< len
; ++i
) {
66 if(c
>= 0x007E || c
< 0x0020)
67 cout
<< "[0x" << hex
<< s
[i
] << "]";
76 // Local function definitions for now
78 // need to copy areas that may overlap
81 us_arrayCopy(const UChar
*src
, int32_t srcStart
,
82 UChar
*dst
, int32_t dstStart
, int32_t count
)
85 uprv_memmove(dst
+dstStart
, src
+srcStart
, (size_t)(count
*sizeof(*src
)));
89 // u_unescapeAt() callback to get a UChar from a UnicodeString
91 static UChar U_CALLCONV
92 UnicodeString_charAt(int32_t offset
, void *context
) {
93 return ((icu::UnicodeString
*) context
)->charAt(offset
);
99 /* The Replaceable virtual destructor can't be defined in the header
100 due to how AIX works with multiple definitions of virtual functions.
102 Replaceable::~Replaceable() {}
104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString
)
106 UnicodeString U_EXPORT2
107 operator+ (const UnicodeString
&s1
, const UnicodeString
&s2
) {
109 UnicodeString(s1
.length()+s2
.length()+1, (UChar32
)0, 0).
114 //========================================
115 // Reference Counting functions, put at top of file so that optimizing compilers
116 // have a chance to automatically inline.
117 //========================================
120 UnicodeString::addRef()
121 { umtx_atomic_inc((int32_t *)fUnion
.fFields
.fArray
- 1);}
124 UnicodeString::removeRef()
125 { return umtx_atomic_dec((int32_t *)fUnion
.fFields
.fArray
- 1);}
128 UnicodeString::refCount() const
131 // Note: without the lock to force a memory barrier, we might see a very
132 // stale value on some multi-processor systems.
133 int32_t count
= *((int32_t *)fUnion
.fFields
.fArray
- 1);
139 UnicodeString::releaseArray() {
140 if((fFlags
& kRefCounted
) && removeRef() == 0) {
141 uprv_free((int32_t *)fUnion
.fFields
.fArray
- 1);
147 //========================================
149 //========================================
151 // The default constructor is inline in unistr.h.
153 UnicodeString::UnicodeString(int32_t capacity
, UChar32 c
, int32_t count
)
157 if(count
<= 0 || (uint32_t)c
> 0x10ffff) {
158 // just allocate and do not do anything else
161 // count > 0, allocate and fill the new string with count c's
162 int32_t unitCount
= U16_LENGTH(c
), length
= count
* unitCount
;
163 if(capacity
< length
) {
166 if(allocate(capacity
)) {
167 UChar
*array
= getArrayStart();
170 // fill the new string with c
172 // fill with length UChars
174 array
[i
++] = (UChar
)c
;
177 // get the code units for c
178 UChar units
[U16_MAX_LENGTH
];
179 U16_APPEND_UNSAFE(units
, i
, c
);
181 // now it must be i==unitCount
184 // for Unicode, unitCount can only be 1, 2, 3, or 4
185 // 1 is handled above
188 while(unitIdx
< unitCount
) {
189 array
[i
++]=units
[unitIdx
++];
198 UnicodeString::UnicodeString(UChar ch
)
202 fUnion
.fStackBuffer
[0] = ch
;
205 UnicodeString::UnicodeString(UChar32 ch
)
210 UBool isError
= FALSE
;
211 U16_APPEND(fUnion
.fStackBuffer
, i
, US_STACKBUF_SIZE
, ch
, isError
);
212 // We test isError so that the compiler does not complain that we don't.
213 // If isError then i==0 which is what we want anyway.
215 fShortLength
= (int8_t)i
;
219 UnicodeString::UnicodeString(const UChar
*text
)
223 doReplace(0, 0, text
, 0, -1);
226 UnicodeString::UnicodeString(const UChar
*text
,
231 doReplace(0, 0, text
, 0, textLength
);
234 UnicodeString::UnicodeString(UBool isTerminated
,
238 fFlags(kReadonlyAlias
)
241 // treat as an empty string, do not alias
243 } else if(textLength
< -1 ||
244 (textLength
== -1 && !isTerminated
) ||
245 (textLength
>= 0 && isTerminated
&& text
[textLength
] != 0)
249 if(textLength
== -1) {
250 // text is terminated, or else it would have failed the above test
251 textLength
= u_strlen(text
);
253 setArray((UChar
*)text
, textLength
, isTerminated
? textLength
+ 1 : textLength
);
257 UnicodeString::UnicodeString(UChar
*buff
,
259 int32_t buffCapacity
)
261 fFlags(kWritableAlias
)
264 // treat as an empty string, do not alias
266 } else if(buffLength
< -1 || buffCapacity
< 0 || buffLength
> buffCapacity
) {
269 if(buffLength
== -1) {
270 // fLength = u_strlen(buff); but do not look beyond buffCapacity
271 const UChar
*p
= buff
, *limit
= buff
+ buffCapacity
;
272 while(p
!= limit
&& *p
!= 0) {
275 buffLength
= (int32_t)(p
- buff
);
277 setArray(buff
, buffLength
, buffCapacity
);
281 UnicodeString::UnicodeString(const char *src
, int32_t length
, EInvariant
)
286 // treat as an empty string
289 length
=(int32_t)uprv_strlen(src
);
291 if(cloneArrayIfNeeded(length
, length
, FALSE
)) {
292 u_charsToUChars(src
, getArrayStart(), length
);
300 #if U_CHARSET_IS_UTF8
302 UnicodeString::UnicodeString(const char *codepageData
)
304 fFlags(kShortString
) {
305 if(codepageData
!= 0) {
306 setToUTF8(codepageData
);
310 UnicodeString::UnicodeString(const char *codepageData
, int32_t dataLength
)
312 fFlags(kShortString
) {
313 // if there's nothing to convert, do nothing
314 if(codepageData
== 0 || dataLength
== 0 || dataLength
< -1) {
317 if(dataLength
== -1) {
318 dataLength
= (int32_t)uprv_strlen(codepageData
);
320 setToUTF8(StringPiece(codepageData
, dataLength
));
323 // else see unistr_cnv.cpp
326 UnicodeString::UnicodeString(const UnicodeString
& that
)
334 UnicodeString::UnicodeString(const UnicodeString
& that
,
340 setTo(that
, srcStart
);
343 UnicodeString::UnicodeString(const UnicodeString
& that
,
350 setTo(that
, srcStart
, srcLength
);
353 // Replaceable base class clone() default implementation, does not clone
355 Replaceable::clone() const {
359 // UnicodeString overrides clone() with a real implementation
361 UnicodeString::clone() const {
362 return new UnicodeString(*this);
365 //========================================
367 //========================================
370 UnicodeString::allocate(int32_t capacity
) {
371 if(capacity
<= US_STACKBUF_SIZE
) {
372 fFlags
= kShortString
;
374 // count bytes for the refCounter and the string capacity, and
375 // round up to a multiple of 16; then divide by 4 and allocate int32_t's
376 // to be safely aligned for the refCount
377 // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
378 int32_t words
= (int32_t)(((sizeof(int32_t) + (capacity
+ 1) * U_SIZEOF_UCHAR
+ 15) & ~15) >> 2);
379 int32_t *array
= (int32_t*) uprv_malloc( sizeof(int32_t) * words
);
381 // set initial refCount and point behind the refCount
384 // have fArray point to the first UChar
385 fUnion
.fFields
.fArray
= (UChar
*)array
;
386 fUnion
.fFields
.fCapacity
= (int32_t)((words
- 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR
));
387 fFlags
= kLongString
;
390 fUnion
.fFields
.fArray
= 0;
391 fUnion
.fFields
.fCapacity
= 0;
399 //========================================
401 //========================================
402 UnicodeString::~UnicodeString()
407 //========================================
409 //========================================
411 UnicodeString
UnicodeString::fromUTF8(const StringPiece
&utf8
) {
412 UnicodeString result
;
413 result
.setToUTF8(utf8
);
417 UnicodeString
UnicodeString::fromUTF32(const UChar32
*utf32
, int32_t length
) {
418 UnicodeString result
;
420 // Most UTF-32 strings will be BMP-only and result in a same-length
421 // UTF-16 string. We overestimate the capacity just slightly,
422 // just in case there are a few supplementary characters.
423 if(length
<= US_STACKBUF_SIZE
) {
424 capacity
= US_STACKBUF_SIZE
;
426 capacity
= length
+ (length
>> 4) + 4;
429 UChar
*utf16
= result
.getBuffer(capacity
);
431 UErrorCode errorCode
= U_ZERO_ERROR
;
432 u_strFromUTF32WithSub(utf16
, result
.getCapacity(), &length16
,
434 0xfffd, // Substitution character.
435 NULL
, // Don't care about number of substitutions.
437 result
.releaseBuffer(length16
);
438 if(errorCode
== U_BUFFER_OVERFLOW_ERROR
) {
439 capacity
= length16
+ 1; // +1 for the terminating NUL.
441 } else if(U_FAILURE(errorCode
)) {
449 //========================================
451 //========================================
454 UnicodeString::operator=(const UnicodeString
&src
) {
455 return copyFrom(src
);
459 UnicodeString::fastCopyFrom(const UnicodeString
&src
) {
460 return copyFrom(src
, TRUE
);
464 UnicodeString::copyFrom(const UnicodeString
&src
, UBool fastCopy
) {
465 // if assigning to ourselves, do nothing
466 if(this == 0 || this == &src
) {
470 // is the right side bogus?
471 if(&src
== 0 || src
.isBogus()) {
476 // delete the current contents
480 // empty string - use the stack buffer
485 // we always copy the length
486 int32_t srcLength
= src
.length();
487 setLength(srcLength
);
489 // fLength>0 and not an "open" src.getBuffer(minCapacity)
492 // short string using the stack buffer, do the same
493 fFlags
= kShortString
;
494 uprv_memcpy(fUnion
.fStackBuffer
, src
.fUnion
.fStackBuffer
, srcLength
* U_SIZEOF_UCHAR
);
497 // src uses a refCounted string buffer, use that buffer with refCount
498 // src is const, use a cast - we don't really change it
499 ((UnicodeString
&)src
).addRef();
500 // copy all fields, share the reference-counted buffer
501 fUnion
.fFields
.fArray
= src
.fUnion
.fFields
.fArray
;
502 fUnion
.fFields
.fCapacity
= src
.fUnion
.fFields
.fCapacity
;
507 // src is a readonly alias, do the same
508 // -> maintain the readonly alias as such
509 fUnion
.fFields
.fArray
= src
.fUnion
.fFields
.fArray
;
510 fUnion
.fFields
.fCapacity
= src
.fUnion
.fFields
.fCapacity
;
514 // else if(!fastCopy) fall through to case kWritableAlias
515 // -> allocate a new buffer and copy the contents
517 // src is a writable alias; we make a copy of that instead
518 if(allocate(srcLength
)) {
519 uprv_memcpy(getArrayStart(), src
.getArrayStart(), srcLength
* U_SIZEOF_UCHAR
);
522 // if there is not enough memory, then fall through to setting to bogus
524 // if src is bogus, set ourselves to bogus
525 // do not call setToBogus() here because fArray and fFlags are not consistent here
527 fUnion
.fFields
.fArray
= 0;
528 fUnion
.fFields
.fCapacity
= 0;
536 //========================================
537 // Miscellaneous operations
538 //========================================
540 UnicodeString
UnicodeString::unescape() const {
541 UnicodeString
result(length(), (UChar32
)0, (int32_t)0); // construct with capacity
542 const UChar
*array
= getBuffer();
543 int32_t len
= length();
545 for (int32_t i
=0;;) {
547 result
.append(array
, prev
, len
- prev
);
550 if (array
[i
++] == 0x5C /*'\\'*/) {
551 result
.append(array
, prev
, (i
- 1) - prev
);
552 UChar32 c
= unescapeAt(i
); // advances i
554 result
.remove(); // return empty string
555 break; // invalid escape sequence
564 UChar32
UnicodeString::unescapeAt(int32_t &offset
) const {
565 return u_unescapeAt(UnicodeString_charAt
, &offset
, length(), (void*)this);
568 //========================================
569 // Read-only implementation
570 //========================================
572 UnicodeString::doEquals(const UnicodeString
&text
, int32_t len
) const {
573 // Requires: this & text not bogus and have same lengths.
574 // Byte-wise comparison works for equality regardless of endianness.
575 return uprv_memcmp(getArrayStart(), text
.getArrayStart(), len
* U_SIZEOF_UCHAR
) == 0;
579 UnicodeString::doCompare( int32_t start
,
581 const UChar
*srcChars
,
583 int32_t srcLength
) const
585 // compare illegal string values
590 // pin indices to legal values
591 pinIndices(start
, length
);
593 if(srcChars
== NULL
) {
594 // treat const UChar *srcChars==NULL as an empty string
595 return length
== 0 ? 0 : 1;
598 // get the correct pointer
599 const UChar
*chars
= getArrayStart();
602 srcChars
+= srcStart
;
607 // get the srcLength if necessary
609 srcLength
= u_strlen(srcChars
+ srcStart
);
612 // are we comparing different lengths?
613 if(length
!= srcLength
) {
614 if(length
< srcLength
) {
618 minLength
= srcLength
;
627 * note that uprv_memcmp() returns an int but we return an int8_t;
628 * we need to take care not to truncate the result -
629 * one way to do this is to right-shift the value to
630 * move the sign bit into the lower 8 bits and making sure that this
631 * does not become 0 itself
634 if(minLength
> 0 && chars
!= srcChars
) {
638 // big-endian: byte comparison works
639 result
= uprv_memcmp(chars
, srcChars
, minLength
* sizeof(UChar
));
641 return (int8_t)(result
>> 15 | 1);
644 // little-endian: compare UChar units
646 result
= ((int32_t)*(chars
++) - (int32_t)*(srcChars
++));
648 return (int8_t)(result
>> 15 | 1);
650 } while(--minLength
> 0);
656 /* String compare in code point order - doCompare() compares in code unit order. */
658 UnicodeString::doCompareCodePointOrder(int32_t start
,
660 const UChar
*srcChars
,
662 int32_t srcLength
) const
664 // compare illegal string values
665 // treat const UChar *srcChars==NULL as an empty string
670 // pin indices to legal values
671 pinIndices(start
, length
);
673 if(srcChars
== NULL
) {
674 srcStart
= srcLength
= 0;
677 int32_t diff
= uprv_strCompare(getArrayStart() + start
, length
, (srcChars
!=NULL
)?(srcChars
+ srcStart
):NULL
, srcLength
, FALSE
, TRUE
);
678 /* translate the 32-bit result into an 8-bit one */
680 return (int8_t)(diff
>> 15 | 1);
687 UnicodeString::getLength() const {
692 UnicodeString::getCharAt(int32_t offset
) const {
693 return charAt(offset
);
697 UnicodeString::getChar32At(int32_t offset
) const {
698 return char32At(offset
);
702 UnicodeString::char32At(int32_t offset
) const
704 int32_t len
= length();
705 if((uint32_t)offset
< (uint32_t)len
) {
706 const UChar
*array
= getArrayStart();
708 U16_GET(array
, 0, offset
, len
, c
);
711 return kInvalidUChar
;
716 UnicodeString::getChar32Start(int32_t offset
) const {
717 if((uint32_t)offset
< (uint32_t)length()) {
718 const UChar
*array
= getArrayStart();
719 U16_SET_CP_START(array
, 0, offset
);
727 UnicodeString::getChar32Limit(int32_t offset
) const {
728 int32_t len
= length();
729 if((uint32_t)offset
< (uint32_t)len
) {
730 const UChar
*array
= getArrayStart();
731 U16_SET_CP_LIMIT(array
, 0, offset
, len
);
739 UnicodeString::countChar32(int32_t start
, int32_t length
) const {
740 pinIndices(start
, length
);
741 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
742 return u_countChar32(getArrayStart()+start
, length
);
746 UnicodeString::hasMoreChar32Than(int32_t start
, int32_t length
, int32_t number
) const {
747 pinIndices(start
, length
);
748 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
749 return u_strHasMoreChar32Than(getArrayStart()+start
, length
, number
);
753 UnicodeString::moveIndex32(int32_t index
, int32_t delta
) const {
755 int32_t len
= length();
758 } else if(index
>len
) {
762 const UChar
*array
= getArrayStart();
764 U16_FWD_N(array
, index
, len
, delta
);
766 U16_BACK_N(array
, 0, index
, -delta
);
773 UnicodeString::doExtract(int32_t start
,
776 int32_t dstStart
) const
778 // pin indices to legal values
779 pinIndices(start
, length
);
781 // do not copy anything if we alias dst itself
782 const UChar
*array
= getArrayStart();
783 if(array
+ start
!= dst
+ dstStart
) {
784 us_arrayCopy(array
, start
, dst
, dstStart
, length
);
789 UnicodeString::extract(UChar
*dest
, int32_t destCapacity
,
790 UErrorCode
&errorCode
) const {
791 int32_t len
= length();
792 if(U_SUCCESS(errorCode
)) {
793 if(isBogus() || destCapacity
<0 || (destCapacity
>0 && dest
==0)) {
794 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
796 const UChar
*array
= getArrayStart();
797 if(len
>0 && len
<=destCapacity
&& array
!=dest
) {
798 uprv_memcpy(dest
, array
, len
*U_SIZEOF_UCHAR
);
800 return u_terminateUChars(dest
, destCapacity
, len
, &errorCode
);
808 UnicodeString::extract(int32_t start
,
811 int32_t targetCapacity
,
812 enum EInvariant
) const
814 // if the arguments are illegal, then do nothing
815 if(targetCapacity
< 0 || (targetCapacity
> 0 && target
== NULL
)) {
819 // pin the indices to legal values
820 pinIndices(start
, length
);
822 if(length
<= targetCapacity
) {
823 u_UCharsToChars(getArrayStart() + start
, target
, length
);
825 UErrorCode status
= U_ZERO_ERROR
;
826 return u_terminateChars(target
, targetCapacity
, length
, &status
);
830 UnicodeString::tempSubString(int32_t start
, int32_t len
) const {
831 pinIndices(start
, len
);
832 const UChar
*array
= getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
834 array
=fUnion
.fStackBuffer
; // anything not NULL because that would make an empty string
835 len
=-2; // bogus result string
837 return UnicodeString(FALSE
, array
+ start
, len
);
841 UnicodeString::toUTF8(int32_t start
, int32_t len
,
842 char *target
, int32_t capacity
) const {
843 pinIndices(start
, len
);
845 UErrorCode errorCode
= U_ZERO_ERROR
;
846 u_strToUTF8WithSub(target
, capacity
, &length8
,
847 getBuffer() + start
, len
,
848 0xFFFD, // Standard substitution character.
849 NULL
, // Don't care about number of substitutions.
854 #if U_CHARSET_IS_UTF8
857 UnicodeString::extract(int32_t start
, int32_t len
,
858 char *target
, uint32_t dstSize
) const {
859 // if the arguments are illegal, then do nothing
860 if(/*dstSize < 0 || */(dstSize
> 0 && target
== 0)) {
863 return toUTF8(start
, len
, target
, dstSize
<= 0x7fffffff ? (int32_t)dstSize
: 0x7fffffff);
866 // else see unistr_cnv.cpp
870 UnicodeString::extractBetween(int32_t start
,
872 UnicodeString
& target
) const {
875 doExtract(start
, limit
- start
, target
);
878 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
879 // as many bytes as the source has UChars.
880 // The "worst cases" are writing systems like Indic, Thai and CJK with
883 UnicodeString::toUTF8(ByteSink
&sink
) const {
884 int32_t length16
= length();
886 char stackBuffer
[1024];
887 int32_t capacity
= (int32_t)sizeof(stackBuffer
);
888 UBool utf8IsOwned
= FALSE
;
889 char *utf8
= sink
.GetAppendBuffer(length16
< capacity
? length16
: capacity
,
891 stackBuffer
, capacity
,
894 UErrorCode errorCode
= U_ZERO_ERROR
;
895 u_strToUTF8WithSub(utf8
, capacity
, &length8
,
896 getBuffer(), length16
,
897 0xFFFD, // Standard substitution character.
898 NULL
, // Don't care about number of substitutions.
900 if(errorCode
== U_BUFFER_OVERFLOW_ERROR
) {
901 utf8
= (char *)uprv_malloc(length8
);
904 errorCode
= U_ZERO_ERROR
;
905 u_strToUTF8WithSub(utf8
, length8
, &length8
,
906 getBuffer(), length16
,
907 0xFFFD, // Standard substitution character.
908 NULL
, // Don't care about number of substitutions.
911 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
914 if(U_SUCCESS(errorCode
)) {
915 sink
.Append(utf8
, length8
);
925 UnicodeString::toUTF32(UChar32
*utf32
, int32_t capacity
, UErrorCode
&errorCode
) const {
927 if(U_SUCCESS(errorCode
)) {
928 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
929 u_strToUTF32WithSub(utf32
, capacity
, &length32
,
930 getBuffer(), length(),
931 0xfffd, // Substitution character.
932 NULL
, // Don't care about number of substitutions.
939 UnicodeString::indexOf(const UChar
*srcChars
,
943 int32_t length
) const
945 if(isBogus() || srcChars
== 0 || srcStart
< 0 || srcLength
== 0) {
949 // UnicodeString does not find empty substrings
950 if(srcLength
< 0 && srcChars
[srcStart
] == 0) {
954 // get the indices within bounds
955 pinIndices(start
, length
);
957 // find the first occurrence of the substring
958 const UChar
*array
= getArrayStart();
959 const UChar
*match
= u_strFindFirst(array
+ start
, length
, srcChars
+ srcStart
, srcLength
);
963 return (int32_t)(match
- array
);
968 UnicodeString::doIndexOf(UChar c
,
970 int32_t length
) const
973 pinIndices(start
, length
);
975 // find the first occurrence of c
976 const UChar
*array
= getArrayStart();
977 const UChar
*match
= u_memchr(array
+ start
, c
, length
);
981 return (int32_t)(match
- array
);
986 UnicodeString::doIndexOf(UChar32 c
,
988 int32_t length
) const {
990 pinIndices(start
, length
);
992 // find the first occurrence of c
993 const UChar
*array
= getArrayStart();
994 const UChar
*match
= u_memchr32(array
+ start
, c
, length
);
998 return (int32_t)(match
- array
);
1003 UnicodeString::lastIndexOf(const UChar
*srcChars
,
1007 int32_t length
) const
1009 if(isBogus() || srcChars
== 0 || srcStart
< 0 || srcLength
== 0) {
1013 // UnicodeString does not find empty substrings
1014 if(srcLength
< 0 && srcChars
[srcStart
] == 0) {
1018 // get the indices within bounds
1019 pinIndices(start
, length
);
1021 // find the last occurrence of the substring
1022 const UChar
*array
= getArrayStart();
1023 const UChar
*match
= u_strFindLast(array
+ start
, length
, srcChars
+ srcStart
, srcLength
);
1027 return (int32_t)(match
- array
);
1032 UnicodeString::doLastIndexOf(UChar c
,
1034 int32_t length
) const
1041 pinIndices(start
, length
);
1043 // find the last occurrence of c
1044 const UChar
*array
= getArrayStart();
1045 const UChar
*match
= u_memrchr(array
+ start
, c
, length
);
1049 return (int32_t)(match
- array
);
1054 UnicodeString::doLastIndexOf(UChar32 c
,
1056 int32_t length
) const {
1058 pinIndices(start
, length
);
1060 // find the last occurrence of c
1061 const UChar
*array
= getArrayStart();
1062 const UChar
*match
= u_memrchr32(array
+ start
, c
, length
);
1066 return (int32_t)(match
- array
);
1070 //========================================
1071 // Write implementation
1072 //========================================
1075 UnicodeString::findAndReplace(int32_t start
,
1077 const UnicodeString
& oldText
,
1080 const UnicodeString
& newText
,
1084 if(isBogus() || oldText
.isBogus() || newText
.isBogus()) {
1088 pinIndices(start
, length
);
1089 oldText
.pinIndices(oldStart
, oldLength
);
1090 newText
.pinIndices(newStart
, newLength
);
1092 if(oldLength
== 0) {
1096 while(length
> 0 && length
>= oldLength
) {
1097 int32_t pos
= indexOf(oldText
, oldStart
, oldLength
, start
, length
);
1099 // no more oldText's here: done
1102 // we found oldText, replace it by newText and go beyond it
1103 replace(pos
, oldLength
, newText
, newStart
, newLength
);
1104 length
-= pos
+ oldLength
- start
;
1105 start
= pos
+ newLength
;
1114 UnicodeString::setToBogus()
1119 fUnion
.fFields
.fArray
= 0;
1120 fUnion
.fFields
.fCapacity
= 0;
1124 // turn a bogus string into an empty one
1126 UnicodeString::unBogus() {
1127 if(fFlags
& kIsBogus
) {
1132 // setTo() analogous to the readonly-aliasing constructor with the same signature
1134 UnicodeString::setTo(UBool isTerminated
,
1138 if(fFlags
& kOpenGetBuffer
) {
1139 // do not modify a string that has an "open" getBuffer(minCapacity)
1144 // treat as an empty string, do not alias
1150 if( textLength
< -1 ||
1151 (textLength
== -1 && !isTerminated
) ||
1152 (textLength
>= 0 && isTerminated
&& text
[textLength
] != 0)
1160 if(textLength
== -1) {
1161 // text is terminated, or else it would have failed the above test
1162 textLength
= u_strlen(text
);
1164 setArray((UChar
*)text
, textLength
, isTerminated
? textLength
+ 1 : textLength
);
1166 fFlags
= kReadonlyAlias
;
1170 // setTo() analogous to the writable-aliasing constructor with the same signature
1172 UnicodeString::setTo(UChar
*buffer
,
1174 int32_t buffCapacity
) {
1175 if(fFlags
& kOpenGetBuffer
) {
1176 // do not modify a string that has an "open" getBuffer(minCapacity)
1180 if(buffer
== NULL
) {
1181 // treat as an empty string, do not alias
1187 if(buffLength
< -1 || buffCapacity
< 0 || buffLength
> buffCapacity
) {
1190 } else if(buffLength
== -1) {
1191 // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1192 const UChar
*p
= buffer
, *limit
= buffer
+ buffCapacity
;
1193 while(p
!= limit
&& *p
!= 0) {
1196 buffLength
= (int32_t)(p
- buffer
);
1201 setArray(buffer
, buffLength
, buffCapacity
);
1202 fFlags
= kWritableAlias
;
1206 UnicodeString
&UnicodeString::setToUTF8(const StringPiece
&utf8
) {
1208 int32_t length
= utf8
.length();
1210 // The UTF-16 string will be at most as long as the UTF-8 string.
1211 if(length
<= US_STACKBUF_SIZE
) {
1212 capacity
= US_STACKBUF_SIZE
;
1214 capacity
= length
+ 1; // +1 for the terminating NUL.
1216 UChar
*utf16
= getBuffer(capacity
);
1218 UErrorCode errorCode
= U_ZERO_ERROR
;
1219 u_strFromUTF8WithSub(utf16
, getCapacity(), &length16
,
1220 utf8
.data(), length
,
1221 0xfffd, // Substitution character.
1222 NULL
, // Don't care about number of substitutions.
1224 releaseBuffer(length16
);
1225 if(U_FAILURE(errorCode
)) {
1232 UnicodeString::setCharAt(int32_t offset
,
1235 int32_t len
= length();
1236 if(cloneArrayIfNeeded() && len
> 0) {
1239 } else if(offset
>= len
) {
1243 getArrayStart()[offset
] = c
;
1249 UnicodeString::replace(int32_t start
,
1252 UChar buffer
[U16_MAX_LENGTH
];
1254 UBool isError
= FALSE
;
1255 U16_APPEND(buffer
, count
, U16_MAX_LENGTH
, srcChar
, isError
);
1256 // We test isError so that the compiler does not complain that we don't.
1257 // If isError (srcChar is not a valid code point) then count==0 which means
1258 // we remove the source segment rather than replacing it with srcChar.
1259 return doReplace(start
, _length
, buffer
, 0, isError
? 0 : count
);
1263 UnicodeString::append(UChar32 srcChar
) {
1264 UChar buffer
[U16_MAX_LENGTH
];
1265 int32_t _length
= 0;
1266 UBool isError
= FALSE
;
1267 U16_APPEND(buffer
, _length
, U16_MAX_LENGTH
, srcChar
, isError
);
1268 // We test isError so that the compiler does not complain that we don't.
1269 // If isError then _length==0 which turns the doReplace() into a no-op anyway.
1270 return isError
? *this : doReplace(length(), 0, buffer
, 0, _length
);
1274 UnicodeString::doReplace( int32_t start
,
1276 const UnicodeString
& src
,
1280 if(!src
.isBogus()) {
1281 // pin the indices to legal values
1282 src
.pinIndices(srcStart
, srcLength
);
1284 // get the characters from src
1285 // and replace the range in ourselves with them
1286 return doReplace(start
, length
, src
.getArrayStart(), srcStart
, srcLength
);
1289 return doReplace(start
, length
, 0, 0, 0);
1294 UnicodeString::doReplace(int32_t start
,
1296 const UChar
*srcChars
,
1304 int32_t oldLength
= this->length();
1306 // optimize (read-only alias).remove(0, start) and .remove(start, end)
1307 if((fFlags
&kBufferIsReadonly
) && srcLength
== 0) {
1309 // remove prefix by adjusting the array pointer
1311 fUnion
.fFields
.fArray
+= length
;
1312 fUnion
.fFields
.fCapacity
-= length
;
1313 setLength(oldLength
- length
);
1317 if(length
>= (oldLength
- start
)) {
1318 // remove suffix by reducing the length (like truncate())
1320 fUnion
.fFields
.fCapacity
= start
; // not NUL-terminated any more
1327 srcStart
= srcLength
= 0;
1328 } else if(srcLength
< 0) {
1329 // get the srcLength if necessary
1330 srcLength
= u_strlen(srcChars
+ srcStart
);
1333 // calculate the size of the string after the replace
1336 // optimize append() onto a large-enough, owned string
1337 if(start
>= oldLength
) {
1338 if(srcLength
== 0) {
1341 newLength
= oldLength
+ srcLength
;
1342 if(newLength
<= getCapacity() && isBufferWritable()) {
1343 UChar
*oldArray
= getArrayStart();
1344 // Do not copy characters when
1345 // UChar *buffer=str.getAppendBuffer(...);
1347 // str.append(buffer, length);
1349 // str.appendString(buffer, length)
1351 if(srcChars
+ srcStart
!= oldArray
+ start
|| start
> oldLength
) {
1352 us_arrayCopy(srcChars
, srcStart
, oldArray
, oldLength
, srcLength
);
1354 setLength(newLength
);
1357 // pin the indices to legal values
1362 // pin the indices to legal values
1363 pinIndices(start
, length
);
1365 newLength
= oldLength
- length
+ srcLength
;
1368 // the following may change fArray but will not copy the current contents;
1369 // therefore we need to keep the current fArray
1370 UChar oldStackBuffer
[US_STACKBUF_SIZE
];
1372 if((fFlags
&kUsingStackBuffer
) && (newLength
> US_STACKBUF_SIZE
)) {
1373 // copy the stack buffer contents because it will be overwritten with
1374 // fUnion.fFields values
1375 u_memcpy(oldStackBuffer
, fUnion
.fStackBuffer
, oldLength
);
1376 oldArray
= oldStackBuffer
;
1378 oldArray
= getArrayStart();
1381 // clone our array and allocate a bigger array if needed
1382 int32_t *bufferToDelete
= 0;
1383 if(!cloneArrayIfNeeded(newLength
, newLength
+ (newLength
>> 2) + kGrowSize
,
1384 FALSE
, &bufferToDelete
)
1389 // now do the replace
1391 UChar
*newArray
= getArrayStart();
1392 if(newArray
!= oldArray
) {
1393 // if fArray changed, then we need to copy everything except what will change
1394 us_arrayCopy(oldArray
, 0, newArray
, 0, start
);
1395 us_arrayCopy(oldArray
, start
+ length
,
1396 newArray
, start
+ srcLength
,
1397 oldLength
- (start
+ length
));
1398 } else if(length
!= srcLength
) {
1399 // fArray did not change; copy only the portion that isn't changing, leaving a hole
1400 us_arrayCopy(oldArray
, start
+ length
,
1401 newArray
, start
+ srcLength
,
1402 oldLength
- (start
+ length
));
1405 // now fill in the hole with the new string
1406 us_arrayCopy(srcChars
, srcStart
, newArray
, start
, srcLength
);
1408 setLength(newLength
);
1410 // delayed delete in case srcChars == fArray when we started, and
1411 // to keep oldArray alive for the above operations
1412 if (bufferToDelete
) {
1413 uprv_free(bufferToDelete
);
1423 UnicodeString::handleReplaceBetween(int32_t start
,
1425 const UnicodeString
& text
) {
1426 replaceBetween(start
, limit
, text
);
1433 UnicodeString::copy(int32_t start
, int32_t limit
, int32_t dest
) {
1434 if (limit
<= start
) {
1435 return; // Nothing to do; avoid bogus malloc call
1437 UChar
* text
= (UChar
*) uprv_malloc( sizeof(UChar
) * (limit
- start
) );
1438 // Check to make sure text is not null.
1440 extractBetween(start
, limit
, text
, 0);
1441 insert(dest
, text
, 0, limit
- start
);
1449 * NOTE: This is for the Replaceable class. There is no rep.cpp,
1450 * so we implement this function here.
1452 UBool
Replaceable::hasMetaData() const {
1459 UBool
UnicodeString::hasMetaData() const {
1464 UnicodeString::doReverse(int32_t start
, int32_t length
) {
1465 if(length
<= 1 || !cloneArrayIfNeeded()) {
1469 // pin the indices to legal values
1470 pinIndices(start
, length
);
1471 if(length
<= 1) { // pinIndices() might have shrunk the length
1475 UChar
*left
= getArrayStart() + start
;
1476 UChar
*right
= left
+ length
- 1; // -1 for inclusive boundary (length>=2)
1478 UBool hasSupplementary
= FALSE
;
1480 // Before the loop we know left<right because length>=2.
1482 hasSupplementary
|= (UBool
)U16_IS_LEAD(swap
= *left
);
1483 hasSupplementary
|= (UBool
)U16_IS_LEAD(*left
++ = *right
);
1485 } while(left
< right
);
1486 // Make sure to test the middle code unit of an odd-length string.
1487 // Redundant if the length is even.
1488 hasSupplementary
|= (UBool
)U16_IS_LEAD(*left
);
1490 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1491 if(hasSupplementary
) {
1494 left
= getArrayStart() + start
;
1495 right
= left
+ length
- 1; // -1 so that we can look at *(left+1) if left<right
1496 while(left
< right
) {
1497 if(U16_IS_TRAIL(swap
= *left
) && U16_IS_LEAD(swap2
= *(left
+ 1))) {
1510 UnicodeString::padLeading(int32_t targetLength
,
1513 int32_t oldLength
= length();
1514 if(oldLength
>= targetLength
|| !cloneArrayIfNeeded(targetLength
)) {
1517 // move contents up by padding width
1518 UChar
*array
= getArrayStart();
1519 int32_t start
= targetLength
- oldLength
;
1520 us_arrayCopy(array
, 0, array
, start
, oldLength
);
1522 // fill in padding character
1523 while(--start
>= 0) {
1524 array
[start
] = padChar
;
1526 setLength(targetLength
);
1532 UnicodeString::padTrailing(int32_t targetLength
,
1535 int32_t oldLength
= length();
1536 if(oldLength
>= targetLength
|| !cloneArrayIfNeeded(targetLength
)) {
1539 // fill in padding character
1540 UChar
*array
= getArrayStart();
1541 int32_t length
= targetLength
;
1542 while(--length
>= oldLength
) {
1543 array
[length
] = padChar
;
1545 setLength(targetLength
);
1550 //========================================
1552 //========================================
1554 UnicodeString::doHashCode() const
1556 /* Delegate hash computation to uhash. This makes UnicodeString
1557 * hashing consistent with UChar* hashing. */
1558 int32_t hashCode
= ustr_hashUCharsN(getArrayStart(), length());
1559 if (hashCode
== kInvalidHashCode
) {
1560 hashCode
= kEmptyHashCode
;
1565 //========================================
1567 //========================================
1570 UnicodeString::getBuffer(int32_t minCapacity
) {
1571 if(minCapacity
>=-1 && cloneArrayIfNeeded(minCapacity
)) {
1572 fFlags
|=kOpenGetBuffer
;
1574 return getArrayStart();
1581 UnicodeString::releaseBuffer(int32_t newLength
) {
1582 if(fFlags
&kOpenGetBuffer
&& newLength
>=-1) {
1583 // set the new fLength
1584 int32_t capacity
=getCapacity();
1586 // the new length is the string length, capped by fCapacity
1587 const UChar
*array
=getArrayStart(), *p
=array
, *limit
=array
+capacity
;
1588 while(p
<limit
&& *p
!=0) {
1591 newLength
=(int32_t)(p
-array
);
1592 } else if(newLength
>capacity
) {
1595 setLength(newLength
);
1596 fFlags
&=~kOpenGetBuffer
;
1600 //========================================
1602 //========================================
1604 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity
,
1605 int32_t growCapacity
,
1607 int32_t **pBufferToDelete
,
1609 // default parameters need to be static, therefore
1610 // the defaults are -1 to have convenience defaults
1611 if(newCapacity
== -1) {
1612 newCapacity
= getCapacity();
1615 // while a getBuffer(minCapacity) is "open",
1616 // prevent any modifications of the string by returning FALSE here
1617 // if the string is bogus, then only an assignment or similar can revive it
1623 * We need to make a copy of the array if
1624 * the buffer is read-only, or
1625 * the buffer is refCounted (shared), and refCount>1, or
1626 * the buffer is too small.
1627 * Return FALSE if memory could not be allocated.
1630 fFlags
& kBufferIsReadonly
||
1631 (fFlags
& kRefCounted
&& refCount() > 1) ||
1632 newCapacity
> getCapacity()
1634 // check growCapacity for default value and use of the stack buffer
1635 if(growCapacity
< 0) {
1636 growCapacity
= newCapacity
;
1637 } else if(newCapacity
<= US_STACKBUF_SIZE
&& growCapacity
> US_STACKBUF_SIZE
) {
1638 growCapacity
= US_STACKBUF_SIZE
;
1642 UChar oldStackBuffer
[US_STACKBUF_SIZE
];
1644 uint8_t flags
= fFlags
;
1646 if(flags
&kUsingStackBuffer
) {
1647 U_ASSERT(!(flags
&kRefCounted
)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1648 if(doCopyArray
&& growCapacity
> US_STACKBUF_SIZE
) {
1649 // copy the stack buffer contents because it will be overwritten with
1650 // fUnion.fFields values
1651 us_arrayCopy(fUnion
.fStackBuffer
, 0, oldStackBuffer
, 0, fShortLength
);
1652 oldArray
= oldStackBuffer
;
1654 oldArray
= 0; // no need to copy from stack buffer to itself
1657 oldArray
= fUnion
.fFields
.fArray
;
1658 U_ASSERT(oldArray
!=NULL
); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1661 // allocate a new array
1662 if(allocate(growCapacity
) ||
1663 (newCapacity
< growCapacity
&& allocate(newCapacity
))
1665 if(doCopyArray
&& oldArray
!= 0) {
1666 // copy the contents
1667 // do not copy more than what fits - it may be smaller than before
1668 int32_t minLength
= length();
1669 newCapacity
= getCapacity();
1670 if(newCapacity
< minLength
) {
1671 minLength
= newCapacity
;
1672 setLength(minLength
);
1674 us_arrayCopy(oldArray
, 0, getArrayStart(), 0, minLength
);
1679 // release the old array
1680 if(flags
& kRefCounted
) {
1681 // the array is refCounted; decrement and release if 0
1682 int32_t *pRefCount
= ((int32_t *)oldArray
- 1);
1683 if(umtx_atomic_dec(pRefCount
) == 0) {
1684 if(pBufferToDelete
== 0) {
1685 uprv_free(pRefCount
);
1687 // the caller requested to delete it himself
1688 *pBufferToDelete
= pRefCount
;
1693 // not enough memory for growCapacity and not even for the smaller newCapacity
1694 // reset the old values for setToBogus() to release the array
1695 if(!(flags
&kUsingStackBuffer
)) {
1696 fUnion
.fFields
.fArray
= oldArray
;
1706 // UnicodeStringAppendable ------------------------------------------------- ***
1708 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1711 UnicodeStringAppendable::appendCodeUnit(UChar c
) {
1712 return str
.doReplace(str
.length(), 0, &c
, 0, 1).isWritable();
1716 UnicodeStringAppendable::appendCodePoint(UChar32 c
) {
1717 UChar buffer
[U16_MAX_LENGTH
];
1718 int32_t cLength
= 0;
1719 UBool isError
= FALSE
;
1720 U16_APPEND(buffer
, cLength
, U16_MAX_LENGTH
, c
, isError
);
1721 return !isError
&& str
.doReplace(str
.length(), 0, buffer
, 0, cLength
).isWritable();
1725 UnicodeStringAppendable::appendString(const UChar
*s
, int32_t length
) {
1726 return str
.doReplace(str
.length(), 0, s
, 0, length
).isWritable();
1730 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity
) {
1731 return str
.cloneArrayIfNeeded(str
.length() + appendCapacity
);
1735 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity
,
1736 int32_t desiredCapacityHint
,
1737 UChar
*scratch
, int32_t scratchCapacity
,
1738 int32_t *resultCapacity
) {
1739 if(minCapacity
< 1 || scratchCapacity
< minCapacity
) {
1740 *resultCapacity
= 0;
1743 int32_t oldLength
= str
.length();
1744 if(str
.cloneArrayIfNeeded(oldLength
+ minCapacity
, oldLength
+ desiredCapacityHint
)) {
1745 *resultCapacity
= str
.getCapacity() - oldLength
;
1746 return str
.getArrayStart() + oldLength
;
1748 *resultCapacity
= scratchCapacity
;
1756 U_CAPI
int32_t U_EXPORT2
1757 uhash_hashUnicodeString(const UElement key
) {
1758 const UnicodeString
*str
= (const UnicodeString
*) key
.pointer
;
1759 return (str
== NULL
) ? 0 : str
->hashCode();
1762 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1763 // does not depend on hashtable code.
1764 U_CAPI UBool U_EXPORT2
1765 uhash_compareUnicodeString(const UElement key1
, const UElement key2
) {
1766 const UnicodeString
*str1
= (const UnicodeString
*) key1
.pointer
;
1767 const UnicodeString
*str2
= (const UnicodeString
*) key2
.pointer
;
1771 if (str1
== NULL
|| str2
== NULL
) {
1774 return *str1
== *str2
;
1777 #ifdef U_STATIC_IMPLEMENTATION
1779 This should never be called. It is defined here to make sure that the
1780 virtual vector deleting destructor is defined within unistr.cpp.
1781 The vector deleting destructor is already a part of UObject,
1782 but defining it here makes sure that it is included with this object file.
1783 This makes sure that static library dependencies are kept to a minimum.
1785 static void uprv_UnicodeStringDummy(void) {
1786 delete [] (new UnicodeString
[2]);