2 **********************************************************************
3 * Copyright (C) 1998-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
9 * Modification History:
11 * Date Name Description
12 * 09/25/98 stephen Creation.
13 * 11/11/98 stephen Changed per 11/9 code review.
14 * 04/20/99 stephen Overhauled per 4/16 code review.
15 * 11/18/99 aliu Made to inherit from Replaceable. Added method
16 * handleReplaceBetween(); other methods unchanged.
17 * 06/25/01 grhoten Remove dependency on iostream.
18 ******************************************************************************
26 * \brief C++ API: Unicode String
29 #include "unicode/utypes.h"
30 #include "unicode/rep.h"
31 #include "unicode/std_string.h"
32 #include "unicode/stringpiece.h"
33 #include "unicode/bytestream.h"
34 #include "unicode/ucasemap.h"
36 struct UConverter
; // unicode/ucnv.h
37 class StringThreadTest
;
39 #ifndef U_COMPARE_CODE_POINT_ORDER
40 /* see also ustring.h and unorm.h */
42 * Option bit for u_strCaseCompare, u_strcasecmp, unorm_compare, etc:
43 * Compare strings in code point order instead of code unit order.
46 #define U_COMPARE_CODE_POINT_ORDER 0x8000
51 * \ingroup ustring_ustrlen
53 U_STABLE
int32_t U_EXPORT2
54 u_strlen(const UChar
*s
);
57 #ifndef U_HIDE_INTERNAL_API
59 * \def U_STRING_CASE_MAPPER_DEFINED
63 #ifndef U_STRING_CASE_MAPPER_DEFINED
64 #define U_STRING_CASE_MAPPER_DEFINED
67 * Internal string case mapping function type.
70 typedef int32_t U_CALLCONV
71 UStringCaseMapper(const UCaseMap
*csm
,
72 UChar
*dest
, int32_t destCapacity
,
73 const UChar
*src
, int32_t srcLength
,
74 UErrorCode
*pErrorCode
);
77 #endif /* U_HIDE_INTERNAL_API */
81 class BreakIterator
; // unicode/brkiter.h
82 class Locale
; // unicode/locid.h
83 class StringCharacterIterator
;
84 class UnicodeStringAppendable
; // unicode/appendable.h
86 /* The <iostream> include has been moved to unicode/ustream.h */
89 * Constant to be used in the UnicodeString(char *, int32_t, EInvariant) constructor
90 * which constructs a Unicode string from an invariant-character char * string.
91 * About invariant characters see utypes.h.
92 * This constructor has no runtime dependency on conversion code and is
93 * therefore recommended over ones taking a charset name string
94 * (where the empty string "" indicates invariant-character conversion).
98 #define US_INV icu::UnicodeString::kInvariant
101 * Unicode String literals in C++.
102 * Dependent on the platform properties, different UnicodeString
103 * constructors should be used to create a UnicodeString object from
105 * The macros are defined for maximum performance.
106 * They work only for strings that contain "invariant characters", i.e.,
107 * only latin letters, digits, and some punctuation.
108 * See utypes.h for details.
110 * The string parameter must be a C string literal.
111 * The length of the string, not including the terminating
112 * <code>NUL</code>, must be specified as a constant.
113 * The U_STRING_DECL macro should be invoked exactly once for one
114 * such string variable before it is used.
117 #if defined(U_DECLARE_UTF16)
118 # define UNICODE_STRING(cs, _length) icu::UnicodeString(TRUE, (const UChar *)U_DECLARE_UTF16(cs), _length)
119 #elif U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY || (U_SIZEOF_UCHAR == 2 && defined(U_WCHAR_IS_UTF16)))
120 # define UNICODE_STRING(cs, _length) icu::UnicodeString(TRUE, (const UChar *)L ## cs, _length)
121 #elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY
122 # define UNICODE_STRING(cs, _length) icu::UnicodeString(TRUE, (const UChar *)cs, _length)
124 # define UNICODE_STRING(cs, _length) icu::UnicodeString(cs, _length, US_INV)
128 * Unicode String literals in C++.
129 * Dependent on the platform properties, different UnicodeString
130 * constructors should be used to create a UnicodeString object from
132 * The macros are defined for improved performance.
133 * They work only for strings that contain "invariant characters", i.e.,
134 * only latin letters, digits, and some punctuation.
135 * See utypes.h for details.
137 * The string parameter must be a C string literal.
140 #define UNICODE_STRING_SIMPLE(cs) UNICODE_STRING(cs, -1)
143 * \def UNISTR_FROM_CHAR_EXPLICIT
144 * This can be defined to be empty or "explicit".
145 * If explicit, then the UnicodeString(UChar) and UnicodeString(UChar32)
146 * constructors are marked as explicit, preventing their inadvertent use.
149 #ifndef UNISTR_FROM_CHAR_EXPLICIT
150 # if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION)
151 // Auto-"explicit" in ICU library code.
152 # define UNISTR_FROM_CHAR_EXPLICIT explicit
154 // Empty by default for source code compatibility.
155 # define UNISTR_FROM_CHAR_EXPLICIT
160 * \def UNISTR_FROM_STRING_EXPLICIT
161 * This can be defined to be empty or "explicit".
162 * If explicit, then the UnicodeString(const char *) and UnicodeString(const UChar *)
163 * constructors are marked as explicit, preventing their inadvertent use.
165 * In particular, this helps prevent accidentally depending on ICU conversion code
166 * by passing a string literal into an API with a const UnicodeString & parameter.
169 #ifndef UNISTR_FROM_STRING_EXPLICIT
170 # if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION)
171 // Auto-"explicit" in ICU library code.
172 # define UNISTR_FROM_STRING_EXPLICIT explicit
174 // Empty by default for source code compatibility.
175 # define UNISTR_FROM_STRING_EXPLICIT
180 * UnicodeString is a string class that stores Unicode characters directly and provides
181 * similar functionality as the Java String and StringBuffer classes.
182 * It is a concrete implementation of the abstract class Replaceable (for transliteration).
184 * The UnicodeString class is not suitable for subclassing.
186 * <p>For an overview of Unicode strings in C and C++ see the
187 * <a href="http://icu-project.org/userguide/strings.html">User Guide Strings chapter</a>.</p>
189 * <p>In ICU, a Unicode string consists of 16-bit Unicode <em>code units</em>.
190 * A Unicode character may be stored with either one code unit
191 * (the most common case) or with a matched pair of special code units
192 * ("surrogates"). The data type for code units is UChar.
193 * For single-character handling, a Unicode character code <em>point</em> is a value
194 * in the range 0..0x10ffff. ICU uses the UChar32 type for code points.</p>
196 * <p>Indexes and offsets into and lengths of strings always count code units, not code points.
197 * This is the same as with multi-byte char* strings in traditional string handling.
198 * Operations on partial strings typically do not test for code point boundaries.
199 * If necessary, the user needs to take care of such boundaries by testing for the code unit
200 * values or by using functions like
201 * UnicodeString::getChar32Start() and UnicodeString::getChar32Limit()
202 * (or, in C, the equivalent macros U16_SET_CP_START() and U16_SET_CP_LIMIT(), see utf.h).</p>
204 * UnicodeString methods are more lenient with regard to input parameter values
205 * than other ICU APIs. In particular:
206 * - If indexes are out of bounds for a UnicodeString object
207 * (<0 or >length()) then they are "pinned" to the nearest boundary.
208 * - If primitive string pointer values (e.g., const UChar * or char *)
209 * for input strings are NULL, then those input string parameters are treated
210 * as if they pointed to an empty string.
211 * However, this is <em>not</em> the case for char * parameters for charset names
213 * - Most UnicodeString methods do not take a UErrorCode parameter because
214 * there are usually very few opportunities for failure other than a shortage
215 * of memory, error codes in low-level C++ string methods would be inconvenient,
216 * and the error code as the last parameter (ICU convention) would prevent
217 * the use of default parameter values.
218 * Instead, such methods set the UnicodeString into a "bogus" state
219 * (see isBogus()) if an error occurs.
221 * In string comparisons, two UnicodeString objects that are both "bogus"
222 * compare equal (to be transitive and prevent endless loops in sorting),
223 * and a "bogus" string compares less than any non-"bogus" one.
225 * Const UnicodeString methods are thread-safe. Multiple threads can use
226 * const methods on the same UnicodeString object simultaneously,
227 * but non-const methods must not be called concurrently (in multiple threads)
228 * with any other (const or non-const) methods.
230 * Similarly, const UnicodeString & parameters are thread-safe.
231 * One object may be passed in as such a parameter concurrently in multiple threads.
232 * This includes the const UnicodeString & parameters for
233 * copy construction, assignment, and cloning.
235 * <p>UnicodeString uses several storage methods.
236 * String contents can be stored inside the UnicodeString object itself,
237 * in an allocated and shared buffer, or in an outside buffer that is "aliased".
238 * Most of this is done transparently, but careful aliasing in particular provides
239 * significant performance improvements.
240 * Also, the internal buffer is accessible via special functions.
241 * For details see the
242 * <a href="http://icu-project.org/userguide/strings.html">User Guide Strings chapter</a>.</p>
245 * @see CharacterIterator
248 class U_COMMON_API UnicodeString
: public Replaceable
253 * Constant to be used in the UnicodeString(char *, int32_t, EInvariant) constructor
254 * which constructs a Unicode string from an invariant-character char * string.
255 * Use the macro US_INV instead of the full qualification for this value.
268 //========================================
269 // Read-only operations
270 //========================================
272 /* Comparison - bitwise only - for international comparison use collation */
275 * Equality operator. Performs only bitwise comparison.
276 * @param text The UnicodeString to compare to this one.
277 * @return TRUE if <TT>text</TT> contains the same characters as this one,
281 inline UBool
operator== (const UnicodeString
& text
) const;
284 * Inequality operator. Performs only bitwise comparison.
285 * @param text The UnicodeString to compare to this one.
286 * @return FALSE if <TT>text</TT> contains the same characters as this one,
290 inline UBool
operator!= (const UnicodeString
& text
) const;
293 * Greater than operator. Performs only bitwise comparison.
294 * @param text The UnicodeString to compare to this one.
295 * @return TRUE if the characters in this are bitwise
296 * greater than the characters in <code>text</code>, FALSE otherwise
299 inline UBool
operator> (const UnicodeString
& text
) const;
302 * Less than operator. Performs only bitwise comparison.
303 * @param text The UnicodeString to compare to this one.
304 * @return TRUE if the characters in this are bitwise
305 * less than the characters in <code>text</code>, FALSE otherwise
308 inline UBool
operator< (const UnicodeString
& text
) const;
311 * Greater than or equal operator. Performs only bitwise comparison.
312 * @param text The UnicodeString to compare to this one.
313 * @return TRUE if the characters in this are bitwise
314 * greater than or equal to the characters in <code>text</code>, FALSE otherwise
317 inline UBool
operator>= (const UnicodeString
& text
) const;
320 * Less than or equal operator. Performs only bitwise comparison.
321 * @param text The UnicodeString to compare to this one.
322 * @return TRUE if the characters in this are bitwise
323 * less than or equal to the characters in <code>text</code>, FALSE otherwise
326 inline UBool
operator<= (const UnicodeString
& text
) const;
329 * Compare the characters bitwise in this UnicodeString to
330 * the characters in <code>text</code>.
331 * @param text The UnicodeString to compare to this one.
332 * @return The result of bitwise character comparison: 0 if this
333 * contains the same characters as <code>text</code>, -1 if the characters in
334 * this are bitwise less than the characters in <code>text</code>, +1 if the
335 * characters in this are bitwise greater than the characters
336 * in <code>text</code>.
339 inline int8_t compare(const UnicodeString
& text
) const;
342 * Compare the characters bitwise in the range
343 * [<TT>start</TT>, <TT>start + length</TT>) with the characters
345 * @param start the offset at which the compare operation begins
346 * @param length the number of characters of text to compare.
347 * @param text the other text to be compared against this string.
348 * @return The result of bitwise character comparison: 0 if this
349 * contains the same characters as <code>text</code>, -1 if the characters in
350 * this are bitwise less than the characters in <code>text</code>, +1 if the
351 * characters in this are bitwise greater than the characters
352 * in <code>text</code>.
355 inline int8_t compare(int32_t start
,
357 const UnicodeString
& text
) const;
360 * Compare the characters bitwise in the range
361 * [<TT>start</TT>, <TT>start + length</TT>) with the characters
362 * in <TT>srcText</TT> in the range
363 * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>).
364 * @param start the offset at which the compare operation begins
365 * @param length the number of characters in this to compare.
366 * @param srcText the text to be compared
367 * @param srcStart the offset into <TT>srcText</TT> to start comparison
368 * @param srcLength the number of characters in <TT>src</TT> to compare
369 * @return The result of bitwise character comparison: 0 if this
370 * contains the same characters as <code>srcText</code>, -1 if the characters in
371 * this are bitwise less than the characters in <code>srcText</code>, +1 if the
372 * characters in this are bitwise greater than the characters
373 * in <code>srcText</code>.
376 inline int8_t compare(int32_t start
,
378 const UnicodeString
& srcText
,
380 int32_t srcLength
) const;
383 * Compare the characters bitwise in this UnicodeString with the first
384 * <TT>srcLength</TT> characters in <TT>srcChars</TT>.
385 * @param srcChars The characters to compare to this UnicodeString.
386 * @param srcLength the number of characters in <TT>srcChars</TT> to compare
387 * @return The result of bitwise character comparison: 0 if this
388 * contains the same characters as <code>srcChars</code>, -1 if the characters in
389 * this are bitwise less than the characters in <code>srcChars</code>, +1 if the
390 * characters in this are bitwise greater than the characters
391 * in <code>srcChars</code>.
394 inline int8_t compare(const UChar
*srcChars
,
395 int32_t srcLength
) const;
398 * Compare the characters bitwise in the range
399 * [<TT>start</TT>, <TT>start + length</TT>) with the first
400 * <TT>length</TT> characters in <TT>srcChars</TT>
401 * @param start the offset at which the compare operation begins
402 * @param length the number of characters to compare.
403 * @param srcChars the characters to be compared
404 * @return The result of bitwise character comparison: 0 if this
405 * contains the same characters as <code>srcChars</code>, -1 if the characters in
406 * this are bitwise less than the characters in <code>srcChars</code>, +1 if the
407 * characters in this are bitwise greater than the characters
408 * in <code>srcChars</code>.
411 inline int8_t compare(int32_t start
,
413 const UChar
*srcChars
) const;
416 * Compare the characters bitwise in the range
417 * [<TT>start</TT>, <TT>start + length</TT>) with the characters
418 * in <TT>srcChars</TT> in the range
419 * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>).
420 * @param start the offset at which the compare operation begins
421 * @param length the number of characters in this to compare
422 * @param srcChars the characters to be compared
423 * @param srcStart the offset into <TT>srcChars</TT> to start comparison
424 * @param srcLength the number of characters in <TT>srcChars</TT> to compare
425 * @return The result of bitwise character comparison: 0 if this
426 * contains the same characters as <code>srcChars</code>, -1 if the characters in
427 * this are bitwise less than the characters in <code>srcChars</code>, +1 if the
428 * characters in this are bitwise greater than the characters
429 * in <code>srcChars</code>.
432 inline int8_t compare(int32_t start
,
434 const UChar
*srcChars
,
436 int32_t srcLength
) const;
439 * Compare the characters bitwise in the range
440 * [<TT>start</TT>, <TT>limit</TT>) with the characters
441 * in <TT>srcText</TT> in the range
442 * [<TT>srcStart</TT>, <TT>srcLimit</TT>).
443 * @param start the offset at which the compare operation begins
444 * @param limit the offset immediately following the compare operation
445 * @param srcText the text to be compared
446 * @param srcStart the offset into <TT>srcText</TT> to start comparison
447 * @param srcLimit the offset into <TT>srcText</TT> to limit comparison
448 * @return The result of bitwise character comparison: 0 if this
449 * contains the same characters as <code>srcText</code>, -1 if the characters in
450 * this are bitwise less than the characters in <code>srcText</code>, +1 if the
451 * characters in this are bitwise greater than the characters
452 * in <code>srcText</code>.
455 inline int8_t compareBetween(int32_t start
,
457 const UnicodeString
& srcText
,
459 int32_t srcLimit
) const;
462 * Compare two Unicode strings in code point order.
463 * The result may be different from the results of compare(), operator<, etc.
464 * if supplementary characters are present:
466 * In UTF-16, supplementary characters (with code points U+10000 and above) are
467 * stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff,
468 * which means that they compare as less than some other BMP characters like U+feff.
469 * This function compares Unicode strings in code point order.
470 * If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined.
472 * @param text Another string to compare this one to.
473 * @return a negative/zero/positive integer corresponding to whether
474 * this string is less than/equal to/greater than the second one
475 * in code point order
478 inline int8_t compareCodePointOrder(const UnicodeString
& text
) const;
481 * Compare two Unicode strings in code point order.
482 * The result may be different from the results of compare(), operator<, etc.
483 * if supplementary characters are present:
485 * In UTF-16, supplementary characters (with code points U+10000 and above) are
486 * stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff,
487 * which means that they compare as less than some other BMP characters like U+feff.
488 * This function compares Unicode strings in code point order.
489 * If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined.
491 * @param start The start offset in this string at which the compare operation begins.
492 * @param length The number of code units from this string to compare.
493 * @param srcText Another string to compare this one to.
494 * @return a negative/zero/positive integer corresponding to whether
495 * this string is less than/equal to/greater than the second one
496 * in code point order
499 inline int8_t compareCodePointOrder(int32_t start
,
501 const UnicodeString
& srcText
) const;
504 * Compare two Unicode strings in code point order.
505 * The result may be different from the results of compare(), operator<, etc.
506 * if supplementary characters are present:
508 * In UTF-16, supplementary characters (with code points U+10000 and above) are
509 * stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff,
510 * which means that they compare as less than some other BMP characters like U+feff.
511 * This function compares Unicode strings in code point order.
512 * If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined.
514 * @param start The start offset in this string at which the compare operation begins.
515 * @param length The number of code units from this string to compare.
516 * @param srcText Another string to compare this one to.
517 * @param srcStart The start offset in that string at which the compare operation begins.
518 * @param srcLength The number of code units from that string to compare.
519 * @return a negative/zero/positive integer corresponding to whether
520 * this string is less than/equal to/greater than the second one
521 * in code point order
524 inline int8_t compareCodePointOrder(int32_t start
,
526 const UnicodeString
& srcText
,
528 int32_t srcLength
) const;
531 * Compare two Unicode strings in code point order.
532 * The result may be different from the results of compare(), operator<, etc.
533 * if supplementary characters are present:
535 * In UTF-16, supplementary characters (with code points U+10000 and above) are
536 * stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff,
537 * which means that they compare as less than some other BMP characters like U+feff.
538 * This function compares Unicode strings in code point order.
539 * If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined.
541 * @param srcChars A pointer to another string to compare this one to.
542 * @param srcLength The number of code units from that string to compare.
543 * @return a negative/zero/positive integer corresponding to whether
544 * this string is less than/equal to/greater than the second one
545 * in code point order
548 inline int8_t compareCodePointOrder(const UChar
*srcChars
,
549 int32_t srcLength
) const;
552 * Compare two Unicode strings in code point order.
553 * The result may be different from the results of compare(), operator<, etc.
554 * if supplementary characters are present:
556 * In UTF-16, supplementary characters (with code points U+10000 and above) are
557 * stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff,
558 * which means that they compare as less than some other BMP characters like U+feff.
559 * This function compares Unicode strings in code point order.
560 * If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined.
562 * @param start The start offset in this string at which the compare operation begins.
563 * @param length The number of code units from this string to compare.
564 * @param srcChars A pointer to another string to compare this one to.
565 * @return a negative/zero/positive integer corresponding to whether
566 * this string is less than/equal to/greater than the second one
567 * in code point order
570 inline int8_t compareCodePointOrder(int32_t start
,
572 const UChar
*srcChars
) const;
575 * Compare two Unicode strings in code point order.
576 * The result may be different from the results of compare(), operator<, etc.
577 * if supplementary characters are present:
579 * In UTF-16, supplementary characters (with code points U+10000 and above) are
580 * stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff,
581 * which means that they compare as less than some other BMP characters like U+feff.
582 * This function compares Unicode strings in code point order.
583 * If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined.
585 * @param start The start offset in this string at which the compare operation begins.
586 * @param length The number of code units from this string to compare.
587 * @param srcChars A pointer to another string to compare this one to.
588 * @param srcStart The start offset in that string at which the compare operation begins.
589 * @param srcLength The number of code units from that string to compare.
590 * @return a negative/zero/positive integer corresponding to whether
591 * this string is less than/equal to/greater than the second one
592 * in code point order
595 inline int8_t compareCodePointOrder(int32_t start
,
597 const UChar
*srcChars
,
599 int32_t srcLength
) const;
602 * Compare two Unicode strings in code point order.
603 * The result may be different from the results of compare(), operator<, etc.
604 * if supplementary characters are present:
606 * In UTF-16, supplementary characters (with code points U+10000 and above) are
607 * stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff,
608 * which means that they compare as less than some other BMP characters like U+feff.
609 * This function compares Unicode strings in code point order.
610 * If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined.
612 * @param start The start offset in this string at which the compare operation begins.
613 * @param limit The offset after the last code unit from this string to compare.
614 * @param srcText Another string to compare this one to.
615 * @param srcStart The start offset in that string at which the compare operation begins.
616 * @param srcLimit The offset after the last code unit from that string to compare.
617 * @return a negative/zero/positive integer corresponding to whether
618 * this string is less than/equal to/greater than the second one
619 * in code point order
622 inline int8_t compareCodePointOrderBetween(int32_t start
,
624 const UnicodeString
& srcText
,
626 int32_t srcLimit
) const;
629 * Compare two strings case-insensitively using full case folding.
630 * This is equivalent to this->foldCase(options).compare(text.foldCase(options)).
632 * @param text Another string to compare this one to.
633 * @param options A bit set of options:
634 * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
635 * Comparison in code unit order with default case folding.
637 * - U_COMPARE_CODE_POINT_ORDER
638 * Set to choose code point order instead of code unit order
639 * (see u_strCompare for details).
641 * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
643 * @return A negative, zero, or positive integer indicating the comparison result.
646 inline int8_t caseCompare(const UnicodeString
& text
, uint32_t options
) const;
649 * Compare two strings case-insensitively using full case folding.
650 * This is equivalent to this->foldCase(options).compare(srcText.foldCase(options)).
652 * @param start The start offset in this string at which the compare operation begins.
653 * @param length The number of code units from this string to compare.
654 * @param srcText Another string to compare this one to.
655 * @param options A bit set of options:
656 * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
657 * Comparison in code unit order with default case folding.
659 * - U_COMPARE_CODE_POINT_ORDER
660 * Set to choose code point order instead of code unit order
661 * (see u_strCompare for details).
663 * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
665 * @return A negative, zero, or positive integer indicating the comparison result.
668 inline int8_t caseCompare(int32_t start
,
670 const UnicodeString
& srcText
,
671 uint32_t options
) const;
674 * Compare two strings case-insensitively using full case folding.
675 * This is equivalent to this->foldCase(options).compare(srcText.foldCase(options)).
677 * @param start The start offset in this string at which the compare operation begins.
678 * @param length The number of code units from this string to compare.
679 * @param srcText Another string to compare this one to.
680 * @param srcStart The start offset in that string at which the compare operation begins.
681 * @param srcLength The number of code units from that string to compare.
682 * @param options A bit set of options:
683 * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
684 * Comparison in code unit order with default case folding.
686 * - U_COMPARE_CODE_POINT_ORDER
687 * Set to choose code point order instead of code unit order
688 * (see u_strCompare for details).
690 * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
692 * @return A negative, zero, or positive integer indicating the comparison result.
695 inline int8_t caseCompare(int32_t start
,
697 const UnicodeString
& srcText
,
700 uint32_t options
) const;
703 * Compare two strings case-insensitively using full case folding.
704 * This is equivalent to this->foldCase(options).compare(srcChars.foldCase(options)).
706 * @param srcChars A pointer to another string to compare this one to.
707 * @param srcLength The number of code units from that string to compare.
708 * @param options A bit set of options:
709 * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
710 * Comparison in code unit order with default case folding.
712 * - U_COMPARE_CODE_POINT_ORDER
713 * Set to choose code point order instead of code unit order
714 * (see u_strCompare for details).
716 * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
718 * @return A negative, zero, or positive integer indicating the comparison result.
721 inline int8_t caseCompare(const UChar
*srcChars
,
723 uint32_t options
) const;
726 * Compare two strings case-insensitively using full case folding.
727 * This is equivalent to this->foldCase(options).compare(srcChars.foldCase(options)).
729 * @param start The start offset in this string at which the compare operation begins.
730 * @param length The number of code units from this string to compare.
731 * @param srcChars A pointer to another string to compare this one to.
732 * @param options A bit set of options:
733 * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
734 * Comparison in code unit order with default case folding.
736 * - U_COMPARE_CODE_POINT_ORDER
737 * Set to choose code point order instead of code unit order
738 * (see u_strCompare for details).
740 * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
742 * @return A negative, zero, or positive integer indicating the comparison result.
745 inline int8_t caseCompare(int32_t start
,
747 const UChar
*srcChars
,
748 uint32_t options
) const;
751 * Compare two strings case-insensitively using full case folding.
752 * This is equivalent to this->foldCase(options).compare(srcChars.foldCase(options)).
754 * @param start The start offset in this string at which the compare operation begins.
755 * @param length The number of code units from this string to compare.
756 * @param srcChars A pointer to another string to compare this one to.
757 * @param srcStart The start offset in that string at which the compare operation begins.
758 * @param srcLength The number of code units from that string to compare.
759 * @param options A bit set of options:
760 * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
761 * Comparison in code unit order with default case folding.
763 * - U_COMPARE_CODE_POINT_ORDER
764 * Set to choose code point order instead of code unit order
765 * (see u_strCompare for details).
767 * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
769 * @return A negative, zero, or positive integer indicating the comparison result.
772 inline int8_t caseCompare(int32_t start
,
774 const UChar
*srcChars
,
777 uint32_t options
) const;
780 * Compare two strings case-insensitively using full case folding.
781 * This is equivalent to this->foldCase(options).compareBetween(text.foldCase(options)).
783 * @param start The start offset in this string at which the compare operation begins.
784 * @param limit The offset after the last code unit from this string to compare.
785 * @param srcText Another string to compare this one to.
786 * @param srcStart The start offset in that string at which the compare operation begins.
787 * @param srcLimit The offset after the last code unit from that string to compare.
788 * @param options A bit set of options:
789 * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
790 * Comparison in code unit order with default case folding.
792 * - U_COMPARE_CODE_POINT_ORDER
793 * Set to choose code point order instead of code unit order
794 * (see u_strCompare for details).
796 * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
798 * @return A negative, zero, or positive integer indicating the comparison result.
801 inline int8_t caseCompareBetween(int32_t start
,
803 const UnicodeString
& srcText
,
806 uint32_t options
) const;
809 * Determine if this starts with the characters in <TT>text</TT>
810 * @param text The text to match.
811 * @return TRUE if this starts with the characters in <TT>text</TT>,
815 inline UBool
startsWith(const UnicodeString
& text
) const;
818 * Determine if this starts with the characters in <TT>srcText</TT>
819 * in the range [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>).
820 * @param srcText The text to match.
821 * @param srcStart the offset into <TT>srcText</TT> to start matching
822 * @param srcLength the number of characters in <TT>srcText</TT> to match
823 * @return TRUE if this starts with the characters in <TT>text</TT>,
827 inline UBool
startsWith(const UnicodeString
& srcText
,
829 int32_t srcLength
) const;
832 * Determine if this starts with the characters in <TT>srcChars</TT>
833 * @param srcChars The characters to match.
834 * @param srcLength the number of characters in <TT>srcChars</TT>
835 * @return TRUE if this starts with the characters in <TT>srcChars</TT>,
839 inline UBool
startsWith(const UChar
*srcChars
,
840 int32_t srcLength
) const;
843 * Determine if this ends with the characters in <TT>srcChars</TT>
844 * in the range [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>).
845 * @param srcChars The characters to match.
846 * @param srcStart the offset into <TT>srcText</TT> to start matching
847 * @param srcLength the number of characters in <TT>srcChars</TT> to match
848 * @return TRUE if this ends with the characters in <TT>srcChars</TT>, FALSE otherwise
851 inline UBool
startsWith(const UChar
*srcChars
,
853 int32_t srcLength
) const;
856 * Determine if this ends with the characters in <TT>text</TT>
857 * @param text The text to match.
858 * @return TRUE if this ends with the characters in <TT>text</TT>,
862 inline UBool
endsWith(const UnicodeString
& text
) const;
865 * Determine if this ends with the characters in <TT>srcText</TT>
866 * in the range [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>).
867 * @param srcText The text to match.
868 * @param srcStart the offset into <TT>srcText</TT> to start matching
869 * @param srcLength the number of characters in <TT>srcText</TT> to match
870 * @return TRUE if this ends with the characters in <TT>text</TT>,
874 inline UBool
endsWith(const UnicodeString
& srcText
,
876 int32_t srcLength
) const;
879 * Determine if this ends with the characters in <TT>srcChars</TT>
880 * @param srcChars The characters to match.
881 * @param srcLength the number of characters in <TT>srcChars</TT>
882 * @return TRUE if this ends with the characters in <TT>srcChars</TT>,
886 inline UBool
endsWith(const UChar
*srcChars
,
887 int32_t srcLength
) const;
890 * Determine if this ends with the characters in <TT>srcChars</TT>
891 * in the range [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>).
892 * @param srcChars The characters to match.
893 * @param srcStart the offset into <TT>srcText</TT> to start matching
894 * @param srcLength the number of characters in <TT>srcChars</TT> to match
895 * @return TRUE if this ends with the characters in <TT>srcChars</TT>,
899 inline UBool
endsWith(const UChar
*srcChars
,
901 int32_t srcLength
) const;
904 /* Searching - bitwise only */
907 * Locate in this the first occurrence of the characters in <TT>text</TT>,
908 * using bitwise comparison.
909 * @param text The text to search for.
910 * @return The offset into this of the start of <TT>text</TT>,
911 * or -1 if not found.
914 inline int32_t indexOf(const UnicodeString
& text
) const;
917 * Locate in this the first occurrence of the characters in <TT>text</TT>
918 * starting at offset <TT>start</TT>, using bitwise comparison.
919 * @param text The text to search for.
920 * @param start The offset at which searching will start.
921 * @return The offset into this of the start of <TT>text</TT>,
922 * or -1 if not found.
925 inline int32_t indexOf(const UnicodeString
& text
,
926 int32_t start
) const;
929 * Locate in this the first occurrence in the range
930 * [<TT>start</TT>, <TT>start + length</TT>) of the characters
931 * in <TT>text</TT>, using bitwise comparison.
932 * @param text The text to search for.
933 * @param start The offset at which searching will start.
934 * @param length The number of characters to search
935 * @return The offset into this of the start of <TT>text</TT>,
936 * or -1 if not found.
939 inline int32_t indexOf(const UnicodeString
& text
,
941 int32_t length
) const;
944 * Locate in this the first occurrence in the range
945 * [<TT>start</TT>, <TT>start + length</TT>) of the characters
946 * in <TT>srcText</TT> in the range
947 * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>),
948 * using bitwise comparison.
949 * @param srcText The text to search for.
950 * @param srcStart the offset into <TT>srcText</TT> at which
952 * @param srcLength the number of characters in <TT>srcText</TT> to match
953 * @param start the offset into this at which to start matching
954 * @param length the number of characters in this to search
955 * @return The offset into this of the start of <TT>text</TT>,
956 * or -1 if not found.
959 inline int32_t indexOf(const UnicodeString
& srcText
,
963 int32_t length
) const;
966 * Locate in this the first occurrence of the characters in
968 * starting at offset <TT>start</TT>, using bitwise comparison.
969 * @param srcChars The text to search for.
970 * @param srcLength the number of characters in <TT>srcChars</TT> to match
971 * @param start the offset into this at which to start matching
972 * @return The offset into this of the start of <TT>text</TT>,
973 * or -1 if not found.
976 inline int32_t indexOf(const UChar
*srcChars
,
978 int32_t start
) const;
981 * Locate in this the first occurrence in the range
982 * [<TT>start</TT>, <TT>start + length</TT>) of the characters
983 * in <TT>srcChars</TT>, using bitwise comparison.
984 * @param srcChars The text to search for.
985 * @param srcLength the number of characters in <TT>srcChars</TT>
986 * @param start The offset at which searching will start.
987 * @param length The number of characters to search
988 * @return The offset into this of the start of <TT>srcChars</TT>,
989 * or -1 if not found.
992 inline int32_t indexOf(const UChar
*srcChars
,
995 int32_t length
) const;
998 * Locate in this the first occurrence in the range
999 * [<TT>start</TT>, <TT>start + length</TT>) of the characters
1000 * in <TT>srcChars</TT> in the range
1001 * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>),
1002 * using bitwise comparison.
1003 * @param srcChars The text to search for.
1004 * @param srcStart the offset into <TT>srcChars</TT> at which
1006 * @param srcLength the number of characters in <TT>srcChars</TT> to match
1007 * @param start the offset into this at which to start matching
1008 * @param length the number of characters in this to search
1009 * @return The offset into this of the start of <TT>text</TT>,
1010 * or -1 if not found.
1013 int32_t indexOf(const UChar
*srcChars
,
1017 int32_t length
) const;
1020 * Locate in this the first occurrence of the BMP code point <code>c</code>,
1021 * using bitwise comparison.
1022 * @param c The code unit to search for.
1023 * @return The offset into this of <TT>c</TT>, or -1 if not found.
1026 inline int32_t indexOf(UChar c
) const;
1029 * Locate in this the first occurrence of the code point <TT>c</TT>,
1030 * using bitwise comparison.
1032 * @param c The code point to search for.
1033 * @return The offset into this of <TT>c</TT>, or -1 if not found.
1036 inline int32_t indexOf(UChar32 c
) const;
1039 * Locate in this the first occurrence of the BMP code point <code>c</code>,
1040 * starting at offset <TT>start</TT>, using bitwise comparison.
1041 * @param c The code unit to search for.
1042 * @param start The offset at which searching will start.
1043 * @return The offset into this of <TT>c</TT>, or -1 if not found.
1046 inline int32_t indexOf(UChar c
,
1047 int32_t start
) const;
1050 * Locate in this the first occurrence of the code point <TT>c</TT>
1051 * starting at offset <TT>start</TT>, using bitwise comparison.
1053 * @param c The code point to search for.
1054 * @param start The offset at which searching will start.
1055 * @return The offset into this of <TT>c</TT>, or -1 if not found.
1058 inline int32_t indexOf(UChar32 c
,
1059 int32_t start
) const;
1062 * Locate in this the first occurrence of the BMP code point <code>c</code>
1063 * in the range [<TT>start</TT>, <TT>start + length</TT>),
1064 * using bitwise comparison.
1065 * @param c The code unit to search for.
1066 * @param start the offset into this at which to start matching
1067 * @param length the number of characters in this to search
1068 * @return The offset into this of <TT>c</TT>, or -1 if not found.
1071 inline int32_t indexOf(UChar c
,
1073 int32_t length
) const;
1076 * Locate in this the first occurrence of the code point <TT>c</TT>
1077 * in the range [<TT>start</TT>, <TT>start + length</TT>),
1078 * using bitwise comparison.
1080 * @param c The code point to search for.
1081 * @param start the offset into this at which to start matching
1082 * @param length the number of characters in this to search
1083 * @return The offset into this of <TT>c</TT>, or -1 if not found.
1086 inline int32_t indexOf(UChar32 c
,
1088 int32_t length
) const;
1091 * Locate in this the last occurrence of the characters in <TT>text</TT>,
1092 * using bitwise comparison.
1093 * @param text The text to search for.
1094 * @return The offset into this of the start of <TT>text</TT>,
1095 * or -1 if not found.
1098 inline int32_t lastIndexOf(const UnicodeString
& text
) const;
1101 * Locate in this the last occurrence of the characters in <TT>text</TT>
1102 * starting at offset <TT>start</TT>, using bitwise comparison.
1103 * @param text The text to search for.
1104 * @param start The offset at which searching will start.
1105 * @return The offset into this of the start of <TT>text</TT>,
1106 * or -1 if not found.
1109 inline int32_t lastIndexOf(const UnicodeString
& text
,
1110 int32_t start
) const;
1113 * Locate in this the last occurrence in the range
1114 * [<TT>start</TT>, <TT>start + length</TT>) of the characters
1115 * in <TT>text</TT>, using bitwise comparison.
1116 * @param text The text to search for.
1117 * @param start The offset at which searching will start.
1118 * @param length The number of characters to search
1119 * @return The offset into this of the start of <TT>text</TT>,
1120 * or -1 if not found.
1123 inline int32_t lastIndexOf(const UnicodeString
& text
,
1125 int32_t length
) const;
1128 * Locate in this the last occurrence in the range
1129 * [<TT>start</TT>, <TT>start + length</TT>) of the characters
1130 * in <TT>srcText</TT> in the range
1131 * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>),
1132 * using bitwise comparison.
1133 * @param srcText The text to search for.
1134 * @param srcStart the offset into <TT>srcText</TT> at which
1136 * @param srcLength the number of characters in <TT>srcText</TT> to match
1137 * @param start the offset into this at which to start matching
1138 * @param length the number of characters in this to search
1139 * @return The offset into this of the start of <TT>text</TT>,
1140 * or -1 if not found.
1143 inline int32_t lastIndexOf(const UnicodeString
& srcText
,
1147 int32_t length
) const;
1150 * Locate in this the last occurrence of the characters in <TT>srcChars</TT>
1151 * starting at offset <TT>start</TT>, using bitwise comparison.
1152 * @param srcChars The text to search for.
1153 * @param srcLength the number of characters in <TT>srcChars</TT> to match
1154 * @param start the offset into this at which to start matching
1155 * @return The offset into this of the start of <TT>text</TT>,
1156 * or -1 if not found.
1159 inline int32_t lastIndexOf(const UChar
*srcChars
,
1161 int32_t start
) const;
1164 * Locate in this the last occurrence in the range
1165 * [<TT>start</TT>, <TT>start + length</TT>) of the characters
1166 * in <TT>srcChars</TT>, using bitwise comparison.
1167 * @param srcChars The text to search for.
1168 * @param srcLength the number of characters in <TT>srcChars</TT>
1169 * @param start The offset at which searching will start.
1170 * @param length The number of characters to search
1171 * @return The offset into this of the start of <TT>srcChars</TT>,
1172 * or -1 if not found.
1175 inline int32_t lastIndexOf(const UChar
*srcChars
,
1178 int32_t length
) const;
1181 * Locate in this the last occurrence in the range
1182 * [<TT>start</TT>, <TT>start + length</TT>) of the characters
1183 * in <TT>srcChars</TT> in the range
1184 * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>),
1185 * using bitwise comparison.
1186 * @param srcChars The text to search for.
1187 * @param srcStart the offset into <TT>srcChars</TT> at which
1189 * @param srcLength the number of characters in <TT>srcChars</TT> to match
1190 * @param start the offset into this at which to start matching
1191 * @param length the number of characters in this to search
1192 * @return The offset into this of the start of <TT>text</TT>,
1193 * or -1 if not found.
1196 int32_t lastIndexOf(const UChar
*srcChars
,
1200 int32_t length
) const;
1203 * Locate in this the last occurrence of the BMP code point <code>c</code>,
1204 * using bitwise comparison.
1205 * @param c The code unit to search for.
1206 * @return The offset into this of <TT>c</TT>, or -1 if not found.
1209 inline int32_t lastIndexOf(UChar c
) const;
1212 * Locate in this the last occurrence of the code point <TT>c</TT>,
1213 * using bitwise comparison.
1215 * @param c The code point to search for.
1216 * @return The offset into this of <TT>c</TT>, or -1 if not found.
1219 inline int32_t lastIndexOf(UChar32 c
) const;
1222 * Locate in this the last occurrence of the BMP code point <code>c</code>
1223 * starting at offset <TT>start</TT>, using bitwise comparison.
1224 * @param c The code unit to search for.
1225 * @param start The offset at which searching will start.
1226 * @return The offset into this of <TT>c</TT>, or -1 if not found.
1229 inline int32_t lastIndexOf(UChar c
,
1230 int32_t start
) const;
1233 * Locate in this the last occurrence of the code point <TT>c</TT>
1234 * starting at offset <TT>start</TT>, using bitwise comparison.
1236 * @param c The code point to search for.
1237 * @param start The offset at which searching will start.
1238 * @return The offset into this of <TT>c</TT>, or -1 if not found.
1241 inline int32_t lastIndexOf(UChar32 c
,
1242 int32_t start
) const;
1245 * Locate in this the last occurrence of the BMP code point <code>c</code>
1246 * in the range [<TT>start</TT>, <TT>start + length</TT>),
1247 * using bitwise comparison.
1248 * @param c The code unit to search for.
1249 * @param start the offset into this at which to start matching
1250 * @param length the number of characters in this to search
1251 * @return The offset into this of <TT>c</TT>, or -1 if not found.
1254 inline int32_t lastIndexOf(UChar c
,
1256 int32_t length
) const;
1259 * Locate in this the last occurrence of the code point <TT>c</TT>
1260 * in the range [<TT>start</TT>, <TT>start + length</TT>),
1261 * using bitwise comparison.
1263 * @param c The code point to search for.
1264 * @param start the offset into this at which to start matching
1265 * @param length the number of characters in this to search
1266 * @return The offset into this of <TT>c</TT>, or -1 if not found.
1269 inline int32_t lastIndexOf(UChar32 c
,
1271 int32_t length
) const;
1274 /* Character access */
1277 * Return the code unit at offset <tt>offset</tt>.
1278 * If the offset is not valid (0..length()-1) then U+ffff is returned.
1279 * @param offset a valid offset into the text
1280 * @return the code unit at offset <tt>offset</tt>
1281 * or 0xffff if the offset is not valid for this string
1284 inline UChar
charAt(int32_t offset
) const;
1287 * Return the code unit at offset <tt>offset</tt>.
1288 * If the offset is not valid (0..length()-1) then U+ffff is returned.
1289 * @param offset a valid offset into the text
1290 * @return the code unit at offset <tt>offset</tt>
1293 inline UChar
operator[] (int32_t offset
) const;
1296 * Return the code point that contains the code unit
1297 * at offset <tt>offset</tt>.
1298 * If the offset is not valid (0..length()-1) then U+ffff is returned.
1299 * @param offset a valid offset into the text
1300 * that indicates the text offset of any of the code units
1301 * that will be assembled into a code point (21-bit value) and returned
1302 * @return the code point of text at <tt>offset</tt>
1303 * or 0xffff if the offset is not valid for this string
1306 UChar32
char32At(int32_t offset
) const;
1309 * Adjust a random-access offset so that
1310 * it points to the beginning of a Unicode character.
1311 * The offset that is passed in points to
1312 * any code unit of a code point,
1313 * while the returned offset will point to the first code unit
1314 * of the same code point.
1315 * In UTF-16, if the input offset points to a second surrogate
1316 * of a surrogate pair, then the returned offset will point
1317 * to the first surrogate.
1318 * @param offset a valid offset into one code point of the text
1319 * @return offset of the first code unit of the same code point
1320 * @see U16_SET_CP_START
1323 int32_t getChar32Start(int32_t offset
) const;
1326 * Adjust a random-access offset so that
1327 * it points behind a Unicode character.
1328 * The offset that is passed in points behind
1329 * any code unit of a code point,
1330 * while the returned offset will point behind the last code unit
1331 * of the same code point.
1332 * In UTF-16, if the input offset points behind the first surrogate
1333 * (i.e., to the second surrogate)
1334 * of a surrogate pair, then the returned offset will point
1335 * behind the second surrogate (i.e., to the first surrogate).
1336 * @param offset a valid offset after any code unit of a code point of the text
1337 * @return offset of the first code unit after the same code point
1338 * @see U16_SET_CP_LIMIT
1341 int32_t getChar32Limit(int32_t offset
) const;
1344 * Move the code unit index along the string by delta code points.
1345 * Interpret the input index as a code unit-based offset into the string,
1346 * move the index forward or backward by delta code points, and
1347 * return the resulting index.
1348 * The input index should point to the first code unit of a code point,
1349 * if there is more than one.
1351 * Both input and output indexes are code unit-based as for all
1352 * string indexes/offsets in ICU (and other libraries, like MBCS char*).
1353 * If delta<0 then the index is moved backward (toward the start of the string).
1354 * If delta>0 then the index is moved forward (toward the end of the string).
1356 * This behaves like CharacterIterator::move32(delta, kCurrent).
1358 * Behavior for out-of-bounds indexes:
1359 * <code>moveIndex32</code> pins the input index to 0..length(), i.e.,
1360 * if the input index<0 then it is pinned to 0;
1361 * if it is index>length() then it is pinned to length().
1362 * Afterwards, the index is moved by <code>delta</code> code points
1363 * forward or backward,
1364 * but no further backward than to 0 and no further forward than to length().
1365 * The resulting index return value will be in between 0 and length(), inclusively.
1369 * // s has code points 'a' U+10000 'b' U+10ffff U+2029
1370 * UnicodeString s=UNICODE_STRING("a\\U00010000b\\U0010ffff\\u2029", 31).unescape();
1372 * // initial index: position of U+10000
1375 * // the following examples will all result in index==4, position of U+10ffff
1377 * // skip 2 code points from some position in the string
1378 * index=s.moveIndex32(index, 2); // skips U+10000 and 'b'
1380 * // go to the 3rd code point from the start of s (0-based)
1381 * index=s.moveIndex32(0, 3); // skips 'a', U+10000, and 'b'
1383 * // go to the next-to-last code point of s
1384 * index=s.moveIndex32(s.length(), -2); // backward-skips U+2029 and U+10ffff
1387 * @param index input code unit index
1388 * @param delta (signed) code point count to move the index forward or backward
1390 * @return the resulting code unit index
1393 int32_t moveIndex32(int32_t index
, int32_t delta
) const;
1395 /* Substring extraction */
1398 * Copy the characters in the range
1399 * [<tt>start</tt>, <tt>start + length</tt>) into the array <tt>dst</tt>,
1400 * beginning at <tt>dstStart</tt>.
1401 * If the string aliases to <code>dst</code> itself as an external buffer,
1402 * then extract() will not copy the contents.
1404 * @param start offset of first character which will be copied into the array
1405 * @param length the number of characters to extract
1406 * @param dst array in which to copy characters. The length of <tt>dst</tt>
1407 * must be at least (<tt>dstStart + length</tt>).
1408 * @param dstStart the offset in <TT>dst</TT> where the first character
1412 inline void extract(int32_t start
,
1415 int32_t dstStart
= 0) const;
1418 * Copy the contents of the string into dest.
1419 * This is a convenience function that
1420 * checks if there is enough space in dest,
1421 * extracts the entire string if possible,
1422 * and NUL-terminates dest if possible.
1424 * If the string fits into dest but cannot be NUL-terminated
1425 * (length()==destCapacity) then the error code is set to U_STRING_NOT_TERMINATED_WARNING.
1426 * If the string itself does not fit into dest
1427 * (length()>destCapacity) then the error code is set to U_BUFFER_OVERFLOW_ERROR.
1429 * If the string aliases to <code>dest</code> itself as an external buffer,
1430 * then extract() will not copy the contents.
1432 * @param dest Destination string buffer.
1433 * @param destCapacity Number of UChars available at dest.
1434 * @param errorCode ICU error code.
1439 extract(UChar
*dest
, int32_t destCapacity
,
1440 UErrorCode
&errorCode
) const;
1443 * Copy the characters in the range
1444 * [<tt>start</tt>, <tt>start + length</tt>) into the UnicodeString
1446 * @param start offset of first character which will be copied
1447 * @param length the number of characters to extract
1448 * @param target UnicodeString into which to copy characters.
1449 * @return A reference to <TT>target</TT>
1452 inline void extract(int32_t start
,
1454 UnicodeString
& target
) const;
1457 * Copy the characters in the range [<tt>start</tt>, <tt>limit</tt>)
1458 * into the array <tt>dst</tt>, beginning at <tt>dstStart</tt>.
1459 * @param start offset of first character which will be copied into the array
1460 * @param limit offset immediately following the last character to be copied
1461 * @param dst array in which to copy characters. The length of <tt>dst</tt>
1462 * must be at least (<tt>dstStart + (limit - start)</tt>).
1463 * @param dstStart the offset in <TT>dst</TT> where the first character
1467 inline void extractBetween(int32_t start
,
1470 int32_t dstStart
= 0) const;
1473 * Copy the characters in the range [<tt>start</tt>, <tt>limit</tt>)
1474 * into the UnicodeString <tt>target</tt>. Replaceable API.
1475 * @param start offset of first character which will be copied
1476 * @param limit offset immediately following the last character to be copied
1477 * @param target UnicodeString into which to copy characters.
1478 * @return A reference to <TT>target</TT>
1481 virtual void extractBetween(int32_t start
,
1483 UnicodeString
& target
) const;
1486 * Copy the characters in the range
1487 * [<tt>start</TT>, <tt>start + length</TT>) into an array of characters.
1488 * All characters must be invariant (see utypes.h).
1489 * Use US_INV as the last, signature-distinguishing parameter.
1491 * This function does not write any more than <code>targetLength</code>
1492 * characters but returns the length of the entire output string
1493 * so that one can allocate a larger buffer and call the function again
1495 * The output string is NUL-terminated if possible.
1497 * @param start offset of first character which will be copied
1498 * @param startLength the number of characters to extract
1499 * @param target the target buffer for extraction, can be NULL
1500 * if targetLength is 0
1501 * @param targetCapacity the length of the target buffer
1502 * @param inv Signature-distinguishing paramater, use US_INV.
1503 * @return the output string length, not including the terminating NUL
1506 int32_t extract(int32_t start
,
1507 int32_t startLength
,
1509 int32_t targetCapacity
,
1510 enum EInvariant inv
) const;
1512 #if U_CHARSET_IS_UTF8 || !UCONFIG_NO_CONVERSION
1515 * Copy the characters in the range
1516 * [<tt>start</TT>, <tt>start + length</TT>) into an array of characters
1517 * in the platform's default codepage.
1518 * This function does not write any more than <code>targetLength</code>
1519 * characters but returns the length of the entire output string
1520 * so that one can allocate a larger buffer and call the function again
1522 * The output string is NUL-terminated if possible.
1524 * @param start offset of first character which will be copied
1525 * @param startLength the number of characters to extract
1526 * @param target the target buffer for extraction
1527 * @param targetLength the length of the target buffer
1528 * If <TT>target</TT> is NULL, then the number of bytes required for
1529 * <TT>target</TT> is returned.
1530 * @return the output string length, not including the terminating NUL
1533 int32_t extract(int32_t start
,
1534 int32_t startLength
,
1536 uint32_t targetLength
) const;
1540 #if !UCONFIG_NO_CONVERSION
1543 * Copy the characters in the range
1544 * [<tt>start</TT>, <tt>start + length</TT>) into an array of characters
1545 * in a specified codepage.
1546 * The output string is NUL-terminated.
1548 * Recommendation: For invariant-character strings use
1549 * extract(int32_t start, int32_t length, char *target, int32_t targetCapacity, enum EInvariant inv) const
1550 * because it avoids object code dependencies of UnicodeString on
1551 * the conversion code.
1553 * @param start offset of first character which will be copied
1554 * @param startLength the number of characters to extract
1555 * @param target the target buffer for extraction
1556 * @param codepage the desired codepage for the characters. 0 has
1557 * the special meaning of the default codepage
1558 * If <code>codepage</code> is an empty string (<code>""</code>),
1559 * then a simple conversion is performed on the codepage-invariant
1560 * subset ("invariant characters") of the platform encoding. See utypes.h.
1561 * If <TT>target</TT> is NULL, then the number of bytes required for
1562 * <TT>target</TT> is returned. It is assumed that the target is big enough
1563 * to fit all of the characters.
1564 * @return the output string length, not including the terminating NUL
1567 inline int32_t extract(int32_t start
,
1568 int32_t startLength
,
1570 const char *codepage
= 0) const;
1573 * Copy the characters in the range
1574 * [<tt>start</TT>, <tt>start + length</TT>) into an array of characters
1575 * in a specified codepage.
1576 * This function does not write any more than <code>targetLength</code>
1577 * characters but returns the length of the entire output string
1578 * so that one can allocate a larger buffer and call the function again
1580 * The output string is NUL-terminated if possible.
1582 * Recommendation: For invariant-character strings use
1583 * extract(int32_t start, int32_t length, char *target, int32_t targetCapacity, enum EInvariant inv) const
1584 * because it avoids object code dependencies of UnicodeString on
1585 * the conversion code.
1587 * @param start offset of first character which will be copied
1588 * @param startLength the number of characters to extract
1589 * @param target the target buffer for extraction
1590 * @param targetLength the length of the target buffer
1591 * @param codepage the desired codepage for the characters. 0 has
1592 * the special meaning of the default codepage
1593 * If <code>codepage</code> is an empty string (<code>""</code>),
1594 * then a simple conversion is performed on the codepage-invariant
1595 * subset ("invariant characters") of the platform encoding. See utypes.h.
1596 * If <TT>target</TT> is NULL, then the number of bytes required for
1597 * <TT>target</TT> is returned.
1598 * @return the output string length, not including the terminating NUL
1601 int32_t extract(int32_t start
,
1602 int32_t startLength
,
1604 uint32_t targetLength
,
1605 const char *codepage
) const;
1608 * Convert the UnicodeString into a codepage string using an existing UConverter.
1609 * The output string is NUL-terminated if possible.
1611 * This function avoids the overhead of opening and closing a converter if
1612 * multiple strings are extracted.
1614 * @param dest destination string buffer, can be NULL if destCapacity==0
1615 * @param destCapacity the number of chars available at dest
1616 * @param cnv the converter object to be used (ucnv_resetFromUnicode() will be called),
1617 * or NULL for the default converter
1618 * @param errorCode normal ICU error code
1619 * @return the length of the output string, not counting the terminating NUL;
1620 * if the length is greater than destCapacity, then the string will not fit
1621 * and a buffer of the indicated length would need to be passed in
1624 int32_t extract(char *dest
, int32_t destCapacity
,
1626 UErrorCode
&errorCode
) const;
1631 * Create a temporary substring for the specified range.
1632 * Unlike the substring constructor and setTo() functions,
1633 * the object returned here will be a read-only alias (using getBuffer())
1634 * rather than copying the text.
1635 * As a result, this substring operation is much faster but requires
1636 * that the original string not be modified or deleted during the lifetime
1637 * of the returned substring object.
1638 * @param start offset of the first character visible in the substring
1639 * @param length length of the substring
1640 * @return a read-only alias UnicodeString object for the substring
1643 UnicodeString
tempSubString(int32_t start
=0, int32_t length
=INT32_MAX
) const;
1646 * Create a temporary substring for the specified range.
1647 * Same as tempSubString(start, length) except that the substring range
1648 * is specified as a (start, limit) pair (with an exclusive limit index)
1649 * rather than a (start, length) pair.
1650 * @param start offset of the first character visible in the substring
1651 * @param limit offset immediately following the last character visible in the substring
1652 * @return a read-only alias UnicodeString object for the substring
1655 inline UnicodeString
tempSubStringBetween(int32_t start
, int32_t limit
=INT32_MAX
) const;
1658 * Convert the UnicodeString to UTF-8 and write the result
1659 * to a ByteSink. This is called by toUTF8String().
1660 * Unpaired surrogates are replaced with U+FFFD.
1661 * Calls u_strToUTF8WithSub().
1663 * @param sink A ByteSink to which the UTF-8 version of the string is written.
1664 * sink.Flush() is called at the end.
1668 void toUTF8(ByteSink
&sink
) const;
1670 #if U_HAVE_STD_STRING
1673 * Convert the UnicodeString to UTF-8 and append the result
1674 * to a standard string.
1675 * Unpaired surrogates are replaced with U+FFFD.
1678 * @param result A standard string (or a compatible object)
1679 * to which the UTF-8 version of the string is appended.
1680 * @return The string object.
1684 template<typename StringClass
>
1685 StringClass
&toUTF8String(StringClass
&result
) const {
1686 StringByteSink
<StringClass
> sbs(&result
);
1694 * Convert the UnicodeString to UTF-32.
1695 * Unpaired surrogates are replaced with U+FFFD.
1696 * Calls u_strToUTF32WithSub().
1698 * @param utf32 destination string buffer, can be NULL if capacity==0
1699 * @param capacity the number of UChar32s available at utf32
1700 * @param errorCode Standard ICU error code. Its input value must
1701 * pass the U_SUCCESS() test, or else the function returns
1702 * immediately. Check for U_FAILURE() on output or use with
1703 * function chaining. (See User Guide for details.)
1704 * @return The length of the UTF-32 string.
1708 int32_t toUTF32(UChar32
*utf32
, int32_t capacity
, UErrorCode
&errorCode
) const;
1710 /* Length operations */
1713 * Return the length of the UnicodeString object.
1714 * The length is the number of UChar code units are in the UnicodeString.
1715 * If you want the number of code points, please use countChar32().
1716 * @return the length of the UnicodeString object
1720 inline int32_t length(void) const;
1723 * Count Unicode code points in the length UChar code units of the string.
1724 * A code point may occupy either one or two UChar code units.
1725 * Counting code points involves reading all code units.
1727 * This functions is basically the inverse of moveIndex32().
1729 * @param start the index of the first code unit to check
1730 * @param length the number of UChar code units to check
1731 * @return the number of code points in the specified code units
1736 countChar32(int32_t start
=0, int32_t length
=INT32_MAX
) const;
1739 * Check if the length UChar code units of the string
1740 * contain more Unicode code points than a certain number.
1741 * This is more efficient than counting all code points in this part of the string
1742 * and comparing that number with a threshold.
1743 * This function may not need to scan the string at all if the length
1744 * falls within a certain range, and
1745 * never needs to count more than 'number+1' code points.
1746 * Logically equivalent to (countChar32(start, length)>number).
1747 * A Unicode code point may occupy either one or two UChar code units.
1749 * @param start the index of the first code unit to check (0 for the entire string)
1750 * @param length the number of UChar code units to check
1751 * (use INT32_MAX for the entire string; remember that start/length
1752 * values are pinned)
1753 * @param number The number of code points in the (sub)string is compared against
1754 * the 'number' parameter.
1755 * @return Boolean value for whether the string contains more Unicode code points
1756 * than 'number'. Same as (u_countChar32(s, length)>number).
1758 * @see u_strHasMoreChar32Than
1762 hasMoreChar32Than(int32_t start
, int32_t length
, int32_t number
) const;
1765 * Determine if this string is empty.
1766 * @return TRUE if this string contains 0 characters, FALSE otherwise.
1769 inline UBool
isEmpty(void) const;
1772 * Return the capacity of the internal buffer of the UnicodeString object.
1773 * This is useful together with the getBuffer functions.
1774 * See there for details.
1776 * @return the number of UChars available in the internal buffer
1780 inline int32_t getCapacity(void) const;
1782 /* Other operations */
1785 * Generate a hash code for this object.
1786 * @return The hash code of this UnicodeString.
1789 inline int32_t hashCode(void) const;
1792 * Determine if this object contains a valid string.
1793 * A bogus string has no value. It is different from an empty string,
1794 * although in both cases isEmpty() returns TRUE and length() returns 0.
1795 * setToBogus() and isBogus() can be used to indicate that no string value is available.
1796 * For a bogus string, getBuffer() and getTerminatedBuffer() return NULL, and
1797 * length() returns 0.
1799 * @return TRUE if the string is valid, FALSE otherwise
1803 inline UBool
isBogus(void) const;
1806 //========================================
1808 //========================================
1810 /* Assignment operations */
1813 * Assignment operator. Replace the characters in this UnicodeString
1814 * with the characters from <TT>srcText</TT>.
1815 * @param srcText The text containing the characters to replace
1816 * @return a reference to this
1819 UnicodeString
&operator=(const UnicodeString
&srcText
);
1822 * Almost the same as the assignment operator.
1823 * Replace the characters in this UnicodeString
1824 * with the characters from <code>srcText</code>.
1826 * This function works the same as the assignment operator
1827 * for all strings except for ones that are readonly aliases.
1829 * Starting with ICU 2.4, the assignment operator and the copy constructor
1830 * allocate a new buffer and copy the buffer contents even for readonly aliases.
1831 * This function implements the old, more efficient but less safe behavior
1832 * of making this string also a readonly alias to the same buffer.
1834 * The fastCopyFrom function must be used only if it is known that the lifetime of
1835 * this UnicodeString does not exceed the lifetime of the aliased buffer
1836 * including its contents, for example for strings from resource bundles
1837 * or aliases to string constants.
1839 * @param src The text containing the characters to replace.
1840 * @return a reference to this
1843 UnicodeString
&fastCopyFrom(const UnicodeString
&src
);
1846 * Assignment operator. Replace the characters in this UnicodeString
1847 * with the code unit <TT>ch</TT>.
1848 * @param ch the code unit to replace
1849 * @return a reference to this
1852 inline UnicodeString
& operator= (UChar ch
);
1855 * Assignment operator. Replace the characters in this UnicodeString
1856 * with the code point <TT>ch</TT>.
1857 * @param ch the code point to replace
1858 * @return a reference to this
1861 inline UnicodeString
& operator= (UChar32 ch
);
1864 * Set the text in the UnicodeString object to the characters
1865 * in <TT>srcText</TT> in the range
1866 * [<TT>srcStart</TT>, <TT>srcText.length()</TT>).
1867 * <TT>srcText</TT> is not modified.
1868 * @param srcText the source for the new characters
1869 * @param srcStart the offset into <TT>srcText</TT> where new characters
1871 * @return a reference to this
1874 inline UnicodeString
& setTo(const UnicodeString
& srcText
,
1878 * Set the text in the UnicodeString object to the characters
1879 * in <TT>srcText</TT> in the range
1880 * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>).
1881 * <TT>srcText</TT> is not modified.
1882 * @param srcText the source for the new characters
1883 * @param srcStart the offset into <TT>srcText</TT> where new characters
1885 * @param srcLength the number of characters in <TT>srcText</TT> in the
1887 * @return a reference to this
1890 inline UnicodeString
& setTo(const UnicodeString
& srcText
,
1895 * Set the text in the UnicodeString object to the characters in
1897 * <TT>srcText</TT> is not modified.
1898 * @param srcText the source for the new characters
1899 * @return a reference to this
1902 inline UnicodeString
& setTo(const UnicodeString
& srcText
);
1905 * Set the characters in the UnicodeString object to the characters
1906 * in <TT>srcChars</TT>. <TT>srcChars</TT> is not modified.
1907 * @param srcChars the source for the new characters
1908 * @param srcLength the number of Unicode characters in srcChars.
1909 * @return a reference to this
1912 inline UnicodeString
& setTo(const UChar
*srcChars
,
1916 * Set the characters in the UnicodeString object to the code unit
1918 * @param srcChar the code unit which becomes the UnicodeString's character
1920 * @return a reference to this
1923 UnicodeString
& setTo(UChar srcChar
);
1926 * Set the characters in the UnicodeString object to the code point
1928 * @param srcChar the code point which becomes the UnicodeString's character
1930 * @return a reference to this
1933 UnicodeString
& setTo(UChar32 srcChar
);
1936 * Aliasing setTo() function, analogous to the readonly-aliasing UChar* constructor.
1937 * The text will be used for the UnicodeString object, but
1938 * it will not be released when the UnicodeString is destroyed.
1939 * This has copy-on-write semantics:
1940 * When the string is modified, then the buffer is first copied into
1941 * newly allocated memory.
1942 * The aliased buffer is never modified.
1944 * In an assignment to another UnicodeString, when using the copy constructor
1945 * or the assignment operator, the text will be copied.
1946 * When using fastCopyFrom(), the text will be aliased again,
1947 * so that both strings then alias the same readonly-text.
1949 * @param isTerminated specifies if <code>text</code> is <code>NUL</code>-terminated.
1950 * This must be true if <code>textLength==-1</code>.
1951 * @param text The characters to alias for the UnicodeString.
1952 * @param textLength The number of Unicode characters in <code>text</code> to alias.
1953 * If -1, then this constructor will determine the length
1954 * by calling <code>u_strlen()</code>.
1955 * @return a reference to this
1958 UnicodeString
&setTo(UBool isTerminated
,
1960 int32_t textLength
);
1963 * Aliasing setTo() function, analogous to the writable-aliasing UChar* constructor.
1964 * The text will be used for the UnicodeString object, but
1965 * it will not be released when the UnicodeString is destroyed.
1966 * This has write-through semantics:
1967 * For as long as the capacity of the buffer is sufficient, write operations
1968 * will directly affect the buffer. When more capacity is necessary, then
1969 * a new buffer will be allocated and the contents copied as with regularly
1970 * constructed strings.
1971 * In an assignment to another UnicodeString, the buffer will be copied.
1972 * The extract(UChar *dst) function detects whether the dst pointer is the same
1973 * as the string buffer itself and will in this case not copy the contents.
1975 * @param buffer The characters to alias for the UnicodeString.
1976 * @param buffLength The number of Unicode characters in <code>buffer</code> to alias.
1977 * @param buffCapacity The size of <code>buffer</code> in UChars.
1978 * @return a reference to this
1981 UnicodeString
&setTo(UChar
*buffer
,
1983 int32_t buffCapacity
);
1986 * Make this UnicodeString object invalid.
1987 * The string will test TRUE with isBogus().
1989 * A bogus string has no value. It is different from an empty string.
1990 * It can be used to indicate that no string value is available.
1991 * getBuffer() and getTerminatedBuffer() return NULL, and
1992 * length() returns 0.
1994 * This utility function is used throughout the UnicodeString
1995 * implementation to indicate that a UnicodeString operation failed,
1996 * and may be used in other functions,
1997 * especially but not exclusively when such functions do not
1998 * take a UErrorCode for simplicity.
2000 * The following methods, and no others, will clear a string object's bogus flag:
2002 * - remove(0, INT32_MAX)
2004 * - operator=() (assignment operator)
2007 * The simplest ways to turn a bogus string into an empty one
2008 * is to use the remove() function.
2009 * Examples for other functions that are equivalent to "set to empty string":
2012 * s.remove(); // set to an empty string (remove all), or
2013 * s.remove(0, INT32_MAX); // set to an empty string (remove all), or
2014 * s.truncate(0); // set to an empty string (complete truncation), or
2015 * s=UnicodeString(); // assign an empty string, or
2016 * s.setTo((UChar32)-1); // set to a pseudo code point that is out of range, or
2017 * static const UChar nul=0;
2018 * s.setTo(&nul, 0); // set to an empty C Unicode string
2028 * Set the character at the specified offset to the specified character.
2029 * @param offset A valid offset into the text of the character to set
2030 * @param ch The new character
2031 * @return A reference to this
2034 UnicodeString
& setCharAt(int32_t offset
,
2038 /* Append operations */
2041 * Append operator. Append the code unit <TT>ch</TT> to the UnicodeString
2043 * @param ch the code unit to be appended
2044 * @return a reference to this
2047 inline UnicodeString
& operator+= (UChar ch
);
2050 * Append operator. Append the code point <TT>ch</TT> to the UnicodeString
2052 * @param ch the code point to be appended
2053 * @return a reference to this
2056 inline UnicodeString
& operator+= (UChar32 ch
);
2059 * Append operator. Append the characters in <TT>srcText</TT> to the
2060 * UnicodeString object. <TT>srcText</TT> is not modified.
2061 * @param srcText the source for the new characters
2062 * @return a reference to this
2065 inline UnicodeString
& operator+= (const UnicodeString
& srcText
);
2068 * Append the characters
2069 * in <TT>srcText</TT> in the range
2070 * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>) to the
2071 * UnicodeString object at offset <TT>start</TT>. <TT>srcText</TT>
2073 * @param srcText the source for the new characters
2074 * @param srcStart the offset into <TT>srcText</TT> where new characters
2076 * @param srcLength the number of characters in <TT>srcText</TT> in
2078 * @return a reference to this
2081 inline UnicodeString
& append(const UnicodeString
& srcText
,
2086 * Append the characters in <TT>srcText</TT> to the UnicodeString object.
2087 * <TT>srcText</TT> is not modified.
2088 * @param srcText the source for the new characters
2089 * @return a reference to this
2092 inline UnicodeString
& append(const UnicodeString
& srcText
);
2095 * Append the characters in <TT>srcChars</TT> in the range
2096 * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>) to the UnicodeString
2098 * <TT>start</TT>. <TT>srcChars</TT> is not modified.
2099 * @param srcChars the source for the new characters
2100 * @param srcStart the offset into <TT>srcChars</TT> where new characters
2102 * @param srcLength the number of characters in <TT>srcChars</TT> in
2103 * the append string; can be -1 if <TT>srcChars</TT> is NUL-terminated
2104 * @return a reference to this
2107 inline UnicodeString
& append(const UChar
*srcChars
,
2112 * Append the characters in <TT>srcChars</TT> to the UnicodeString object
2113 * at offset <TT>start</TT>. <TT>srcChars</TT> is not modified.
2114 * @param srcChars the source for the new characters
2115 * @param srcLength the number of Unicode characters in <TT>srcChars</TT>;
2116 * can be -1 if <TT>srcChars</TT> is NUL-terminated
2117 * @return a reference to this
2120 inline UnicodeString
& append(const UChar
*srcChars
,
2124 * Append the code unit <TT>srcChar</TT> to the UnicodeString object.
2125 * @param srcChar the code unit to append
2126 * @return a reference to this
2129 inline UnicodeString
& append(UChar srcChar
);
2132 * Append the code point <TT>srcChar</TT> to the UnicodeString object.
2133 * @param srcChar the code point to append
2134 * @return a reference to this
2137 UnicodeString
& append(UChar32 srcChar
);
2140 /* Insert operations */
2143 * Insert the characters in <TT>srcText</TT> in the range
2144 * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>) into the UnicodeString
2145 * object at offset <TT>start</TT>. <TT>srcText</TT> is not modified.
2146 * @param start the offset where the insertion begins
2147 * @param srcText the source for the new characters
2148 * @param srcStart the offset into <TT>srcText</TT> where new characters
2150 * @param srcLength the number of characters in <TT>srcText</TT> in
2152 * @return a reference to this
2155 inline UnicodeString
& insert(int32_t start
,
2156 const UnicodeString
& srcText
,
2161 * Insert the characters in <TT>srcText</TT> into the UnicodeString object
2162 * at offset <TT>start</TT>. <TT>srcText</TT> is not modified.
2163 * @param start the offset where the insertion begins
2164 * @param srcText the source for the new characters
2165 * @return a reference to this
2168 inline UnicodeString
& insert(int32_t start
,
2169 const UnicodeString
& srcText
);
2172 * Insert the characters in <TT>srcChars</TT> in the range
2173 * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>) into the UnicodeString
2174 * object at offset <TT>start</TT>. <TT>srcChars</TT> is not modified.
2175 * @param start the offset at which the insertion begins
2176 * @param srcChars the source for the new characters
2177 * @param srcStart the offset into <TT>srcChars</TT> where new characters
2179 * @param srcLength the number of characters in <TT>srcChars</TT>
2180 * in the insert string
2181 * @return a reference to this
2184 inline UnicodeString
& insert(int32_t start
,
2185 const UChar
*srcChars
,
2190 * Insert the characters in <TT>srcChars</TT> into the UnicodeString object
2191 * at offset <TT>start</TT>. <TT>srcChars</TT> is not modified.
2192 * @param start the offset where the insertion begins
2193 * @param srcChars the source for the new characters
2194 * @param srcLength the number of Unicode characters in srcChars.
2195 * @return a reference to this
2198 inline UnicodeString
& insert(int32_t start
,
2199 const UChar
*srcChars
,
2203 * Insert the code unit <TT>srcChar</TT> into the UnicodeString object at
2204 * offset <TT>start</TT>.
2205 * @param start the offset at which the insertion occurs
2206 * @param srcChar the code unit to insert
2207 * @return a reference to this
2210 inline UnicodeString
& insert(int32_t start
,
2214 * Insert the code point <TT>srcChar</TT> into the UnicodeString object at
2215 * offset <TT>start</TT>.
2216 * @param start the offset at which the insertion occurs
2217 * @param srcChar the code point to insert
2218 * @return a reference to this
2221 inline UnicodeString
& insert(int32_t start
,
2225 /* Replace operations */
2228 * Replace the characters in the range
2229 * [<TT>start</TT>, <TT>start + length</TT>) with the characters in
2230 * <TT>srcText</TT> in the range
2231 * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>).
2232 * <TT>srcText</TT> is not modified.
2233 * @param start the offset at which the replace operation begins
2234 * @param length the number of characters to replace. The character at
2235 * <TT>start + length</TT> is not modified.
2236 * @param srcText the source for the new characters
2237 * @param srcStart the offset into <TT>srcText</TT> where new characters
2239 * @param srcLength the number of characters in <TT>srcText</TT> in
2240 * the replace string
2241 * @return a reference to this
2244 UnicodeString
& replace(int32_t start
,
2246 const UnicodeString
& srcText
,
2251 * Replace the characters in the range
2252 * [<TT>start</TT>, <TT>start + length</TT>)
2253 * with the characters in <TT>srcText</TT>. <TT>srcText</TT> is
2255 * @param start the offset at which the replace operation begins
2256 * @param length the number of characters to replace. The character at
2257 * <TT>start + length</TT> is not modified.
2258 * @param srcText the source for the new characters
2259 * @return a reference to this
2262 UnicodeString
& replace(int32_t start
,
2264 const UnicodeString
& srcText
);
2267 * Replace the characters in the range
2268 * [<TT>start</TT>, <TT>start + length</TT>) with the characters in
2269 * <TT>srcChars</TT> in the range
2270 * [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>). <TT>srcChars</TT>
2272 * @param start the offset at which the replace operation begins
2273 * @param length the number of characters to replace. The character at
2274 * <TT>start + length</TT> is not modified.
2275 * @param srcChars the source for the new characters
2276 * @param srcStart the offset into <TT>srcChars</TT> where new characters
2278 * @param srcLength the number of characters in <TT>srcChars</TT>
2279 * in the replace string
2280 * @return a reference to this
2283 UnicodeString
& replace(int32_t start
,
2285 const UChar
*srcChars
,
2290 * Replace the characters in the range
2291 * [<TT>start</TT>, <TT>start + length</TT>) with the characters in
2292 * <TT>srcChars</TT>. <TT>srcChars</TT> is not modified.
2293 * @param start the offset at which the replace operation begins
2294 * @param length number of characters to replace. The character at
2295 * <TT>start + length</TT> is not modified.
2296 * @param srcChars the source for the new characters
2297 * @param srcLength the number of Unicode characters in srcChars
2298 * @return a reference to this
2301 inline UnicodeString
& replace(int32_t start
,
2303 const UChar
*srcChars
,
2307 * Replace the characters in the range
2308 * [<TT>start</TT>, <TT>start + length</TT>) with the code unit
2310 * @param start the offset at which the replace operation begins
2311 * @param length the number of characters to replace. The character at
2312 * <TT>start + length</TT> is not modified.
2313 * @param srcChar the new code unit
2314 * @return a reference to this
2317 inline UnicodeString
& replace(int32_t start
,
2322 * Replace the characters in the range
2323 * [<TT>start</TT>, <TT>start + length</TT>) with the code point
2325 * @param start the offset at which the replace operation begins
2326 * @param length the number of characters to replace. The character at
2327 * <TT>start + length</TT> is not modified.
2328 * @param srcChar the new code point
2329 * @return a reference to this
2332 UnicodeString
& replace(int32_t start
, int32_t length
, UChar32 srcChar
);
2335 * Replace the characters in the range [<TT>start</TT>, <TT>limit</TT>)
2336 * with the characters in <TT>srcText</TT>. <TT>srcText</TT> is not modified.
2337 * @param start the offset at which the replace operation begins
2338 * @param limit the offset immediately following the replace range
2339 * @param srcText the source for the new characters
2340 * @return a reference to this
2343 inline UnicodeString
& replaceBetween(int32_t start
,
2345 const UnicodeString
& srcText
);
2348 * Replace the characters in the range [<TT>start</TT>, <TT>limit</TT>)
2349 * with the characters in <TT>srcText</TT> in the range
2350 * [<TT>srcStart</TT>, <TT>srcLimit</TT>). <TT>srcText</TT> is not modified.
2351 * @param start the offset at which the replace operation begins
2352 * @param limit the offset immediately following the replace range
2353 * @param srcText the source for the new characters
2354 * @param srcStart the offset into <TT>srcChars</TT> where new characters
2356 * @param srcLimit the offset immediately following the range to copy
2357 * in <TT>srcText</TT>
2358 * @return a reference to this
2361 inline UnicodeString
& replaceBetween(int32_t start
,
2363 const UnicodeString
& srcText
,
2368 * Replace a substring of this object with the given text.
2369 * @param start the beginning index, inclusive; <code>0 <= start
2371 * @param limit the ending index, exclusive; <code>start <= limit
2372 * <= length()</code>.
2373 * @param text the text to replace characters <code>start</code>
2374 * to <code>limit - 1</code>
2377 virtual void handleReplaceBetween(int32_t start
,
2379 const UnicodeString
& text
);
2383 * @return TRUE if it has MetaData
2386 virtual UBool
hasMetaData() const;
2389 * Copy a substring of this object, retaining attribute (out-of-band)
2390 * information. This method is used to duplicate or reorder substrings.
2391 * The destination index must not overlap the source range.
2393 * @param start the beginning index, inclusive; <code>0 <= start <=
2395 * @param limit the ending index, exclusive; <code>start <= limit <=
2397 * @param dest the destination index. The characters from
2398 * <code>start..limit-1</code> will be copied to <code>dest</code>.
2399 * Implementations of this method may assume that <code>dest <= start ||
2400 * dest >= limit</code>.
2403 virtual void copy(int32_t start
, int32_t limit
, int32_t dest
);
2405 /* Search and replace operations */
2408 * Replace all occurrences of characters in oldText with the characters
2410 * @param oldText the text containing the search text
2411 * @param newText the text containing the replacement text
2412 * @return a reference to this
2415 inline UnicodeString
& findAndReplace(const UnicodeString
& oldText
,
2416 const UnicodeString
& newText
);
2419 * Replace all occurrences of characters in oldText with characters
2421 * in the range [<TT>start</TT>, <TT>start + length</TT>).
2422 * @param start the start of the range in which replace will performed
2423 * @param length the length of the range in which replace will be performed
2424 * @param oldText the text containing the search text
2425 * @param newText the text containing the replacement text
2426 * @return a reference to this
2429 inline UnicodeString
& findAndReplace(int32_t start
,
2431 const UnicodeString
& oldText
,
2432 const UnicodeString
& newText
);
2435 * Replace all occurrences of characters in oldText in the range
2436 * [<TT>oldStart</TT>, <TT>oldStart + oldLength</TT>) with the characters
2437 * in newText in the range
2438 * [<TT>newStart</TT>, <TT>newStart + newLength</TT>)
2439 * in the range [<TT>start</TT>, <TT>start + length</TT>).
2440 * @param start the start of the range in which replace will performed
2441 * @param length the length of the range in which replace will be performed
2442 * @param oldText the text containing the search text
2443 * @param oldStart the start of the search range in <TT>oldText</TT>
2444 * @param oldLength the length of the search range in <TT>oldText</TT>
2445 * @param newText the text containing the replacement text
2446 * @param newStart the start of the replacement range in <TT>newText</TT>
2447 * @param newLength the length of the replacement range in <TT>newText</TT>
2448 * @return a reference to this
2451 UnicodeString
& findAndReplace(int32_t start
,
2453 const UnicodeString
& oldText
,
2456 const UnicodeString
& newText
,
2461 /* Remove operations */
2464 * Remove all characters from the UnicodeString object.
2465 * @return a reference to this
2468 inline UnicodeString
& remove(void);
2471 * Remove the characters in the range
2472 * [<TT>start</TT>, <TT>start + length</TT>) from the UnicodeString object.
2473 * @param start the offset of the first character to remove
2474 * @param length the number of characters to remove
2475 * @return a reference to this
2478 inline UnicodeString
& remove(int32_t start
,
2479 int32_t length
= (int32_t)INT32_MAX
);
2482 * Remove the characters in the range
2483 * [<TT>start</TT>, <TT>limit</TT>) from the UnicodeString object.
2484 * @param start the offset of the first character to remove
2485 * @param limit the offset immediately following the range to remove
2486 * @return a reference to this
2489 inline UnicodeString
& removeBetween(int32_t start
,
2490 int32_t limit
= (int32_t)INT32_MAX
);
2493 * Retain only the characters in the range
2494 * [<code>start</code>, <code>limit</code>) from the UnicodeString object.
2495 * Removes characters before <code>start</code> and at and after <code>limit</code>.
2496 * @param start the offset of the first character to retain
2497 * @param limit the offset immediately following the range to retain
2498 * @return a reference to this
2501 inline UnicodeString
&retainBetween(int32_t start
, int32_t limit
= INT32_MAX
);
2503 /* Length operations */
2506 * Pad the start of this UnicodeString with the character <TT>padChar</TT>.
2507 * If the length of this UnicodeString is less than targetLength,
2508 * length() - targetLength copies of padChar will be added to the
2509 * beginning of this UnicodeString.
2510 * @param targetLength the desired length of the string
2511 * @param padChar the character to use for padding. Defaults to
2513 * @return TRUE if the text was padded, FALSE otherwise.
2516 UBool
padLeading(int32_t targetLength
,
2517 UChar padChar
= 0x0020);
2520 * Pad the end of this UnicodeString with the character <TT>padChar</TT>.
2521 * If the length of this UnicodeString is less than targetLength,
2522 * length() - targetLength copies of padChar will be added to the
2523 * end of this UnicodeString.
2524 * @param targetLength the desired length of the string
2525 * @param padChar the character to use for padding. Defaults to
2527 * @return TRUE if the text was padded, FALSE otherwise.
2530 UBool
padTrailing(int32_t targetLength
,
2531 UChar padChar
= 0x0020);
2534 * Truncate this UnicodeString to the <TT>targetLength</TT>.
2535 * @param targetLength the desired length of this UnicodeString.
2536 * @return TRUE if the text was truncated, FALSE otherwise
2539 inline UBool
truncate(int32_t targetLength
);
2542 * Trims leading and trailing whitespace from this UnicodeString.
2543 * @return a reference to this
2546 UnicodeString
& trim(void);
2549 /* Miscellaneous operations */
2552 * Reverse this UnicodeString in place.
2553 * @return a reference to this
2556 inline UnicodeString
& reverse(void);
2559 * Reverse the range [<TT>start</TT>, <TT>start + length</TT>) in
2560 * this UnicodeString.
2561 * @param start the start of the range to reverse
2562 * @param length the number of characters to to reverse
2563 * @return a reference to this
2566 inline UnicodeString
& reverse(int32_t start
,
2570 * Convert the characters in this to UPPER CASE following the conventions of
2571 * the default locale.
2572 * @return A reference to this.
2575 UnicodeString
& toUpper(void);
2578 * Convert the characters in this to UPPER CASE following the conventions of
2579 * a specific locale.
2580 * @param locale The locale containing the conventions to use.
2581 * @return A reference to this.
2584 UnicodeString
& toUpper(const Locale
& locale
);
2587 * Convert the characters in this to lower case following the conventions of
2588 * the default locale.
2589 * @return A reference to this.
2592 UnicodeString
& toLower(void);
2595 * Convert the characters in this to lower case following the conventions of
2596 * a specific locale.
2597 * @param locale The locale containing the conventions to use.
2598 * @return A reference to this.
2601 UnicodeString
& toLower(const Locale
& locale
);
2603 #if !UCONFIG_NO_BREAK_ITERATION
2606 * Titlecase this string, convenience function using the default locale.
2608 * Casing is locale-dependent and context-sensitive.
2609 * Titlecasing uses a break iterator to find the first characters of words
2610 * that are to be titlecased. It titlecases those characters and lowercases
2613 * The titlecase break iterator can be provided to customize for arbitrary
2614 * styles, using rules and dictionaries beyond the standard iterators.
2615 * It may be more efficient to always provide an iterator to avoid
2616 * opening and closing one for each string.
2617 * The standard titlecase iterator for the root locale implements the
2618 * algorithm of Unicode TR 21.
2620 * This function uses only the setText(), first() and next() methods of the
2621 * provided break iterator.
2623 * @param titleIter A break iterator to find the first characters of words
2624 * that are to be titlecased.
2625 * If none is provided (0), then a standard titlecase
2626 * break iterator is opened.
2627 * Otherwise the provided iterator is set to the string's text.
2628 * @return A reference to this.
2631 UnicodeString
&toTitle(BreakIterator
*titleIter
);
2634 * Titlecase this string.
2636 * Casing is locale-dependent and context-sensitive.
2637 * Titlecasing uses a break iterator to find the first characters of words
2638 * that are to be titlecased. It titlecases those characters and lowercases
2641 * The titlecase break iterator can be provided to customize for arbitrary
2642 * styles, using rules and dictionaries beyond the standard iterators.
2643 * It may be more efficient to always provide an iterator to avoid
2644 * opening and closing one for each string.
2645 * The standard titlecase iterator for the root locale implements the
2646 * algorithm of Unicode TR 21.
2648 * This function uses only the setText(), first() and next() methods of the
2649 * provided break iterator.
2651 * @param titleIter A break iterator to find the first characters of words
2652 * that are to be titlecased.
2653 * If none is provided (0), then a standard titlecase
2654 * break iterator is opened.
2655 * Otherwise the provided iterator is set to the string's text.
2656 * @param locale The locale to consider.
2657 * @return A reference to this.
2660 UnicodeString
&toTitle(BreakIterator
*titleIter
, const Locale
&locale
);
2663 * Titlecase this string, with options.
2665 * Casing is locale-dependent and context-sensitive.
2666 * Titlecasing uses a break iterator to find the first characters of words
2667 * that are to be titlecased. It titlecases those characters and lowercases
2668 * all others. (This can be modified with options.)
2670 * The titlecase break iterator can be provided to customize for arbitrary
2671 * styles, using rules and dictionaries beyond the standard iterators.
2672 * It may be more efficient to always provide an iterator to avoid
2673 * opening and closing one for each string.
2674 * The standard titlecase iterator for the root locale implements the
2675 * algorithm of Unicode TR 21.
2677 * This function uses only the setText(), first() and next() methods of the
2678 * provided break iterator.
2680 * @param titleIter A break iterator to find the first characters of words
2681 * that are to be titlecased.
2682 * If none is provided (0), then a standard titlecase
2683 * break iterator is opened.
2684 * Otherwise the provided iterator is set to the string's text.
2685 * @param locale The locale to consider.
2686 * @param options Options bit set, see ucasemap_open().
2687 * @return A reference to this.
2688 * @see U_TITLECASE_NO_LOWERCASE
2689 * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
2690 * @see ucasemap_open
2693 UnicodeString
&toTitle(BreakIterator
*titleIter
, const Locale
&locale
, uint32_t options
);
2698 * Case-folds the characters in this string.
2700 * Case-folding is locale-independent and not context-sensitive,
2701 * but there is an option for whether to include or exclude mappings for dotted I
2702 * and dotless i that are marked with 'T' in CaseFolding.txt.
2704 * The result may be longer or shorter than the original.
2706 * @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
2707 * @return A reference to this.
2710 UnicodeString
&foldCase(uint32_t options
=0 /*U_FOLD_CASE_DEFAULT*/);
2712 //========================================
2713 // Access to the internal buffer
2714 //========================================
2717 * Get a read/write pointer to the internal buffer.
2718 * The buffer is guaranteed to be large enough for at least minCapacity UChars,
2719 * writable, and is still owned by the UnicodeString object.
2720 * Calls to getBuffer(minCapacity) must not be nested, and
2721 * must be matched with calls to releaseBuffer(newLength).
2722 * If the string buffer was read-only or shared,
2723 * then it will be reallocated and copied.
2725 * An attempted nested call will return 0, and will not further modify the
2726 * state of the UnicodeString object.
2727 * It also returns 0 if the string is bogus.
2729 * The actual capacity of the string buffer may be larger than minCapacity.
2730 * getCapacity() returns the actual capacity.
2731 * For many operations, the full capacity should be used to avoid reallocations.
2733 * While the buffer is "open" between getBuffer(minCapacity)
2734 * and releaseBuffer(newLength), the following applies:
2735 * - The string length is set to 0.
2736 * - Any read API call on the UnicodeString object will behave like on a 0-length string.
2737 * - Any write API call on the UnicodeString object is disallowed and will have no effect.
2738 * - You can read from and write to the returned buffer.
2739 * - The previous string contents will still be in the buffer;
2740 * if you want to use it, then you need to call length() before getBuffer(minCapacity).
2741 * If the length() was greater than minCapacity, then any contents after minCapacity
2743 * The buffer contents is not NUL-terminated by getBuffer().
2744 * If length()<getCapacity() then you can terminate it by writing a NUL
2745 * at index length().
2746 * - You must call releaseBuffer(newLength) before and in order to
2747 * return to normal UnicodeString operation.
2749 * @param minCapacity the minimum number of UChars that are to be available
2750 * in the buffer, starting at the returned pointer;
2751 * default to the current string capacity if minCapacity==-1
2752 * @return a writable pointer to the internal string buffer,
2753 * or 0 if an error occurs (nested calls, out of memory)
2755 * @see releaseBuffer
2756 * @see getTerminatedBuffer()
2759 UChar
*getBuffer(int32_t minCapacity
);
2762 * Release a read/write buffer on a UnicodeString object with an
2763 * "open" getBuffer(minCapacity).
2764 * This function must be called in a matched pair with getBuffer(minCapacity).
2765 * releaseBuffer(newLength) must be called if and only if a getBuffer(minCapacity) is "open".
2767 * It will set the string length to newLength, at most to the current capacity.
2768 * If newLength==-1 then it will set the length according to the
2769 * first NUL in the buffer, or to the capacity if there is no NUL.
2771 * After calling releaseBuffer(newLength) the UnicodeString is back to normal operation.
2773 * @param newLength the new length of the UnicodeString object;
2774 * defaults to the current capacity if newLength is greater than that;
2775 * if newLength==-1, it defaults to u_strlen(buffer) but not more than
2776 * the current capacity of the string
2778 * @see getBuffer(int32_t minCapacity)
2781 void releaseBuffer(int32_t newLength
=-1);
2784 * Get a read-only pointer to the internal buffer.
2785 * This can be called at any time on a valid UnicodeString.
2787 * It returns 0 if the string is bogus, or
2788 * during an "open" getBuffer(minCapacity).
2790 * It can be called as many times as desired.
2791 * The pointer that it returns will remain valid until the UnicodeString object is modified,
2792 * at which time the pointer is semantically invalidated and must not be used any more.
2794 * The capacity of the buffer can be determined with getCapacity().
2795 * The part after length() may or may not be initialized and valid,
2796 * depending on the history of the UnicodeString object.
2798 * The buffer contents is (probably) not NUL-terminated.
2799 * You can check if it is with
2800 * <code>(s.length()<s.getCapacity() && buffer[s.length()]==0)</code>.
2801 * (See getTerminatedBuffer().)
2803 * The buffer may reside in read-only memory. Its contents must not
2806 * @return a read-only pointer to the internal string buffer,
2807 * or 0 if the string is empty or bogus
2809 * @see getBuffer(int32_t minCapacity)
2810 * @see getTerminatedBuffer()
2813 inline const UChar
*getBuffer() const;
2816 * Get a read-only pointer to the internal buffer,
2817 * making sure that it is NUL-terminated.
2818 * This can be called at any time on a valid UnicodeString.
2820 * It returns 0 if the string is bogus, or
2821 * during an "open" getBuffer(minCapacity), or if the buffer cannot
2822 * be NUL-terminated (because memory allocation failed).
2824 * It can be called as many times as desired.
2825 * The pointer that it returns will remain valid until the UnicodeString object is modified,
2826 * at which time the pointer is semantically invalidated and must not be used any more.
2828 * The capacity of the buffer can be determined with getCapacity().
2829 * The part after length()+1 may or may not be initialized and valid,
2830 * depending on the history of the UnicodeString object.
2832 * The buffer contents is guaranteed to be NUL-terminated.
2833 * getTerminatedBuffer() may reallocate the buffer if a terminating NUL
2835 * For this reason, this function is not const, unlike getBuffer().
2836 * Note that a UnicodeString may also contain NUL characters as part of its contents.
2838 * The buffer may reside in read-only memory. Its contents must not
2841 * @return a read-only pointer to the internal string buffer,
2842 * or 0 if the string is empty or bogus
2844 * @see getBuffer(int32_t minCapacity)
2848 inline const UChar
*getTerminatedBuffer();
2850 //========================================
2852 //========================================
2854 /** Construct an empty UnicodeString.
2857 inline UnicodeString();
2860 * Construct a UnicodeString with capacity to hold <TT>capacity</TT> UChars
2861 * @param capacity the number of UChars this UnicodeString should hold
2862 * before a resize is necessary; if count is greater than 0 and count
2863 * code points c take up more space than capacity, then capacity is adjusted
2865 * @param c is used to initially fill the string
2866 * @param count specifies how many code points c are to be written in the
2870 UnicodeString(int32_t capacity
, UChar32 c
, int32_t count
);
2873 * Single UChar (code unit) constructor.
2875 * It is recommended to mark this constructor "explicit" by
2876 * <code>-DUNISTR_FROM_CHAR_EXPLICIT=explicit</code>
2877 * on the compiler command line or similar.
2878 * @param ch the character to place in the UnicodeString
2881 UNISTR_FROM_CHAR_EXPLICIT
UnicodeString(UChar ch
);
2884 * Single UChar32 (code point) constructor.
2886 * It is recommended to mark this constructor "explicit" by
2887 * <code>-DUNISTR_FROM_CHAR_EXPLICIT=explicit</code>
2888 * on the compiler command line or similar.
2889 * @param ch the character to place in the UnicodeString
2892 UNISTR_FROM_CHAR_EXPLICIT
UnicodeString(UChar32 ch
);
2895 * UChar* constructor.
2897 * It is recommended to mark this constructor "explicit" by
2898 * <code>-DUNISTR_FROM_STRING_EXPLICIT=explicit</code>
2899 * on the compiler command line or similar.
2900 * @param text The characters to place in the UnicodeString. <TT>text</TT>
2901 * must be NULL (U+0000) terminated.
2904 UNISTR_FROM_STRING_EXPLICIT
UnicodeString(const UChar
*text
);
2907 * UChar* constructor.
2908 * @param text The characters to place in the UnicodeString.
2909 * @param textLength The number of Unicode characters in <TT>text</TT>
2913 UnicodeString(const UChar
*text
,
2914 int32_t textLength
);
2917 * Readonly-aliasing UChar* constructor.
2918 * The text will be used for the UnicodeString object, but
2919 * it will not be released when the UnicodeString is destroyed.
2920 * This has copy-on-write semantics:
2921 * When the string is modified, then the buffer is first copied into
2922 * newly allocated memory.
2923 * The aliased buffer is never modified.
2925 * In an assignment to another UnicodeString, when using the copy constructor
2926 * or the assignment operator, the text will be copied.
2927 * When using fastCopyFrom(), the text will be aliased again,
2928 * so that both strings then alias the same readonly-text.
2930 * @param isTerminated specifies if <code>text</code> is <code>NUL</code>-terminated.
2931 * This must be true if <code>textLength==-1</code>.
2932 * @param text The characters to alias for the UnicodeString.
2933 * @param textLength The number of Unicode characters in <code>text</code> to alias.
2934 * If -1, then this constructor will determine the length
2935 * by calling <code>u_strlen()</code>.
2938 UnicodeString(UBool isTerminated
,
2940 int32_t textLength
);
2943 * Writable-aliasing UChar* constructor.
2944 * The text will be used for the UnicodeString object, but
2945 * it will not be released when the UnicodeString is destroyed.
2946 * This has write-through semantics:
2947 * For as long as the capacity of the buffer is sufficient, write operations
2948 * will directly affect the buffer. When more capacity is necessary, then
2949 * a new buffer will be allocated and the contents copied as with regularly
2950 * constructed strings.
2951 * In an assignment to another UnicodeString, the buffer will be copied.
2952 * The extract(UChar *dst) function detects whether the dst pointer is the same
2953 * as the string buffer itself and will in this case not copy the contents.
2955 * @param buffer The characters to alias for the UnicodeString.
2956 * @param buffLength The number of Unicode characters in <code>buffer</code> to alias.
2957 * @param buffCapacity The size of <code>buffer</code> in UChars.
2960 UnicodeString(UChar
*buffer
, int32_t buffLength
, int32_t buffCapacity
);
2962 #if U_CHARSET_IS_UTF8 || !UCONFIG_NO_CONVERSION
2965 * char* constructor.
2966 * Uses the default converter (and thus depends on the ICU conversion code)
2967 * unless U_CHARSET_IS_UTF8 is set to 1.
2969 * For ASCII (really "invariant character") strings it is more efficient to use
2970 * the constructor that takes a US_INV (for its enum EInvariant).
2971 * For ASCII (invariant-character) string literals, see UNICODE_STRING and
2972 * UNICODE_STRING_SIMPLE.
2974 * It is recommended to mark this constructor "explicit" by
2975 * <code>-DUNISTR_FROM_STRING_EXPLICIT=explicit</code>
2976 * on the compiler command line or similar.
2977 * @param codepageData an array of bytes, null-terminated,
2978 * in the platform's default codepage.
2980 * @see UNICODE_STRING
2981 * @see UNICODE_STRING_SIMPLE
2983 UNISTR_FROM_STRING_EXPLICIT
UnicodeString(const char *codepageData
);
2986 * char* constructor.
2987 * Uses the default converter (and thus depends on the ICU conversion code)
2988 * unless U_CHARSET_IS_UTF8 is set to 1.
2989 * @param codepageData an array of bytes in the platform's default codepage.
2990 * @param dataLength The number of bytes in <TT>codepageData</TT>.
2993 UnicodeString(const char *codepageData
, int32_t dataLength
);
2997 #if !UCONFIG_NO_CONVERSION
3000 * char* constructor.
3001 * @param codepageData an array of bytes, null-terminated
3002 * @param codepage the encoding of <TT>codepageData</TT>. The special
3003 * value 0 for <TT>codepage</TT> indicates that the text is in the
3004 * platform's default codepage.
3006 * If <code>codepage</code> is an empty string (<code>""</code>),
3007 * then a simple conversion is performed on the codepage-invariant
3008 * subset ("invariant characters") of the platform encoding. See utypes.h.
3009 * Recommendation: For invariant-character strings use the constructor
3010 * UnicodeString(const char *src, int32_t length, enum EInvariant inv)
3011 * because it avoids object code dependencies of UnicodeString on
3012 * the conversion code.
3016 UnicodeString(const char *codepageData
, const char *codepage
);
3019 * char* constructor.
3020 * @param codepageData an array of bytes.
3021 * @param dataLength The number of bytes in <TT>codepageData</TT>.
3022 * @param codepage the encoding of <TT>codepageData</TT>. The special
3023 * value 0 for <TT>codepage</TT> indicates that the text is in the
3024 * platform's default codepage.
3025 * If <code>codepage</code> is an empty string (<code>""</code>),
3026 * then a simple conversion is performed on the codepage-invariant
3027 * subset ("invariant characters") of the platform encoding. See utypes.h.
3028 * Recommendation: For invariant-character strings use the constructor
3029 * UnicodeString(const char *src, int32_t length, enum EInvariant inv)
3030 * because it avoids object code dependencies of UnicodeString on
3031 * the conversion code.
3035 UnicodeString(const char *codepageData
, int32_t dataLength
, const char *codepage
);
3038 * char * / UConverter constructor.
3039 * This constructor uses an existing UConverter object to
3040 * convert the codepage string to Unicode and construct a UnicodeString
3043 * The converter is reset at first.
3044 * If the error code indicates a failure before this constructor is called,
3045 * or if an error occurs during conversion or construction,
3046 * then the string will be bogus.
3048 * This function avoids the overhead of opening and closing a converter if
3049 * multiple strings are constructed.
3051 * @param src input codepage string
3052 * @param srcLength length of the input string, can be -1 for NUL-terminated strings
3053 * @param cnv converter object (ucnv_resetToUnicode() will be called),
3054 * can be NULL for the default converter
3055 * @param errorCode normal ICU error code
3059 const char *src
, int32_t srcLength
,
3061 UErrorCode
&errorCode
);
3066 * Constructs a Unicode string from an invariant-character char * string.
3067 * About invariant characters see utypes.h.
3068 * This constructor has no runtime dependency on conversion code and is
3069 * therefore recommended over ones taking a charset name string
3070 * (where the empty string "" indicates invariant-character conversion).
3072 * Use the macro US_INV as the third, signature-distinguishing parameter.
3076 * void fn(const char *s) {
3077 * UnicodeString ustr(s, -1, US_INV);
3082 * @param src String using only invariant characters.
3083 * @param length Length of src, or -1 if NUL-terminated.
3084 * @param inv Signature-distinguishing paramater, use US_INV.
3089 UnicodeString(const char *src
, int32_t length
, enum EInvariant inv
);
3094 * @param that The UnicodeString object to copy.
3097 UnicodeString(const UnicodeString
& that
);
3100 * 'Substring' constructor from tail of source string.
3101 * @param src The UnicodeString object to copy.
3102 * @param srcStart The offset into <tt>src</tt> at which to start copying.
3105 UnicodeString(const UnicodeString
& src
, int32_t srcStart
);
3108 * 'Substring' constructor from subrange of source string.
3109 * @param src The UnicodeString object to copy.
3110 * @param srcStart The offset into <tt>src</tt> at which to start copying.
3111 * @param srcLength The number of characters from <tt>src</tt> to copy.
3114 UnicodeString(const UnicodeString
& src
, int32_t srcStart
, int32_t srcLength
);
3117 * Clone this object, an instance of a subclass of Replaceable.
3118 * Clones can be used concurrently in multiple threads.
3119 * If a subclass does not implement clone(), or if an error occurs,
3120 * then NULL is returned.
3121 * The clone functions in all subclasses return a pointer to a Replaceable
3122 * because some compilers do not support covariant (same-as-this)
3123 * return types; cast to the appropriate subclass if necessary.
3124 * The caller must delete the clone.
3126 * @return a clone of this object
3128 * @see Replaceable::clone
3129 * @see getDynamicClassID
3132 virtual Replaceable
*clone() const;
3137 virtual ~UnicodeString();
3140 * Create a UnicodeString from a UTF-8 string.
3141 * Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string.
3142 * Calls u_strFromUTF8WithSub().
3144 * @param utf8 UTF-8 input string.
3145 * Note that a StringPiece can be implicitly constructed
3146 * from a std::string or a NUL-terminated const char * string.
3147 * @return A UnicodeString with equivalent UTF-16 contents.
3152 static UnicodeString
fromUTF8(const StringPiece
&utf8
);
3155 * Create a UnicodeString from a UTF-32 string.
3156 * Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string.
3157 * Calls u_strFromUTF32WithSub().
3159 * @param utf32 UTF-32 input string. Must not be NULL.
3160 * @param length Length of the input string, or -1 if NUL-terminated.
3161 * @return A UnicodeString with equivalent UTF-16 contents.
3165 static UnicodeString
fromUTF32(const UChar32
*utf32
, int32_t length
);
3167 /* Miscellaneous operations */
3170 * Unescape a string of characters and return a string containing
3171 * the result. The following escape sequences are recognized:
3173 * \\uhhhh 4 hex digits; h in [0-9A-Fa-f]
3174 * \\Uhhhhhhhh 8 hex digits
3175 * \\xhh 1-2 hex digits
3176 * \\ooo 1-3 octal digits; o in [0-7]
3177 * \\cX control-X; X is masked with 0x1F
3179 * as well as the standard ANSI C escapes:
3181 * \\a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A,
3182 * \\v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B,
3183 * \\" => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C
3185 * Anything else following a backslash is generically escaped. For
3186 * example, "[a\\-z]" returns "[a-z]".
3188 * If an escape sequence is ill-formed, this method returns an empty
3189 * string. An example of an ill-formed sequence is "\\u" followed by
3190 * fewer than 4 hex digits.
3192 * This function is similar to u_unescape() but not identical to it.
3193 * The latter takes a source char*, so it does escape recognition
3194 * and also invariant conversion.
3196 * @return a string with backslash escapes interpreted, or an
3197 * empty string on error.
3198 * @see UnicodeString#unescapeAt()
3200 * @see u_unescapeAt()
3203 UnicodeString
unescape() const;
3206 * Unescape a single escape sequence and return the represented
3207 * character. See unescape() for a listing of the recognized escape
3208 * sequences. The character at offset-1 is assumed (without
3209 * checking) to be a backslash. If the escape sequence is
3210 * ill-formed, or the offset is out of range, U_SENTINEL=-1 is
3213 * @param offset an input output parameter. On input, it is the
3214 * offset into this string where the escape sequence is located,
3215 * after the initial backslash. On output, it is advanced after the
3216 * last character parsed. On error, it is not advanced at all.
3217 * @return the character represented by the escape sequence at
3218 * offset, or U_SENTINEL=-1 on error.
3219 * @see UnicodeString#unescape()
3221 * @see u_unescapeAt()
3224 UChar32
unescapeAt(int32_t &offset
) const;
3227 * ICU "poor man's RTTI", returns a UClassID for this class.
3231 static UClassID U_EXPORT2
getStaticClassID();
3234 * ICU "poor man's RTTI", returns a UClassID for the actual class.
3238 virtual UClassID
getDynamicClassID() const;
3240 //========================================
3241 // Implementation methods
3242 //========================================
3246 * Implement Replaceable::getLength() (see jitterbug 1027).
3249 virtual int32_t getLength() const;
3252 * The change in Replaceable to use virtual getCharAt() allows
3253 * UnicodeString::charAt() to be inline again (see jitterbug 709).
3256 virtual UChar
getCharAt(int32_t offset
) const;
3259 * The change in Replaceable to use virtual getChar32At() allows
3260 * UnicodeString::char32At() to be inline again (see jitterbug 709).
3263 virtual UChar32
getChar32At(int32_t offset
) const;
3266 // For char* constructors. Could be made public.
3267 UnicodeString
&setToUTF8(const StringPiece
&utf8
);
3268 // For extract(char*).
3269 // We could make a toUTF8(target, capacity, errorCode) public but not
3270 // this version: New API will be cleaner if we make callers create substrings
3271 // rather than having start+length on every method,
3272 // and it should take a UErrorCode&.
3274 toUTF8(int32_t start
, int32_t len
,
3275 char *target
, int32_t capacity
) const;
3278 * Internal string contents comparison, called by operator==.
3279 * Requires: this & text not bogus and have same lengths.
3281 UBool
doEquals(const UnicodeString
&text
, int32_t len
) const;
3284 doCompare(int32_t start
,
3286 const UnicodeString
& srcText
,
3288 int32_t srcLength
) const;
3290 int8_t doCompare(int32_t start
,
3292 const UChar
*srcChars
,
3294 int32_t srcLength
) const;
3297 doCompareCodePointOrder(int32_t start
,
3299 const UnicodeString
& srcText
,
3301 int32_t srcLength
) const;
3303 int8_t doCompareCodePointOrder(int32_t start
,
3305 const UChar
*srcChars
,
3307 int32_t srcLength
) const;
3310 doCaseCompare(int32_t start
,
3312 const UnicodeString
&srcText
,
3315 uint32_t options
) const;
3318 doCaseCompare(int32_t start
,
3320 const UChar
*srcChars
,
3323 uint32_t options
) const;
3325 int32_t doIndexOf(UChar c
,
3327 int32_t length
) const;
3329 int32_t doIndexOf(UChar32 c
,
3331 int32_t length
) const;
3333 int32_t doLastIndexOf(UChar c
,
3335 int32_t length
) const;
3337 int32_t doLastIndexOf(UChar32 c
,
3339 int32_t length
) const;
3341 void doExtract(int32_t start
,
3344 int32_t dstStart
) const;
3346 inline void doExtract(int32_t start
,
3348 UnicodeString
& target
) const;
3350 inline UChar
doCharAt(int32_t offset
) const;
3352 UnicodeString
& doReplace(int32_t start
,
3354 const UnicodeString
& srcText
,
3358 UnicodeString
& doReplace(int32_t start
,
3360 const UChar
*srcChars
,
3364 UnicodeString
& doReverse(int32_t start
,
3367 // calculate hash code
3368 int32_t doHashCode(void) const;
3370 // get pointer to start of array
3371 // these do not check for kOpenGetBuffer, unlike the public getBuffer() function
3372 inline UChar
* getArrayStart(void);
3373 inline const UChar
* getArrayStart(void) const;
3375 // A UnicodeString object (not necessarily its current buffer)
3376 // is writable unless it isBogus() or it has an "open" getBuffer(minCapacity).
3377 inline UBool
isWritable() const;
3379 // Is the current buffer writable?
3380 inline UBool
isBufferWritable() const;
3382 // None of the following does releaseArray().
3383 inline void setLength(int32_t len
); // sets only fShortLength and fLength
3384 inline void setToEmpty(); // sets fFlags=kShortString
3385 inline void setArray(UChar
*array
, int32_t len
, int32_t capacity
); // does not set fFlags
3387 // allocate the array; result may be fStackBuffer
3388 // sets refCount to 1 if appropriate
3389 // sets fArray, fCapacity, and fFlags
3390 // returns boolean for success or failure
3391 UBool
allocate(int32_t capacity
);
3393 // release the array if owned
3394 void releaseArray(void);
3396 // turn a bogus string into an empty one
3399 // implements assigment operator, copy constructor, and fastCopyFrom()
3400 UnicodeString
©From(const UnicodeString
&src
, UBool fastCopy
=FALSE
);
3402 // Pin start and limit to acceptable values.
3403 inline void pinIndex(int32_t& start
) const;
3404 inline void pinIndices(int32_t& start
,
3405 int32_t& length
) const;
3407 #if !UCONFIG_NO_CONVERSION
3409 /* Internal extract() using UConverter. */
3410 int32_t doExtract(int32_t start
, int32_t length
,
3411 char *dest
, int32_t destCapacity
,
3413 UErrorCode
&errorCode
) const;
3416 * Real constructor for converting from codepage data.
3417 * It assumes that it is called with !fRefCounted.
3419 * If <code>codepage==0</code>, then the default converter
3420 * is used for the platform encoding.
3421 * If <code>codepage</code> is an empty string (<code>""</code>),
3422 * then a simple conversion is performed on the codepage-invariant
3423 * subset ("invariant characters") of the platform encoding. See utypes.h.
3425 void doCodepageCreate(const char *codepageData
,
3427 const char *codepage
);
3430 * Worker function for creating a UnicodeString from
3431 * a codepage string using a UConverter.
3434 doCodepageCreate(const char *codepageData
,
3436 UConverter
*converter
,
3437 UErrorCode
&status
);
3442 * This function is called when write access to the array
3445 * We need to make a copy of the array if
3446 * the buffer is read-only, or
3447 * the buffer is refCounted (shared), and refCount>1, or
3448 * the buffer is too small.
3450 * Return FALSE if memory could not be allocated.
3452 UBool
cloneArrayIfNeeded(int32_t newCapacity
= -1,
3453 int32_t growCapacity
= -1,
3454 UBool doCopyArray
= TRUE
,
3455 int32_t **pBufferToDelete
= 0,
3456 UBool forceClone
= FALSE
);
3459 * Common function for UnicodeString case mappings.
3460 * The stringCaseMapper has the same type UStringCaseMapper
3461 * as in ustr_imp.h for ustrcase_map().
3464 caseMap(const UCaseMap
*csm
, UStringCaseMapper
*stringCaseMapper
);
3468 int32_t removeRef(void);
3469 int32_t refCount(void) const;
3473 // Set the stack buffer size so that sizeof(UnicodeString) is,
3474 // naturally (without padding), a multiple of sizeof(pointer).
3475 US_STACKBUF_SIZE
= sizeof(void *)==4 ? 13 : 15, // Size of stack buffer for short strings
3476 kInvalidUChar
=0xffff, // invalid UChar index
3477 kGrowSize
=128, // grow size for this buffer
3478 kInvalidHashCode
=0, // invalid hash code
3479 kEmptyHashCode
=1, // hash code for empty string
3481 // bit flag values for fFlags
3482 kIsBogus
=1, // this string is bogus, i.e., not valid or NULL
3483 kUsingStackBuffer
=2,// using fUnion.fStackBuffer instead of fUnion.fFields
3484 kRefCounted
=4, // there is a refCount field before the characters in fArray
3485 kBufferIsReadonly
=8,// do not write to this buffer
3486 kOpenGetBuffer
=16, // getBuffer(minCapacity) was called (is "open"),
3487 // and releaseBuffer(newLength) must be called
3489 // combined values for convenience
3490 kShortString
=kUsingStackBuffer
,
3491 kLongString
=kRefCounted
,
3492 kReadonlyAlias
=kBufferIsReadonly
,
3496 friend class StringThreadTest
;
3497 friend class UnicodeStringAppendable
;
3499 union StackBufferOrFields
; // forward declaration necessary before friend declaration
3500 friend union StackBufferOrFields
; // make US_STACKBUF_SIZE visible inside fUnion
3503 * The following are all the class fields that are stored
3504 * in each UnicodeString object.
3505 * Note that UnicodeString has virtual functions,
3506 * therefore there is an implicit vtable pointer
3507 * as the first real field.
3508 * The fields should be aligned such that no padding is necessary.
3509 * On 32-bit machines, the size should be 32 bytes,
3510 * on 64-bit machines (8-byte pointers), it should be 40 bytes.
3512 * We use a hack to achieve this.
3514 * With at least some compilers, each of the following is forced to
3515 * a multiple of sizeof(pointer) [the largest field base unit here is a data pointer],
3516 * rounded up with additional padding if the fields do not already fit that requirement:
3517 * - sizeof(class UnicodeString)
3518 * - offsetof(UnicodeString, fUnion)
3522 * In order to avoid padding, we make sizeof(fStackBuffer)=16 (=8 UChars)
3523 * which is at least as large as sizeof(fFields) on 32-bit and 64-bit machines.
3524 * (Padding at the end of fFields is ok:
3525 * As long as there is no padding after fStackBuffer, it is not wasted space.)
3527 * We further assume that the compiler does not reorder the fields,
3528 * so that fRestOfStackBuffer (which holds a few more UChars) immediately follows after fUnion,
3529 * with at most some padding (but no other field) in between.
3530 * (Padding there would be wasted space, but functionally harmless.)
3532 * We use a few more sizeof(pointer)'s chunks of space with
3533 * fRestOfStackBuffer, fShortLength and fFlags,
3534 * to get up exactly to the intended sizeof(UnicodeString).
3536 // (implicit) *vtable;
3537 union StackBufferOrFields
{
3538 // fStackBuffer is used iff (fFlags&kUsingStackBuffer)
3539 // else fFields is used
3540 UChar fStackBuffer
[8]; // buffer for short strings, together with fRestOfStackBuffer
3542 UChar
*fArray
; // the Unicode data
3543 int32_t fCapacity
; // capacity of fArray (in UChars)
3544 int32_t fLength
; // number of characters in fArray if >127; else undefined
3547 UChar fRestOfStackBuffer
[US_STACKBUF_SIZE
-8];
3548 int8_t fShortLength
; // 0..127: length <0: real length is in fUnion.fFields.fLength
3549 uint8_t fFlags
; // bit flags: see constants above
3553 * Create a new UnicodeString with the concatenation of two others.
3555 * @param s1 The first string to be copied to the new one.
3556 * @param s2 The second string to be copied to the new one, after s1.
3557 * @return UnicodeString(s1).append(s2)
3560 U_COMMON_API UnicodeString U_EXPORT2
3561 operator+ (const UnicodeString
&s1
, const UnicodeString
&s2
);
3563 //========================================
3565 //========================================
3567 //========================================
3569 //========================================
3572 UnicodeString::pinIndex(int32_t& start
) const
3577 } else if(start
> length()) {
3583 UnicodeString::pinIndices(int32_t& start
,
3584 int32_t& _length
) const
3587 int32_t len
= length();
3590 } else if(start
> len
) {
3595 } else if(_length
> (len
- start
)) {
3596 _length
= (len
- start
);
3601 UnicodeString::getArrayStart()
3602 { return (fFlags
&kUsingStackBuffer
) ? fUnion
.fStackBuffer
: fUnion
.fFields
.fArray
; }
3605 UnicodeString::getArrayStart() const
3606 { return (fFlags
&kUsingStackBuffer
) ? fUnion
.fStackBuffer
: fUnion
.fFields
.fArray
; }
3608 //========================================
3609 // Default constructor
3610 //========================================
3613 UnicodeString::UnicodeString()
3615 fFlags(kShortString
)
3618 //========================================
3619 // Read-only implementation methods
3620 //========================================
3622 UnicodeString::length() const
3623 { return fShortLength
>=0 ? fShortLength
: fUnion
.fFields
.fLength
; }
3626 UnicodeString::getCapacity() const
3627 { return (fFlags
&kUsingStackBuffer
) ? US_STACKBUF_SIZE
: fUnion
.fFields
.fCapacity
; }
3630 UnicodeString::hashCode() const
3631 { return doHashCode(); }
3634 UnicodeString::isBogus() const
3635 { return (UBool
)(fFlags
& kIsBogus
); }
3638 UnicodeString::isWritable() const
3639 { return (UBool
)!(fFlags
&(kOpenGetBuffer
|kIsBogus
)); }
3642 UnicodeString::isBufferWritable() const
3645 !(fFlags
&(kOpenGetBuffer
|kIsBogus
|kBufferIsReadonly
)) &&
3646 (!(fFlags
&kRefCounted
) || refCount()==1));
3649 inline const UChar
*
3650 UnicodeString::getBuffer() const {
3651 if(fFlags
&(kIsBogus
|kOpenGetBuffer
)) {
3653 } else if(fFlags
&kUsingStackBuffer
) {
3654 return fUnion
.fStackBuffer
;
3656 return fUnion
.fFields
.fArray
;
3660 //========================================
3661 // Read-only alias methods
3662 //========================================
3664 UnicodeString::doCompare(int32_t start
,
3666 const UnicodeString
& srcText
,
3668 int32_t srcLength
) const
3670 if(srcText
.isBogus()) {
3671 return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise
3673 srcText
.pinIndices(srcStart
, srcLength
);
3674 return doCompare(start
, thisLength
, srcText
.getArrayStart(), srcStart
, srcLength
);
3679 UnicodeString::operator== (const UnicodeString
& text
) const
3682 return text
.isBogus();
3684 int32_t len
= length(), textLength
= text
.length();
3685 return !text
.isBogus() && len
== textLength
&& doEquals(text
, len
);
3690 UnicodeString::operator!= (const UnicodeString
& text
) const
3691 { return (! operator==(text
)); }
3694 UnicodeString::operator> (const UnicodeString
& text
) const
3695 { return doCompare(0, length(), text
, 0, text
.length()) == 1; }
3698 UnicodeString::operator< (const UnicodeString
& text
) const
3699 { return doCompare(0, length(), text
, 0, text
.length()) == -1; }
3702 UnicodeString::operator>= (const UnicodeString
& text
) const
3703 { return doCompare(0, length(), text
, 0, text
.length()) != -1; }
3706 UnicodeString::operator<= (const UnicodeString
& text
) const
3707 { return doCompare(0, length(), text
, 0, text
.length()) != 1; }
3710 UnicodeString::compare(const UnicodeString
& text
) const
3711 { return doCompare(0, length(), text
, 0, text
.length()); }
3714 UnicodeString::compare(int32_t start
,
3716 const UnicodeString
& srcText
) const
3717 { return doCompare(start
, _length
, srcText
, 0, srcText
.length()); }
3720 UnicodeString::compare(const UChar
*srcChars
,
3721 int32_t srcLength
) const
3722 { return doCompare(0, length(), srcChars
, 0, srcLength
); }
3725 UnicodeString::compare(int32_t start
,
3727 const UnicodeString
& srcText
,
3729 int32_t srcLength
) const
3730 { return doCompare(start
, _length
, srcText
, srcStart
, srcLength
); }
3733 UnicodeString::compare(int32_t start
,
3735 const UChar
*srcChars
) const
3736 { return doCompare(start
, _length
, srcChars
, 0, _length
); }
3739 UnicodeString::compare(int32_t start
,
3741 const UChar
*srcChars
,
3743 int32_t srcLength
) const
3744 { return doCompare(start
, _length
, srcChars
, srcStart
, srcLength
); }
3747 UnicodeString::compareBetween(int32_t start
,
3749 const UnicodeString
& srcText
,
3751 int32_t srcLimit
) const
3752 { return doCompare(start
, limit
- start
,
3753 srcText
, srcStart
, srcLimit
- srcStart
); }
3756 UnicodeString::doCompareCodePointOrder(int32_t start
,
3758 const UnicodeString
& srcText
,
3760 int32_t srcLength
) const
3762 if(srcText
.isBogus()) {
3763 return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise
3765 srcText
.pinIndices(srcStart
, srcLength
);
3766 return doCompareCodePointOrder(start
, thisLength
, srcText
.getArrayStart(), srcStart
, srcLength
);
3771 UnicodeString::compareCodePointOrder(const UnicodeString
& text
) const
3772 { return doCompareCodePointOrder(0, length(), text
, 0, text
.length()); }
3775 UnicodeString::compareCodePointOrder(int32_t start
,
3777 const UnicodeString
& srcText
) const
3778 { return doCompareCodePointOrder(start
, _length
, srcText
, 0, srcText
.length()); }
3781 UnicodeString::compareCodePointOrder(const UChar
*srcChars
,
3782 int32_t srcLength
) const
3783 { return doCompareCodePointOrder(0, length(), srcChars
, 0, srcLength
); }
3786 UnicodeString::compareCodePointOrder(int32_t start
,
3788 const UnicodeString
& srcText
,
3790 int32_t srcLength
) const
3791 { return doCompareCodePointOrder(start
, _length
, srcText
, srcStart
, srcLength
); }
3794 UnicodeString::compareCodePointOrder(int32_t start
,
3796 const UChar
*srcChars
) const
3797 { return doCompareCodePointOrder(start
, _length
, srcChars
, 0, _length
); }
3800 UnicodeString::compareCodePointOrder(int32_t start
,
3802 const UChar
*srcChars
,
3804 int32_t srcLength
) const
3805 { return doCompareCodePointOrder(start
, _length
, srcChars
, srcStart
, srcLength
); }
3808 UnicodeString::compareCodePointOrderBetween(int32_t start
,
3810 const UnicodeString
& srcText
,
3812 int32_t srcLimit
) const
3813 { return doCompareCodePointOrder(start
, limit
- start
,
3814 srcText
, srcStart
, srcLimit
- srcStart
); }
3817 UnicodeString::doCaseCompare(int32_t start
,
3819 const UnicodeString
&srcText
,
3822 uint32_t options
) const
3824 if(srcText
.isBogus()) {
3825 return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise
3827 srcText
.pinIndices(srcStart
, srcLength
);
3828 return doCaseCompare(start
, thisLength
, srcText
.getArrayStart(), srcStart
, srcLength
, options
);
3833 UnicodeString::caseCompare(const UnicodeString
&text
, uint32_t options
) const {
3834 return doCaseCompare(0, length(), text
, 0, text
.length(), options
);
3838 UnicodeString::caseCompare(int32_t start
,
3840 const UnicodeString
&srcText
,
3841 uint32_t options
) const {
3842 return doCaseCompare(start
, _length
, srcText
, 0, srcText
.length(), options
);
3846 UnicodeString::caseCompare(const UChar
*srcChars
,
3848 uint32_t options
) const {
3849 return doCaseCompare(0, length(), srcChars
, 0, srcLength
, options
);
3853 UnicodeString::caseCompare(int32_t start
,
3855 const UnicodeString
&srcText
,
3858 uint32_t options
) const {
3859 return doCaseCompare(start
, _length
, srcText
, srcStart
, srcLength
, options
);
3863 UnicodeString::caseCompare(int32_t start
,
3865 const UChar
*srcChars
,
3866 uint32_t options
) const {
3867 return doCaseCompare(start
, _length
, srcChars
, 0, _length
, options
);
3871 UnicodeString::caseCompare(int32_t start
,
3873 const UChar
*srcChars
,
3876 uint32_t options
) const {
3877 return doCaseCompare(start
, _length
, srcChars
, srcStart
, srcLength
, options
);
3881 UnicodeString::caseCompareBetween(int32_t start
,
3883 const UnicodeString
&srcText
,
3886 uint32_t options
) const {
3887 return doCaseCompare(start
, limit
- start
, srcText
, srcStart
, srcLimit
- srcStart
, options
);
3891 UnicodeString::indexOf(const UnicodeString
& srcText
,
3895 int32_t _length
) const
3897 if(!srcText
.isBogus()) {
3898 srcText
.pinIndices(srcStart
, srcLength
);
3900 return indexOf(srcText
.getArrayStart(), srcStart
, srcLength
, start
, _length
);
3907 UnicodeString::indexOf(const UnicodeString
& text
) const
3908 { return indexOf(text
, 0, text
.length(), 0, length()); }
3911 UnicodeString::indexOf(const UnicodeString
& text
,
3912 int32_t start
) const {
3914 return indexOf(text
, 0, text
.length(), start
, length() - start
);
3918 UnicodeString::indexOf(const UnicodeString
& text
,
3920 int32_t _length
) const
3921 { return indexOf(text
, 0, text
.length(), start
, _length
); }
3924 UnicodeString::indexOf(const UChar
*srcChars
,
3926 int32_t start
) const {
3928 return indexOf(srcChars
, 0, srcLength
, start
, length() - start
);
3932 UnicodeString::indexOf(const UChar
*srcChars
,
3935 int32_t _length
) const
3936 { return indexOf(srcChars
, 0, srcLength
, start
, _length
); }
3939 UnicodeString::indexOf(UChar c
,
3941 int32_t _length
) const
3942 { return doIndexOf(c
, start
, _length
); }
3945 UnicodeString::indexOf(UChar32 c
,
3947 int32_t _length
) const
3948 { return doIndexOf(c
, start
, _length
); }
3951 UnicodeString::indexOf(UChar c
) const
3952 { return doIndexOf(c
, 0, length()); }
3955 UnicodeString::indexOf(UChar32 c
) const
3956 { return indexOf(c
, 0, length()); }
3959 UnicodeString::indexOf(UChar c
,
3960 int32_t start
) const {
3962 return doIndexOf(c
, start
, length() - start
);
3966 UnicodeString::indexOf(UChar32 c
,
3967 int32_t start
) const {
3969 return indexOf(c
, start
, length() - start
);
3973 UnicodeString::lastIndexOf(const UChar
*srcChars
,
3976 int32_t _length
) const
3977 { return lastIndexOf(srcChars
, 0, srcLength
, start
, _length
); }
3980 UnicodeString::lastIndexOf(const UChar
*srcChars
,
3982 int32_t start
) const {
3984 return lastIndexOf(srcChars
, 0, srcLength
, start
, length() - start
);
3988 UnicodeString::lastIndexOf(const UnicodeString
& srcText
,
3992 int32_t _length
) const
3994 if(!srcText
.isBogus()) {
3995 srcText
.pinIndices(srcStart
, srcLength
);
3997 return lastIndexOf(srcText
.getArrayStart(), srcStart
, srcLength
, start
, _length
);
4004 UnicodeString::lastIndexOf(const UnicodeString
& text
,
4006 int32_t _length
) const
4007 { return lastIndexOf(text
, 0, text
.length(), start
, _length
); }
4010 UnicodeString::lastIndexOf(const UnicodeString
& text
,
4011 int32_t start
) const {
4013 return lastIndexOf(text
, 0, text
.length(), start
, length() - start
);
4017 UnicodeString::lastIndexOf(const UnicodeString
& text
) const
4018 { return lastIndexOf(text
, 0, text
.length(), 0, length()); }
4021 UnicodeString::lastIndexOf(UChar c
,
4023 int32_t _length
) const
4024 { return doLastIndexOf(c
, start
, _length
); }
4027 UnicodeString::lastIndexOf(UChar32 c
,
4029 int32_t _length
) const {
4030 return doLastIndexOf(c
, start
, _length
);
4034 UnicodeString::lastIndexOf(UChar c
) const
4035 { return doLastIndexOf(c
, 0, length()); }
4038 UnicodeString::lastIndexOf(UChar32 c
) const {
4039 return lastIndexOf(c
, 0, length());
4043 UnicodeString::lastIndexOf(UChar c
,
4044 int32_t start
) const {
4046 return doLastIndexOf(c
, start
, length() - start
);
4050 UnicodeString::lastIndexOf(UChar32 c
,
4051 int32_t start
) const {
4053 return lastIndexOf(c
, start
, length() - start
);
4057 UnicodeString::startsWith(const UnicodeString
& text
) const
4058 { return compare(0, text
.length(), text
, 0, text
.length()) == 0; }
4061 UnicodeString::startsWith(const UnicodeString
& srcText
,
4063 int32_t srcLength
) const
4064 { return doCompare(0, srcLength
, srcText
, srcStart
, srcLength
) == 0; }
4067 UnicodeString::startsWith(const UChar
*srcChars
, int32_t srcLength
) const {
4069 srcLength
= u_strlen(srcChars
);
4071 return doCompare(0, srcLength
, srcChars
, 0, srcLength
) == 0;
4075 UnicodeString::startsWith(const UChar
*srcChars
, int32_t srcStart
, int32_t srcLength
) const {
4077 srcLength
= u_strlen(srcChars
);
4079 return doCompare(0, srcLength
, srcChars
, srcStart
, srcLength
) == 0;
4083 UnicodeString::endsWith(const UnicodeString
& text
) const
4084 { return doCompare(length() - text
.length(), text
.length(),
4085 text
, 0, text
.length()) == 0; }
4088 UnicodeString::endsWith(const UnicodeString
& srcText
,
4090 int32_t srcLength
) const {
4091 srcText
.pinIndices(srcStart
, srcLength
);
4092 return doCompare(length() - srcLength
, srcLength
,
4093 srcText
, srcStart
, srcLength
) == 0;
4097 UnicodeString::endsWith(const UChar
*srcChars
,
4098 int32_t srcLength
) const {
4100 srcLength
= u_strlen(srcChars
);
4102 return doCompare(length() - srcLength
, srcLength
,
4103 srcChars
, 0, srcLength
) == 0;
4107 UnicodeString::endsWith(const UChar
*srcChars
,
4109 int32_t srcLength
) const {
4111 srcLength
= u_strlen(srcChars
+ srcStart
);
4113 return doCompare(length() - srcLength
, srcLength
,
4114 srcChars
, srcStart
, srcLength
) == 0;
4117 //========================================
4119 //========================================
4120 inline UnicodeString
&
4121 UnicodeString::replace(int32_t start
,
4123 const UnicodeString
& srcText
)
4124 { return doReplace(start
, _length
, srcText
, 0, srcText
.length()); }
4126 inline UnicodeString
&
4127 UnicodeString::replace(int32_t start
,
4129 const UnicodeString
& srcText
,
4132 { return doReplace(start
, _length
, srcText
, srcStart
, srcLength
); }
4134 inline UnicodeString
&
4135 UnicodeString::replace(int32_t start
,
4137 const UChar
*srcChars
,
4139 { return doReplace(start
, _length
, srcChars
, 0, srcLength
); }
4141 inline UnicodeString
&
4142 UnicodeString::replace(int32_t start
,
4144 const UChar
*srcChars
,
4147 { return doReplace(start
, _length
, srcChars
, srcStart
, srcLength
); }
4149 inline UnicodeString
&
4150 UnicodeString::replace(int32_t start
,
4153 { return doReplace(start
, _length
, &srcChar
, 0, 1); }
4155 inline UnicodeString
&
4156 UnicodeString::replaceBetween(int32_t start
,
4158 const UnicodeString
& srcText
)
4159 { return doReplace(start
, limit
- start
, srcText
, 0, srcText
.length()); }
4161 inline UnicodeString
&
4162 UnicodeString::replaceBetween(int32_t start
,
4164 const UnicodeString
& srcText
,
4167 { return doReplace(start
, limit
- start
, srcText
, srcStart
, srcLimit
- srcStart
); }
4169 inline UnicodeString
&
4170 UnicodeString::findAndReplace(const UnicodeString
& oldText
,
4171 const UnicodeString
& newText
)
4172 { return findAndReplace(0, length(), oldText
, 0, oldText
.length(),
4173 newText
, 0, newText
.length()); }
4175 inline UnicodeString
&
4176 UnicodeString::findAndReplace(int32_t start
,
4178 const UnicodeString
& oldText
,
4179 const UnicodeString
& newText
)
4180 { return findAndReplace(start
, _length
, oldText
, 0, oldText
.length(),
4181 newText
, 0, newText
.length()); }
4183 // ============================
4185 // ============================
4187 UnicodeString::doExtract(int32_t start
,
4189 UnicodeString
& target
) const
4190 { target
.replace(0, target
.length(), *this, start
, _length
); }
4193 UnicodeString::extract(int32_t start
,
4196 int32_t targetStart
) const
4197 { doExtract(start
, _length
, target
, targetStart
); }
4200 UnicodeString::extract(int32_t start
,
4202 UnicodeString
& target
) const
4203 { doExtract(start
, _length
, target
); }
4205 #if !UCONFIG_NO_CONVERSION
4208 UnicodeString::extract(int32_t start
,
4211 const char *codepage
) const
4214 // This dstSize value will be checked explicitly
4215 return extract(start
, _length
, dst
, dst
!=0 ? 0xffffffff : 0, codepage
);
4221 UnicodeString::extractBetween(int32_t start
,
4224 int32_t dstStart
) const {
4227 doExtract(start
, limit
- start
, dst
, dstStart
);
4230 inline UnicodeString
4231 UnicodeString::tempSubStringBetween(int32_t start
, int32_t limit
) const {
4232 return tempSubString(start
, limit
- start
);
4236 UnicodeString::doCharAt(int32_t offset
) const
4238 if((uint32_t)offset
< (uint32_t)length()) {
4239 return getArrayStart()[offset
];
4241 return kInvalidUChar
;
4246 UnicodeString::charAt(int32_t offset
) const
4247 { return doCharAt(offset
); }
4250 UnicodeString::operator[] (int32_t offset
) const
4251 { return doCharAt(offset
); }
4254 UnicodeString::isEmpty() const {
4255 return fShortLength
== 0;
4258 //========================================
4259 // Write implementation methods
4260 //========================================
4262 UnicodeString::setLength(int32_t len
) {
4264 fShortLength
= (int8_t)len
;
4266 fShortLength
= (int8_t)-1;
4267 fUnion
.fFields
.fLength
= len
;
4272 UnicodeString::setToEmpty() {
4274 fFlags
= kShortString
;
4278 UnicodeString::setArray(UChar
*array
, int32_t len
, int32_t capacity
) {
4280 fUnion
.fFields
.fArray
= array
;
4281 fUnion
.fFields
.fCapacity
= capacity
;
4284 inline const UChar
*
4285 UnicodeString::getTerminatedBuffer() {
4289 UChar
*array
= getArrayStart();
4290 int32_t len
= length();
4291 if(len
< getCapacity() && ((fFlags
&kRefCounted
) == 0 || refCount() == 1)) {
4293 * kRefCounted: Do not write the NUL if the buffer is shared.
4294 * That is mostly safe, except when the length of one copy was modified
4295 * without copy-on-write, e.g., via truncate(newLength) or remove(void).
4296 * Then the NUL would be written into the middle of another copy's string.
4298 if(!(fFlags
&kBufferIsReadonly
)) {
4300 * We must not write to a readonly buffer, but it is known to be
4301 * NUL-terminated if len<capacity.
4302 * A shared, allocated buffer (refCount()>1) must not have its contents
4303 * modified, but the NUL at [len] is beyond the string contents,
4304 * and multiple string objects and threads writing the same NUL into the
4305 * same location is harmless.
4306 * In all other cases, the buffer is fully writable and it is anyway safe
4309 * Note: An earlier version of this code tested whether there is a NUL
4310 * at [len] already, but, while safe, it generated lots of warnings from
4311 * tools like valgrind and Purify.
4316 } else if(cloneArrayIfNeeded(len
+1)) {
4317 array
= getArrayStart();
4326 inline UnicodeString
&
4327 UnicodeString::operator= (UChar ch
)
4328 { return doReplace(0, length(), &ch
, 0, 1); }
4330 inline UnicodeString
&
4331 UnicodeString::operator= (UChar32 ch
)
4332 { return replace(0, length(), ch
); }
4334 inline UnicodeString
&
4335 UnicodeString::setTo(const UnicodeString
& srcText
,
4340 return doReplace(0, length(), srcText
, srcStart
, srcLength
);
4343 inline UnicodeString
&
4344 UnicodeString::setTo(const UnicodeString
& srcText
,
4348 srcText
.pinIndex(srcStart
);
4349 return doReplace(0, length(), srcText
, srcStart
, srcText
.length() - srcStart
);
4352 inline UnicodeString
&
4353 UnicodeString::setTo(const UnicodeString
& srcText
)
4355 return copyFrom(srcText
);
4358 inline UnicodeString
&
4359 UnicodeString::setTo(const UChar
*srcChars
,
4363 return doReplace(0, length(), srcChars
, 0, srcLength
);
4366 inline UnicodeString
&
4367 UnicodeString::setTo(UChar srcChar
)
4370 return doReplace(0, length(), &srcChar
, 0, 1);
4373 inline UnicodeString
&
4374 UnicodeString::setTo(UChar32 srcChar
)
4377 return replace(0, length(), srcChar
);
4380 inline UnicodeString
&
4381 UnicodeString::append(const UnicodeString
& srcText
,
4384 { return doReplace(length(), 0, srcText
, srcStart
, srcLength
); }
4386 inline UnicodeString
&
4387 UnicodeString::append(const UnicodeString
& srcText
)
4388 { return doReplace(length(), 0, srcText
, 0, srcText
.length()); }
4390 inline UnicodeString
&
4391 UnicodeString::append(const UChar
*srcChars
,
4394 { return doReplace(length(), 0, srcChars
, srcStart
, srcLength
); }
4396 inline UnicodeString
&
4397 UnicodeString::append(const UChar
*srcChars
,
4399 { return doReplace(length(), 0, srcChars
, 0, srcLength
); }
4401 inline UnicodeString
&
4402 UnicodeString::append(UChar srcChar
)
4403 { return doReplace(length(), 0, &srcChar
, 0, 1); }
4405 inline UnicodeString
&
4406 UnicodeString::operator+= (UChar ch
)
4407 { return doReplace(length(), 0, &ch
, 0, 1); }
4409 inline UnicodeString
&
4410 UnicodeString::operator+= (UChar32 ch
) {
4414 inline UnicodeString
&
4415 UnicodeString::operator+= (const UnicodeString
& srcText
)
4416 { return doReplace(length(), 0, srcText
, 0, srcText
.length()); }
4418 inline UnicodeString
&
4419 UnicodeString::insert(int32_t start
,
4420 const UnicodeString
& srcText
,
4423 { return doReplace(start
, 0, srcText
, srcStart
, srcLength
); }
4425 inline UnicodeString
&
4426 UnicodeString::insert(int32_t start
,
4427 const UnicodeString
& srcText
)
4428 { return doReplace(start
, 0, srcText
, 0, srcText
.length()); }
4430 inline UnicodeString
&
4431 UnicodeString::insert(int32_t start
,
4432 const UChar
*srcChars
,
4435 { return doReplace(start
, 0, srcChars
, srcStart
, srcLength
); }
4437 inline UnicodeString
&
4438 UnicodeString::insert(int32_t start
,
4439 const UChar
*srcChars
,
4441 { return doReplace(start
, 0, srcChars
, 0, srcLength
); }
4443 inline UnicodeString
&
4444 UnicodeString::insert(int32_t start
,
4446 { return doReplace(start
, 0, &srcChar
, 0, 1); }
4448 inline UnicodeString
&
4449 UnicodeString::insert(int32_t start
,
4451 { return replace(start
, 0, srcChar
); }
4454 inline UnicodeString
&
4455 UnicodeString::remove()
4457 // remove() of a bogus string makes the string empty and non-bogus
4458 // we also un-alias a read-only alias to deal with NUL-termination
4459 // issues with getTerminatedBuffer()
4460 if(fFlags
& (kIsBogus
|kBufferIsReadonly
)) {
4468 inline UnicodeString
&
4469 UnicodeString::remove(int32_t start
,
4472 if(start
<= 0 && _length
== INT32_MAX
) {
4473 // remove(guaranteed everything) of a bogus string makes the string empty and non-bogus
4476 return doReplace(start
, _length
, NULL
, 0, 0);
4479 inline UnicodeString
&
4480 UnicodeString::removeBetween(int32_t start
,
4482 { return doReplace(start
, limit
- start
, NULL
, 0, 0); }
4484 inline UnicodeString
&
4485 UnicodeString::retainBetween(int32_t start
, int32_t limit
) {
4487 return doReplace(0, start
, NULL
, 0, 0);
4491 UnicodeString::truncate(int32_t targetLength
)
4493 if(isBogus() && targetLength
== 0) {
4494 // truncate(0) of a bogus string makes the string empty and non-bogus
4497 } else if((uint32_t)targetLength
< (uint32_t)length()) {
4498 setLength(targetLength
);
4499 if(fFlags
&kBufferIsReadonly
) {
4500 fUnion
.fFields
.fCapacity
= targetLength
; // not NUL-terminated any more
4508 inline UnicodeString
&
4509 UnicodeString::reverse()
4510 { return doReverse(0, length()); }
4512 inline UnicodeString
&
4513 UnicodeString::reverse(int32_t start
,
4515 { return doReverse(start
, _length
); }