X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/374ca955a76ecab1204ca8bfa63ff9238d998416..3d1f044b704633e2e541231cd17ae9ecf9ad5c7a:/icuSources/common/unicode/unistr.h diff --git a/icuSources/common/unicode/unistr.h b/icuSources/common/unicode/unistr.h index 3827458b..55739ac2 100644 --- a/icuSources/common/unicode/unistr.h +++ b/icuSources/common/unicode/unistr.h @@ -1,6 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** -* Copyright (C) 1998-2004, International Business Machines +* Copyright (C) 1998-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * @@ -21,32 +23,65 @@ #ifndef UNISTR_H #define UNISTR_H +/** + * \file + * \brief C++ API: Unicode String + */ + +#include +#include "unicode/utypes.h" +#include "unicode/char16ptr.h" #include "unicode/rep.h" +#include "unicode/std_string.h" +#include "unicode/stringpiece.h" +#include "unicode/bytestream.h" struct UConverter; // unicode/ucnv.h -class StringThreadTest; -#ifndef U_COMPARE_CODE_POINT_ORDER -/* see also ustring.h and unorm.h */ +#ifndef USTRING_H /** - * Option bit for u_strCaseCompare, u_strcasecmp, unorm_compare, etc: - * Compare strings in code point order instead of code unit order. - * @stable ICU 2.2 + * \ingroup ustring_ustrlen */ -#define U_COMPARE_CODE_POINT_ORDER 0x8000 -#endif - -#ifndef USTRING_H -/* see ustring.h */ U_STABLE int32_t U_EXPORT2 u_strlen(const UChar *s); #endif +#if U_SHOW_CPLUSPLUS_API +U_NAMESPACE_BEGIN + +#if !UCONFIG_NO_BREAK_ITERATION +class BreakIterator; // unicode/brkiter.h +#endif +class Edits; + +U_NAMESPACE_END +#endif // U_SHOW_CPLUSPLUS_API + +#if U_SHOW_CPLUSPLUS_API +// Not #ifndef U_HIDE_INTERNAL_API because UnicodeString needs the UStringCaseMapper. +/** + * Internal string case mapping function type. + * All error checking must be done. + * src and dest must not overlap. + * @internal + */ +typedef int32_t U_CALLCONV +UStringCaseMapper(int32_t caseLocale, uint32_t options, +#if !UCONFIG_NO_BREAK_ITERATION + icu::BreakIterator *iter, +#endif + char16_t *dest, int32_t destCapacity, + const char16_t *src, int32_t srcLength, + icu::Edits *edits, + UErrorCode &errorCode); +#endif // U_SHOW_CPLUSPLUS_API + +#if U_SHOW_CPLUSPLUS_API U_NAMESPACE_BEGIN class Locale; // unicode/locid.h class StringCharacterIterator; -class BreakIterator; // unicode/brkiter.h +class UnicodeStringAppendable; // unicode/appendable.h /* The include has been moved to unicode/ustream.h */ @@ -58,33 +93,31 @@ class BreakIterator; // unicode/brkiter.h * therefore recommended over ones taking a charset name string * (where the empty string "" indicates invariant-character conversion). * - * @draft ICU 3.2 + * @stable ICU 3.2 */ -#define US_INV UnicodeString::kInvariant +#define US_INV icu::UnicodeString::kInvariant /** * Unicode String literals in C++. - * Dependent on the platform properties, different UnicodeString - * constructors should be used to create a UnicodeString object from - * a string literal. - * The macros are defined for maximum performance. + * + * Note: these macros are not recommended for new code. + * Prior to the availability of C++11 and u"unicode string literals", + * these macros were provided for portability and efficiency when + * initializing UnicodeStrings from literals. + * * They work only for strings that contain "invariant characters", i.e., * only latin letters, digits, and some punctuation. * See utypes.h for details. * * The string parameter must be a C string literal. * The length of the string, not including the terminating - * NUL, must be specified as a constant. - * The U_STRING_DECL macro should be invoked exactly once for one - * such string variable before it is used. + * `NUL`, must be specified as a constant. * @stable ICU 2.0 */ -#if U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && U_CHARSET_FAMILY==U_ASCII_FAMILY -# define UNICODE_STRING(cs, _length) UnicodeString(TRUE, (const UChar *)L ## cs, _length) -#elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY -# define UNICODE_STRING(cs, _length) UnicodeString(TRUE, (const UChar *)cs, _length) +#if !U_CHAR16_IS_TYPEDEF +# define UNICODE_STRING(cs, _length) icu::UnicodeString(TRUE, u ## cs, _length) #else -# define UNICODE_STRING(cs, _length) UnicodeString(cs, _length, US_INV) +# define UNICODE_STRING(cs, _length) icu::UnicodeString(TRUE, (const char16_t*)u ## cs, _length) #endif /** @@ -100,48 +133,128 @@ class BreakIterator; // unicode/brkiter.h * The string parameter must be a C string literal. * @stable ICU 2.0 */ -#if U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && U_CHARSET_FAMILY==U_ASCII_FAMILY -# define UNICODE_STRING_SIMPLE(cs) UnicodeString(TRUE, (const UChar *)L ## cs, -1) -#elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY -# define UNICODE_STRING_SIMPLE(cs) UnicodeString(TRUE, (const UChar *)cs, -1) -#else -# define UNICODE_STRING_SIMPLE(cs) UnicodeString(cs, -1, US_INV) +#define UNICODE_STRING_SIMPLE(cs) UNICODE_STRING(cs, -1) + +/** + * \def UNISTR_FROM_CHAR_EXPLICIT + * This can be defined to be empty or "explicit". + * If explicit, then the UnicodeString(char16_t) and UnicodeString(UChar32) + * constructors are marked as explicit, preventing their inadvertent use. + * @stable ICU 49 + */ +#ifndef UNISTR_FROM_CHAR_EXPLICIT +# if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION) + // Auto-"explicit" in ICU library code. +# define UNISTR_FROM_CHAR_EXPLICIT explicit +# else + // Empty by default for source code compatibility. +# define UNISTR_FROM_CHAR_EXPLICIT +# endif +#endif + +/** + * \def UNISTR_FROM_STRING_EXPLICIT + * This can be defined to be empty or "explicit". + * If explicit, then the UnicodeString(const char *) and UnicodeString(const char16_t *) + * constructors are marked as explicit, preventing their inadvertent use. + * + * In particular, this helps prevent accidentally depending on ICU conversion code + * by passing a string literal into an API with a const UnicodeString & parameter. + * @stable ICU 49 + */ +#ifndef UNISTR_FROM_STRING_EXPLICIT +# if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION) + // Auto-"explicit" in ICU library code. +# define UNISTR_FROM_STRING_EXPLICIT explicit +# else + // Empty by default for source code compatibility. +# define UNISTR_FROM_STRING_EXPLICIT +# endif +#endif + +/** + * \def UNISTR_OBJECT_SIZE + * Desired sizeof(UnicodeString) in bytes. + * It should be a multiple of sizeof(pointer) to avoid unusable space for padding. + * The object size may want to be a multiple of 16 bytes, + * which is a common granularity for heap allocation. + * + * Any space inside the object beyond sizeof(vtable pointer) + 2 + * is available for storing short strings inside the object. + * The bigger the object, the longer a string that can be stored inside the object, + * without additional heap allocation. + * + * Depending on a platform's pointer size, pointer alignment requirements, + * and struct padding, the compiler will usually round up sizeof(UnicodeString) + * to 4 * sizeof(pointer) (or 3 * sizeof(pointer) for P128 data models), + * to hold the fields for heap-allocated strings. + * Such a minimum size also ensures that the object is easily large enough + * to hold at least 2 char16_ts, for one supplementary code point (U16_MAX_LENGTH). + * + * sizeof(UnicodeString) >= 48 should work for all known platforms. + * + * For example, on a 64-bit machine where sizeof(vtable pointer) is 8, + * sizeof(UnicodeString) = 64 would leave space for + * (64 - sizeof(vtable pointer) - 2) / U_SIZEOF_UCHAR = (64 - 8 - 2) / 2 = 27 + * char16_ts stored inside the object. + * + * The minimum object size on a 64-bit machine would be + * 4 * sizeof(pointer) = 4 * 8 = 32 bytes, + * and the internal buffer would hold up to 11 char16_ts in that case. + * + * @see U16_MAX_LENGTH + * @stable ICU 56 + */ +#ifndef UNISTR_OBJECT_SIZE +# define UNISTR_OBJECT_SIZE 64 #endif /** * UnicodeString is a string class that stores Unicode characters directly and provides - * similar functionality as the Java String and StringBuffer classes. + * similar functionality as the Java String and StringBuffer/StringBuilder classes. * It is a concrete implementation of the abstract class Replaceable (for transliteration). * + * A UnicodeString may also "alias" an external array of characters + * (that is, point to it, rather than own the array) + * whose lifetime must then at least match the lifetime of the aliasing object. + * This aliasing may be preserved when returning a UnicodeString by value, + * depending on the compiler and the function implementation, + * via Return Value Optimization (RVO) or the move assignment operator. + * (However, the copy assignment operator does not preserve aliasing.) + * For details see the description of storage models at the end of the class API docs + * and in the User Guide chapter linked from there. + * * The UnicodeString class is not suitable for subclassing. * - *

For an overview of Unicode strings in C and C++ see the - * User Guide Strings chapter.

+ * For an overview of Unicode strings in C and C++ see the + * [User Guide Strings chapter](http://userguide.icu-project.org/strings#TOC-Strings-in-C-C-). * - *

In ICU, a Unicode string consists of 16-bit Unicode code units. - * A Unicode character may be stored with either - * one code unit — which is the most common case — or with a matched pair of - * special code units ("surrogates"). - * The data type for code units is UChar.
- * For single-character handling, a Unicode character code point is a value - * in the range 0..0x10ffff. ICU uses the UChar32 type for code points.

+ * In ICU, a Unicode string consists of 16-bit Unicode *code units*. + * A Unicode character may be stored with either one code unit + * (the most common case) or with a matched pair of special code units + * ("surrogates"). The data type for code units is char16_t. + * For single-character handling, a Unicode character code *point* is a value + * in the range 0..0x10ffff. ICU uses the UChar32 type for code points. * - *

Indexes and offsets into and lengths of strings always count code units, not code points. + * Indexes and offsets into and lengths of strings always count code units, not code points. * This is the same as with multi-byte char* strings in traditional string handling. * Operations on partial strings typically do not test for code point boundaries. * If necessary, the user needs to take care of such boundaries by testing for the code unit * values or by using functions like * UnicodeString::getChar32Start() and UnicodeString::getChar32Limit() - * (or, in C, the equivalent macros U16_SET_CP_START() and U16_SET_CP_LIMIT(), see utf.h).

+ * (or, in C, the equivalent macros U16_SET_CP_START() and U16_SET_CP_LIMIT(), see utf.h). * * UnicodeString methods are more lenient with regard to input parameter values * than other ICU APIs. In particular: * - If indexes are out of bounds for a UnicodeString object - * (<0 or >length()) then they are "pinned" to the nearest boundary. - * - If primitive string pointer values (e.g., const UChar * or char *) + * (< 0 or > length()) then they are "pinned" to the nearest boundary. + * - If the buffer passed to an insert/append/replace operation is owned by the + * target object, e.g., calling str.append(str), an extra copy may take place + * to ensure safety. + * - If primitive string pointer values (e.g., const char16_t * or char *) * for input strings are NULL, then those input string parameters are treated * as if they pointed to an empty string. - * However, this is not the case for char * parameters for charset names + * However, this is *not* the case for char * parameters for charset names * or other IDs. * - Most UnicodeString methods do not take a UErrorCode parameter because * there are usually very few opportunities for failure other than a shortage @@ -165,14 +278,14 @@ class BreakIterator; // unicode/brkiter.h * This includes the const UnicodeString & parameters for * copy construction, assignment, and cloning. * - *

UnicodeString uses several storage methods. + * UnicodeString uses several storage methods. * String contents can be stored inside the UnicodeString object itself, * in an allocated and shared buffer, or in an outside buffer that is "aliased". * Most of this is done transparently, but careful aliasing in particular provides * significant performance improvements. * Also, the internal buffer is accessible via special functions. * For details see the - * User Guide Strings chapter.

+ * [User Guide Strings chapter](http://userguide.icu-project.org/strings#TOC-Maximizing-Performance-with-the-UnicodeString-Storage-Model). * * @see utf.h * @see CharacterIterator @@ -188,12 +301,12 @@ public: * Use the macro US_INV instead of the full qualification for this value. * * @see US_INV - * @draft ICU 3.2 + * @stable ICU 3.2 */ enum EInvariant { /** * @see EInvariant - * @draft ICU 3.2 + * @stable ICU 3.2 */ kInvariant }; @@ -207,7 +320,7 @@ public: /** * Equality operator. Performs only bitwise comparison. * @param text The UnicodeString to compare to this one. - * @return TRUE if text contains the same characters as this one, + * @return TRUE if `text` contains the same characters as this one, * FALSE otherwise. * @stable ICU 2.0 */ @@ -216,7 +329,7 @@ public: /** * Inequality operator. Performs only bitwise comparison. * @param text The UnicodeString to compare to this one. - * @return FALSE if text contains the same characters as this one, + * @return FALSE if `text` contains the same characters as this one, * TRUE otherwise. * @stable ICU 2.0 */ @@ -226,7 +339,7 @@ public: * Greater than operator. Performs only bitwise comparison. * @param text The UnicodeString to compare to this one. * @return TRUE if the characters in this are bitwise - * greater than the characters in text, FALSE otherwise + * greater than the characters in `text`, FALSE otherwise * @stable ICU 2.0 */ inline UBool operator> (const UnicodeString& text) const; @@ -235,7 +348,7 @@ public: * Less than operator. Performs only bitwise comparison. * @param text The UnicodeString to compare to this one. * @return TRUE if the characters in this are bitwise - * less than the characters in text, FALSE otherwise + * less than the characters in `text`, FALSE otherwise * @stable ICU 2.0 */ inline UBool operator< (const UnicodeString& text) const; @@ -244,7 +357,7 @@ public: * Greater than or equal operator. Performs only bitwise comparison. * @param text The UnicodeString to compare to this one. * @return TRUE if the characters in this are bitwise - * greater than or equal to the characters in text, FALSE otherwise + * greater than or equal to the characters in `text`, FALSE otherwise * @stable ICU 2.0 */ inline UBool operator>= (const UnicodeString& text) const; @@ -253,36 +366,37 @@ public: * Less than or equal operator. Performs only bitwise comparison. * @param text The UnicodeString to compare to this one. * @return TRUE if the characters in this are bitwise - * less than or equal to the characters in text, FALSE otherwise + * less than or equal to the characters in `text`, FALSE otherwise * @stable ICU 2.0 */ inline UBool operator<= (const UnicodeString& text) const; /** * Compare the characters bitwise in this UnicodeString to - * the characters in text. + * the characters in `text`. * @param text The UnicodeString to compare to this one. * @return The result of bitwise character comparison: 0 if this - * contains the same characters as text, -1 if the characters in - * this are bitwise less than the characters in text, +1 if the + * contains the same characters as `text`, -1 if the characters in + * this are bitwise less than the characters in `text`, +1 if the * characters in this are bitwise greater than the characters - * in text. + * in `text`. * @stable ICU 2.0 */ inline int8_t compare(const UnicodeString& text) const; /** * Compare the characters bitwise in the range - * [start, start + length) with the characters - * in text + * [`start`, `start + length`) with the characters + * in the **entire string** `text`. + * (The parameters "start" and "length" are not applied to the other text "text".) * @param start the offset at which the compare operation begins * @param length the number of characters of text to compare. * @param text the other text to be compared against this string. * @return The result of bitwise character comparison: 0 if this - * contains the same characters as text, -1 if the characters in - * this are bitwise less than the characters in text, +1 if the + * contains the same characters as `text`, -1 if the characters in + * this are bitwise less than the characters in `text`, +1 if the * characters in this are bitwise greater than the characters - * in text. + * in `text`. * @stable ICU 2.0 */ inline int8_t compare(int32_t start, @@ -291,19 +405,19 @@ public: /** * Compare the characters bitwise in the range - * [start, start + length) with the characters - * in srcText in the range - * [srcStart, srcStart + srcLength). + * [`start`, `start + length`) with the characters + * in `srcText` in the range + * [`srcStart`, `srcStart + srcLength`). * @param start the offset at which the compare operation begins * @param length the number of characters in this to compare. * @param srcText the text to be compared - * @param srcStart the offset into srcText to start comparison - * @param srcLength the number of characters in src to compare + * @param srcStart the offset into `srcText` to start comparison + * @param srcLength the number of characters in `src` to compare * @return The result of bitwise character comparison: 0 if this - * contains the same characters as srcText, -1 if the characters in - * this are bitwise less than the characters in srcText, +1 if the + * contains the same characters as `srcText`, -1 if the characters in + * this are bitwise less than the characters in `srcText`, +1 if the * characters in this are bitwise greater than the characters - * in srcText. + * in `srcText`. * @stable ICU 2.0 */ inline int8_t compare(int32_t start, @@ -314,75 +428,75 @@ public: /** * Compare the characters bitwise in this UnicodeString with the first - * srcLength characters in srcChars. + * `srcLength` characters in `srcChars`. * @param srcChars The characters to compare to this UnicodeString. - * @param srcLength the number of characters in srcChars to compare + * @param srcLength the number of characters in `srcChars` to compare * @return The result of bitwise character comparison: 0 if this - * contains the same characters as srcChars, -1 if the characters in - * this are bitwise less than the characters in srcChars, +1 if the + * contains the same characters as `srcChars`, -1 if the characters in + * this are bitwise less than the characters in `srcChars`, +1 if the * characters in this are bitwise greater than the characters - * in srcChars. + * in `srcChars`. * @stable ICU 2.0 */ - inline int8_t compare(const UChar *srcChars, + inline int8_t compare(ConstChar16Ptr srcChars, int32_t srcLength) const; /** * Compare the characters bitwise in the range - * [start, start + length) with the first - * length characters in srcChars + * [`start`, `start + length`) with the first + * `length` characters in `srcChars` * @param start the offset at which the compare operation begins * @param length the number of characters to compare. * @param srcChars the characters to be compared * @return The result of bitwise character comparison: 0 if this - * contains the same characters as srcChars, -1 if the characters in - * this are bitwise less than the characters in srcChars, +1 if the + * contains the same characters as `srcChars`, -1 if the characters in + * this are bitwise less than the characters in `srcChars`, +1 if the * characters in this are bitwise greater than the characters - * in srcChars. + * in `srcChars`. * @stable ICU 2.0 */ inline int8_t compare(int32_t start, int32_t length, - const UChar *srcChars) const; + const char16_t *srcChars) const; /** * Compare the characters bitwise in the range - * [start, start + length) with the characters - * in srcChars in the range - * [srcStart, srcStart + srcLength). + * [`start`, `start + length`) with the characters + * in `srcChars` in the range + * [`srcStart`, `srcStart + srcLength`). * @param start the offset at which the compare operation begins * @param length the number of characters in this to compare * @param srcChars the characters to be compared - * @param srcStart the offset into srcChars to start comparison - * @param srcLength the number of characters in srcChars to compare + * @param srcStart the offset into `srcChars` to start comparison + * @param srcLength the number of characters in `srcChars` to compare * @return The result of bitwise character comparison: 0 if this - * contains the same characters as srcChars, -1 if the characters in - * this are bitwise less than the characters in srcChars, +1 if the + * contains the same characters as `srcChars`, -1 if the characters in + * this are bitwise less than the characters in `srcChars`, +1 if the * characters in this are bitwise greater than the characters - * in srcChars. + * in `srcChars`. * @stable ICU 2.0 */ inline int8_t compare(int32_t start, int32_t length, - const UChar *srcChars, + const char16_t *srcChars, int32_t srcStart, int32_t srcLength) const; /** * Compare the characters bitwise in the range - * [start, limit) with the characters - * in srcText in the range - * [srcStart, srcLimit). + * [`start`, `limit`) with the characters + * in `srcText` in the range + * [`srcStart`, `srcLimit`). * @param start the offset at which the compare operation begins * @param limit the offset immediately following the compare operation * @param srcText the text to be compared - * @param srcStart the offset into srcText to start comparison - * @param srcLimit the offset into srcText to limit comparison + * @param srcStart the offset into `srcText` to start comparison + * @param srcLimit the offset into `srcText` to limit comparison * @return The result of bitwise character comparison: 0 if this - * contains the same characters as srcText, -1 if the characters in - * this are bitwise less than the characters in srcText, +1 if the + * contains the same characters as `srcText`, -1 if the characters in + * this are bitwise less than the characters in `srcText`, +1 if the * characters in this are bitwise greater than the characters - * in srcText. + * in `srcText`. * @stable ICU 2.0 */ inline int8_t compareBetween(int32_t start, @@ -393,7 +507,7 @@ public: /** * Compare two Unicode strings in code point order. - * This is different in UTF-16 from how compare(), operator==, startsWith() etc. work + * The result may be different from the results of compare(), operator<, etc. * if supplementary characters are present: * * In UTF-16, supplementary characters (with code points U+10000 and above) are @@ -412,7 +526,7 @@ public: /** * Compare two Unicode strings in code point order. - * This is different in UTF-16 from how compare(), operator==, startsWith() etc. work + * The result may be different from the results of compare(), operator<, etc. * if supplementary characters are present: * * In UTF-16, supplementary characters (with code points U+10000 and above) are @@ -435,7 +549,7 @@ public: /** * Compare two Unicode strings in code point order. - * This is different in UTF-16 from how compare(), operator==, startsWith() etc. work + * The result may be different from the results of compare(), operator<, etc. * if supplementary characters are present: * * In UTF-16, supplementary characters (with code points U+10000 and above) are @@ -462,7 +576,7 @@ public: /** * Compare two Unicode strings in code point order. - * This is different in UTF-16 from how compare(), operator==, startsWith() etc. work + * The result may be different from the results of compare(), operator<, etc. * if supplementary characters are present: * * In UTF-16, supplementary characters (with code points U+10000 and above) are @@ -478,12 +592,12 @@ public: * in code point order * @stable ICU 2.0 */ - inline int8_t compareCodePointOrder(const UChar *srcChars, + inline int8_t compareCodePointOrder(ConstChar16Ptr srcChars, int32_t srcLength) const; /** * Compare two Unicode strings in code point order. - * This is different in UTF-16 from how compare(), operator==, startsWith() etc. work + * The result may be different from the results of compare(), operator<, etc. * if supplementary characters are present: * * In UTF-16, supplementary characters (with code points U+10000 and above) are @@ -502,11 +616,11 @@ public: */ inline int8_t compareCodePointOrder(int32_t start, int32_t length, - const UChar *srcChars) const; + const char16_t *srcChars) const; /** * Compare two Unicode strings in code point order. - * This is different in UTF-16 from how compare(), operator==, startsWith() etc. work + * The result may be different from the results of compare(), operator<, etc. * if supplementary characters are present: * * In UTF-16, supplementary characters (with code points U+10000 and above) are @@ -527,13 +641,13 @@ public: */ inline int8_t compareCodePointOrder(int32_t start, int32_t length, - const UChar *srcChars, + const char16_t *srcChars, int32_t srcStart, int32_t srcLength) const; /** * Compare two Unicode strings in code point order. - * This is different in UTF-16 from how compare(), operator==, startsWith() etc. work + * The result may be different from the results of compare(), operator<, etc. * if supplementary characters are present: * * In UTF-16, supplementary characters (with code points U+10000 and above) are @@ -651,7 +765,7 @@ public: * @return A negative, zero, or positive integer indicating the comparison result. * @stable ICU 2.0 */ - inline int8_t caseCompare(const UChar *srcChars, + inline int8_t caseCompare(ConstChar16Ptr srcChars, int32_t srcLength, uint32_t options) const; @@ -677,7 +791,7 @@ public: */ inline int8_t caseCompare(int32_t start, int32_t length, - const UChar *srcChars, + const char16_t *srcChars, uint32_t options) const; /** @@ -704,7 +818,7 @@ public: */ inline int8_t caseCompare(int32_t start, int32_t length, - const UChar *srcChars, + const char16_t *srcChars, int32_t srcStart, int32_t srcLength, uint32_t options) const; @@ -739,21 +853,21 @@ public: uint32_t options) const; /** - * Determine if this starts with the characters in text + * Determine if this starts with the characters in `text` * @param text The text to match. - * @return TRUE if this starts with the characters in text, + * @return TRUE if this starts with the characters in `text`, * FALSE otherwise * @stable ICU 2.0 */ inline UBool startsWith(const UnicodeString& text) const; /** - * Determine if this starts with the characters in srcText - * in the range [srcStart, srcStart + srcLength). + * Determine if this starts with the characters in `srcText` + * in the range [`srcStart`, `srcStart + srcLength`). * @param srcText The text to match. - * @param srcStart the offset into srcText to start matching - * @param srcLength the number of characters in srcText to match - * @return TRUE if this starts with the characters in text, + * @param srcStart the offset into `srcText` to start matching + * @param srcLength the number of characters in `srcText` to match + * @return TRUE if this starts with the characters in `text`, * FALSE otherwise * @stable ICU 2.0 */ @@ -762,45 +876,45 @@ public: int32_t srcLength) const; /** - * Determine if this starts with the characters in srcChars + * Determine if this starts with the characters in `srcChars` * @param srcChars The characters to match. - * @param srcLength the number of characters in srcChars - * @return TRUE if this starts with the characters in srcChars, + * @param srcLength the number of characters in `srcChars` + * @return TRUE if this starts with the characters in `srcChars`, * FALSE otherwise * @stable ICU 2.0 */ - inline UBool startsWith(const UChar *srcChars, + inline UBool startsWith(ConstChar16Ptr srcChars, int32_t srcLength) const; /** - * Determine if this ends with the characters in srcChars - * in the range [srcStart, srcStart + srcLength). + * Determine if this ends with the characters in `srcChars` + * in the range [`srcStart`, `srcStart + srcLength`). * @param srcChars The characters to match. - * @param srcStart the offset into srcText to start matching - * @param srcLength the number of characters in srcChars to match - * @return TRUE if this ends with the characters in srcChars, FALSE otherwise + * @param srcStart the offset into `srcText` to start matching + * @param srcLength the number of characters in `srcChars` to match + * @return TRUE if this ends with the characters in `srcChars`, FALSE otherwise * @stable ICU 2.0 */ - inline UBool startsWith(const UChar *srcChars, + inline UBool startsWith(const char16_t *srcChars, int32_t srcStart, int32_t srcLength) const; /** - * Determine if this ends with the characters in text + * Determine if this ends with the characters in `text` * @param text The text to match. - * @return TRUE if this ends with the characters in text, + * @return TRUE if this ends with the characters in `text`, * FALSE otherwise * @stable ICU 2.0 */ inline UBool endsWith(const UnicodeString& text) const; /** - * Determine if this ends with the characters in srcText - * in the range [srcStart, srcStart + srcLength). + * Determine if this ends with the characters in `srcText` + * in the range [`srcStart`, `srcStart + srcLength`). * @param srcText The text to match. - * @param srcStart the offset into srcText to start matching - * @param srcLength the number of characters in srcText to match - * @return TRUE if this ends with the characters in text, + * @param srcStart the offset into `srcText` to start matching + * @param srcLength the number of characters in `srcText` to match + * @return TRUE if this ends with the characters in `text`, * FALSE otherwise * @stable ICU 2.0 */ @@ -809,27 +923,27 @@ public: int32_t srcLength) const; /** - * Determine if this ends with the characters in srcChars + * Determine if this ends with the characters in `srcChars` * @param srcChars The characters to match. - * @param srcLength the number of characters in srcChars - * @return TRUE if this ends with the characters in srcChars, + * @param srcLength the number of characters in `srcChars` + * @return TRUE if this ends with the characters in `srcChars`, * FALSE otherwise * @stable ICU 2.0 */ - inline UBool endsWith(const UChar *srcChars, + inline UBool endsWith(ConstChar16Ptr srcChars, int32_t srcLength) const; /** - * Determine if this ends with the characters in srcChars - * in the range [srcStart, srcStart + srcLength). + * Determine if this ends with the characters in `srcChars` + * in the range [`srcStart`, `srcStart + srcLength`). * @param srcChars The characters to match. - * @param srcStart the offset into srcText to start matching - * @param srcLength the number of characters in srcChars to match - * @return TRUE if this ends with the characters in srcChars, + * @param srcStart the offset into `srcText` to start matching + * @param srcLength the number of characters in `srcChars` to match + * @return TRUE if this ends with the characters in `srcChars`, * FALSE otherwise * @stable ICU 2.0 */ - inline UBool endsWith(const UChar *srcChars, + inline UBool endsWith(const char16_t *srcChars, int32_t srcStart, int32_t srcLength) const; @@ -837,21 +951,21 @@ public: /* Searching - bitwise only */ /** - * Locate in this the first occurrence of the characters in text, + * Locate in this the first occurrence of the characters in `text`, * using bitwise comparison. * @param text The text to search for. - * @return The offset into this of the start of text, + * @return The offset into this of the start of `text`, * or -1 if not found. * @stable ICU 2.0 */ inline int32_t indexOf(const UnicodeString& text) const; /** - * Locate in this the first occurrence of the characters in text - * starting at offset start, using bitwise comparison. + * Locate in this the first occurrence of the characters in `text` + * starting at offset `start`, using bitwise comparison. * @param text The text to search for. * @param start The offset at which searching will start. - * @return The offset into this of the start of text, + * @return The offset into this of the start of `text`, * or -1 if not found. * @stable ICU 2.0 */ @@ -860,12 +974,12 @@ public: /** * Locate in this the first occurrence in the range - * [start, start + length) of the characters - * in text, using bitwise comparison. + * [`start`, `start + length`) of the characters + * in `text`, using bitwise comparison. * @param text The text to search for. * @param start The offset at which searching will start. * @param length The number of characters to search - * @return The offset into this of the start of text, + * @return The offset into this of the start of `text`, * or -1 if not found. * @stable ICU 2.0 */ @@ -875,17 +989,17 @@ public: /** * Locate in this the first occurrence in the range - * [start, start + length) of the characters - * in srcText in the range - * [srcStart, srcStart + srcLength), + * [`start`, `start + length`) of the characters + * in `srcText` in the range + * [`srcStart`, `srcStart + srcLength`), * using bitwise comparison. * @param srcText The text to search for. - * @param srcStart the offset into srcText at which + * @param srcStart the offset into `srcText` at which * to start matching - * @param srcLength the number of characters in srcText to match + * @param srcLength the number of characters in `srcText` to match * @param start the offset into this at which to start matching * @param length the number of characters in this to search - * @return The offset into this of the start of text, + * @return The offset into this of the start of `text`, * or -1 if not found. * @stable ICU 2.0 */ @@ -897,123 +1011,123 @@ public: /** * Locate in this the first occurrence of the characters in - * srcChars - * starting at offset start, using bitwise comparison. + * `srcChars` + * starting at offset `start`, using bitwise comparison. * @param srcChars The text to search for. - * @param srcLength the number of characters in srcChars to match + * @param srcLength the number of characters in `srcChars` to match * @param start the offset into this at which to start matching - * @return The offset into this of the start of text, + * @return The offset into this of the start of `text`, * or -1 if not found. * @stable ICU 2.0 */ - inline int32_t indexOf(const UChar *srcChars, + inline int32_t indexOf(const char16_t *srcChars, int32_t srcLength, int32_t start) const; /** * Locate in this the first occurrence in the range - * [start, start + length) of the characters - * in srcChars, using bitwise comparison. + * [`start`, `start + length`) of the characters + * in `srcChars`, using bitwise comparison. * @param srcChars The text to search for. - * @param srcLength the number of characters in srcChars + * @param srcLength the number of characters in `srcChars` * @param start The offset at which searching will start. * @param length The number of characters to search - * @return The offset into this of the start of srcChars, + * @return The offset into this of the start of `srcChars`, * or -1 if not found. * @stable ICU 2.0 */ - inline int32_t indexOf(const UChar *srcChars, + inline int32_t indexOf(ConstChar16Ptr srcChars, int32_t srcLength, int32_t start, int32_t length) const; /** * Locate in this the first occurrence in the range - * [start, start + length) of the characters - * in srcChars in the range - * [srcStart, srcStart + srcLength), + * [`start`, `start + length`) of the characters + * in `srcChars` in the range + * [`srcStart`, `srcStart + srcLength`), * using bitwise comparison. * @param srcChars The text to search for. - * @param srcStart the offset into srcChars at which + * @param srcStart the offset into `srcChars` at which * to start matching - * @param srcLength the number of characters in srcChars to match + * @param srcLength the number of characters in `srcChars` to match * @param start the offset into this at which to start matching * @param length the number of characters in this to search - * @return The offset into this of the start of text, + * @return The offset into this of the start of `text`, * or -1 if not found. * @stable ICU 2.0 */ - int32_t indexOf(const UChar *srcChars, + int32_t indexOf(const char16_t *srcChars, int32_t srcStart, int32_t srcLength, int32_t start, int32_t length) const; /** - * Locate in this the first occurrence of the BMP code point c, + * Locate in this the first occurrence of the BMP code point `c`, * using bitwise comparison. * @param c The code unit to search for. - * @return The offset into this of c, or -1 if not found. + * @return The offset into this of `c`, or -1 if not found. * @stable ICU 2.0 */ - inline int32_t indexOf(UChar c) const; + inline int32_t indexOf(char16_t c) const; /** - * Locate in this the first occurrence of the code point c, + * Locate in this the first occurrence of the code point `c`, * using bitwise comparison. * * @param c The code point to search for. - * @return The offset into this of c, or -1 if not found. + * @return The offset into this of `c`, or -1 if not found. * @stable ICU 2.0 */ inline int32_t indexOf(UChar32 c) const; /** - * Locate in this the first occurrence of the BMP code point c, - * starting at offset start, using bitwise comparison. + * Locate in this the first occurrence of the BMP code point `c`, + * starting at offset `start`, using bitwise comparison. * @param c The code unit to search for. * @param start The offset at which searching will start. - * @return The offset into this of c, or -1 if not found. + * @return The offset into this of `c`, or -1 if not found. * @stable ICU 2.0 */ - inline int32_t indexOf(UChar c, + inline int32_t indexOf(char16_t c, int32_t start) const; /** - * Locate in this the first occurrence of the code point c - * starting at offset start, using bitwise comparison. + * Locate in this the first occurrence of the code point `c` + * starting at offset `start`, using bitwise comparison. * * @param c The code point to search for. * @param start The offset at which searching will start. - * @return The offset into this of c, or -1 if not found. + * @return The offset into this of `c`, or -1 if not found. * @stable ICU 2.0 */ inline int32_t indexOf(UChar32 c, int32_t start) const; /** - * Locate in this the first occurrence of the BMP code point c - * in the range [start, start + length), + * Locate in this the first occurrence of the BMP code point `c` + * in the range [`start`, `start + length`), * using bitwise comparison. * @param c The code unit to search for. * @param start the offset into this at which to start matching * @param length the number of characters in this to search - * @return The offset into this of c, or -1 if not found. + * @return The offset into this of `c`, or -1 if not found. * @stable ICU 2.0 */ - inline int32_t indexOf(UChar c, + inline int32_t indexOf(char16_t c, int32_t start, int32_t length) const; /** - * Locate in this the first occurrence of the code point c - * in the range [start, start + length), + * Locate in this the first occurrence of the code point `c` + * in the range [`start`, `start + length`), * using bitwise comparison. * * @param c The code point to search for. * @param start the offset into this at which to start matching * @param length the number of characters in this to search - * @return The offset into this of c, or -1 if not found. + * @return The offset into this of `c`, or -1 if not found. * @stable ICU 2.0 */ inline int32_t indexOf(UChar32 c, @@ -1021,21 +1135,21 @@ public: int32_t length) const; /** - * Locate in this the last occurrence of the characters in text, + * Locate in this the last occurrence of the characters in `text`, * using bitwise comparison. * @param text The text to search for. - * @return The offset into this of the start of text, + * @return The offset into this of the start of `text`, * or -1 if not found. * @stable ICU 2.0 */ inline int32_t lastIndexOf(const UnicodeString& text) const; /** - * Locate in this the last occurrence of the characters in text - * starting at offset start, using bitwise comparison. + * Locate in this the last occurrence of the characters in `text` + * starting at offset `start`, using bitwise comparison. * @param text The text to search for. * @param start The offset at which searching will start. - * @return The offset into this of the start of text, + * @return The offset into this of the start of `text`, * or -1 if not found. * @stable ICU 2.0 */ @@ -1044,12 +1158,12 @@ public: /** * Locate in this the last occurrence in the range - * [start, start + length) of the characters - * in text, using bitwise comparison. + * [`start`, `start + length`) of the characters + * in `text`, using bitwise comparison. * @param text The text to search for. * @param start The offset at which searching will start. * @param length The number of characters to search - * @return The offset into this of the start of text, + * @return The offset into this of the start of `text`, * or -1 if not found. * @stable ICU 2.0 */ @@ -1059,17 +1173,17 @@ public: /** * Locate in this the last occurrence in the range - * [start, start + length) of the characters - * in srcText in the range - * [srcStart, srcStart + srcLength), + * [`start`, `start + length`) of the characters + * in `srcText` in the range + * [`srcStart`, `srcStart + srcLength`), * using bitwise comparison. * @param srcText The text to search for. - * @param srcStart the offset into srcText at which + * @param srcStart the offset into `srcText` at which * to start matching - * @param srcLength the number of characters in srcText to match + * @param srcLength the number of characters in `srcText` to match * @param start the offset into this at which to start matching * @param length the number of characters in this to search - * @return The offset into this of the start of text, + * @return The offset into this of the start of `text`, * or -1 if not found. * @stable ICU 2.0 */ @@ -1080,123 +1194,123 @@ public: int32_t length) const; /** - * Locate in this the last occurrence of the characters in srcChars - * starting at offset start, using bitwise comparison. + * Locate in this the last occurrence of the characters in `srcChars` + * starting at offset `start`, using bitwise comparison. * @param srcChars The text to search for. - * @param srcLength the number of characters in srcChars to match + * @param srcLength the number of characters in `srcChars` to match * @param start the offset into this at which to start matching - * @return The offset into this of the start of text, + * @return The offset into this of the start of `text`, * or -1 if not found. * @stable ICU 2.0 */ - inline int32_t lastIndexOf(const UChar *srcChars, + inline int32_t lastIndexOf(const char16_t *srcChars, int32_t srcLength, int32_t start) const; /** * Locate in this the last occurrence in the range - * [start, start + length) of the characters - * in srcChars, using bitwise comparison. + * [`start`, `start + length`) of the characters + * in `srcChars`, using bitwise comparison. * @param srcChars The text to search for. - * @param srcLength the number of characters in srcChars + * @param srcLength the number of characters in `srcChars` * @param start The offset at which searching will start. * @param length The number of characters to search - * @return The offset into this of the start of srcChars, + * @return The offset into this of the start of `srcChars`, * or -1 if not found. * @stable ICU 2.0 */ - inline int32_t lastIndexOf(const UChar *srcChars, + inline int32_t lastIndexOf(ConstChar16Ptr srcChars, int32_t srcLength, int32_t start, int32_t length) const; /** * Locate in this the last occurrence in the range - * [start, start + length) of the characters - * in srcChars in the range - * [srcStart, srcStart + srcLength), + * [`start`, `start + length`) of the characters + * in `srcChars` in the range + * [`srcStart`, `srcStart + srcLength`), * using bitwise comparison. * @param srcChars The text to search for. - * @param srcStart the offset into srcChars at which + * @param srcStart the offset into `srcChars` at which * to start matching - * @param srcLength the number of characters in srcChars to match + * @param srcLength the number of characters in `srcChars` to match * @param start the offset into this at which to start matching * @param length the number of characters in this to search - * @return The offset into this of the start of text, + * @return The offset into this of the start of `text`, * or -1 if not found. * @stable ICU 2.0 */ - int32_t lastIndexOf(const UChar *srcChars, + int32_t lastIndexOf(const char16_t *srcChars, int32_t srcStart, int32_t srcLength, int32_t start, int32_t length) const; /** - * Locate in this the last occurrence of the BMP code point c, + * Locate in this the last occurrence of the BMP code point `c`, * using bitwise comparison. * @param c The code unit to search for. - * @return The offset into this of c, or -1 if not found. + * @return The offset into this of `c`, or -1 if not found. * @stable ICU 2.0 */ - inline int32_t lastIndexOf(UChar c) const; + inline int32_t lastIndexOf(char16_t c) const; /** - * Locate in this the last occurrence of the code point c, + * Locate in this the last occurrence of the code point `c`, * using bitwise comparison. * * @param c The code point to search for. - * @return The offset into this of c, or -1 if not found. + * @return The offset into this of `c`, or -1 if not found. * @stable ICU 2.0 */ inline int32_t lastIndexOf(UChar32 c) const; /** - * Locate in this the last occurrence of the BMP code point c - * starting at offset start, using bitwise comparison. + * Locate in this the last occurrence of the BMP code point `c` + * starting at offset `start`, using bitwise comparison. * @param c The code unit to search for. * @param start The offset at which searching will start. - * @return The offset into this of c, or -1 if not found. + * @return The offset into this of `c`, or -1 if not found. * @stable ICU 2.0 */ - inline int32_t lastIndexOf(UChar c, + inline int32_t lastIndexOf(char16_t c, int32_t start) const; /** - * Locate in this the last occurrence of the code point c - * starting at offset start, using bitwise comparison. + * Locate in this the last occurrence of the code point `c` + * starting at offset `start`, using bitwise comparison. * * @param c The code point to search for. * @param start The offset at which searching will start. - * @return The offset into this of c, or -1 if not found. + * @return The offset into this of `c`, or -1 if not found. * @stable ICU 2.0 */ inline int32_t lastIndexOf(UChar32 c, int32_t start) const; /** - * Locate in this the last occurrence of the BMP code point c - * in the range [start, start + length), + * Locate in this the last occurrence of the BMP code point `c` + * in the range [`start`, `start + length`), * using bitwise comparison. * @param c The code unit to search for. * @param start the offset into this at which to start matching * @param length the number of characters in this to search - * @return The offset into this of c, or -1 if not found. + * @return The offset into this of `c`, or -1 if not found. * @stable ICU 2.0 */ - inline int32_t lastIndexOf(UChar c, + inline int32_t lastIndexOf(char16_t c, int32_t start, int32_t length) const; /** - * Locate in this the last occurrence of the code point c - * in the range [start, start + length), + * Locate in this the last occurrence of the code point `c` + * in the range [`start`, `start + length`), * using bitwise comparison. * * @param c The code point to search for. * @param start the offset into this at which to start matching * @param length the number of characters in this to search - * @return The offset into this of c, or -1 if not found. + * @return The offset into this of `c`, or -1 if not found. * @stable ICU 2.0 */ inline int32_t lastIndexOf(UChar32 c, @@ -1207,36 +1321,36 @@ public: /* Character access */ /** - * Return the code unit at offset offset. + * Return the code unit at offset `offset`. * If the offset is not valid (0..length()-1) then U+ffff is returned. * @param offset a valid offset into the text - * @return the code unit at offset offset + * @return the code unit at offset `offset` * or 0xffff if the offset is not valid for this string * @stable ICU 2.0 */ - inline UChar charAt(int32_t offset) const; + inline char16_t charAt(int32_t offset) const; /** - * Return the code unit at offset offset. + * Return the code unit at offset `offset`. * If the offset is not valid (0..length()-1) then U+ffff is returned. * @param offset a valid offset into the text - * @return the code unit at offset offset + * @return the code unit at offset `offset` * @stable ICU 2.0 */ - inline UChar operator[] (int32_t offset) const; + inline char16_t operator[] (int32_t offset) const; /** * Return the code point that contains the code unit - * at offset offset. + * at offset `offset`. * If the offset is not valid (0..length()-1) then U+ffff is returned. * @param offset a valid offset into the text * that indicates the text offset of any of the code units * that will be assembled into a code point (21-bit value) and returned - * @return the code point of text at offset + * @return the code point of text at `offset` * or 0xffff if the offset is not valid for this string * @stable ICU 2.0 */ - inline UChar32 char32At(int32_t offset) const; + UChar32 char32At(int32_t offset) const; /** * Adjust a random-access offset so that @@ -1253,7 +1367,7 @@ public: * @see U16_SET_CP_START * @stable ICU 2.0 */ - inline int32_t getChar32Start(int32_t offset) const; + int32_t getChar32Start(int32_t offset) const; /** * Adjust a random-access offset so that @@ -1271,7 +1385,7 @@ public: * @see U16_SET_CP_LIMIT * @stable ICU 2.0 */ - inline int32_t getChar32Limit(int32_t offset) const; + int32_t getChar32Limit(int32_t offset) const; /** * Move the code unit index along the string by delta code points. @@ -1289,33 +1403,33 @@ public: * This behaves like CharacterIterator::move32(delta, kCurrent). * * Behavior for out-of-bounds indexes: - * moveIndex32 pins the input index to 0..length(), i.e., + * `moveIndex32` pins the input index to 0..length(), i.e., * if the input index<0 then it is pinned to 0; * if it is index>length() then it is pinned to length(). - * Afterwards, the index is moved by delta code points + * Afterwards, the index is moved by `delta` code points * forward or backward, * but no further backward than to 0 and no further forward than to length(). * The resulting index return value will be in between 0 and length(), inclusively. * * Examples: - *
-   * // s has code points 'a' U+10000 'b' U+10ffff U+2029
-   * UnicodeString s=UNICODE_STRING("a\\U00010000b\\U0010ffff\\u2029", 31).unescape();
+   * \code
+   *     // s has code points 'a' U+10000 'b' U+10ffff U+2029
+   *     UnicodeString s(u"a\U00010000b\U0010ffff\u2029");
    *
-   * // initial index: position of U+10000
-   * int32_t index=1;
+   *     // initial index: position of U+10000
+   *     int32_t index=1;
    *
-   * // the following examples will all result in index==4, position of U+10ffff
+   *     // the following examples will all result in index==4, position of U+10ffff
    *
-   * // skip 2 code points from some position in the string
-   * index=s.moveIndex32(index, 2); // skips U+10000 and 'b'
+   *     // skip 2 code points from some position in the string
+   *     index=s.moveIndex32(index, 2); // skips U+10000 and 'b'
    *
-   * // go to the 3rd code point from the start of s (0-based)
-   * index=s.moveIndex32(0, 3); // skips 'a', U+10000, and 'b'
+   *     // go to the 3rd code point from the start of s (0-based)
+   *     index=s.moveIndex32(0, 3); // skips 'a', U+10000, and 'b'
    *
-   * // go to the next-to-last code point of s
-   * index=s.moveIndex32(s.length(), -2); // backward-skips U+2029 and U+10ffff
-   * 
+ * // go to the next-to-last code point of s + * index=s.moveIndex32(s.length(), -2); // backward-skips U+2029 and U+10ffff + * \endcode * * @param index input code unit index * @param delta (signed) code point count to move the index forward or backward @@ -1329,22 +1443,22 @@ public: /** * Copy the characters in the range - * [start, start + length) into the array dst, - * beginning at dstStart. - * If the string aliases to dst itself as an external buffer, + * [`start`, `start + length`) into the array `dst`, + * beginning at `dstStart`. + * If the string aliases to `dst` itself as an external buffer, * then extract() will not copy the contents. * * @param start offset of first character which will be copied into the array * @param length the number of characters to extract - * @param dst array in which to copy characters. The length of dst - * must be at least (dstStart + length). - * @param dstStart the offset in dst where the first character + * @param dst array in which to copy characters. The length of `dst` + * must be at least (`dstStart + length`). + * @param dstStart the offset in `dst` where the first character * will be extracted * @stable ICU 2.0 */ inline void extract(int32_t start, int32_t length, - UChar *dst, + Char16Ptr dst, int32_t dstStart = 0) const; /** @@ -1359,27 +1473,26 @@ public: * If the string itself does not fit into dest * (length()>destCapacity) then the error code is set to U_BUFFER_OVERFLOW_ERROR. * - * If the string aliases to dest itself as an external buffer, + * If the string aliases to `dest` itself as an external buffer, * then extract() will not copy the contents. * * @param dest Destination string buffer. - * @param destCapacity Number of UChars available at dest. + * @param destCapacity Number of char16_ts available at dest. * @param errorCode ICU error code. * @return length() * @stable ICU 2.0 */ int32_t - extract(UChar *dest, int32_t destCapacity, + extract(Char16Ptr dest, int32_t destCapacity, UErrorCode &errorCode) const; /** * Copy the characters in the range - * [start, start + length) into the UnicodeString - * target. + * [`start`, `start + length`) into the UnicodeString + * `target`. * @param start offset of first character which will be copied * @param length the number of characters to extract * @param target UnicodeString into which to copy characters. - * @return A reference to target * @stable ICU 2.0 */ inline void extract(int32_t start, @@ -1387,28 +1500,27 @@ public: UnicodeString& target) const; /** - * Copy the characters in the range [start, limit) - * into the array dst, beginning at dstStart. + * Copy the characters in the range [`start`, `limit`) + * into the array `dst`, beginning at `dstStart`. * @param start offset of first character which will be copied into the array * @param limit offset immediately following the last character to be copied - * @param dst array in which to copy characters. The length of dst - * must be at least (dstStart + (limit - start)). - * @param dstStart the offset in dst where the first character + * @param dst array in which to copy characters. The length of `dst` + * must be at least (`dstStart + (limit - start)`). + * @param dstStart the offset in `dst` where the first character * will be extracted * @stable ICU 2.0 */ inline void extractBetween(int32_t start, int32_t limit, - UChar *dst, + char16_t *dst, int32_t dstStart = 0) const; /** - * Copy the characters in the range [start, limit) - * into the UnicodeString target. Replaceable API. + * Copy the characters in the range [`start`, `limit`) + * into the UnicodeString `target`. Replaceable API. * @param start offset of first character which will be copied * @param limit offset immediately following the last character to be copied * @param target UnicodeString into which to copy characters. - * @return A reference to target * @stable ICU 2.0 */ virtual void extractBetween(int32_t start, @@ -1416,12 +1528,12 @@ public: UnicodeString& target) const; /** - * Copy the characters in the range - * [start, start + length) into an array of characters. + * Copy the characters in the range + * [`start`, `start + startLength`) into an array of characters. * All characters must be invariant (see utypes.h). * Use US_INV as the last, signature-distinguishing parameter. * - * This function does not write any more than targetLength + * This function does not write any more than `targetCapacity` * characters but returns the length of the entire output string * so that one can allocate a larger buffer and call the function again * if necessary. @@ -1434,7 +1546,7 @@ public: * @param targetCapacity the length of the target buffer * @param inv Signature-distinguishing paramater, use US_INV. * @return the output string length, not including the terminating NUL - * @draft ICU 3.2 + * @stable ICU 3.2 */ int32_t extract(int32_t start, int32_t startLength, @@ -1442,11 +1554,39 @@ public: int32_t targetCapacity, enum EInvariant inv) const; +#if U_CHARSET_IS_UTF8 || !UCONFIG_NO_CONVERSION + + /** + * Copy the characters in the range + * [`start`, `start + length`) into an array of characters + * in the platform's default codepage. + * This function does not write any more than `targetLength` + * characters but returns the length of the entire output string + * so that one can allocate a larger buffer and call the function again + * if necessary. + * The output string is NUL-terminated if possible. + * + * @param start offset of first character which will be copied + * @param startLength the number of characters to extract + * @param target the target buffer for extraction + * @param targetLength the length of the target buffer + * If `target` is NULL, then the number of bytes required for + * `target` is returned. + * @return the output string length, not including the terminating NUL + * @stable ICU 2.0 + */ + int32_t extract(int32_t start, + int32_t startLength, + char *target, + uint32_t targetLength) const; + +#endif + #if !UCONFIG_NO_CONVERSION /** * Copy the characters in the range - * [start, start + length) into an array of characters + * [`start`, `start + length`) into an array of characters * in a specified codepage. * The output string is NUL-terminated. * @@ -1460,11 +1600,11 @@ public: * @param target the target buffer for extraction * @param codepage the desired codepage for the characters. 0 has * the special meaning of the default codepage - * If codepage is an empty string (""), + * If `codepage` is an empty string (`""`), * then a simple conversion is performed on the codepage-invariant * subset ("invariant characters") of the platform encoding. See utypes.h. - * If target is NULL, then the number of bytes required for - * target is returned. It is assumed that the target is big enough + * If `target` is NULL, then the number of bytes required for + * `target` is returned. It is assumed that the target is big enough * to fit all of the characters. * @return the output string length, not including the terminating NUL * @stable ICU 2.0 @@ -1476,9 +1616,9 @@ public: /** * Copy the characters in the range - * [start, start + length) into an array of characters + * [`start`, `start + length`) into an array of characters * in a specified codepage. - * This function does not write any more than targetLength + * This function does not write any more than `targetLength` * characters but returns the length of the entire output string * so that one can allocate a larger buffer and call the function again * if necessary. @@ -1495,11 +1635,11 @@ public: * @param targetLength the length of the target buffer * @param codepage the desired codepage for the characters. 0 has * the special meaning of the default codepage - * If codepage is an empty string (""), + * If `codepage` is an empty string (`""`), * then a simple conversion is performed on the codepage-invariant * subset ("invariant characters") of the platform encoding. See utypes.h. - * If target is NULL, then the number of bytes required for - * target is returned. + * If `target` is NULL, then the number of bytes required for + * `target` is returned. * @return the output string length, not including the terminating NUL * @stable ICU 2.0 */ @@ -1507,7 +1647,7 @@ public: int32_t startLength, char *target, uint32_t targetLength, - const char *codepage = 0) const; + const char *codepage) const; /** * Convert the UnicodeString into a codepage string using an existing UConverter. @@ -1532,11 +1672,87 @@ public: #endif + /** + * Create a temporary substring for the specified range. + * Unlike the substring constructor and setTo() functions, + * the object returned here will be a read-only alias (using getBuffer()) + * rather than copying the text. + * As a result, this substring operation is much faster but requires + * that the original string not be modified or deleted during the lifetime + * of the returned substring object. + * @param start offset of the first character visible in the substring + * @param length length of the substring + * @return a read-only alias UnicodeString object for the substring + * @stable ICU 4.4 + */ + UnicodeString tempSubString(int32_t start=0, int32_t length=INT32_MAX) const; + + /** + * Create a temporary substring for the specified range. + * Same as tempSubString(start, length) except that the substring range + * is specified as a (start, limit) pair (with an exclusive limit index) + * rather than a (start, length) pair. + * @param start offset of the first character visible in the substring + * @param limit offset immediately following the last character visible in the substring + * @return a read-only alias UnicodeString object for the substring + * @stable ICU 4.4 + */ + inline UnicodeString tempSubStringBetween(int32_t start, int32_t limit=INT32_MAX) const; + + /** + * Convert the UnicodeString to UTF-8 and write the result + * to a ByteSink. This is called by toUTF8String(). + * Unpaired surrogates are replaced with U+FFFD. + * Calls u_strToUTF8WithSub(). + * + * @param sink A ByteSink to which the UTF-8 version of the string is written. + * sink.Flush() is called at the end. + * @stable ICU 4.2 + * @see toUTF8String + */ + void toUTF8(ByteSink &sink) const; + + /** + * Convert the UnicodeString to UTF-8 and append the result + * to a standard string. + * Unpaired surrogates are replaced with U+FFFD. + * Calls toUTF8(). + * + * @param result A standard string (or a compatible object) + * to which the UTF-8 version of the string is appended. + * @return The string object. + * @stable ICU 4.2 + * @see toUTF8 + */ + template + StringClass &toUTF8String(StringClass &result) const { + StringByteSink sbs(&result, length()); + toUTF8(sbs); + return result; + } + + /** + * Convert the UnicodeString to UTF-32. + * Unpaired surrogates are replaced with U+FFFD. + * Calls u_strToUTF32WithSub(). + * + * @param utf32 destination string buffer, can be NULL if capacity==0 + * @param capacity the number of UChar32s available at utf32 + * @param errorCode Standard ICU error code. Its input value must + * pass the U_SUCCESS() test, or else the function returns + * immediately. Check for U_FAILURE() on output or use with + * function chaining. (See User Guide for details.) + * @return The length of the UTF-32 string. + * @see fromUTF32 + * @stable ICU 4.2 + */ + int32_t toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const; + /* Length operations */ /** * Return the length of the UnicodeString object. - * The length is the number of UChar code units are in the UnicodeString. + * The length is the number of char16_t code units are in the UnicodeString. * If you want the number of code points, please use countChar32(). * @return the length of the UnicodeString object * @see countChar32 @@ -1545,14 +1761,14 @@ public: inline int32_t length(void) const; /** - * Count Unicode code points in the length UChar code units of the string. - * A code point may occupy either one or two UChar code units. + * Count Unicode code points in the length char16_t code units of the string. + * A code point may occupy either one or two char16_t code units. * Counting code points involves reading all code units. * * This functions is basically the inverse of moveIndex32(). * * @param start the index of the first code unit to check - * @param length the number of UChar code units to check + * @param length the number of char16_t code units to check * @return the number of code points in the specified code units * @see length * @stable ICU 2.0 @@ -1561,7 +1777,7 @@ public: countChar32(int32_t start=0, int32_t length=INT32_MAX) const; /** - * Check if the length UChar code units of the string + * Check if the length char16_t code units of the string * contain more Unicode code points than a certain number. * This is more efficient than counting all code points in this part of the string * and comparing that number with a threshold. @@ -1569,10 +1785,10 @@ public: * falls within a certain range, and * never needs to count more than 'number+1' code points. * Logically equivalent to (countChar32(start, length)>number). - * A Unicode code point may occupy either one or two UChar code units. + * A Unicode code point may occupy either one or two char16_t code units. * * @param start the index of the first code unit to check (0 for the entire string) - * @param length the number of UChar code units to check + * @param length the number of char16_t code units to check * (use INT32_MAX for the entire string; remember that start/length * values are pinned) * @param number The number of code points in the (sub)string is compared against @@ -1598,7 +1814,7 @@ public: * This is useful together with the getBuffer functions. * See there for details. * - * @return the number of UChars available in the internal buffer + * @return the number of char16_ts available in the internal buffer * @see getBuffer * @stable ICU 2.0 */ @@ -1615,12 +1831,13 @@ public: /** * Determine if this object contains a valid string. - * A bogus string has no value. It is different from an empty string. - * It can be used to indicate that no string value is available. - * getBuffer() and getTerminatedBuffer() return NULL, and + * A bogus string has no value. It is different from an empty string, + * although in both cases isEmpty() returns TRUE and length() returns 0. + * setToBogus() and isBogus() can be used to indicate that no string value is available. + * For a bogus string, getBuffer() and getTerminatedBuffer() return NULL, and * length() returns 0. * - * @return TRUE if the string is valid, FALSE otherwise + * @return TRUE if the string is bogus/invalid, FALSE otherwise * @see setToBogus() * @stable ICU 2.0 */ @@ -1635,28 +1852,44 @@ public: /** * Assignment operator. Replace the characters in this UnicodeString - * with the characters from srcText. + * with the characters from `srcText`. + * + * Starting with ICU 2.4, the assignment operator and the copy constructor + * allocate a new buffer and copy the buffer contents even for readonly aliases. + * By contrast, the fastCopyFrom() function implements the old, + * more efficient but less safe behavior + * of making this string also a readonly alias to the same buffer. + * + * If the source object has an "open" buffer from getBuffer(minCapacity), + * then the copy is an empty string. + * * @param srcText The text containing the characters to replace * @return a reference to this * @stable ICU 2.0 + * @see fastCopyFrom */ UnicodeString &operator=(const UnicodeString &srcText); /** * Almost the same as the assignment operator. * Replace the characters in this UnicodeString - * with the characters from srcText. + * with the characters from `srcText`. + * + * This function works the same as the assignment operator + * for all strings except for ones that are readonly aliases. * - * This function works the same for all strings except for ones that - * are readonly aliases. * Starting with ICU 2.4, the assignment operator and the copy constructor * allocate a new buffer and copy the buffer contents even for readonly aliases. * This function implements the old, more efficient but less safe behavior * of making this string also a readonly alias to the same buffer. + * * The fastCopyFrom function must be used only if it is known that the lifetime of - * this UnicodeString is at least as long as the lifetime of the aliased buffer + * this UnicodeString does not exceed the lifetime of the aliased buffer * including its contents, for example for strings from resource bundles - * or aliases to string contents. + * or aliases to string constants. + * + * If the source object has an "open" buffer from getBuffer(minCapacity), + * then the copy is an empty string. * * @param src The text containing the characters to replace. * @return a reference to this @@ -1664,18 +1897,46 @@ public: */ UnicodeString &fastCopyFrom(const UnicodeString &src); + /** + * Move assignment operator; might leave src in bogus state. + * This string will have the same contents and state that the source string had. + * The behavior is undefined if *this and src are the same object. + * @param src source string + * @return *this + * @stable ICU 56 + */ + UnicodeString &operator=(UnicodeString &&src) U_NOEXCEPT; + + /** + * Swap strings. + * @param other other string + * @stable ICU 56 + */ + void swap(UnicodeString &other) U_NOEXCEPT; + + /** + * Non-member UnicodeString swap function. + * @param s1 will get s2's contents and state + * @param s2 will get s1's contents and state + * @stable ICU 56 + */ + friend inline void U_EXPORT2 + swap(UnicodeString &s1, UnicodeString &s2) U_NOEXCEPT { + s1.swap(s2); + } + /** * Assignment operator. Replace the characters in this UnicodeString - * with the code unit ch. + * with the code unit `ch`. * @param ch the code unit to replace * @return a reference to this * @stable ICU 2.0 */ - inline UnicodeString& operator= (UChar ch); + inline UnicodeString& operator= (char16_t ch); /** * Assignment operator. Replace the characters in this UnicodeString - * with the code point ch. + * with the code point `ch`. * @param ch the code point to replace * @return a reference to this * @stable ICU 2.0 @@ -1684,11 +1945,11 @@ public: /** * Set the text in the UnicodeString object to the characters - * in srcText in the range - * [srcStart, srcText.length()). - * srcText is not modified. + * in `srcText` in the range + * [`srcStart`, `srcText.length()`). + * `srcText` is not modified. * @param srcText the source for the new characters - * @param srcStart the offset into srcText where new characters + * @param srcStart the offset into `srcText` where new characters * will be obtained * @return a reference to this * @stable ICU 2.2 @@ -1698,13 +1959,13 @@ public: /** * Set the text in the UnicodeString object to the characters - * in srcText in the range - * [srcStart, srcStart + srcLength). - * srcText is not modified. + * in `srcText` in the range + * [`srcStart`, `srcStart + srcLength`). + * `srcText` is not modified. * @param srcText the source for the new characters - * @param srcStart the offset into srcText where new characters + * @param srcStart the offset into `srcText` where new characters * will be obtained - * @param srcLength the number of characters in srcText in the + * @param srcLength the number of characters in `srcText` in the * replace string. * @return a reference to this * @stable ICU 2.0 @@ -1715,8 +1976,8 @@ public: /** * Set the text in the UnicodeString object to the characters in - * srcText. - * srcText is not modified. + * `srcText`. + * `srcText` is not modified. * @param srcText the source for the new characters * @return a reference to this * @stable ICU 2.0 @@ -1725,61 +1986,64 @@ public: /** * Set the characters in the UnicodeString object to the characters - * in srcChars. srcChars is not modified. + * in `srcChars`. `srcChars` is not modified. * @param srcChars the source for the new characters * @param srcLength the number of Unicode characters in srcChars. * @return a reference to this * @stable ICU 2.0 */ - inline UnicodeString& setTo(const UChar *srcChars, + inline UnicodeString& setTo(const char16_t *srcChars, int32_t srcLength); /** * Set the characters in the UnicodeString object to the code unit - * srcChar. + * `srcChar`. * @param srcChar the code unit which becomes the UnicodeString's character * content * @return a reference to this * @stable ICU 2.0 */ - UnicodeString& setTo(UChar srcChar); + inline UnicodeString& setTo(char16_t srcChar); /** * Set the characters in the UnicodeString object to the code point - * srcChar. + * `srcChar`. * @param srcChar the code point which becomes the UnicodeString's character * content * @return a reference to this * @stable ICU 2.0 */ - UnicodeString& setTo(UChar32 srcChar); + inline UnicodeString& setTo(UChar32 srcChar); /** - * Aliasing setTo() function, analogous to the readonly-aliasing UChar* constructor. + * Aliasing setTo() function, analogous to the readonly-aliasing char16_t* constructor. * The text will be used for the UnicodeString object, but * it will not be released when the UnicodeString is destroyed. * This has copy-on-write semantics: * When the string is modified, then the buffer is first copied into * newly allocated memory. * The aliased buffer is never modified. - * In an assignment to another UnicodeString, the text will be aliased again, + * + * In an assignment to another UnicodeString, when using the copy constructor + * or the assignment operator, the text will be copied. + * When using fastCopyFrom(), the text will be aliased again, * so that both strings then alias the same readonly-text. * - * @param isTerminated specifies if text is NUL-terminated. - * This must be true if textLength==-1. + * @param isTerminated specifies if `text` is `NUL`-terminated. + * This must be true if `textLength==-1`. * @param text The characters to alias for the UnicodeString. - * @param textLength The number of Unicode characters in text to alias. + * @param textLength The number of Unicode characters in `text` to alias. * If -1, then this constructor will determine the length - * by calling u_strlen(). + * by calling `u_strlen()`. * @return a reference to this * @stable ICU 2.0 */ UnicodeString &setTo(UBool isTerminated, - const UChar *text, + ConstChar16Ptr text, int32_t textLength); /** - * Aliasing setTo() function, analogous to the writable-aliasing UChar* constructor. + * Aliasing setTo() function, analogous to the writable-aliasing char16_t* constructor. * The text will be used for the UnicodeString object, but * it will not be released when the UnicodeString is destroyed. * This has write-through semantics: @@ -1788,16 +2052,16 @@ public: * a new buffer will be allocated and the contents copied as with regularly * constructed strings. * In an assignment to another UnicodeString, the buffer will be copied. - * The extract(UChar *dst) function detects whether the dst pointer is the same + * The extract(Char16Ptr dst) function detects whether the dst pointer is the same * as the string buffer itself and will in this case not copy the contents. * * @param buffer The characters to alias for the UnicodeString. - * @param buffLength The number of Unicode characters in buffer to alias. - * @param buffCapacity The size of buffer in UChars. + * @param buffLength The number of Unicode characters in `buffer` to alias. + * @param buffCapacity The size of `buffer` in char16_ts. * @return a reference to this * @stable ICU 2.0 */ - UnicodeString &setTo(UChar *buffer, + UnicodeString &setTo(char16_t *buffer, int32_t buffLength, int32_t buffCapacity); @@ -1833,7 +2097,7 @@ public: * s.truncate(0); // set to an empty string (complete truncation), or * s=UnicodeString(); // assign an empty string, or * s.setTo((UChar32)-1); // set to a pseudo code point that is out of range, or - * static const UChar nul=0; + * static const char16_t nul=0; * s.setTo(&nul, 0); // set to an empty C Unicode string * } * \endcode @@ -1851,22 +2115,22 @@ public: * @stable ICU 2.0 */ UnicodeString& setCharAt(int32_t offset, - UChar ch); + char16_t ch); /* Append operations */ /** - * Append operator. Append the code unit ch to the UnicodeString + * Append operator. Append the code unit `ch` to the UnicodeString * object. * @param ch the code unit to be appended * @return a reference to this * @stable ICU 2.0 */ - inline UnicodeString& operator+= (UChar ch); + inline UnicodeString& operator+= (char16_t ch); /** - * Append operator. Append the code point ch to the UnicodeString + * Append operator. Append the code point `ch` to the UnicodeString * object. * @param ch the code point to be appended * @return a reference to this @@ -1875,9 +2139,8 @@ public: inline UnicodeString& operator+= (UChar32 ch); /** - * Append operator. Append the characters in srcText to the - * UnicodeString object at offset start. srcText is - * not modified. + * Append operator. Append the characters in `srcText` to the + * UnicodeString object. `srcText` is not modified. * @param srcText the source for the new characters * @return a reference to this * @stable ICU 2.0 @@ -1886,14 +2149,14 @@ public: /** * Append the characters - * in srcText in the range - * [srcStart, srcStart + srcLength) to the - * UnicodeString object at offset start. srcText + * in `srcText` in the range + * [`srcStart`, `srcStart + srcLength`) to the + * UnicodeString object at offset `start`. `srcText` * is not modified. * @param srcText the source for the new characters - * @param srcStart the offset into srcText where new characters + * @param srcStart the offset into `srcText` where new characters * will be obtained - * @param srcLength the number of characters in srcText in + * @param srcLength the number of characters in `srcText` in * the append string * @return a reference to this * @stable ICU 2.0 @@ -1903,8 +2166,8 @@ public: int32_t srcLength); /** - * Append the characters in srcText to the UnicodeString object at - * offset start. srcText is not modified. + * Append the characters in `srcText` to the UnicodeString object. + * `srcText` is not modified. * @param srcText the source for the new characters * @return a reference to this * @stable ICU 2.0 @@ -1912,61 +2175,62 @@ public: inline UnicodeString& append(const UnicodeString& srcText); /** - * Append the characters in srcChars in the range - * [srcStart, srcStart + srcLength) to the UnicodeString + * Append the characters in `srcChars` in the range + * [`srcStart`, `srcStart + srcLength`) to the UnicodeString * object at offset - * start. srcChars is not modified. + * `start`. `srcChars` is not modified. * @param srcChars the source for the new characters - * @param srcStart the offset into srcChars where new characters + * @param srcStart the offset into `srcChars` where new characters * will be obtained - * @param srcLength the number of characters in srcChars in - * the append string + * @param srcLength the number of characters in `srcChars` in + * the append string; can be -1 if `srcChars` is NUL-terminated * @return a reference to this * @stable ICU 2.0 */ - inline UnicodeString& append(const UChar *srcChars, + inline UnicodeString& append(const char16_t *srcChars, int32_t srcStart, int32_t srcLength); /** - * Append the characters in srcChars to the UnicodeString object - * at offset start. srcChars is not modified. + * Append the characters in `srcChars` to the UnicodeString object + * at offset `start`. `srcChars` is not modified. * @param srcChars the source for the new characters - * @param srcLength the number of Unicode characters in srcChars + * @param srcLength the number of Unicode characters in `srcChars`; + * can be -1 if `srcChars` is NUL-terminated * @return a reference to this * @stable ICU 2.0 */ - inline UnicodeString& append(const UChar *srcChars, + inline UnicodeString& append(ConstChar16Ptr srcChars, int32_t srcLength); /** - * Append the code unit srcChar to the UnicodeString object. + * Append the code unit `srcChar` to the UnicodeString object. * @param srcChar the code unit to append * @return a reference to this * @stable ICU 2.0 */ - inline UnicodeString& append(UChar srcChar); + inline UnicodeString& append(char16_t srcChar); /** - * Append the code point srcChar to the UnicodeString object. + * Append the code point `srcChar` to the UnicodeString object. * @param srcChar the code point to append * @return a reference to this * @stable ICU 2.0 */ - inline UnicodeString& append(UChar32 srcChar); + UnicodeString& append(UChar32 srcChar); /* Insert operations */ /** - * Insert the characters in srcText in the range - * [srcStart, srcStart + srcLength) into the UnicodeString - * object at offset start. srcText is not modified. + * Insert the characters in `srcText` in the range + * [`srcStart`, `srcStart + srcLength`) into the UnicodeString + * object at offset `start`. `srcText` is not modified. * @param start the offset where the insertion begins * @param srcText the source for the new characters - * @param srcStart the offset into srcText where new characters + * @param srcStart the offset into `srcText` where new characters * will be obtained - * @param srcLength the number of characters in srcText in + * @param srcLength the number of characters in `srcText` in * the insert string * @return a reference to this * @stable ICU 2.0 @@ -1977,8 +2241,8 @@ public: int32_t srcLength); /** - * Insert the characters in srcText into the UnicodeString object - * at offset start. srcText is not modified. + * Insert the characters in `srcText` into the UnicodeString object + * at offset `start`. `srcText` is not modified. * @param start the offset where the insertion begins * @param srcText the source for the new characters * @return a reference to this @@ -1988,26 +2252,26 @@ public: const UnicodeString& srcText); /** - * Insert the characters in srcChars in the range - * [srcStart, srcStart + srcLength) into the UnicodeString - * object at offset start. srcChars is not modified. + * Insert the characters in `srcChars` in the range + * [`srcStart`, `srcStart + srcLength`) into the UnicodeString + * object at offset `start`. `srcChars` is not modified. * @param start the offset at which the insertion begins * @param srcChars the source for the new characters - * @param srcStart the offset into srcChars where new characters + * @param srcStart the offset into `srcChars` where new characters * will be obtained - * @param srcLength the number of characters in srcChars + * @param srcLength the number of characters in `srcChars` * in the insert string * @return a reference to this * @stable ICU 2.0 */ inline UnicodeString& insert(int32_t start, - const UChar *srcChars, + const char16_t *srcChars, int32_t srcStart, int32_t srcLength); /** - * Insert the characters in srcChars into the UnicodeString object - * at offset start. srcChars is not modified. + * Insert the characters in `srcChars` into the UnicodeString object + * at offset `start`. `srcChars` is not modified. * @param start the offset where the insertion begins * @param srcChars the source for the new characters * @param srcLength the number of Unicode characters in srcChars. @@ -2015,23 +2279,23 @@ public: * @stable ICU 2.0 */ inline UnicodeString& insert(int32_t start, - const UChar *srcChars, + ConstChar16Ptr srcChars, int32_t srcLength); /** - * Insert the code unit srcChar into the UnicodeString object at - * offset start. + * Insert the code unit `srcChar` into the UnicodeString object at + * offset `start`. * @param start the offset at which the insertion occurs * @param srcChar the code unit to insert * @return a reference to this * @stable ICU 2.0 */ inline UnicodeString& insert(int32_t start, - UChar srcChar); + char16_t srcChar); /** - * Insert the code point srcChar into the UnicodeString object at - * offset start. + * Insert the code point `srcChar` into the UnicodeString object at + * offset `start`. * @param start the offset at which the insertion occurs * @param srcChar the code point to insert * @return a reference to this @@ -2045,22 +2309,22 @@ public: /** * Replace the characters in the range - * [start, start + length) with the characters in - * srcText in the range - * [srcStart, srcStart + srcLength). - * srcText is not modified. + * [`start`, `start + length`) with the characters in + * `srcText` in the range + * [`srcStart`, `srcStart + srcLength`). + * `srcText` is not modified. * @param start the offset at which the replace operation begins * @param length the number of characters to replace. The character at - * start + length is not modified. + * `start + length` is not modified. * @param srcText the source for the new characters - * @param srcStart the offset into srcText where new characters + * @param srcStart the offset into `srcText` where new characters * will be obtained - * @param srcLength the number of characters in srcText in + * @param srcLength the number of characters in `srcText` in * the replace string * @return a reference to this * @stable ICU 2.0 */ - UnicodeString& replace(int32_t start, + inline UnicodeString& replace(int32_t start, int32_t length, const UnicodeString& srcText, int32_t srcStart, @@ -2068,50 +2332,50 @@ public: /** * Replace the characters in the range - * [start, start + length) - * with the characters in srcText. srcText is + * [`start`, `start + length`) + * with the characters in `srcText`. `srcText` is * not modified. * @param start the offset at which the replace operation begins * @param length the number of characters to replace. The character at - * start + length is not modified. + * `start + length` is not modified. * @param srcText the source for the new characters * @return a reference to this * @stable ICU 2.0 */ - UnicodeString& replace(int32_t start, + inline UnicodeString& replace(int32_t start, int32_t length, const UnicodeString& srcText); /** * Replace the characters in the range - * [start, start + length) with the characters in - * srcChars in the range - * [srcStart, srcStart + srcLength). srcChars + * [`start`, `start + length`) with the characters in + * `srcChars` in the range + * [`srcStart`, `srcStart + srcLength`). `srcChars` * is not modified. * @param start the offset at which the replace operation begins * @param length the number of characters to replace. The character at - * start + length is not modified. + * `start + length` is not modified. * @param srcChars the source for the new characters - * @param srcStart the offset into srcChars where new characters + * @param srcStart the offset into `srcChars` where new characters * will be obtained - * @param srcLength the number of characters in srcChars + * @param srcLength the number of characters in `srcChars` * in the replace string * @return a reference to this * @stable ICU 2.0 */ - UnicodeString& replace(int32_t start, + inline UnicodeString& replace(int32_t start, int32_t length, - const UChar *srcChars, + const char16_t *srcChars, int32_t srcStart, int32_t srcLength); /** * Replace the characters in the range - * [start, start + length) with the characters in - * srcChars. srcChars is not modified. + * [`start`, `start + length`) with the characters in + * `srcChars`. `srcChars` is not modified. * @param start the offset at which the replace operation begins * @param length number of characters to replace. The character at - * start + length is not modified. + * `start + length` is not modified. * @param srcChars the source for the new characters * @param srcLength the number of Unicode characters in srcChars * @return a reference to this @@ -2119,42 +2383,40 @@ public: */ inline UnicodeString& replace(int32_t start, int32_t length, - const UChar *srcChars, + ConstChar16Ptr srcChars, int32_t srcLength); /** * Replace the characters in the range - * [start, start + length) with the code unit - * srcChar. + * [`start`, `start + length`) with the code unit + * `srcChar`. * @param start the offset at which the replace operation begins * @param length the number of characters to replace. The character at - * start + length is not modified. + * `start + length` is not modified. * @param srcChar the new code unit * @return a reference to this * @stable ICU 2.0 */ inline UnicodeString& replace(int32_t start, int32_t length, - UChar srcChar); + char16_t srcChar); /** * Replace the characters in the range - * [start, start + length) with the code point - * srcChar. + * [`start`, `start + length`) with the code point + * `srcChar`. * @param start the offset at which the replace operation begins * @param length the number of characters to replace. The character at - * start + length is not modified. + * `start + length` is not modified. * @param srcChar the new code point * @return a reference to this * @stable ICU 2.0 */ - inline UnicodeString& replace(int32_t start, - int32_t length, - UChar32 srcChar); + UnicodeString& replace(int32_t start, int32_t length, UChar32 srcChar); /** - * Replace the characters in the range [start, limit) - * with the characters in srcText. srcText is not modified. + * Replace the characters in the range [`start`, `limit`) + * with the characters in `srcText`. `srcText` is not modified. * @param start the offset at which the replace operation begins * @param limit the offset immediately following the replace range * @param srcText the source for the new characters @@ -2166,16 +2428,16 @@ public: const UnicodeString& srcText); /** - * Replace the characters in the range [start, limit) - * with the characters in srcText in the range - * [srcStart, srcLimit). srcText is not modified. + * Replace the characters in the range [`start`, `limit`) + * with the characters in `srcText` in the range + * [`srcStart`, `srcLimit`). `srcText` is not modified. * @param start the offset at which the replace operation begins * @param limit the offset immediately following the replace range * @param srcText the source for the new characters - * @param srcStart the offset into srcChars where new characters + * @param srcStart the offset into `srcChars` where new characters * will be obtained * @param srcLimit the offset immediately following the range to copy - * in srcText + * in `srcText` * @return a reference to this * @stable ICU 2.0 */ @@ -2187,12 +2449,9 @@ public: /** * Replace a substring of this object with the given text. - * @param start the beginning index, inclusive; 0 <= start - * <= limit. - * @param limit the ending index, exclusive; start <= limit - * <= length(). - * @param text the text to replace characters start - * to limit - 1 + * @param start the beginning index, inclusive; `0 <= start <= limit`. + * @param limit the ending index, exclusive; `start <= limit <= length()`. + * @param text the text to replace characters `start` to `limit - 1` * @stable ICU 2.0 */ virtual void handleReplaceBetween(int32_t start, @@ -2211,14 +2470,12 @@ public: * information. This method is used to duplicate or reorder substrings. * The destination index must not overlap the source range. * - * @param start the beginning index, inclusive; 0 <= start <= - * limit. - * @param limit the ending index, exclusive; start <= limit <= - * length(). + * @param start the beginning index, inclusive; `0 <= start <= limit`. + * @param limit the ending index, exclusive; `start <= limit <= length()`. * @param dest the destination index. The characters from - * start..limit-1 will be copied to dest. - * Implementations of this method may assume that dest <= start || - * dest >= limit. + * `start..limit-1` will be copied to `dest`. + * Implementations of this method may assume that `dest <= start || + * dest >= limit`. * @stable ICU 2.0 */ virtual void copy(int32_t start, int32_t limit, int32_t dest); @@ -2239,7 +2496,7 @@ public: /** * Replace all occurrences of characters in oldText with characters * in newText - * in the range [start, start + length). + * in the range [`start`, `start + length`). * @param start the start of the range in which replace will performed * @param length the length of the range in which replace will be performed * @param oldText the text containing the search text @@ -2254,18 +2511,18 @@ public: /** * Replace all occurrences of characters in oldText in the range - * [oldStart, oldStart + oldLength) with the characters + * [`oldStart`, `oldStart + oldLength`) with the characters * in newText in the range - * [newStart, newStart + newLength) - * in the range [start, start + length). + * [`newStart`, `newStart + newLength`) + * in the range [`start`, `start + length`). * @param start the start of the range in which replace will performed * @param length the length of the range in which replace will be performed * @param oldText the text containing the search text - * @param oldStart the start of the search range in oldText - * @param oldLength the length of the search range in oldText + * @param oldStart the start of the search range in `oldText` + * @param oldLength the length of the search range in `oldText` * @param newText the text containing the replacement text - * @param newStart the start of the replacement range in newText - * @param newLength the length of the replacement range in newText + * @param newStart the start of the replacement range in `newText` + * @param newLength the length of the replacement range in `newText` * @return a reference to this * @stable ICU 2.0 */ @@ -2290,7 +2547,7 @@ public: /** * Remove the characters in the range - * [start, start + length) from the UnicodeString object. + * [`start`, `start + length`) from the UnicodeString object. * @param start the offset of the first character to remove * @param length the number of characters to remove * @return a reference to this @@ -2301,7 +2558,7 @@ public: /** * Remove the characters in the range - * [start, limit) from the UnicodeString object. + * [`start`, `limit`) from the UnicodeString object. * @param start the offset of the first character to remove * @param limit the offset immediately following the range to remove * @return a reference to this @@ -2310,11 +2567,21 @@ public: inline UnicodeString& removeBetween(int32_t start, int32_t limit = (int32_t)INT32_MAX); + /** + * Retain only the characters in the range + * [`start`, `limit`) from the UnicodeString object. + * Removes characters before `start` and at and after `limit`. + * @param start the offset of the first character to retain + * @param limit the offset immediately following the range to retain + * @return a reference to this + * @stable ICU 4.4 + */ + inline UnicodeString &retainBetween(int32_t start, int32_t limit = INT32_MAX); /* Length operations */ /** - * Pad the start of this UnicodeString with the character padChar. + * Pad the start of this UnicodeString with the character `padChar`. * If the length of this UnicodeString is less than targetLength, * length() - targetLength copies of padChar will be added to the * beginning of this UnicodeString. @@ -2325,10 +2592,10 @@ public: * @stable ICU 2.0 */ UBool padLeading(int32_t targetLength, - UChar padChar = 0x0020); + char16_t padChar = 0x0020); /** - * Pad the end of this UnicodeString with the character padChar. + * Pad the end of this UnicodeString with the character `padChar`. * If the length of this UnicodeString is less than targetLength, * length() - targetLength copies of padChar will be added to the * end of this UnicodeString. @@ -2339,10 +2606,10 @@ public: * @stable ICU 2.0 */ UBool padTrailing(int32_t targetLength, - UChar padChar = 0x0020); + char16_t padChar = 0x0020); /** - * Truncate this UnicodeString to the targetLength. + * Truncate this UnicodeString to the `targetLength`. * @param targetLength the desired length of this UnicodeString. * @return TRUE if the text was truncated, FALSE otherwise * @stable ICU 2.0 @@ -2367,7 +2634,7 @@ public: inline UnicodeString& reverse(void); /** - * Reverse the range [start, start + length) in + * Reverse the range [`start`, `start + length`) in * this UnicodeString. * @param start the start of the range to reverse * @param length the number of characters to to reverse @@ -2428,7 +2695,7 @@ public: * The standard titlecase iterator for the root locale implements the * algorithm of Unicode TR 21. * - * This function uses only the first() and next() methods of the + * This function uses only the setText(), first() and next() methods of the * provided break iterator. * * @param titleIter A break iterator to find the first characters of words @@ -2456,7 +2723,7 @@ public: * The standard titlecase iterator for the root locale implements the * algorithm of Unicode TR 21. * - * This function uses only the first() and next() methods of the + * This function uses only the setText(), first() and next() methods of the * provided break iterator. * * @param titleIter A break iterator to find the first characters of words @@ -2470,13 +2737,48 @@ public: */ UnicodeString &toTitle(BreakIterator *titleIter, const Locale &locale); + /** + * Titlecase this string, with options. + * + * Casing is locale-dependent and context-sensitive. + * Titlecasing uses a break iterator to find the first characters of words + * that are to be titlecased. It titlecases those characters and lowercases + * all others. (This can be modified with options.) + * + * The titlecase break iterator can be provided to customize for arbitrary + * styles, using rules and dictionaries beyond the standard iterators. + * It may be more efficient to always provide an iterator to avoid + * opening and closing one for each string. + * The standard titlecase iterator for the root locale implements the + * algorithm of Unicode TR 21. + * + * This function uses only the setText(), first() and next() methods of the + * provided break iterator. + * + * @param titleIter A break iterator to find the first characters of words + * that are to be titlecased. + * If none is provided (0), then a standard titlecase + * break iterator is opened. + * Otherwise the provided iterator is set to the string's text. + * @param locale The locale to consider. + * @param options Options bit set, usually 0. See U_TITLECASE_NO_LOWERCASE, + * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED, + * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES. + * @param options Options bit set, see ucasemap_open(). + * @return A reference to this. + * @stable ICU 3.8 + */ + UnicodeString &toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options); + #endif /** - * Case-fold the characters in this string. + * Case-folds the characters in this string. + * * Case-folding is locale-independent and not context-sensitive, * but there is an option for whether to include or exclude mappings for dotted I - * and dotless i that are marked with 'I' in CaseFolding.txt. + * and dotless i that are marked with 'T' in CaseFolding.txt. + * * The result may be longer or shorter than the original. * * @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I @@ -2491,7 +2793,7 @@ public: /** * Get a read/write pointer to the internal buffer. - * The buffer is guaranteed to be large enough for at least minCapacity UChars, + * The buffer is guaranteed to be large enough for at least minCapacity char16_ts, * writable, and is still owned by the UnicodeString object. * Calls to getBuffer(minCapacity) must not be nested, and * must be matched with calls to releaseBuffer(newLength). @@ -2517,22 +2819,22 @@ public: * If the length() was greater than minCapacity, then any contents after minCapacity * may be lost. * The buffer contents is not NUL-terminated by getBuffer(). - * If length()(s.length(). + * `(s.length() < s.getCapacity() && buffer[s.length()]==0)`. * (See getTerminatedBuffer().) * * The buffer may reside in read-only memory. Its contents must not * be modified. * * @return a read-only pointer to the internal string buffer, - * or 0 if the string is empty or bogus + * or nullptr if the string is empty or bogus * * @see getBuffer(int32_t minCapacity) * @see getTerminatedBuffer() * @stable ICU 2.0 */ - inline const UChar *getBuffer() const; + inline const char16_t *getBuffer() const; /** * Get a read-only pointer to the internal buffer, @@ -2621,7 +2923,7 @@ public: * @see getBuffer() * @stable ICU 2.2 */ - inline const UChar *getTerminatedBuffer(); + const char16_t *getTerminatedBuffer(); //======================================== // Constructors @@ -2630,11 +2932,11 @@ public: /** Construct an empty UnicodeString. * @stable ICU 2.0 */ - UnicodeString(); + inline UnicodeString(); /** - * Construct a UnicodeString with capacity to hold capacity UChars - * @param capacity the number of UChars this UnicodeString should hold + * Construct a UnicodeString with capacity to hold `capacity` char16_ts + * @param capacity the number of char16_ts this UnicodeString should hold * before a resize is necessary; if count is greater than 0 and count * code points c take up more space than capacity, then capacity is adjusted * accordingly. @@ -2646,62 +2948,154 @@ public: UnicodeString(int32_t capacity, UChar32 c, int32_t count); /** - * Single UChar (code unit) constructor. + * Single char16_t (code unit) constructor. + * + * It is recommended to mark this constructor "explicit" by + * `-DUNISTR_FROM_CHAR_EXPLICIT=explicit` + * on the compiler command line or similar. * @param ch the character to place in the UnicodeString * @stable ICU 2.0 */ - UnicodeString(UChar ch); + UNISTR_FROM_CHAR_EXPLICIT UnicodeString(char16_t ch); /** * Single UChar32 (code point) constructor. + * + * It is recommended to mark this constructor "explicit" by + * `-DUNISTR_FROM_CHAR_EXPLICIT=explicit` + * on the compiler command line or similar. * @param ch the character to place in the UnicodeString * @stable ICU 2.0 */ - UnicodeString(UChar32 ch); + UNISTR_FROM_CHAR_EXPLICIT UnicodeString(UChar32 ch); /** - * UChar* constructor. - * @param text The characters to place in the UnicodeString. text + * char16_t* constructor. + * + * It is recommended to mark this constructor "explicit" by + * `-DUNISTR_FROM_STRING_EXPLICIT=explicit` + * on the compiler command line or similar. + * @param text The characters to place in the UnicodeString. `text` * must be NULL (U+0000) terminated. * @stable ICU 2.0 */ - UnicodeString(const UChar *text); + UNISTR_FROM_STRING_EXPLICIT UnicodeString(const char16_t *text); + +#if !U_CHAR16_IS_TYPEDEF + /** + * uint16_t * constructor. + * Delegates to UnicodeString(const char16_t *). + * + * It is recommended to mark this constructor "explicit" by + * `-DUNISTR_FROM_STRING_EXPLICIT=explicit` + * on the compiler command line or similar. + * @param text NUL-terminated UTF-16 string + * @stable ICU 59 + */ + UNISTR_FROM_STRING_EXPLICIT UnicodeString(const uint16_t *text) : + UnicodeString(ConstChar16Ptr(text)) {} +#endif + +#if U_SIZEOF_WCHAR_T==2 || defined(U_IN_DOXYGEN) + /** + * wchar_t * constructor. + * (Only defined if U_SIZEOF_WCHAR_T==2.) + * Delegates to UnicodeString(const char16_t *). + * + * It is recommended to mark this constructor "explicit" by + * `-DUNISTR_FROM_STRING_EXPLICIT=explicit` + * on the compiler command line or similar. + * @param text NUL-terminated UTF-16 string + * @stable ICU 59 + */ + UNISTR_FROM_STRING_EXPLICIT UnicodeString(const wchar_t *text) : + UnicodeString(ConstChar16Ptr(text)) {} +#endif /** - * UChar* constructor. + * nullptr_t constructor. + * Effectively the same as the default constructor, makes an empty string object. + * + * It is recommended to mark this constructor "explicit" by + * `-DUNISTR_FROM_STRING_EXPLICIT=explicit` + * on the compiler command line or similar. + * @param text nullptr + * @stable ICU 59 + */ + UNISTR_FROM_STRING_EXPLICIT inline UnicodeString(const std::nullptr_t text); + + /** + * char16_t* constructor. * @param text The characters to place in the UnicodeString. - * @param textLength The number of Unicode characters in text + * @param textLength The number of Unicode characters in `text` * to copy. * @stable ICU 2.0 */ - UnicodeString(const UChar *text, + UnicodeString(const char16_t *text, int32_t textLength); +#if !U_CHAR16_IS_TYPEDEF + /** + * uint16_t * constructor. + * Delegates to UnicodeString(const char16_t *, int32_t). + * @param text UTF-16 string + * @param length string length + * @stable ICU 59 + */ + UnicodeString(const uint16_t *text, int32_t length) : + UnicodeString(ConstChar16Ptr(text), length) {} +#endif + +#if U_SIZEOF_WCHAR_T==2 || defined(U_IN_DOXYGEN) + /** + * wchar_t * constructor. + * (Only defined if U_SIZEOF_WCHAR_T==2.) + * Delegates to UnicodeString(const char16_t *, int32_t). + * @param text NUL-terminated UTF-16 string + * @param length string length + * @stable ICU 59 + */ + UnicodeString(const wchar_t *text, int32_t length) : + UnicodeString(ConstChar16Ptr(text), length) {} +#endif + /** - * Readonly-aliasing UChar* constructor. + * nullptr_t constructor. + * Effectively the same as the default constructor, makes an empty string object. + * @param text nullptr + * @param length ignored + * @stable ICU 59 + */ + inline UnicodeString(const std::nullptr_t text, int32_t length); + + /** + * Readonly-aliasing char16_t* constructor. * The text will be used for the UnicodeString object, but * it will not be released when the UnicodeString is destroyed. * This has copy-on-write semantics: * When the string is modified, then the buffer is first copied into * newly allocated memory. * The aliased buffer is never modified. - * In an assignment to another UnicodeString, the text will be aliased again, + * + * In an assignment to another UnicodeString, when using the copy constructor + * or the assignment operator, the text will be copied. + * When using fastCopyFrom(), the text will be aliased again, * so that both strings then alias the same readonly-text. * - * @param isTerminated specifies if text is NUL-terminated. - * This must be true if textLength==-1. + * @param isTerminated specifies if `text` is `NUL`-terminated. + * This must be true if `textLength==-1`. * @param text The characters to alias for the UnicodeString. - * @param textLength The number of Unicode characters in text to alias. + * @param textLength The number of Unicode characters in `text` to alias. * If -1, then this constructor will determine the length - * by calling u_strlen(). + * by calling `u_strlen()`. * @stable ICU 2.0 */ UnicodeString(UBool isTerminated, - const UChar *text, + ConstChar16Ptr text, int32_t textLength); /** - * Writable-aliasing UChar* constructor. + * Writable-aliasing char16_t* constructor. * The text will be used for the UnicodeString object, but * it will not be released when the UnicodeString is destroyed. * This has write-through semantics: @@ -2710,26 +3104,98 @@ public: * a new buffer will be allocated and the contents copied as with regularly * constructed strings. * In an assignment to another UnicodeString, the buffer will be copied. - * The extract(UChar *dst) function detects whether the dst pointer is the same + * The extract(Char16Ptr dst) function detects whether the dst pointer is the same * as the string buffer itself and will in this case not copy the contents. * * @param buffer The characters to alias for the UnicodeString. - * @param buffLength The number of Unicode characters in buffer to alias. - * @param buffCapacity The size of buffer in UChars. + * @param buffLength The number of Unicode characters in `buffer` to alias. + * @param buffCapacity The size of `buffer` in char16_ts. * @stable ICU 2.0 */ - UnicodeString(UChar *buffer, int32_t buffLength, int32_t buffCapacity); + UnicodeString(char16_t *buffer, int32_t buffLength, int32_t buffCapacity); + +#if !U_CHAR16_IS_TYPEDEF + /** + * Writable-aliasing uint16_t * constructor. + * Delegates to UnicodeString(const char16_t *, int32_t, int32_t). + * @param buffer writable buffer of/for UTF-16 text + * @param buffLength length of the current buffer contents + * @param buffCapacity buffer capacity + * @stable ICU 59 + */ + UnicodeString(uint16_t *buffer, int32_t buffLength, int32_t buffCapacity) : + UnicodeString(Char16Ptr(buffer), buffLength, buffCapacity) {} +#endif + +#if U_SIZEOF_WCHAR_T==2 || defined(U_IN_DOXYGEN) + /** + * Writable-aliasing wchar_t * constructor. + * (Only defined if U_SIZEOF_WCHAR_T==2.) + * Delegates to UnicodeString(const char16_t *, int32_t, int32_t). + * @param buffer writable buffer of/for UTF-16 text + * @param buffLength length of the current buffer contents + * @param buffCapacity buffer capacity + * @stable ICU 59 + */ + UnicodeString(wchar_t *buffer, int32_t buffLength, int32_t buffCapacity) : + UnicodeString(Char16Ptr(buffer), buffLength, buffCapacity) {} +#endif + + /** + * Writable-aliasing nullptr_t constructor. + * Effectively the same as the default constructor, makes an empty string object. + * @param buffer nullptr + * @param buffLength ignored + * @param buffCapacity ignored + * @stable ICU 59 + */ + inline UnicodeString(std::nullptr_t buffer, int32_t buffLength, int32_t buffCapacity); + +#if U_CHARSET_IS_UTF8 || !UCONFIG_NO_CONVERSION + + /** + * char* constructor. + * Uses the default converter (and thus depends on the ICU conversion code) + * unless U_CHARSET_IS_UTF8 is set to 1. + * + * For ASCII (really "invariant character") strings it is more efficient to use + * the constructor that takes a US_INV (for its enum EInvariant). + * For ASCII (invariant-character) string literals, see UNICODE_STRING and + * UNICODE_STRING_SIMPLE. + * + * It is recommended to mark this constructor "explicit" by + * `-DUNISTR_FROM_STRING_EXPLICIT=explicit` + * on the compiler command line or similar. + * @param codepageData an array of bytes, null-terminated, + * in the platform's default codepage. + * @stable ICU 2.0 + * @see UNICODE_STRING + * @see UNICODE_STRING_SIMPLE + */ + UNISTR_FROM_STRING_EXPLICIT UnicodeString(const char *codepageData); + + /** + * char* constructor. + * Uses the default converter (and thus depends on the ICU conversion code) + * unless U_CHARSET_IS_UTF8 is set to 1. + * @param codepageData an array of bytes in the platform's default codepage. + * @param dataLength The number of bytes in `codepageData`. + * @stable ICU 2.0 + */ + UnicodeString(const char *codepageData, int32_t dataLength); + +#endif #if !UCONFIG_NO_CONVERSION /** * char* constructor. * @param codepageData an array of bytes, null-terminated - * @param codepage the encoding of codepageData. The special - * value 0 for codepage indicates that the text is in the + * @param codepage the encoding of `codepageData`. The special + * value 0 for `codepage` indicates that the text is in the * platform's default codepage. * - * If codepage is an empty string (""), + * If `codepage` is an empty string (`""`), * then a simple conversion is performed on the codepage-invariant * subset ("invariant characters") of the platform encoding. See utypes.h. * Recommendation: For invariant-character strings use the constructor @@ -2739,17 +3205,16 @@ public: * * @stable ICU 2.0 */ - UnicodeString(const char *codepageData, - const char *codepage = 0); + UnicodeString(const char *codepageData, const char *codepage); /** * char* constructor. * @param codepageData an array of bytes. - * @param dataLength The number of bytes in codepageData. - * @param codepage the encoding of codepageData. The special - * value 0 for codepage indicates that the text is in the + * @param dataLength The number of bytes in `codepageData`. + * @param codepage the encoding of `codepageData`. The special + * value 0 for `codepage` indicates that the text is in the * platform's default codepage. - * If codepage is an empty string (""), + * If `codepage` is an empty string (`""`), * then a simple conversion is performed on the codepage-invariant * subset ("invariant characters") of the platform encoding. See utypes.h. * Recommendation: For invariant-character strings use the constructor @@ -2759,9 +3224,7 @@ public: * * @stable ICU 2.0 */ - UnicodeString(const char *codepageData, - int32_t dataLength, - const char *codepage = 0); + UnicodeString(const char *codepageData, int32_t dataLength, const char *codepage); /** * char * / UConverter constructor. @@ -2802,33 +3265,51 @@ public: * * For example: * \code - * void fn(const char *s) { - * UnicodeString ustr(s, -1, US_INV); - * // use ustr ... - * } + * void fn(const char *s) { + * UnicodeString ustr(s, -1, US_INV); + * // use ustr ... + * } * \endcode - * * @param src String using only invariant characters. * @param length Length of src, or -1 if NUL-terminated. * @param inv Signature-distinguishing paramater, use US_INV. * * @see US_INV - * @draft ICU 3.2 + * @stable ICU 3.2 */ UnicodeString(const char *src, int32_t length, enum EInvariant inv); /** * Copy constructor. + * + * Starting with ICU 2.4, the assignment operator and the copy constructor + * allocate a new buffer and copy the buffer contents even for readonly aliases. + * By contrast, the fastCopyFrom() function implements the old, + * more efficient but less safe behavior + * of making this string also a readonly alias to the same buffer. + * + * If the source object has an "open" buffer from getBuffer(minCapacity), + * then the copy is an empty string. + * * @param that The UnicodeString object to copy. * @stable ICU 2.0 + * @see fastCopyFrom */ UnicodeString(const UnicodeString& that); + /** + * Move constructor; might leave src in bogus state. + * This string will have the same contents and state that the source string had. + * @param src source string + * @stable ICU 56 + */ + UnicodeString(UnicodeString &&src) U_NOEXCEPT; + /** * 'Substring' constructor from tail of source string. * @param src The UnicodeString object to copy. - * @param srcStart The offset into src at which to start copying. + * @param srcStart The offset into `src` at which to start copying. * @stable ICU 2.2 */ UnicodeString(const UnicodeString& src, int32_t srcStart); @@ -2836,8 +3317,8 @@ public: /** * 'Substring' constructor from subrange of source string. * @param src The UnicodeString object to copy. - * @param srcStart The offset into src at which to start copying. - * @param srcLength The number of characters from src to copy. + * @param srcStart The offset into `src` at which to start copying. + * @param srcLength The number of characters from `src` to copy. * @stable ICU 2.2 */ UnicodeString(const UnicodeString& src, int32_t srcStart, int32_t srcLength); @@ -2865,6 +3346,33 @@ public: */ virtual ~UnicodeString(); + /** + * Create a UnicodeString from a UTF-8 string. + * Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string. + * Calls u_strFromUTF8WithSub(). + * + * @param utf8 UTF-8 input string. + * Note that a StringPiece can be implicitly constructed + * from a std::string or a NUL-terminated const char * string. + * @return A UnicodeString with equivalent UTF-16 contents. + * @see toUTF8 + * @see toUTF8String + * @stable ICU 4.2 + */ + static UnicodeString fromUTF8(StringPiece utf8); + + /** + * Create a UnicodeString from a UTF-32 string. + * Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string. + * Calls u_strFromUTF32WithSub(). + * + * @param utf32 UTF-32 input string. Must not be NULL. + * @param length Length of the input string, or -1 if NUL-terminated. + * @return A UnicodeString with equivalent UTF-16 contents. + * @see toUTF32 + * @stable ICU 4.2 + */ + static UnicodeString fromUTF32(const UChar32 *utf32, int32_t length); /* Miscellaneous operations */ @@ -2909,7 +3417,7 @@ public: * character. See unescape() for a listing of the recognized escape * sequences. The character at offset-1 is assumed (without * checking) to be a backslash. If the escape sequence is - * ill-formed, or the offset is out of range, (UChar32)0xFFFFFFFF is + * ill-formed, or the offset is out of range, U_SENTINEL=-1 is * returned. * * @param offset an input output parameter. On input, it is the @@ -2917,7 +3425,7 @@ public: * after the initial backslash. On output, it is advanced after the * last character parsed. On error, it is not advanced at all. * @return the character represented by the escape sequence at - * offset, or (UChar32)0xFFFFFFFF on error. + * offset, or U_SENTINEL=-1 on error. * @see UnicodeString#unescape() * @see u_unescape() * @see u_unescapeAt() @@ -2955,7 +3463,7 @@ protected: * UnicodeString::charAt() to be inline again (see jitterbug 709). * @stable ICU 2.4 */ - virtual UChar getCharAt(int32_t offset) const; + virtual char16_t getCharAt(int32_t offset) const; /** * The change in Replaceable to use virtual getChar32At() allows @@ -2965,6 +3473,22 @@ protected: virtual UChar32 getChar32At(int32_t offset) const; private: + // For char* constructors. Could be made public. + UnicodeString &setToUTF8(StringPiece utf8); + // For extract(char*). + // We could make a toUTF8(target, capacity, errorCode) public but not + // this version: New API will be cleaner if we make callers create substrings + // rather than having start+length on every method, + // and it should take a UErrorCode&. + int32_t + toUTF8(int32_t start, int32_t len, + char *target, int32_t capacity) const; + + /** + * Internal string contents comparison, called by operator==. + * Requires: this & text not bogus and have same lengths. + */ + UBool doEquals(const UnicodeString &text, int32_t len) const; inline int8_t doCompare(int32_t start, @@ -2975,7 +3499,7 @@ private: int8_t doCompare(int32_t start, int32_t length, - const UChar *srcChars, + const char16_t *srcChars, int32_t srcStart, int32_t srcLength) const; @@ -2988,7 +3512,7 @@ private: int8_t doCompareCodePointOrder(int32_t start, int32_t length, - const UChar *srcChars, + const char16_t *srcChars, int32_t srcStart, int32_t srcLength) const; @@ -3003,12 +3527,12 @@ private: int8_t doCaseCompare(int32_t start, int32_t length, - const UChar *srcChars, + const char16_t *srcChars, int32_t srcStart, int32_t srcLength, uint32_t options) const; - int32_t doIndexOf(UChar c, + int32_t doIndexOf(char16_t c, int32_t start, int32_t length) const; @@ -3016,7 +3540,7 @@ private: int32_t start, int32_t length) const; - int32_t doLastIndexOf(UChar c, + int32_t doLastIndexOf(char16_t c, int32_t start, int32_t length) const; @@ -3026,14 +3550,14 @@ private: void doExtract(int32_t start, int32_t length, - UChar *dst, + char16_t *dst, int32_t dstStart) const; inline void doExtract(int32_t start, int32_t length, UnicodeString& target) const; - inline UChar doCharAt(int32_t offset) const; + inline char16_t doCharAt(int32_t offset) const; UnicodeString& doReplace(int32_t start, int32_t length, @@ -3043,10 +3567,13 @@ private: UnicodeString& doReplace(int32_t start, int32_t length, - const UChar *srcChars, + const char16_t *srcChars, int32_t srcStart, int32_t srcLength); + UnicodeString& doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength); + UnicodeString& doAppend(const char16_t *srcChars, int32_t srcStart, int32_t srcLength); + UnicodeString& doReverse(int32_t start, int32_t length); @@ -3054,12 +3581,31 @@ private: int32_t doHashCode(void) const; // get pointer to start of array - inline UChar* getArrayStart(void); - inline const UChar* getArrayStart(void) const; + // these do not check for kOpenGetBuffer, unlike the public getBuffer() function + inline char16_t* getArrayStart(void); + inline const char16_t* getArrayStart(void) const; + + inline UBool hasShortLength() const; + inline int32_t getShortLength() const; + + // A UnicodeString object (not necessarily its current buffer) + // is writable unless it isBogus() or it has an "open" getBuffer(minCapacity). + inline UBool isWritable() const; + + // Is the current buffer writable? + inline UBool isBufferWritable() const; - // allocate the array; result may be fStackBuffer + // None of the following does releaseArray(). + inline void setZeroLength(); + inline void setShortLength(int32_t len); + inline void setLength(int32_t len); + inline void setToEmpty(); + inline void setArray(char16_t *array, int32_t len, int32_t capacity); // sets length but not flags + + // allocate the array; result may be the stack buffer // sets refCount to 1 if appropriate - // sets fArray, fCapacity, and fFlags + // sets fArray, fCapacity, and flags + // sets length to 0 // returns boolean for success or failure UBool allocate(int32_t capacity); @@ -3072,6 +3618,9 @@ private: // implements assigment operator, copy constructor, and fastCopyFrom() UnicodeString ©From(const UnicodeString &src, UBool fastCopy=FALSE); + // Copies just the fields without memory management. + void copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT; + // Pin start and limit to acceptable values. inline void pinIndex(int32_t& start) const; inline void pinIndices(int32_t& start, @@ -3089,9 +3638,9 @@ private: * Real constructor for converting from codepage data. * It assumes that it is called with !fRefCounted. * - * If codepage==0, then the default converter + * If `codepage==0`, then the default converter * is used for the platform encoding. - * If codepage is an empty string (""), + * If `codepage` is an empty string (`""`), * then a simple conversion is performed on the codepage-invariant * subset ("invariant characters") of the platform encoding. See utypes.h. */ @@ -3128,12 +3677,17 @@ private: int32_t **pBufferToDelete = 0, UBool forceClone = FALSE); - // common function for case mappings + /** + * Common function for UnicodeString case mappings. + * The stringCaseMapper has the same type UStringCaseMapper + * as in ustr_imp.h for ustrcase_map(). + */ UnicodeString & - caseMap(BreakIterator *titleIter, - const char *locale, - uint32_t options, - int32_t toWhichCase); + caseMap(int32_t caseLocale, uint32_t options, +#if !UCONFIG_NO_BREAK_ITERATION + BreakIterator *iter, +#endif + UStringCaseMapper *stringCaseMapper); // ref counting void addRef(void); @@ -3142,19 +3696,29 @@ private: // constants enum { - US_STACKBUF_SIZE=7, // Size of stack buffer for small strings - kInvalidUChar=0xffff, // invalid UChar index - kGrowSize=128, // grow size for this buffer + /** + * Size of stack buffer for short strings. + * Must be at least U16_MAX_LENGTH for the single-code point constructor to work. + * @see UNISTR_OBJECT_SIZE + */ + US_STACKBUF_SIZE=(int32_t)(UNISTR_OBJECT_SIZE-sizeof(void *)-2)/U_SIZEOF_UCHAR, + kInvalidUChar=0xffff, // U+FFFF returned by charAt(invalid index) kInvalidHashCode=0, // invalid hash code kEmptyHashCode=1, // hash code for empty string - // bit flag values for fFlags + // bit flag values for fLengthAndFlags kIsBogus=1, // this string is bogus, i.e., not valid or NULL - kUsingStackBuffer=2,// fArray==fStackBuffer + kUsingStackBuffer=2,// using fUnion.fStackFields instead of fUnion.fFields kRefCounted=4, // there is a refCount field before the characters in fArray kBufferIsReadonly=8,// do not write to this buffer kOpenGetBuffer=16, // getBuffer(minCapacity) was called (is "open"), // and releaseBuffer(newLength) must be called + kAllStorageFlags=0x1f, + + kLengthShift=5, // remaining 11 bits for non-negative short length, or negative if long + kLength1=1<127; else undefined + int32_t fCapacity; // capacity of fArray (in char16_ts) + // array pointer last to minimize padding for machines with P128 data model + // or pointer sizes that are not a power of 2 + char16_t *fArray; // the Unicode data + } fFields; + } fUnion; }; /** @@ -3192,37 +3797,11 @@ private: * @param s1 The first string to be copied to the new one. * @param s2 The second string to be copied to the new one, after s1. * @return UnicodeString(s1).append(s2) - * @draft ICU 2.8 + * @stable ICU 2.8 */ U_COMMON_API UnicodeString U_EXPORT2 operator+ (const UnicodeString &s1, const UnicodeString &s2); -U_NAMESPACE_END - -// inline implementations -------------------------------------------------- *** - -//======================================== -// Array copying -//======================================== -/** - * Copy an array of UnicodeString OBJECTS (not pointers). - * @internal - */ -inline void -uprv_arrayCopy(const U_NAMESPACE_QUALIFIER UnicodeString *src, U_NAMESPACE_QUALIFIER UnicodeString *dst, int32_t count) -{ while(count-- > 0) *dst++ = *src++; } - -/** - * Copy an array of UnicodeString OBJECTS (not pointers). - * @internal - */ -inline void -uprv_arrayCopy(const U_NAMESPACE_QUALIFIER UnicodeString *src, int32_t srcStart, - U_NAMESPACE_QUALIFIER UnicodeString *dst, int32_t dstStart, int32_t count) -{ uprv_arrayCopy(src+srcStart, dst+dstStart, count); } - -U_NAMESPACE_BEGIN - //======================================== // Inline members //======================================== @@ -3237,8 +3816,8 @@ UnicodeString::pinIndex(int32_t& start) const // pin index if(start < 0) { start = 0; - } else if(start > fLength) { - start = fLength; + } else if(start > length()) { + start = length(); } } @@ -3247,36 +3826,77 @@ UnicodeString::pinIndices(int32_t& start, int32_t& _length) const { // pin indices + int32_t len = length(); if(start < 0) { start = 0; - } else if(start > fLength) { - start = fLength; + } else if(start > len) { + start = len; } if(_length < 0) { _length = 0; - } else if(_length > (fLength - start)) { - _length = (fLength - start); + } else if(_length > (len - start)) { + _length = (len - start); } } -inline UChar* -UnicodeString::getArrayStart() -{ return fArray; } +inline char16_t* +UnicodeString::getArrayStart() { + return (fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) ? + fUnion.fStackFields.fBuffer : fUnion.fFields.fArray; +} + +inline const char16_t* +UnicodeString::getArrayStart() const { + return (fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) ? + fUnion.fStackFields.fBuffer : fUnion.fFields.fArray; +} + +//======================================== +// Default constructor +//======================================== + +inline +UnicodeString::UnicodeString() { + fUnion.fStackFields.fLengthAndFlags=kShortString; +} + +inline UnicodeString::UnicodeString(const std::nullptr_t /*text*/) { + fUnion.fStackFields.fLengthAndFlags=kShortString; +} -inline const UChar* -UnicodeString::getArrayStart() const -{ return fArray; } +inline UnicodeString::UnicodeString(const std::nullptr_t /*text*/, int32_t /*length*/) { + fUnion.fStackFields.fLengthAndFlags=kShortString; +} + +inline UnicodeString::UnicodeString(std::nullptr_t /*buffer*/, int32_t /*buffLength*/, int32_t /*buffCapacity*/) { + fUnion.fStackFields.fLengthAndFlags=kShortString; +} //======================================== // Read-only implementation methods //======================================== +inline UBool +UnicodeString::hasShortLength() const { + return fUnion.fFields.fLengthAndFlags>=0; +} + inline int32_t -UnicodeString::length() const -{ return fLength; } +UnicodeString::getShortLength() const { + // fLengthAndFlags must be non-negative -> short length >= 0 + // and arithmetic or logical shift does not matter. + return fUnion.fFields.fLengthAndFlags>>kLengthShift; +} + +inline int32_t +UnicodeString::length() const { + return hasShortLength() ? getShortLength() : fUnion.fFields.fLength; +} inline int32_t -UnicodeString::getCapacity() const -{ return fCapacity; } +UnicodeString::getCapacity() const { + return (fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) ? + US_STACKBUF_SIZE : fUnion.fFields.fCapacity; +} inline int32_t UnicodeString::hashCode() const @@ -3284,14 +3904,28 @@ UnicodeString::hashCode() const inline UBool UnicodeString::isBogus() const -{ return (UBool)(fFlags & kIsBogus); } +{ return (UBool)(fUnion.fFields.fLengthAndFlags & kIsBogus); } + +inline UBool +UnicodeString::isWritable() const +{ return (UBool)!(fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kIsBogus)); } + +inline UBool +UnicodeString::isBufferWritable() const +{ + return (UBool)( + !(fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kIsBogus|kBufferIsReadonly)) && + (!(fUnion.fFields.fLengthAndFlags&kRefCounted) || refCount()==1)); +} -inline const UChar * +inline const char16_t * UnicodeString::getBuffer() const { - if(!(fFlags&(kIsBogus|kOpenGetBuffer))) { - return fArray; + if(fUnion.fFields.fLengthAndFlags&(kIsBogus|kOpenGetBuffer)) { + return nullptr; + } else if(fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) { + return fUnion.fStackFields.fBuffer; } else { - return 0; + return fUnion.fFields.fArray; } } @@ -3300,7 +3934,7 @@ UnicodeString::getBuffer() const { //======================================== inline int8_t UnicodeString::doCompare(int32_t start, - int32_t length, + int32_t thisLength, const UnicodeString& srcText, int32_t srcStart, int32_t srcLength) const @@ -3309,7 +3943,7 @@ UnicodeString::doCompare(int32_t start, return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise } else { srcText.pinIndices(srcStart, srcLength); - return doCompare(start, length, srcText.fArray, srcStart, srcLength); + return doCompare(start, thisLength, srcText.getArrayStart(), srcStart, srcLength); } } @@ -3319,10 +3953,8 @@ UnicodeString::operator== (const UnicodeString& text) const if(isBogus()) { return text.isBogus(); } else { - return - !text.isBogus() && - fLength == text.fLength && - doCompare(0, fLength, text, 0, text.fLength) == 0; + int32_t len = length(), textLength = text.length(); + return !text.isBogus() && len == textLength && doEquals(text, len); } } @@ -3332,34 +3964,34 @@ UnicodeString::operator!= (const UnicodeString& text) const inline UBool UnicodeString::operator> (const UnicodeString& text) const -{ return doCompare(0, fLength, text, 0, text.fLength) == 1; } +{ return doCompare(0, length(), text, 0, text.length()) == 1; } inline UBool UnicodeString::operator< (const UnicodeString& text) const -{ return doCompare(0, fLength, text, 0, text.fLength) == -1; } +{ return doCompare(0, length(), text, 0, text.length()) == -1; } inline UBool UnicodeString::operator>= (const UnicodeString& text) const -{ return doCompare(0, fLength, text, 0, text.fLength) != -1; } +{ return doCompare(0, length(), text, 0, text.length()) != -1; } inline UBool UnicodeString::operator<= (const UnicodeString& text) const -{ return doCompare(0, fLength, text, 0, text.fLength) != 1; } +{ return doCompare(0, length(), text, 0, text.length()) != 1; } inline int8_t UnicodeString::compare(const UnicodeString& text) const -{ return doCompare(0, fLength, text, 0, text.fLength); } +{ return doCompare(0, length(), text, 0, text.length()); } inline int8_t UnicodeString::compare(int32_t start, int32_t _length, const UnicodeString& srcText) const -{ return doCompare(start, _length, srcText, 0, srcText.fLength); } +{ return doCompare(start, _length, srcText, 0, srcText.length()); } inline int8_t -UnicodeString::compare(const UChar *srcChars, +UnicodeString::compare(ConstChar16Ptr srcChars, int32_t srcLength) const -{ return doCompare(0, fLength, srcChars, 0, srcLength); } +{ return doCompare(0, length(), srcChars, 0, srcLength); } inline int8_t UnicodeString::compare(int32_t start, @@ -3372,13 +4004,13 @@ UnicodeString::compare(int32_t start, inline int8_t UnicodeString::compare(int32_t start, int32_t _length, - const UChar *srcChars) const + const char16_t *srcChars) const { return doCompare(start, _length, srcChars, 0, _length); } inline int8_t UnicodeString::compare(int32_t start, int32_t _length, - const UChar *srcChars, + const char16_t *srcChars, int32_t srcStart, int32_t srcLength) const { return doCompare(start, _length, srcChars, srcStart, srcLength); } @@ -3394,7 +4026,7 @@ UnicodeString::compareBetween(int32_t start, inline int8_t UnicodeString::doCompareCodePointOrder(int32_t start, - int32_t length, + int32_t thisLength, const UnicodeString& srcText, int32_t srcStart, int32_t srcLength) const @@ -3403,24 +4035,24 @@ UnicodeString::doCompareCodePointOrder(int32_t start, return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise } else { srcText.pinIndices(srcStart, srcLength); - return doCompareCodePointOrder(start, length, srcText.fArray, srcStart, srcLength); + return doCompareCodePointOrder(start, thisLength, srcText.getArrayStart(), srcStart, srcLength); } } inline int8_t UnicodeString::compareCodePointOrder(const UnicodeString& text) const -{ return doCompareCodePointOrder(0, fLength, text, 0, text.fLength); } +{ return doCompareCodePointOrder(0, length(), text, 0, text.length()); } inline int8_t UnicodeString::compareCodePointOrder(int32_t start, int32_t _length, const UnicodeString& srcText) const -{ return doCompareCodePointOrder(start, _length, srcText, 0, srcText.fLength); } +{ return doCompareCodePointOrder(start, _length, srcText, 0, srcText.length()); } inline int8_t -UnicodeString::compareCodePointOrder(const UChar *srcChars, +UnicodeString::compareCodePointOrder(ConstChar16Ptr srcChars, int32_t srcLength) const -{ return doCompareCodePointOrder(0, fLength, srcChars, 0, srcLength); } +{ return doCompareCodePointOrder(0, length(), srcChars, 0, srcLength); } inline int8_t UnicodeString::compareCodePointOrder(int32_t start, @@ -3433,13 +4065,13 @@ UnicodeString::compareCodePointOrder(int32_t start, inline int8_t UnicodeString::compareCodePointOrder(int32_t start, int32_t _length, - const UChar *srcChars) const + const char16_t *srcChars) const { return doCompareCodePointOrder(start, _length, srcChars, 0, _length); } inline int8_t UnicodeString::compareCodePointOrder(int32_t start, int32_t _length, - const UChar *srcChars, + const char16_t *srcChars, int32_t srcStart, int32_t srcLength) const { return doCompareCodePointOrder(start, _length, srcChars, srcStart, srcLength); } @@ -3455,7 +4087,7 @@ UnicodeString::compareCodePointOrderBetween(int32_t start, inline int8_t UnicodeString::doCaseCompare(int32_t start, - int32_t length, + int32_t thisLength, const UnicodeString &srcText, int32_t srcStart, int32_t srcLength, @@ -3465,13 +4097,13 @@ UnicodeString::doCaseCompare(int32_t start, return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise } else { srcText.pinIndices(srcStart, srcLength); - return doCaseCompare(start, length, srcText.fArray, srcStart, srcLength, options); + return doCaseCompare(start, thisLength, srcText.getArrayStart(), srcStart, srcLength, options); } } inline int8_t UnicodeString::caseCompare(const UnicodeString &text, uint32_t options) const { - return doCaseCompare(0, fLength, text, 0, text.fLength, options); + return doCaseCompare(0, length(), text, 0, text.length(), options); } inline int8_t @@ -3479,14 +4111,14 @@ UnicodeString::caseCompare(int32_t start, int32_t _length, const UnicodeString &srcText, uint32_t options) const { - return doCaseCompare(start, _length, srcText, 0, srcText.fLength, options); + return doCaseCompare(start, _length, srcText, 0, srcText.length(), options); } inline int8_t -UnicodeString::caseCompare(const UChar *srcChars, +UnicodeString::caseCompare(ConstChar16Ptr srcChars, int32_t srcLength, uint32_t options) const { - return doCaseCompare(0, fLength, srcChars, 0, srcLength, options); + return doCaseCompare(0, length(), srcChars, 0, srcLength, options); } inline int8_t @@ -3502,7 +4134,7 @@ UnicodeString::caseCompare(int32_t start, inline int8_t UnicodeString::caseCompare(int32_t start, int32_t _length, - const UChar *srcChars, + const char16_t *srcChars, uint32_t options) const { return doCaseCompare(start, _length, srcChars, 0, _length, options); } @@ -3510,7 +4142,7 @@ UnicodeString::caseCompare(int32_t start, inline int8_t UnicodeString::caseCompare(int32_t start, int32_t _length, - const UChar *srcChars, + const char16_t *srcChars, int32_t srcStart, int32_t srcLength, uint32_t options) const { @@ -3545,38 +4177,38 @@ UnicodeString::indexOf(const UnicodeString& srcText, inline int32_t UnicodeString::indexOf(const UnicodeString& text) const -{ return indexOf(text, 0, text.fLength, 0, fLength); } +{ return indexOf(text, 0, text.length(), 0, length()); } inline int32_t UnicodeString::indexOf(const UnicodeString& text, int32_t start) const { pinIndex(start); - return indexOf(text, 0, text.fLength, start, fLength - start); + return indexOf(text, 0, text.length(), start, length() - start); } inline int32_t UnicodeString::indexOf(const UnicodeString& text, int32_t start, int32_t _length) const -{ return indexOf(text, 0, text.fLength, start, _length); } +{ return indexOf(text, 0, text.length(), start, _length); } inline int32_t -UnicodeString::indexOf(const UChar *srcChars, +UnicodeString::indexOf(const char16_t *srcChars, int32_t srcLength, int32_t start) const { pinIndex(start); - return indexOf(srcChars, 0, srcLength, start, fLength - start); + return indexOf(srcChars, 0, srcLength, start, length() - start); } inline int32_t -UnicodeString::indexOf(const UChar *srcChars, +UnicodeString::indexOf(ConstChar16Ptr srcChars, int32_t srcLength, int32_t start, int32_t _length) const { return indexOf(srcChars, 0, srcLength, start, _length); } inline int32_t -UnicodeString::indexOf(UChar c, +UnicodeString::indexOf(char16_t c, int32_t start, int32_t _length) const { return doIndexOf(c, start, _length); } @@ -3588,40 +4220,40 @@ UnicodeString::indexOf(UChar32 c, { return doIndexOf(c, start, _length); } inline int32_t -UnicodeString::indexOf(UChar c) const -{ return doIndexOf(c, 0, fLength); } +UnicodeString::indexOf(char16_t c) const +{ return doIndexOf(c, 0, length()); } inline int32_t UnicodeString::indexOf(UChar32 c) const -{ return indexOf(c, 0, fLength); } +{ return indexOf(c, 0, length()); } inline int32_t -UnicodeString::indexOf(UChar c, +UnicodeString::indexOf(char16_t c, int32_t start) const { pinIndex(start); - return doIndexOf(c, start, fLength - start); + return doIndexOf(c, start, length() - start); } inline int32_t UnicodeString::indexOf(UChar32 c, int32_t start) const { pinIndex(start); - return indexOf(c, start, fLength - start); + return indexOf(c, start, length() - start); } inline int32_t -UnicodeString::lastIndexOf(const UChar *srcChars, +UnicodeString::lastIndexOf(ConstChar16Ptr srcChars, int32_t srcLength, int32_t start, int32_t _length) const { return lastIndexOf(srcChars, 0, srcLength, start, _length); } inline int32_t -UnicodeString::lastIndexOf(const UChar *srcChars, +UnicodeString::lastIndexOf(const char16_t *srcChars, int32_t srcLength, int32_t start) const { pinIndex(start); - return lastIndexOf(srcChars, 0, srcLength, start, fLength - start); + return lastIndexOf(srcChars, 0, srcLength, start, length() - start); } inline int32_t @@ -3644,21 +4276,21 @@ inline int32_t UnicodeString::lastIndexOf(const UnicodeString& text, int32_t start, int32_t _length) const -{ return lastIndexOf(text, 0, text.fLength, start, _length); } +{ return lastIndexOf(text, 0, text.length(), start, _length); } inline int32_t UnicodeString::lastIndexOf(const UnicodeString& text, int32_t start) const { pinIndex(start); - return lastIndexOf(text, 0, text.fLength, start, fLength - start); + return lastIndexOf(text, 0, text.length(), start, length() - start); } inline int32_t UnicodeString::lastIndexOf(const UnicodeString& text) const -{ return lastIndexOf(text, 0, text.fLength, 0, fLength); } +{ return lastIndexOf(text, 0, text.length(), 0, length()); } inline int32_t -UnicodeString::lastIndexOf(UChar c, +UnicodeString::lastIndexOf(char16_t c, int32_t start, int32_t _length) const { return doLastIndexOf(c, start, _length); } @@ -3671,31 +4303,31 @@ UnicodeString::lastIndexOf(UChar32 c, } inline int32_t -UnicodeString::lastIndexOf(UChar c) const -{ return doLastIndexOf(c, 0, fLength); } +UnicodeString::lastIndexOf(char16_t c) const +{ return doLastIndexOf(c, 0, length()); } inline int32_t UnicodeString::lastIndexOf(UChar32 c) const { - return lastIndexOf(c, 0, fLength); + return lastIndexOf(c, 0, length()); } inline int32_t -UnicodeString::lastIndexOf(UChar c, +UnicodeString::lastIndexOf(char16_t c, int32_t start) const { pinIndex(start); - return doLastIndexOf(c, start, fLength - start); + return doLastIndexOf(c, start, length() - start); } inline int32_t UnicodeString::lastIndexOf(UChar32 c, int32_t start) const { pinIndex(start); - return lastIndexOf(c, start, fLength - start); + return lastIndexOf(c, start, length() - start); } inline UBool UnicodeString::startsWith(const UnicodeString& text) const -{ return compare(0, text.fLength, text, 0, text.fLength) == 0; } +{ return compare(0, text.length(), text, 0, text.length()) == 0; } inline UBool UnicodeString::startsWith(const UnicodeString& srcText, @@ -3704,48 +4336,53 @@ UnicodeString::startsWith(const UnicodeString& srcText, { return doCompare(0, srcLength, srcText, srcStart, srcLength) == 0; } inline UBool -UnicodeString::startsWith(const UChar *srcChars, - int32_t srcLength) const -{ return doCompare(0, srcLength, srcChars, 0, srcLength) == 0; } +UnicodeString::startsWith(ConstChar16Ptr srcChars, int32_t srcLength) const { + if(srcLength < 0) { + srcLength = u_strlen(toUCharPtr(srcChars)); + } + return doCompare(0, srcLength, srcChars, 0, srcLength) == 0; +} inline UBool -UnicodeString::startsWith(const UChar *srcChars, - int32_t srcStart, - int32_t srcLength) const -{ return doCompare(0, srcLength, srcChars, srcStart, srcLength) == 0;} +UnicodeString::startsWith(const char16_t *srcChars, int32_t srcStart, int32_t srcLength) const { + if(srcLength < 0) { + srcLength = u_strlen(toUCharPtr(srcChars)); + } + return doCompare(0, srcLength, srcChars, srcStart, srcLength) == 0; +} inline UBool UnicodeString::endsWith(const UnicodeString& text) const -{ return doCompare(fLength - text.fLength, text.fLength, - text, 0, text.fLength) == 0; } +{ return doCompare(length() - text.length(), text.length(), + text, 0, text.length()) == 0; } inline UBool UnicodeString::endsWith(const UnicodeString& srcText, int32_t srcStart, int32_t srcLength) const { srcText.pinIndices(srcStart, srcLength); - return doCompare(fLength - srcLength, srcLength, + return doCompare(length() - srcLength, srcLength, srcText, srcStart, srcLength) == 0; } inline UBool -UnicodeString::endsWith(const UChar *srcChars, +UnicodeString::endsWith(ConstChar16Ptr srcChars, int32_t srcLength) const { if(srcLength < 0) { - srcLength = u_strlen(srcChars); + srcLength = u_strlen(toUCharPtr(srcChars)); } - return doCompare(fLength - srcLength, srcLength, + return doCompare(length() - srcLength, srcLength, srcChars, 0, srcLength) == 0; } inline UBool -UnicodeString::endsWith(const UChar *srcChars, +UnicodeString::endsWith(const char16_t *srcChars, int32_t srcStart, int32_t srcLength) const { if(srcLength < 0) { - srcLength = u_strlen(srcChars + srcStart); + srcLength = u_strlen(toUCharPtr(srcChars + srcStart)); } - return doCompare(fLength - srcLength, srcLength, + return doCompare(length() - srcLength, srcLength, srcChars, srcStart, srcLength) == 0; } @@ -3756,7 +4393,7 @@ inline UnicodeString& UnicodeString::replace(int32_t start, int32_t _length, const UnicodeString& srcText) -{ return doReplace(start, _length, srcText, 0, srcText.fLength); } +{ return doReplace(start, _length, srcText, 0, srcText.length()); } inline UnicodeString& UnicodeString::replace(int32_t start, @@ -3769,14 +4406,14 @@ UnicodeString::replace(int32_t start, inline UnicodeString& UnicodeString::replace(int32_t start, int32_t _length, - const UChar *srcChars, + ConstChar16Ptr srcChars, int32_t srcLength) { return doReplace(start, _length, srcChars, 0, srcLength); } inline UnicodeString& UnicodeString::replace(int32_t start, int32_t _length, - const UChar *srcChars, + const char16_t *srcChars, int32_t srcStart, int32_t srcLength) { return doReplace(start, _length, srcChars, srcStart, srcLength); } @@ -3784,25 +4421,14 @@ UnicodeString::replace(int32_t start, inline UnicodeString& UnicodeString::replace(int32_t start, int32_t _length, - UChar srcChar) + char16_t srcChar) { return doReplace(start, _length, &srcChar, 0, 1); } -inline UnicodeString& -UnicodeString::replace(int32_t start, - int32_t _length, - UChar32 srcChar) { - UChar buffer[U16_MAX_LENGTH]; - int32_t count = 0; - UBool isError = FALSE; - U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError); - return doReplace(start, _length, buffer, 0, count); -} - inline UnicodeString& UnicodeString::replaceBetween(int32_t start, int32_t limit, const UnicodeString& srcText) -{ return doReplace(start, limit - start, srcText, 0, srcText.fLength); } +{ return doReplace(start, limit - start, srcText, 0, srcText.length()); } inline UnicodeString& UnicodeString::replaceBetween(int32_t start, @@ -3815,16 +4441,16 @@ UnicodeString::replaceBetween(int32_t start, inline UnicodeString& UnicodeString::findAndReplace(const UnicodeString& oldText, const UnicodeString& newText) -{ return findAndReplace(0, fLength, oldText, 0, oldText.fLength, - newText, 0, newText.fLength); } +{ return findAndReplace(0, length(), oldText, 0, oldText.length(), + newText, 0, newText.length()); } inline UnicodeString& UnicodeString::findAndReplace(int32_t start, int32_t _length, const UnicodeString& oldText, const UnicodeString& newText) -{ return findAndReplace(start, _length, oldText, 0, oldText.fLength, - newText, 0, newText.fLength); } +{ return findAndReplace(start, _length, oldText, 0, oldText.length(), + newText, 0, newText.length()); } // ============================ // extract @@ -3833,12 +4459,12 @@ inline void UnicodeString::doExtract(int32_t start, int32_t _length, UnicodeString& target) const -{ target.replace(0, target.fLength, *this, start, _length); } +{ target.replace(0, target.length(), *this, start, _length); } inline void UnicodeString::extract(int32_t start, int32_t _length, - UChar *target, + Char16Ptr target, int32_t targetStart) const { doExtract(start, _length, target, targetStart); } @@ -3866,92 +4492,86 @@ UnicodeString::extract(int32_t start, inline void UnicodeString::extractBetween(int32_t start, int32_t limit, - UChar *dst, + char16_t *dst, int32_t dstStart) const { pinIndex(start); pinIndex(limit); doExtract(start, limit - start, dst, dstStart); } -inline UChar +inline UnicodeString +UnicodeString::tempSubStringBetween(int32_t start, int32_t limit) const { + return tempSubString(start, limit - start); +} + +inline char16_t UnicodeString::doCharAt(int32_t offset) const { - if((uint32_t)offset < (uint32_t)fLength) { - return fArray[offset]; + if((uint32_t)offset < (uint32_t)length()) { + return getArrayStart()[offset]; } else { return kInvalidUChar; } } -inline UChar +inline char16_t UnicodeString::charAt(int32_t offset) const { return doCharAt(offset); } -inline UChar +inline char16_t UnicodeString::operator[] (int32_t offset) const { return doCharAt(offset); } -inline UChar32 -UnicodeString::char32At(int32_t offset) const -{ - if((uint32_t)offset < (uint32_t)fLength) { - UChar32 c; - U16_GET(fArray, 0, offset, fLength, c); - return c; - } else { - return kInvalidUChar; - } +inline UBool +UnicodeString::isEmpty() const { + // Arithmetic or logical right shift does not matter: only testing for 0. + return (fUnion.fFields.fLengthAndFlags>>kLengthShift) == 0; } -inline int32_t -UnicodeString::getChar32Start(int32_t offset) const { - if((uint32_t)offset < (uint32_t)fLength) { - U16_SET_CP_START(fArray, 0, offset); - return offset; - } else { - return 0; - } +//======================================== +// Write implementation methods +//======================================== +inline void +UnicodeString::setZeroLength() { + fUnion.fFields.fLengthAndFlags &= kAllStorageFlags; } -inline int32_t -UnicodeString::getChar32Limit(int32_t offset) const { - if((uint32_t)offset < (uint32_t)fLength) { - U16_SET_CP_LIMIT(fArray, 0, offset, fLength); - return offset; +inline void +UnicodeString::setShortLength(int32_t len) { + // requires 0 <= len <= kMaxShortLength + fUnion.fFields.fLengthAndFlags = + (int16_t)((fUnion.fFields.fLengthAndFlags & kAllStorageFlags) | (len << kLengthShift)); +} + +inline void +UnicodeString::setLength(int32_t len) { + if(len <= kMaxShortLength) { + setShortLength(len); } else { - return fLength; + fUnion.fFields.fLengthAndFlags |= kLengthIsLarge; + fUnion.fFields.fLength = len; } } -inline UBool -UnicodeString::isEmpty() const { - return fLength == 0; +inline void +UnicodeString::setToEmpty() { + fUnion.fFields.fLengthAndFlags = kShortString; } -//======================================== -// Write implementation methods -//======================================== -inline const UChar * -UnicodeString::getTerminatedBuffer() { - if(fFlags&(kIsBogus|kOpenGetBuffer)) { - return 0; - } else if(fLength