/*
**********************************************************************
-* Copyright (C) 1998-2004, International Business Machines
+* Copyright (C) 1998-2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
#ifndef UNISTR_H
#define UNISTR_H
+/**
+ * \file
+ * \brief C++ API: Unicode String
+ */
+
+#include "unicode/utypes.h"
#include "unicode/rep.h"
+#include "unicode/std_string.h"
+#include "unicode/stringpiece.h"
+#include "unicode/bytestream.h"
+#include "unicode/ucasemap.h"
struct UConverter; // unicode/ucnv.h
class StringThreadTest;
#endif
#ifndef USTRING_H
-/* see ustring.h */
+/**
+ * \ingroup ustring_ustrlen
+ */
U_STABLE int32_t U_EXPORT2
u_strlen(const UChar *s);
#endif
+#ifndef U_HIDE_INTERNAL_API
+/**
+ * \def U_STRING_CASE_MAPPER_DEFINED
+ * @internal
+ */
+
+#ifndef U_STRING_CASE_MAPPER_DEFINED
+#define U_STRING_CASE_MAPPER_DEFINED
+
+/**
+ * Internal string case mapping function type.
+ * @internal
+ */
+typedef int32_t U_CALLCONV
+UStringCaseMapper(const UCaseMap *csm,
+ UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ UErrorCode *pErrorCode);
+
+#endif
+#endif /* U_HIDE_INTERNAL_API */
+
U_NAMESPACE_BEGIN
+class BreakIterator; // unicode/brkiter.h
class Locale; // unicode/locid.h
class StringCharacterIterator;
-class BreakIterator; // unicode/brkiter.h
+class UnicodeStringAppendable; // unicode/appendable.h
/* The <iostream> include has been moved to unicode/ustream.h */
* therefore recommended over ones taking a charset name string
* (where the empty string "" indicates invariant-character conversion).
*
- * @draft ICU 3.2
+ * @stable ICU 3.2
*/
-#define US_INV UnicodeString::kInvariant
+#define US_INV icu::UnicodeString::kInvariant
/**
* Unicode String literals in C++.
* such string variable before it is used.
* @stable ICU 2.0
*/
-#if U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && U_CHARSET_FAMILY==U_ASCII_FAMILY
-# define UNICODE_STRING(cs, _length) UnicodeString(TRUE, (const UChar *)L ## cs, _length)
+#if defined(U_DECLARE_UTF16)
+# define UNICODE_STRING(cs, _length) icu::UnicodeString(TRUE, (const UChar *)U_DECLARE_UTF16(cs), _length)
+#elif U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY || (U_SIZEOF_UCHAR == 2 && defined(U_WCHAR_IS_UTF16)))
+# define UNICODE_STRING(cs, _length) icu::UnicodeString(TRUE, (const UChar *)L ## cs, _length)
#elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY
-# define UNICODE_STRING(cs, _length) UnicodeString(TRUE, (const UChar *)cs, _length)
+# define UNICODE_STRING(cs, _length) icu::UnicodeString(TRUE, (const UChar *)cs, _length)
#else
-# define UNICODE_STRING(cs, _length) UnicodeString(cs, _length, US_INV)
+# define UNICODE_STRING(cs, _length) icu::UnicodeString(cs, _length, US_INV)
#endif
/**
* The string parameter must be a C string literal.
* @stable ICU 2.0
*/
-#if U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && U_CHARSET_FAMILY==U_ASCII_FAMILY
-# define UNICODE_STRING_SIMPLE(cs) UnicodeString(TRUE, (const UChar *)L ## cs, -1)
-#elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY
-# define UNICODE_STRING_SIMPLE(cs) UnicodeString(TRUE, (const UChar *)cs, -1)
-#else
-# define UNICODE_STRING_SIMPLE(cs) UnicodeString(cs, -1, US_INV)
+#define UNICODE_STRING_SIMPLE(cs) UNICODE_STRING(cs, -1)
+
+/**
+ * \def UNISTR_FROM_CHAR_EXPLICIT
+ * This can be defined to be empty or "explicit".
+ * If explicit, then the UnicodeString(UChar) and UnicodeString(UChar32)
+ * constructors are marked as explicit, preventing their inadvertent use.
+ * @stable ICU 49
+ */
+#ifndef UNISTR_FROM_CHAR_EXPLICIT
+# if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION)
+ // Auto-"explicit" in ICU library code.
+# define UNISTR_FROM_CHAR_EXPLICIT explicit
+# else
+ // Empty by default for source code compatibility.
+# define UNISTR_FROM_CHAR_EXPLICIT
+# endif
+#endif
+
+/**
+ * \def UNISTR_FROM_STRING_EXPLICIT
+ * This can be defined to be empty or "explicit".
+ * If explicit, then the UnicodeString(const char *) and UnicodeString(const UChar *)
+ * constructors are marked as explicit, preventing their inadvertent use.
+ *
+ * In particular, this helps prevent accidentally depending on ICU conversion code
+ * by passing a string literal into an API with a const UnicodeString & parameter.
+ * @stable ICU 49
+ */
+#ifndef UNISTR_FROM_STRING_EXPLICIT
+# if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION)
+ // Auto-"explicit" in ICU library code.
+# define UNISTR_FROM_STRING_EXPLICIT explicit
+# else
+ // Empty by default for source code compatibility.
+# define UNISTR_FROM_STRING_EXPLICIT
+# endif
#endif
/**
* The UnicodeString class is not suitable for subclassing.
*
* <p>For an overview of Unicode strings in C and C++ see the
- * <a href="http://oss.software.ibm.com/icu/userguide/strings.html">User Guide Strings chapter</a>.</p>
+ * <a href="http://icu-project.org/userguide/strings.html">User Guide Strings chapter</a>.</p>
*
* <p>In ICU, a Unicode string consists of 16-bit Unicode <em>code units</em>.
- * A Unicode character may be stored with either
- * one code unit — which is the most common case — or with a matched pair of
- * special code units ("surrogates").
- * The data type for code units is UChar.<br>
+ * A Unicode character may be stored with either one code unit
+ * (the most common case) or with a matched pair of special code units
+ * ("surrogates"). The data type for code units is UChar.
* For single-character handling, a Unicode character code <em>point</em> is a value
* in the range 0..0x10ffff. ICU uses the UChar32 type for code points.</p>
*
* significant performance improvements.
* Also, the internal buffer is accessible via special functions.
* For details see the
- * <a href="http://oss.software.ibm.com/icu/userguide/strings.html">User Guide Strings chapter</a>.</p>
+ * <a href="http://icu-project.org/userguide/strings.html">User Guide Strings chapter</a>.</p>
*
* @see utf.h
* @see CharacterIterator
* Use the macro US_INV instead of the full qualification for this value.
*
* @see US_INV
- * @draft ICU 3.2
+ * @stable ICU 3.2
*/
enum EInvariant {
/**
* @see EInvariant
- * @draft ICU 3.2
+ * @stable ICU 3.2
*/
kInvariant
};
/**
* Compare two Unicode strings in code point order.
- * This is different in UTF-16 from how compare(), operator==, startsWith() etc. work
+ * The result may be different from the results of compare(), operator<, etc.
* if supplementary characters are present:
*
* In UTF-16, supplementary characters (with code points U+10000 and above) are
/**
* Compare two Unicode strings in code point order.
- * This is different in UTF-16 from how compare(), operator==, startsWith() etc. work
+ * The result may be different from the results of compare(), operator<, etc.
* if supplementary characters are present:
*
* In UTF-16, supplementary characters (with code points U+10000 and above) are
/**
* Compare two Unicode strings in code point order.
- * This is different in UTF-16 from how compare(), operator==, startsWith() etc. work
+ * The result may be different from the results of compare(), operator<, etc.
* if supplementary characters are present:
*
* In UTF-16, supplementary characters (with code points U+10000 and above) are
/**
* Compare two Unicode strings in code point order.
- * This is different in UTF-16 from how compare(), operator==, startsWith() etc. work
+ * The result may be different from the results of compare(), operator<, etc.
* if supplementary characters are present:
*
* In UTF-16, supplementary characters (with code points U+10000 and above) are
/**
* Compare two Unicode strings in code point order.
- * This is different in UTF-16 from how compare(), operator==, startsWith() etc. work
+ * The result may be different from the results of compare(), operator<, etc.
* if supplementary characters are present:
*
* In UTF-16, supplementary characters (with code points U+10000 and above) are
/**
* Compare two Unicode strings in code point order.
- * This is different in UTF-16 from how compare(), operator==, startsWith() etc. work
+ * The result may be different from the results of compare(), operator<, etc.
* if supplementary characters are present:
*
* In UTF-16, supplementary characters (with code points U+10000 and above) are
/**
* Compare two Unicode strings in code point order.
- * This is different in UTF-16 from how compare(), operator==, startsWith() etc. work
+ * The result may be different from the results of compare(), operator<, etc.
* if supplementary characters are present:
*
* In UTF-16, supplementary characters (with code points U+10000 and above) are
* or 0xffff if the offset is not valid for this string
* @stable ICU 2.0
*/
- inline UChar32 char32At(int32_t offset) const;
+ UChar32 char32At(int32_t offset) const;
/**
* Adjust a random-access offset so that
* @see U16_SET_CP_START
* @stable ICU 2.0
*/
- inline int32_t getChar32Start(int32_t offset) const;
+ int32_t getChar32Start(int32_t offset) const;
/**
* Adjust a random-access offset so that
* @see U16_SET_CP_LIMIT
* @stable ICU 2.0
*/
- inline int32_t getChar32Limit(int32_t offset) const;
+ int32_t getChar32Limit(int32_t offset) const;
/**
* Move the code unit index along the string by delta code points.
* @param targetCapacity the length of the target buffer
* @param inv Signature-distinguishing paramater, use US_INV.
* @return the output string length, not including the terminating NUL
- * @draft ICU 3.2
+ * @stable ICU 3.2
*/
int32_t extract(int32_t start,
int32_t startLength,
int32_t targetCapacity,
enum EInvariant inv) const;
+#if U_CHARSET_IS_UTF8 || !UCONFIG_NO_CONVERSION
+
+ /**
+ * Copy the characters in the range
+ * [<tt>start</TT>, <tt>start + length</TT>) into an array of characters
+ * in the platform's default codepage.
+ * This function does not write any more than <code>targetLength</code>
+ * characters but returns the length of the entire output string
+ * so that one can allocate a larger buffer and call the function again
+ * if necessary.
+ * The output string is NUL-terminated if possible.
+ *
+ * @param start offset of first character which will be copied
+ * @param startLength the number of characters to extract
+ * @param target the target buffer for extraction
+ * @param targetLength the length of the target buffer
+ * If <TT>target</TT> is NULL, then the number of bytes required for
+ * <TT>target</TT> is returned.
+ * @return the output string length, not including the terminating NUL
+ * @stable ICU 2.0
+ */
+ int32_t extract(int32_t start,
+ int32_t startLength,
+ char *target,
+ uint32_t targetLength) const;
+
+#endif
+
#if !UCONFIG_NO_CONVERSION
/**
int32_t startLength,
char *target,
uint32_t targetLength,
- const char *codepage = 0) const;
+ const char *codepage) const;
/**
* Convert the UnicodeString into a codepage string using an existing UConverter.
#endif
+ /**
+ * Create a temporary substring for the specified range.
+ * Unlike the substring constructor and setTo() functions,
+ * the object returned here will be a read-only alias (using getBuffer())
+ * rather than copying the text.
+ * As a result, this substring operation is much faster but requires
+ * that the original string not be modified or deleted during the lifetime
+ * of the returned substring object.
+ * @param start offset of the first character visible in the substring
+ * @param length length of the substring
+ * @return a read-only alias UnicodeString object for the substring
+ * @stable ICU 4.4
+ */
+ UnicodeString tempSubString(int32_t start=0, int32_t length=INT32_MAX) const;
+
+ /**
+ * Create a temporary substring for the specified range.
+ * Same as tempSubString(start, length) except that the substring range
+ * is specified as a (start, limit) pair (with an exclusive limit index)
+ * rather than a (start, length) pair.
+ * @param start offset of the first character visible in the substring
+ * @param limit offset immediately following the last character visible in the substring
+ * @return a read-only alias UnicodeString object for the substring
+ * @stable ICU 4.4
+ */
+ inline UnicodeString tempSubStringBetween(int32_t start, int32_t limit=INT32_MAX) const;
+
+ /**
+ * Convert the UnicodeString to UTF-8 and write the result
+ * to a ByteSink. This is called by toUTF8String().
+ * Unpaired surrogates are replaced with U+FFFD.
+ * Calls u_strToUTF8WithSub().
+ *
+ * @param sink A ByteSink to which the UTF-8 version of the string is written.
+ * sink.Flush() is called at the end.
+ * @stable ICU 4.2
+ * @see toUTF8String
+ */
+ void toUTF8(ByteSink &sink) const;
+
+#if U_HAVE_STD_STRING
+
+ /**
+ * Convert the UnicodeString to UTF-8 and append the result
+ * to a standard string.
+ * Unpaired surrogates are replaced with U+FFFD.
+ * Calls toUTF8().
+ *
+ * @param result A standard string (or a compatible object)
+ * to which the UTF-8 version of the string is appended.
+ * @return The string object.
+ * @stable ICU 4.2
+ * @see toUTF8
+ */
+ template<typename StringClass>
+ StringClass &toUTF8String(StringClass &result) const {
+ StringByteSink<StringClass> sbs(&result);
+ toUTF8(sbs);
+ return result;
+ }
+
+#endif
+
+ /**
+ * Convert the UnicodeString to UTF-32.
+ * Unpaired surrogates are replaced with U+FFFD.
+ * Calls u_strToUTF32WithSub().
+ *
+ * @param utf32 destination string buffer, can be NULL if capacity==0
+ * @param capacity the number of UChar32s available at utf32
+ * @param errorCode Standard ICU error code. Its input value must
+ * pass the U_SUCCESS() test, or else the function returns
+ * immediately. Check for U_FAILURE() on output or use with
+ * function chaining. (See User Guide for details.)
+ * @return The length of the UTF-32 string.
+ * @see fromUTF32
+ * @stable ICU 4.2
+ */
+ int32_t toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const;
+
/* Length operations */
/**
/**
* Determine if this object contains a valid string.
- * A bogus string has no value. It is different from an empty string.
- * It can be used to indicate that no string value is available.
- * getBuffer() and getTerminatedBuffer() return NULL, and
+ * A bogus string has no value. It is different from an empty string,
+ * although in both cases isEmpty() returns TRUE and length() returns 0.
+ * setToBogus() and isBogus() can be used to indicate that no string value is available.
+ * For a bogus string, getBuffer() and getTerminatedBuffer() return NULL, and
* length() returns 0.
*
* @return TRUE if the string is valid, FALSE otherwise
* Replace the characters in this UnicodeString
* with the characters from <code>srcText</code>.
*
- * This function works the same for all strings except for ones that
- * are readonly aliases.
+ * This function works the same as the assignment operator
+ * for all strings except for ones that are readonly aliases.
+ *
* Starting with ICU 2.4, the assignment operator and the copy constructor
* allocate a new buffer and copy the buffer contents even for readonly aliases.
* This function implements the old, more efficient but less safe behavior
* of making this string also a readonly alias to the same buffer.
+ *
* The fastCopyFrom function must be used only if it is known that the lifetime of
- * this UnicodeString is at least as long as the lifetime of the aliased buffer
+ * this UnicodeString does not exceed the lifetime of the aliased buffer
* including its contents, for example for strings from resource bundles
- * or aliases to string contents.
+ * or aliases to string constants.
*
* @param src The text containing the characters to replace.
* @return a reference to this
* When the string is modified, then the buffer is first copied into
* newly allocated memory.
* The aliased buffer is never modified.
- * In an assignment to another UnicodeString, the text will be aliased again,
+ *
+ * In an assignment to another UnicodeString, when using the copy constructor
+ * or the assignment operator, the text will be copied.
+ * When using fastCopyFrom(), the text will be aliased again,
* so that both strings then alias the same readonly-text.
*
* @param isTerminated specifies if <code>text</code> is <code>NUL</code>-terminated.
/**
* Append operator. Append the characters in <TT>srcText</TT> to the
- * UnicodeString object at offset <TT>start</TT>. <TT>srcText</TT> is
- * not modified.
+ * UnicodeString object. <TT>srcText</TT> is not modified.
* @param srcText the source for the new characters
* @return a reference to this
* @stable ICU 2.0
int32_t srcLength);
/**
- * Append the characters in <TT>srcText</TT> to the UnicodeString object at
- * offset <TT>start</TT>. <TT>srcText</TT> is not modified.
+ * Append the characters in <TT>srcText</TT> to the UnicodeString object.
+ * <TT>srcText</TT> is not modified.
* @param srcText the source for the new characters
* @return a reference to this
* @stable ICU 2.0
* @param srcStart the offset into <TT>srcChars</TT> where new characters
* will be obtained
* @param srcLength the number of characters in <TT>srcChars</TT> in
- * the append string
+ * the append string; can be -1 if <TT>srcChars</TT> is NUL-terminated
* @return a reference to this
* @stable ICU 2.0
*/
* Append the characters in <TT>srcChars</TT> to the UnicodeString object
* at offset <TT>start</TT>. <TT>srcChars</TT> is not modified.
* @param srcChars the source for the new characters
- * @param srcLength the number of Unicode characters in <TT>srcChars</TT>
+ * @param srcLength the number of Unicode characters in <TT>srcChars</TT>;
+ * can be -1 if <TT>srcChars</TT> is NUL-terminated
* @return a reference to this
* @stable ICU 2.0
*/
* @return a reference to this
* @stable ICU 2.0
*/
- inline UnicodeString& append(UChar32 srcChar);
+ UnicodeString& append(UChar32 srcChar);
/* Insert operations */
* @return a reference to this
* @stable ICU 2.0
*/
- inline UnicodeString& replace(int32_t start,
- int32_t length,
- UChar32 srcChar);
+ UnicodeString& replace(int32_t start, int32_t length, UChar32 srcChar);
/**
* Replace the characters in the range [<TT>start</TT>, <TT>limit</TT>)
inline UnicodeString& removeBetween(int32_t start,
int32_t limit = (int32_t)INT32_MAX);
+ /**
+ * Retain only the characters in the range
+ * [<code>start</code>, <code>limit</code>) from the UnicodeString object.
+ * Removes characters before <code>start</code> and at and after <code>limit</code>.
+ * @param start the offset of the first character to retain
+ * @param limit the offset immediately following the range to retain
+ * @return a reference to this
+ * @stable ICU 4.4
+ */
+ inline UnicodeString &retainBetween(int32_t start, int32_t limit = INT32_MAX);
/* Length operations */
* The standard titlecase iterator for the root locale implements the
* algorithm of Unicode TR 21.
*
- * This function uses only the first() and next() methods of the
+ * This function uses only the setText(), first() and next() methods of the
* provided break iterator.
*
* @param titleIter A break iterator to find the first characters of words
* The standard titlecase iterator for the root locale implements the
* algorithm of Unicode TR 21.
*
- * This function uses only the first() and next() methods of the
+ * This function uses only the setText(), first() and next() methods of the
* provided break iterator.
*
* @param titleIter A break iterator to find the first characters of words
*/
UnicodeString &toTitle(BreakIterator *titleIter, const Locale &locale);
+ /**
+ * Titlecase this string, with options.
+ *
+ * Casing is locale-dependent and context-sensitive.
+ * Titlecasing uses a break iterator to find the first characters of words
+ * that are to be titlecased. It titlecases those characters and lowercases
+ * all others. (This can be modified with options.)
+ *
+ * The titlecase break iterator can be provided to customize for arbitrary
+ * styles, using rules and dictionaries beyond the standard iterators.
+ * It may be more efficient to always provide an iterator to avoid
+ * opening and closing one for each string.
+ * The standard titlecase iterator for the root locale implements the
+ * algorithm of Unicode TR 21.
+ *
+ * This function uses only the setText(), first() and next() methods of the
+ * provided break iterator.
+ *
+ * @param titleIter A break iterator to find the first characters of words
+ * that are to be titlecased.
+ * If none is provided (0), then a standard titlecase
+ * break iterator is opened.
+ * Otherwise the provided iterator is set to the string's text.
+ * @param locale The locale to consider.
+ * @param options Options bit set, see ucasemap_open().
+ * @return A reference to this.
+ * @see U_TITLECASE_NO_LOWERCASE
+ * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
+ * @see ucasemap_open
+ * @stable ICU 3.8
+ */
+ UnicodeString &toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options);
+
#endif
/**
- * Case-fold the characters in this string.
+ * Case-folds the characters in this string.
+ *
* Case-folding is locale-independent and not context-sensitive,
* but there is an option for whether to include or exclude mappings for dotted I
- * and dotless i that are marked with 'I' in CaseFolding.txt.
+ * and dotless i that are marked with 'T' in CaseFolding.txt.
+ *
* The result may be longer or shorter than the original.
*
* @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
/** Construct an empty UnicodeString.
* @stable ICU 2.0
*/
- UnicodeString();
+ inline UnicodeString();
/**
* Construct a UnicodeString with capacity to hold <TT>capacity</TT> UChars
/**
* Single UChar (code unit) constructor.
+ *
+ * It is recommended to mark this constructor "explicit" by
+ * <code>-DUNISTR_FROM_CHAR_EXPLICIT=explicit</code>
+ * on the compiler command line or similar.
* @param ch the character to place in the UnicodeString
* @stable ICU 2.0
*/
- UnicodeString(UChar ch);
+ UNISTR_FROM_CHAR_EXPLICIT UnicodeString(UChar ch);
/**
* Single UChar32 (code point) constructor.
+ *
+ * It is recommended to mark this constructor "explicit" by
+ * <code>-DUNISTR_FROM_CHAR_EXPLICIT=explicit</code>
+ * on the compiler command line or similar.
* @param ch the character to place in the UnicodeString
* @stable ICU 2.0
*/
- UnicodeString(UChar32 ch);
+ UNISTR_FROM_CHAR_EXPLICIT UnicodeString(UChar32 ch);
/**
* UChar* constructor.
+ *
+ * It is recommended to mark this constructor "explicit" by
+ * <code>-DUNISTR_FROM_STRING_EXPLICIT=explicit</code>
+ * on the compiler command line or similar.
* @param text The characters to place in the UnicodeString. <TT>text</TT>
* must be NULL (U+0000) terminated.
* @stable ICU 2.0
*/
- UnicodeString(const UChar *text);
+ UNISTR_FROM_STRING_EXPLICIT UnicodeString(const UChar *text);
/**
* UChar* constructor.
* When the string is modified, then the buffer is first copied into
* newly allocated memory.
* The aliased buffer is never modified.
- * In an assignment to another UnicodeString, the text will be aliased again,
+ *
+ * In an assignment to another UnicodeString, when using the copy constructor
+ * or the assignment operator, the text will be copied.
+ * When using fastCopyFrom(), the text will be aliased again,
* so that both strings then alias the same readonly-text.
*
* @param isTerminated specifies if <code>text</code> is <code>NUL</code>-terminated.
*/
UnicodeString(UChar *buffer, int32_t buffLength, int32_t buffCapacity);
+#if U_CHARSET_IS_UTF8 || !UCONFIG_NO_CONVERSION
+
+ /**
+ * char* constructor.
+ * Uses the default converter (and thus depends on the ICU conversion code)
+ * unless U_CHARSET_IS_UTF8 is set to 1.
+ *
+ * For ASCII (really "invariant character") strings it is more efficient to use
+ * the constructor that takes a US_INV (for its enum EInvariant).
+ * For ASCII (invariant-character) string literals, see UNICODE_STRING and
+ * UNICODE_STRING_SIMPLE.
+ *
+ * It is recommended to mark this constructor "explicit" by
+ * <code>-DUNISTR_FROM_STRING_EXPLICIT=explicit</code>
+ * on the compiler command line or similar.
+ * @param codepageData an array of bytes, null-terminated,
+ * in the platform's default codepage.
+ * @stable ICU 2.0
+ * @see UNICODE_STRING
+ * @see UNICODE_STRING_SIMPLE
+ */
+ UNISTR_FROM_STRING_EXPLICIT UnicodeString(const char *codepageData);
+
+ /**
+ * char* constructor.
+ * Uses the default converter (and thus depends on the ICU conversion code)
+ * unless U_CHARSET_IS_UTF8 is set to 1.
+ * @param codepageData an array of bytes in the platform's default codepage.
+ * @param dataLength The number of bytes in <TT>codepageData</TT>.
+ * @stable ICU 2.0
+ */
+ UnicodeString(const char *codepageData, int32_t dataLength);
+
+#endif
+
#if !UCONFIG_NO_CONVERSION
/**
*
* @stable ICU 2.0
*/
- UnicodeString(const char *codepageData,
- const char *codepage = 0);
+ UnicodeString(const char *codepageData, const char *codepage);
/**
* char* constructor.
*
* @stable ICU 2.0
*/
- UnicodeString(const char *codepageData,
- int32_t dataLength,
- const char *codepage = 0);
+ UnicodeString(const char *codepageData, int32_t dataLength, const char *codepage);
/**
* char * / UConverter constructor.
* @param inv Signature-distinguishing paramater, use US_INV.
*
* @see US_INV
- * @draft ICU 3.2
+ * @stable ICU 3.2
*/
UnicodeString(const char *src, int32_t length, enum EInvariant inv);
*/
virtual ~UnicodeString();
+ /**
+ * Create a UnicodeString from a UTF-8 string.
+ * Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string.
+ * Calls u_strFromUTF8WithSub().
+ *
+ * @param utf8 UTF-8 input string.
+ * Note that a StringPiece can be implicitly constructed
+ * from a std::string or a NUL-terminated const char * string.
+ * @return A UnicodeString with equivalent UTF-16 contents.
+ * @see toUTF8
+ * @see toUTF8String
+ * @stable ICU 4.2
+ */
+ static UnicodeString fromUTF8(const StringPiece &utf8);
+
+ /**
+ * Create a UnicodeString from a UTF-32 string.
+ * Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string.
+ * Calls u_strFromUTF32WithSub().
+ *
+ * @param utf32 UTF-32 input string. Must not be NULL.
+ * @param length Length of the input string, or -1 if NUL-terminated.
+ * @return A UnicodeString with equivalent UTF-16 contents.
+ * @see toUTF32
+ * @stable ICU 4.2
+ */
+ static UnicodeString fromUTF32(const UChar32 *utf32, int32_t length);
/* Miscellaneous operations */
*
* \\a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A,
* \\v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B,
- * \\" => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C
+ * \\" => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C
*
* Anything else following a backslash is generically escaped. For
* example, "[a\\-z]" returns "[a-z]".
* character. See unescape() for a listing of the recognized escape
* sequences. The character at offset-1 is assumed (without
* checking) to be a backslash. If the escape sequence is
- * ill-formed, or the offset is out of range, (UChar32)0xFFFFFFFF is
+ * ill-formed, or the offset is out of range, U_SENTINEL=-1 is
* returned.
*
* @param offset an input output parameter. On input, it is the
* after the initial backslash. On output, it is advanced after the
* last character parsed. On error, it is not advanced at all.
* @return the character represented by the escape sequence at
- * offset, or (UChar32)0xFFFFFFFF on error.
+ * offset, or U_SENTINEL=-1 on error.
* @see UnicodeString#unescape()
* @see u_unescape()
* @see u_unescapeAt()
virtual UChar32 getChar32At(int32_t offset) const;
private:
+ // For char* constructors. Could be made public.
+ UnicodeString &setToUTF8(const StringPiece &utf8);
+ // For extract(char*).
+ // We could make a toUTF8(target, capacity, errorCode) public but not
+ // this version: New API will be cleaner if we make callers create substrings
+ // rather than having start+length on every method,
+ // and it should take a UErrorCode&.
+ int32_t
+ toUTF8(int32_t start, int32_t len,
+ char *target, int32_t capacity) const;
+
+ /**
+ * Internal string contents comparison, called by operator==.
+ * Requires: this & text not bogus and have same lengths.
+ */
+ UBool doEquals(const UnicodeString &text, int32_t len) const;
inline int8_t
doCompare(int32_t start,
int32_t doHashCode(void) const;
// get pointer to start of array
+ // these do not check for kOpenGetBuffer, unlike the public getBuffer() function
inline UChar* getArrayStart(void);
inline const UChar* getArrayStart(void) const;
+ // A UnicodeString object (not necessarily its current buffer)
+ // is writable unless it isBogus() or it has an "open" getBuffer(minCapacity).
+ inline UBool isWritable() const;
+
+ // Is the current buffer writable?
+ inline UBool isBufferWritable() const;
+
+ // None of the following does releaseArray().
+ inline void setLength(int32_t len); // sets only fShortLength and fLength
+ inline void setToEmpty(); // sets fFlags=kShortString
+ inline void setArray(UChar *array, int32_t len, int32_t capacity); // does not set fFlags
+
// allocate the array; result may be fStackBuffer
// sets refCount to 1 if appropriate
// sets fArray, fCapacity, and fFlags
int32_t **pBufferToDelete = 0,
UBool forceClone = FALSE);
- // common function for case mappings
+ /**
+ * Common function for UnicodeString case mappings.
+ * The stringCaseMapper has the same type UStringCaseMapper
+ * as in ustr_imp.h for ustrcase_map().
+ */
UnicodeString &
- caseMap(BreakIterator *titleIter,
- const char *locale,
- uint32_t options,
- int32_t toWhichCase);
+ caseMap(const UCaseMap *csm, UStringCaseMapper *stringCaseMapper);
// ref counting
void addRef(void);
// constants
enum {
- US_STACKBUF_SIZE=7, // Size of stack buffer for small strings
+ // Set the stack buffer size so that sizeof(UnicodeString) is,
+ // naturally (without padding), a multiple of sizeof(pointer).
+ US_STACKBUF_SIZE= sizeof(void *)==4 ? 13 : 15, // Size of stack buffer for short strings
kInvalidUChar=0xffff, // invalid UChar index
kGrowSize=128, // grow size for this buffer
kInvalidHashCode=0, // invalid hash code
// bit flag values for fFlags
kIsBogus=1, // this string is bogus, i.e., not valid or NULL
- kUsingStackBuffer=2,// fArray==fStackBuffer
+ kUsingStackBuffer=2,// using fUnion.fStackBuffer instead of fUnion.fFields
kRefCounted=4, // there is a refCount field before the characters in fArray
kBufferIsReadonly=8,// do not write to this buffer
kOpenGetBuffer=16, // getBuffer(minCapacity) was called (is "open"),
kWritableAlias=0
};
- friend class StringCharacterIterator;
friend class StringThreadTest;
+ friend class UnicodeStringAppendable;
+
+ union StackBufferOrFields; // forward declaration necessary before friend declaration
+ friend union StackBufferOrFields; // make US_STACKBUF_SIZE visible inside fUnion
/*
* The following are all the class fields that are stored
* Note that UnicodeString has virtual functions,
* therefore there is an implicit vtable pointer
* as the first real field.
- * The fields should be aligned such that no padding is
- * necessary, mostly by having larger types first.
+ * The fields should be aligned such that no padding is necessary.
* On 32-bit machines, the size should be 32 bytes,
* on 64-bit machines (8-byte pointers), it should be 40 bytes.
+ *
+ * We use a hack to achieve this.
+ *
+ * With at least some compilers, each of the following is forced to
+ * a multiple of sizeof(pointer) [the largest field base unit here is a data pointer],
+ * rounded up with additional padding if the fields do not already fit that requirement:
+ * - sizeof(class UnicodeString)
+ * - offsetof(UnicodeString, fUnion)
+ * - sizeof(fUnion)
+ * - sizeof(fFields)
+ *
+ * In order to avoid padding, we make sizeof(fStackBuffer)=16 (=8 UChars)
+ * which is at least as large as sizeof(fFields) on 32-bit and 64-bit machines.
+ * (Padding at the end of fFields is ok:
+ * As long as there is no padding after fStackBuffer, it is not wasted space.)
+ *
+ * We further assume that the compiler does not reorder the fields,
+ * so that fRestOfStackBuffer (which holds a few more UChars) immediately follows after fUnion,
+ * with at most some padding (but no other field) in between.
+ * (Padding there would be wasted space, but functionally harmless.)
+ *
+ * We use a few more sizeof(pointer)'s chunks of space with
+ * fRestOfStackBuffer, fShortLength and fFlags,
+ * to get up exactly to the intended sizeof(UnicodeString).
*/
// (implicit) *vtable;
- int32_t fLength; // number of characters in fArray
- int32_t fCapacity; // sizeof fArray
- UChar *fArray; // the Unicode data
- uint16_t fFlags; // bit flags: see constants above
- UChar fStackBuffer [ US_STACKBUF_SIZE ]; // buffer for small strings
-
+ union StackBufferOrFields {
+ // fStackBuffer is used iff (fFlags&kUsingStackBuffer)
+ // else fFields is used
+ UChar fStackBuffer[8]; // buffer for short strings, together with fRestOfStackBuffer
+ struct {
+ UChar *fArray; // the Unicode data
+ int32_t fCapacity; // capacity of fArray (in UChars)
+ int32_t fLength; // number of characters in fArray if >127; else undefined
+ } fFields;
+ } fUnion;
+ UChar fRestOfStackBuffer[US_STACKBUF_SIZE-8];
+ int8_t fShortLength; // 0..127: length <0: real length is in fUnion.fFields.fLength
+ uint8_t fFlags; // bit flags: see constants above
};
/**
* @param s1 The first string to be copied to the new one.
* @param s2 The second string to be copied to the new one, after s1.
* @return UnicodeString(s1).append(s2)
- * @draft ICU 2.8
+ * @stable ICU 2.8
*/
U_COMMON_API UnicodeString U_EXPORT2
operator+ (const UnicodeString &s1, const UnicodeString &s2);
-U_NAMESPACE_END
-
-// inline implementations -------------------------------------------------- ***
-
-//========================================
-// Array copying
-//========================================
-/**
- * Copy an array of UnicodeString OBJECTS (not pointers).
- * @internal
- */
-inline void
-uprv_arrayCopy(const U_NAMESPACE_QUALIFIER UnicodeString *src, U_NAMESPACE_QUALIFIER UnicodeString *dst, int32_t count)
-{ while(count-- > 0) *dst++ = *src++; }
-
-/**
- * Copy an array of UnicodeString OBJECTS (not pointers).
- * @internal
- */
-inline void
-uprv_arrayCopy(const U_NAMESPACE_QUALIFIER UnicodeString *src, int32_t srcStart,
- U_NAMESPACE_QUALIFIER UnicodeString *dst, int32_t dstStart, int32_t count)
-{ uprv_arrayCopy(src+srcStart, dst+dstStart, count); }
-
-U_NAMESPACE_BEGIN
-
//========================================
// Inline members
//========================================
// pin index
if(start < 0) {
start = 0;
- } else if(start > fLength) {
- start = fLength;
+ } else if(start > length()) {
+ start = length();
}
}
int32_t& _length) const
{
// pin indices
+ int32_t len = length();
if(start < 0) {
start = 0;
- } else if(start > fLength) {
- start = fLength;
+ } else if(start > len) {
+ start = len;
}
if(_length < 0) {
_length = 0;
- } else if(_length > (fLength - start)) {
- _length = (fLength - start);
+ } else if(_length > (len - start)) {
+ _length = (len - start);
}
}
inline UChar*
UnicodeString::getArrayStart()
-{ return fArray; }
+{ return (fFlags&kUsingStackBuffer) ? fUnion.fStackBuffer : fUnion.fFields.fArray; }
inline const UChar*
UnicodeString::getArrayStart() const
-{ return fArray; }
+{ return (fFlags&kUsingStackBuffer) ? fUnion.fStackBuffer : fUnion.fFields.fArray; }
+
+//========================================
+// Default constructor
+//========================================
+
+inline
+UnicodeString::UnicodeString()
+ : fShortLength(0),
+ fFlags(kShortString)
+{}
//========================================
// Read-only implementation methods
//========================================
inline int32_t
UnicodeString::length() const
-{ return fLength; }
+{ return fShortLength>=0 ? fShortLength : fUnion.fFields.fLength; }
inline int32_t
UnicodeString::getCapacity() const
-{ return fCapacity; }
+{ return (fFlags&kUsingStackBuffer) ? US_STACKBUF_SIZE : fUnion.fFields.fCapacity; }
inline int32_t
UnicodeString::hashCode() const
UnicodeString::isBogus() const
{ return (UBool)(fFlags & kIsBogus); }
+inline UBool
+UnicodeString::isWritable() const
+{ return (UBool)!(fFlags&(kOpenGetBuffer|kIsBogus)); }
+
+inline UBool
+UnicodeString::isBufferWritable() const
+{
+ return (UBool)(
+ !(fFlags&(kOpenGetBuffer|kIsBogus|kBufferIsReadonly)) &&
+ (!(fFlags&kRefCounted) || refCount()==1));
+}
+
inline const UChar *
UnicodeString::getBuffer() const {
- if(!(fFlags&(kIsBogus|kOpenGetBuffer))) {
- return fArray;
- } else {
+ if(fFlags&(kIsBogus|kOpenGetBuffer)) {
return 0;
+ } else if(fFlags&kUsingStackBuffer) {
+ return fUnion.fStackBuffer;
+ } else {
+ return fUnion.fFields.fArray;
}
}
//========================================
inline int8_t
UnicodeString::doCompare(int32_t start,
- int32_t length,
+ int32_t thisLength,
const UnicodeString& srcText,
int32_t srcStart,
int32_t srcLength) const
return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise
} else {
srcText.pinIndices(srcStart, srcLength);
- return doCompare(start, length, srcText.fArray, srcStart, srcLength);
+ return doCompare(start, thisLength, srcText.getArrayStart(), srcStart, srcLength);
}
}
if(isBogus()) {
return text.isBogus();
} else {
- return
- !text.isBogus() &&
- fLength == text.fLength &&
- doCompare(0, fLength, text, 0, text.fLength) == 0;
+ int32_t len = length(), textLength = text.length();
+ return !text.isBogus() && len == textLength && doEquals(text, len);
}
}
inline UBool
UnicodeString::operator> (const UnicodeString& text) const
-{ return doCompare(0, fLength, text, 0, text.fLength) == 1; }
+{ return doCompare(0, length(), text, 0, text.length()) == 1; }
inline UBool
UnicodeString::operator< (const UnicodeString& text) const
-{ return doCompare(0, fLength, text, 0, text.fLength) == -1; }
+{ return doCompare(0, length(), text, 0, text.length()) == -1; }
inline UBool
UnicodeString::operator>= (const UnicodeString& text) const
-{ return doCompare(0, fLength, text, 0, text.fLength) != -1; }
+{ return doCompare(0, length(), text, 0, text.length()) != -1; }
inline UBool
UnicodeString::operator<= (const UnicodeString& text) const
-{ return doCompare(0, fLength, text, 0, text.fLength) != 1; }
+{ return doCompare(0, length(), text, 0, text.length()) != 1; }
inline int8_t
UnicodeString::compare(const UnicodeString& text) const
-{ return doCompare(0, fLength, text, 0, text.fLength); }
+{ return doCompare(0, length(), text, 0, text.length()); }
inline int8_t
UnicodeString::compare(int32_t start,
int32_t _length,
const UnicodeString& srcText) const
-{ return doCompare(start, _length, srcText, 0, srcText.fLength); }
+{ return doCompare(start, _length, srcText, 0, srcText.length()); }
inline int8_t
UnicodeString::compare(const UChar *srcChars,
int32_t srcLength) const
-{ return doCompare(0, fLength, srcChars, 0, srcLength); }
+{ return doCompare(0, length(), srcChars, 0, srcLength); }
inline int8_t
UnicodeString::compare(int32_t start,
inline int8_t
UnicodeString::doCompareCodePointOrder(int32_t start,
- int32_t length,
+ int32_t thisLength,
const UnicodeString& srcText,
int32_t srcStart,
int32_t srcLength) const
return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise
} else {
srcText.pinIndices(srcStart, srcLength);
- return doCompareCodePointOrder(start, length, srcText.fArray, srcStart, srcLength);
+ return doCompareCodePointOrder(start, thisLength, srcText.getArrayStart(), srcStart, srcLength);
}
}
inline int8_t
UnicodeString::compareCodePointOrder(const UnicodeString& text) const
-{ return doCompareCodePointOrder(0, fLength, text, 0, text.fLength); }
+{ return doCompareCodePointOrder(0, length(), text, 0, text.length()); }
inline int8_t
UnicodeString::compareCodePointOrder(int32_t start,
int32_t _length,
const UnicodeString& srcText) const
-{ return doCompareCodePointOrder(start, _length, srcText, 0, srcText.fLength); }
+{ return doCompareCodePointOrder(start, _length, srcText, 0, srcText.length()); }
inline int8_t
UnicodeString::compareCodePointOrder(const UChar *srcChars,
int32_t srcLength) const
-{ return doCompareCodePointOrder(0, fLength, srcChars, 0, srcLength); }
+{ return doCompareCodePointOrder(0, length(), srcChars, 0, srcLength); }
inline int8_t
UnicodeString::compareCodePointOrder(int32_t start,
inline int8_t
UnicodeString::doCaseCompare(int32_t start,
- int32_t length,
+ int32_t thisLength,
const UnicodeString &srcText,
int32_t srcStart,
int32_t srcLength,
return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise
} else {
srcText.pinIndices(srcStart, srcLength);
- return doCaseCompare(start, length, srcText.fArray, srcStart, srcLength, options);
+ return doCaseCompare(start, thisLength, srcText.getArrayStart(), srcStart, srcLength, options);
}
}
inline int8_t
UnicodeString::caseCompare(const UnicodeString &text, uint32_t options) const {
- return doCaseCompare(0, fLength, text, 0, text.fLength, options);
+ return doCaseCompare(0, length(), text, 0, text.length(), options);
}
inline int8_t
int32_t _length,
const UnicodeString &srcText,
uint32_t options) const {
- return doCaseCompare(start, _length, srcText, 0, srcText.fLength, options);
+ return doCaseCompare(start, _length, srcText, 0, srcText.length(), options);
}
inline int8_t
UnicodeString::caseCompare(const UChar *srcChars,
int32_t srcLength,
uint32_t options) const {
- return doCaseCompare(0, fLength, srcChars, 0, srcLength, options);
+ return doCaseCompare(0, length(), srcChars, 0, srcLength, options);
}
inline int8_t
inline int32_t
UnicodeString::indexOf(const UnicodeString& text) const
-{ return indexOf(text, 0, text.fLength, 0, fLength); }
+{ return indexOf(text, 0, text.length(), 0, length()); }
inline int32_t
UnicodeString::indexOf(const UnicodeString& text,
int32_t start) const {
pinIndex(start);
- return indexOf(text, 0, text.fLength, start, fLength - start);
+ return indexOf(text, 0, text.length(), start, length() - start);
}
inline int32_t
UnicodeString::indexOf(const UnicodeString& text,
int32_t start,
int32_t _length) const
-{ return indexOf(text, 0, text.fLength, start, _length); }
+{ return indexOf(text, 0, text.length(), start, _length); }
inline int32_t
UnicodeString::indexOf(const UChar *srcChars,
int32_t srcLength,
int32_t start) const {
pinIndex(start);
- return indexOf(srcChars, 0, srcLength, start, fLength - start);
+ return indexOf(srcChars, 0, srcLength, start, length() - start);
}
inline int32_t
inline int32_t
UnicodeString::indexOf(UChar c) const
-{ return doIndexOf(c, 0, fLength); }
+{ return doIndexOf(c, 0, length()); }
inline int32_t
UnicodeString::indexOf(UChar32 c) const
-{ return indexOf(c, 0, fLength); }
+{ return indexOf(c, 0, length()); }
inline int32_t
UnicodeString::indexOf(UChar c,
int32_t start) const {
pinIndex(start);
- return doIndexOf(c, start, fLength - start);
+ return doIndexOf(c, start, length() - start);
}
inline int32_t
UnicodeString::indexOf(UChar32 c,
int32_t start) const {
pinIndex(start);
- return indexOf(c, start, fLength - start);
+ return indexOf(c, start, length() - start);
}
inline int32_t
int32_t srcLength,
int32_t start) const {
pinIndex(start);
- return lastIndexOf(srcChars, 0, srcLength, start, fLength - start);
+ return lastIndexOf(srcChars, 0, srcLength, start, length() - start);
}
inline int32_t
UnicodeString::lastIndexOf(const UnicodeString& text,
int32_t start,
int32_t _length) const
-{ return lastIndexOf(text, 0, text.fLength, start, _length); }
+{ return lastIndexOf(text, 0, text.length(), start, _length); }
inline int32_t
UnicodeString::lastIndexOf(const UnicodeString& text,
int32_t start) const {
pinIndex(start);
- return lastIndexOf(text, 0, text.fLength, start, fLength - start);
+ return lastIndexOf(text, 0, text.length(), start, length() - start);
}
inline int32_t
UnicodeString::lastIndexOf(const UnicodeString& text) const
-{ return lastIndexOf(text, 0, text.fLength, 0, fLength); }
+{ return lastIndexOf(text, 0, text.length(), 0, length()); }
inline int32_t
UnicodeString::lastIndexOf(UChar c,
inline int32_t
UnicodeString::lastIndexOf(UChar c) const
-{ return doLastIndexOf(c, 0, fLength); }
+{ return doLastIndexOf(c, 0, length()); }
inline int32_t
UnicodeString::lastIndexOf(UChar32 c) const {
- return lastIndexOf(c, 0, fLength);
+ return lastIndexOf(c, 0, length());
}
inline int32_t
UnicodeString::lastIndexOf(UChar c,
int32_t start) const {
pinIndex(start);
- return doLastIndexOf(c, start, fLength - start);
+ return doLastIndexOf(c, start, length() - start);
}
inline int32_t
UnicodeString::lastIndexOf(UChar32 c,
int32_t start) const {
pinIndex(start);
- return lastIndexOf(c, start, fLength - start);
+ return lastIndexOf(c, start, length() - start);
}
inline UBool
UnicodeString::startsWith(const UnicodeString& text) const
-{ return compare(0, text.fLength, text, 0, text.fLength) == 0; }
+{ return compare(0, text.length(), text, 0, text.length()) == 0; }
inline UBool
UnicodeString::startsWith(const UnicodeString& srcText,
{ return doCompare(0, srcLength, srcText, srcStart, srcLength) == 0; }
inline UBool
-UnicodeString::startsWith(const UChar *srcChars,
- int32_t srcLength) const
-{ return doCompare(0, srcLength, srcChars, 0, srcLength) == 0; }
+UnicodeString::startsWith(const UChar *srcChars, int32_t srcLength) const {
+ if(srcLength < 0) {
+ srcLength = u_strlen(srcChars);
+ }
+ return doCompare(0, srcLength, srcChars, 0, srcLength) == 0;
+}
inline UBool
-UnicodeString::startsWith(const UChar *srcChars,
- int32_t srcStart,
- int32_t srcLength) const
-{ return doCompare(0, srcLength, srcChars, srcStart, srcLength) == 0;}
+UnicodeString::startsWith(const UChar *srcChars, int32_t srcStart, int32_t srcLength) const {
+ if(srcLength < 0) {
+ srcLength = u_strlen(srcChars);
+ }
+ return doCompare(0, srcLength, srcChars, srcStart, srcLength) == 0;
+}
inline UBool
UnicodeString::endsWith(const UnicodeString& text) const
-{ return doCompare(fLength - text.fLength, text.fLength,
- text, 0, text.fLength) == 0; }
+{ return doCompare(length() - text.length(), text.length(),
+ text, 0, text.length()) == 0; }
inline UBool
UnicodeString::endsWith(const UnicodeString& srcText,
int32_t srcStart,
int32_t srcLength) const {
srcText.pinIndices(srcStart, srcLength);
- return doCompare(fLength - srcLength, srcLength,
+ return doCompare(length() - srcLength, srcLength,
srcText, srcStart, srcLength) == 0;
}
if(srcLength < 0) {
srcLength = u_strlen(srcChars);
}
- return doCompare(fLength - srcLength, srcLength,
+ return doCompare(length() - srcLength, srcLength,
srcChars, 0, srcLength) == 0;
}
if(srcLength < 0) {
srcLength = u_strlen(srcChars + srcStart);
}
- return doCompare(fLength - srcLength, srcLength,
+ return doCompare(length() - srcLength, srcLength,
srcChars, srcStart, srcLength) == 0;
}
UnicodeString::replace(int32_t start,
int32_t _length,
const UnicodeString& srcText)
-{ return doReplace(start, _length, srcText, 0, srcText.fLength); }
+{ return doReplace(start, _length, srcText, 0, srcText.length()); }
inline UnicodeString&
UnicodeString::replace(int32_t start,
UChar srcChar)
{ return doReplace(start, _length, &srcChar, 0, 1); }
-inline UnicodeString&
-UnicodeString::replace(int32_t start,
- int32_t _length,
- UChar32 srcChar) {
- UChar buffer[U16_MAX_LENGTH];
- int32_t count = 0;
- UBool isError = FALSE;
- U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
- return doReplace(start, _length, buffer, 0, count);
-}
-
inline UnicodeString&
UnicodeString::replaceBetween(int32_t start,
int32_t limit,
const UnicodeString& srcText)
-{ return doReplace(start, limit - start, srcText, 0, srcText.fLength); }
+{ return doReplace(start, limit - start, srcText, 0, srcText.length()); }
inline UnicodeString&
UnicodeString::replaceBetween(int32_t start,
inline UnicodeString&
UnicodeString::findAndReplace(const UnicodeString& oldText,
const UnicodeString& newText)
-{ return findAndReplace(0, fLength, oldText, 0, oldText.fLength,
- newText, 0, newText.fLength); }
+{ return findAndReplace(0, length(), oldText, 0, oldText.length(),
+ newText, 0, newText.length()); }
inline UnicodeString&
UnicodeString::findAndReplace(int32_t start,
int32_t _length,
const UnicodeString& oldText,
const UnicodeString& newText)
-{ return findAndReplace(start, _length, oldText, 0, oldText.fLength,
- newText, 0, newText.fLength); }
+{ return findAndReplace(start, _length, oldText, 0, oldText.length(),
+ newText, 0, newText.length()); }
// ============================
// extract
UnicodeString::doExtract(int32_t start,
int32_t _length,
UnicodeString& target) const
-{ target.replace(0, target.fLength, *this, start, _length); }
+{ target.replace(0, target.length(), *this, start, _length); }
inline void
UnicodeString::extract(int32_t start,
doExtract(start, limit - start, dst, dstStart);
}
+inline UnicodeString
+UnicodeString::tempSubStringBetween(int32_t start, int32_t limit) const {
+ return tempSubString(start, limit - start);
+}
+
inline UChar
UnicodeString::doCharAt(int32_t offset) const
{
- if((uint32_t)offset < (uint32_t)fLength) {
- return fArray[offset];
+ if((uint32_t)offset < (uint32_t)length()) {
+ return getArrayStart()[offset];
} else {
return kInvalidUChar;
}
UnicodeString::operator[] (int32_t offset) const
{ return doCharAt(offset); }
-inline UChar32
-UnicodeString::char32At(int32_t offset) const
-{
- if((uint32_t)offset < (uint32_t)fLength) {
- UChar32 c;
- U16_GET(fArray, 0, offset, fLength, c);
- return c;
- } else {
- return kInvalidUChar;
- }
+inline UBool
+UnicodeString::isEmpty() const {
+ return fShortLength == 0;
}
-inline int32_t
-UnicodeString::getChar32Start(int32_t offset) const {
- if((uint32_t)offset < (uint32_t)fLength) {
- U16_SET_CP_START(fArray, 0, offset);
- return offset;
+//========================================
+// Write implementation methods
+//========================================
+inline void
+UnicodeString::setLength(int32_t len) {
+ if(len <= 127) {
+ fShortLength = (int8_t)len;
} else {
- return 0;
+ fShortLength = (int8_t)-1;
+ fUnion.fFields.fLength = len;
}
}
-inline int32_t
-UnicodeString::getChar32Limit(int32_t offset) const {
- if((uint32_t)offset < (uint32_t)fLength) {
- U16_SET_CP_LIMIT(fArray, 0, offset, fLength);
- return offset;
- } else {
- return fLength;
- }
+inline void
+UnicodeString::setToEmpty() {
+ fShortLength = 0;
+ fFlags = kShortString;
}
-inline UBool
-UnicodeString::isEmpty() const {
- return fLength == 0;
+inline void
+UnicodeString::setArray(UChar *array, int32_t len, int32_t capacity) {
+ setLength(len);
+ fUnion.fFields.fArray = array;
+ fUnion.fFields.fCapacity = capacity;
}
-//========================================
-// Write implementation methods
-//========================================
inline const UChar *
UnicodeString::getTerminatedBuffer() {
- if(fFlags&(kIsBogus|kOpenGetBuffer)) {
+ if(!isWritable()) {
return 0;
- } else if(fLength<fCapacity && fArray[fLength]==0) {
- return fArray;
- } else if(cloneArrayIfNeeded(fLength+1)) {
- fArray[fLength]=0;
- return fArray;
} else {
- return 0;
+ UChar *array = getArrayStart();
+ int32_t len = length();
+ if(len < getCapacity() && ((fFlags&kRefCounted) == 0 || refCount() == 1)) {
+ /*
+ * kRefCounted: Do not write the NUL if the buffer is shared.
+ * That is mostly safe, except when the length of one copy was modified
+ * without copy-on-write, e.g., via truncate(newLength) or remove(void).
+ * Then the NUL would be written into the middle of another copy's string.
+ */
+ if(!(fFlags&kBufferIsReadonly)) {
+ /*
+ * We must not write to a readonly buffer, but it is known to be
+ * NUL-terminated if len<capacity.
+ * A shared, allocated buffer (refCount()>1) must not have its contents
+ * modified, but the NUL at [len] is beyond the string contents,
+ * and multiple string objects and threads writing the same NUL into the
+ * same location is harmless.
+ * In all other cases, the buffer is fully writable and it is anyway safe
+ * to write the NUL.
+ *
+ * Note: An earlier version of this code tested whether there is a NUL
+ * at [len] already, but, while safe, it generated lots of warnings from
+ * tools like valgrind and Purify.
+ */
+ array[len] = 0;
+ }
+ return array;
+ } else if(cloneArrayIfNeeded(len+1)) {
+ array = getArrayStart();
+ array[len] = 0;
+ return array;
+ } else {
+ return 0;
+ }
}
}
inline UnicodeString&
UnicodeString::operator= (UChar ch)
-{ return doReplace(0, fLength, &ch, 0, 1); }
+{ return doReplace(0, length(), &ch, 0, 1); }
inline UnicodeString&
UnicodeString::operator= (UChar32 ch)
-{ return replace(0, fLength, ch); }
+{ return replace(0, length(), ch); }
inline UnicodeString&
UnicodeString::setTo(const UnicodeString& srcText,
int32_t srcLength)
{
unBogus();
- return doReplace(0, fLength, srcText, srcStart, srcLength);
+ return doReplace(0, length(), srcText, srcStart, srcLength);
}
inline UnicodeString&
{
unBogus();
srcText.pinIndex(srcStart);
- return doReplace(0, fLength, srcText, srcStart, srcText.fLength - srcStart);
+ return doReplace(0, length(), srcText, srcStart, srcText.length() - srcStart);
}
inline UnicodeString&
UnicodeString::setTo(const UnicodeString& srcText)
{
- unBogus();
- return doReplace(0, fLength, srcText, 0, srcText.fLength);
+ return copyFrom(srcText);
}
inline UnicodeString&
int32_t srcLength)
{
unBogus();
- return doReplace(0, fLength, srcChars, 0, srcLength);
+ return doReplace(0, length(), srcChars, 0, srcLength);
}
inline UnicodeString&
UnicodeString::setTo(UChar srcChar)
{
unBogus();
- return doReplace(0, fLength, &srcChar, 0, 1);
+ return doReplace(0, length(), &srcChar, 0, 1);
}
inline UnicodeString&
UnicodeString::setTo(UChar32 srcChar)
{
unBogus();
- return replace(0, fLength, srcChar);
-}
-
-inline UnicodeString&
-UnicodeString::operator+= (UChar ch)
-{ return doReplace(fLength, 0, &ch, 0, 1); }
-
-inline UnicodeString&
-UnicodeString::operator+= (UChar32 ch) {
- UChar buffer[U16_MAX_LENGTH];
- int32_t _length = 0;
- UBool isError = FALSE;
- U16_APPEND(buffer, _length, U16_MAX_LENGTH, ch, isError);
- return doReplace(fLength, 0, buffer, 0, _length);
+ return replace(0, length(), srcChar);
}
-inline UnicodeString&
-UnicodeString::operator+= (const UnicodeString& srcText)
-{ return doReplace(fLength, 0, srcText, 0, srcText.fLength); }
-
inline UnicodeString&
UnicodeString::append(const UnicodeString& srcText,
int32_t srcStart,
int32_t srcLength)
-{ return doReplace(fLength, 0, srcText, srcStart, srcLength); }
+{ return doReplace(length(), 0, srcText, srcStart, srcLength); }
inline UnicodeString&
UnicodeString::append(const UnicodeString& srcText)
-{ return doReplace(fLength, 0, srcText, 0, srcText.fLength); }
+{ return doReplace(length(), 0, srcText, 0, srcText.length()); }
inline UnicodeString&
UnicodeString::append(const UChar *srcChars,
int32_t srcStart,
int32_t srcLength)
-{ return doReplace(fLength, 0, srcChars, srcStart, srcLength); }
+{ return doReplace(length(), 0, srcChars, srcStart, srcLength); }
inline UnicodeString&
UnicodeString::append(const UChar *srcChars,
int32_t srcLength)
-{ return doReplace(fLength, 0, srcChars, 0, srcLength); }
+{ return doReplace(length(), 0, srcChars, 0, srcLength); }
inline UnicodeString&
UnicodeString::append(UChar srcChar)
-{ return doReplace(fLength, 0, &srcChar, 0, 1); }
+{ return doReplace(length(), 0, &srcChar, 0, 1); }
+
+inline UnicodeString&
+UnicodeString::operator+= (UChar ch)
+{ return doReplace(length(), 0, &ch, 0, 1); }
inline UnicodeString&
-UnicodeString::append(UChar32 srcChar) {
- UChar buffer[U16_MAX_LENGTH];
- int32_t _length = 0;
- UBool isError = FALSE;
- U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
- return doReplace(fLength, 0, buffer, 0, _length);
+UnicodeString::operator+= (UChar32 ch) {
+ return append(ch);
}
+inline UnicodeString&
+UnicodeString::operator+= (const UnicodeString& srcText)
+{ return doReplace(length(), 0, srcText, 0, srcText.length()); }
+
inline UnicodeString&
UnicodeString::insert(int32_t start,
const UnicodeString& srcText,
inline UnicodeString&
UnicodeString::insert(int32_t start,
const UnicodeString& srcText)
-{ return doReplace(start, 0, srcText, 0, srcText.fLength); }
+{ return doReplace(start, 0, srcText, 0, srcText.length()); }
inline UnicodeString&
UnicodeString::insert(int32_t start,
UnicodeString::remove()
{
// remove() of a bogus string makes the string empty and non-bogus
- if(isBogus()) {
- unBogus();
+ // we also un-alias a read-only alias to deal with NUL-termination
+ // issues with getTerminatedBuffer()
+ if(fFlags & (kIsBogus|kBufferIsReadonly)) {
+ setToEmpty();
} else {
- fLength = 0;
+ fShortLength = 0;
}
return *this;
}
UnicodeString::remove(int32_t start,
int32_t _length)
{
- if(start <= 0 && _length == INT32_MAX) {
- // remove(guaranteed everything) of a bogus string makes the string empty and non-bogus
- return remove();
- } else {
+ if(start <= 0 && _length == INT32_MAX) {
+ // remove(guaranteed everything) of a bogus string makes the string empty and non-bogus
+ return remove();
+ }
return doReplace(start, _length, NULL, 0, 0);
- }
}
inline UnicodeString&
int32_t limit)
{ return doReplace(start, limit - start, NULL, 0, 0); }
+inline UnicodeString &
+UnicodeString::retainBetween(int32_t start, int32_t limit) {
+ truncate(limit);
+ return doReplace(0, start, NULL, 0, 0);
+}
+
inline UBool
UnicodeString::truncate(int32_t targetLength)
{
// truncate(0) of a bogus string makes the string empty and non-bogus
unBogus();
return FALSE;
- } else if((uint32_t)targetLength < (uint32_t)fLength) {
- fLength = targetLength;
+ } else if((uint32_t)targetLength < (uint32_t)length()) {
+ setLength(targetLength);
+ if(fFlags&kBufferIsReadonly) {
+ fUnion.fFields.fCapacity = targetLength; // not NUL-terminated any more
+ }
return TRUE;
} else {
return FALSE;
inline UnicodeString&
UnicodeString::reverse()
-{ return doReverse(0, fLength); }
+{ return doReverse(0, length()); }
inline UnicodeString&
UnicodeString::reverse(int32_t start,