X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/374ca955a76ecab1204ca8bfa63ff9238d998416..57a6839dcb3bba09e8228b822b290604668416fe:/icuSources/common/unicode/uniset.h diff --git a/icuSources/common/unicode/uniset.h b/icuSources/common/unicode/uniset.h index 48464cfd..fa7cc7ca 100644 --- a/icuSources/common/unicode/uniset.h +++ b/icuSources/common/unicode/uniset.h @@ -1,10 +1,11 @@ /* -********************************************************************** -* Copyright (C) 1999-2004, International Business Machines Corporation and others. All Rights Reserved. -********************************************************************** +*************************************************************************** +* Copyright (C) 1999-2013, International Business Machines Corporation +* and others. All Rights Reserved. +*************************************************************************** * Date Name Description * 10/20/99 alan Creation. -********************************************************************** +*************************************************************************** */ #ifndef UNICODESET_H @@ -14,12 +15,22 @@ #include "unicode/unistr.h" #include "unicode/uset.h" +/** + * \file + * \brief C++ API: Unicode Set + */ + U_NAMESPACE_BEGIN +// Forward Declarations. +void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status); /**< @internal */ + +class BMPSet; class ParsePosition; +class RBBIRuleScanner; class SymbolTable; +class UnicodeSetStringSpan; class UVector; -class CaseEquivClass; class RuleCharacterIterator; /** @@ -108,8 +119,8 @@ class RuleCharacterIterator; * "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized. For a * complete list of supported property patterns, see the User's Guide * for UnicodeSet at - * - * http://oss.software.ibm.com/icu/userguide/unicodeSet.html. + * + * http://icu-project.org/userguide/unicodeSet.html. * Actual determination of property data is defined by the underlying * Unicode database as implemented by UCharacter. * @@ -249,6 +260,15 @@ class RuleCharacterIterator; * * * \htmlonly\endhtmlonly + * + *

Note: + * - Most UnicodeSet methods do not take a UErrorCode parameter because + * there are usually very few opportunities for failure other than a shortage + * of memory, error codes in low-level C++ string methods would be inconvenient, + * and the error code as the last parameter (ICU convention) would prevent + * the use of default parameter values. + * Instead, such methods set the UnicodeSet into a "bogus" state + * (see isBogus()) if an error occurs. * * @author Alan Liu * @stable ICU 2.0 @@ -257,11 +277,11 @@ class U_COMMON_API UnicodeSet : public UnicodeFilter { int32_t len; // length of list used; 0 <= len <= capacity int32_t capacity; // capacity of list - int32_t bufferCapacity; // capacity of buffer UChar32* list; // MUST be terminated with HIGH + BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL. UChar32* buffer; // internal buffer, may be NULL - - UVector* strings; // maintained in sorted order + int32_t bufferCapacity; // capacity of buffer + int32_t patLen; /** * The pattern representation of this set. This may not be the @@ -272,29 +292,60 @@ class U_COMMON_API UnicodeSet : public UnicodeFilter { * indicating that toPattern() must generate a pattern * representation from the inversion list. */ - UnicodeString pat; + UChar *pat; + UVector* strings; // maintained in sorted order + UnicodeSetStringSpan *stringSpan; +private: + enum { // constants + kIsBogus = 1 // This set is bogus (i.e. not valid) + }; + uint8_t fFlags; // Bit flag (see constants above) public: - /** - * Minimum value that can be stored in a UnicodeSet. - * @stable ICU 2.4 + * Determine if this object contains a valid set. + * A bogus set has no value. It is different from an empty set. + * It can be used to indicate that no set value is available. + * + * @return TRUE if the set is valid, FALSE otherwise + * @see setToBogus() + * @stable ICU 4.0 */ -#ifdef U_CYGWIN - static U_COMMON_API const UChar32 MIN_VALUE; -#else - static const UChar32 MIN_VALUE; -#endif - + inline UBool isBogus(void) const; + /** - * Maximum value that can be stored in a UnicodeSet. - * @stable ICU 2.4 + * Make this UnicodeSet object invalid. + * The string will test TRUE with isBogus(). + * + * A bogus set has no value. It is different from an empty set. + * It can be used to indicate that no set value is available. + * + * This utility function is used throughout the UnicodeSet + * implementation to indicate that a UnicodeSet operation failed, + * and may be used in other functions, + * especially but not exclusively when such functions do not + * take a UErrorCode for simplicity. + * + * @see isBogus() + * @stable ICU 4.0 */ -#ifdef U_CYGWIN - static U_COMMON_API const UChar32 MAX_VALUE; -#else - static const UChar32 MAX_VALUE; -#endif + void setToBogus(); + +public: + + enum { + /** + * Minimum value that can be stored in a UnicodeSet. + * @stable ICU 2.4 + */ + MIN_VALUE = 0, + + /** + * Maximum value that can be stored in a UnicodeSet. + * @stable ICU 2.4 + */ + MAX_VALUE = 0x10ffff + }; //---------------------------------------------------------------- // Constructors &c @@ -329,6 +380,7 @@ public: UnicodeSet(const UnicodeString& pattern, UErrorCode& status); +#ifndef U_HIDE_INTERNAL_API /** * Constructs a set from the given pattern. See the class * description for the syntax of the pattern language. @@ -345,6 +397,7 @@ public: uint32_t options, const SymbolTable* symbols, UErrorCode& status); +#endif /* U_HIDE_INTERNAL_API */ /** * Constructs a set from the given pattern. See the class description @@ -357,23 +410,13 @@ public: * @param symbols a symbol table mapping variable names to values * and stand-in characters to UnicodeSets; may be NULL * @param status input-output error code - * @draft ICU 2.8 + * @stable ICU 2.8 */ UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, uint32_t options, const SymbolTable* symbols, UErrorCode& status); -#ifdef U_USE_UNICODESET_DEPRECATES - /** - * Obsolete: Constructs a set from the given Unicode character category. - * @param category an integer indicating the character category as - * defined in uchar.h. - * @obsolete ICU 2.6. Use a pattern with the category instead since this API will be removed in that release. - */ - UnicodeSet(int8_t category, UErrorCode& status); -#endif - /** * Constructs a set that is identical to the given UnicodeSet. * @stable ICU 2.0 @@ -388,6 +431,7 @@ public: /** * Assigns this object to be a copy of another. + * A frozen set will not be modified. * @stable ICU 2.0 */ UnicodeSet& operator=(const UnicodeSet& o); @@ -416,6 +460,9 @@ public: * Returns a copy of this object. All UnicodeFunctor objects have * to support cloning in order to allow classes using * UnicodeFunctors, such as Transliterator, to implement cloning. + * If this set is frozen, then the clone will be frozen as well. + * Use cloneAsThawed() for a mutable clone of a frozen set. + * @see cloneAsThawed * @stable ICU 2.0 */ virtual UnicodeFunctor* clone() const; @@ -429,6 +476,85 @@ public: */ virtual int32_t hashCode(void) const; + /** + * Get a UnicodeSet pointer from a USet + * + * @param uset a USet (the ICU plain C type for UnicodeSet) + * @return the corresponding UnicodeSet pointer. + * + * @stable ICU 4.2 + */ + inline static UnicodeSet *fromUSet(USet *uset); + + /** + * Get a UnicodeSet pointer from a const USet + * + * @param uset a const USet (the ICU plain C type for UnicodeSet) + * @return the corresponding UnicodeSet pointer. + * + * @stable ICU 4.2 + */ + inline static const UnicodeSet *fromUSet(const USet *uset); + + /** + * Produce a USet * pointer for this UnicodeSet. + * USet is the plain C type for UnicodeSet + * + * @return a USet pointer for this UnicodeSet + * @stable ICU 4.2 + */ + inline USet *toUSet(); + + + /** + * Produce a const USet * pointer for this UnicodeSet. + * USet is the plain C type for UnicodeSet + * + * @return a const USet pointer for this UnicodeSet + * @stable ICU 4.2 + */ + inline const USet * toUSet() const; + + + //---------------------------------------------------------------- + // Freezable API + //---------------------------------------------------------------- + + /** + * Determines whether the set has been frozen (made immutable) or not. + * See the ICU4J Freezable interface for details. + * @return TRUE/FALSE for whether the set has been frozen + * @see freeze + * @see cloneAsThawed + * @stable ICU 3.8 + */ + inline UBool isFrozen() const; + + /** + * Freeze the set (make it immutable). + * Once frozen, it cannot be unfrozen and is therefore thread-safe + * until it is deleted. + * See the ICU4J Freezable interface for details. + * Freezing the set may also make some operations faster, for example + * contains() and span(). + * A frozen set will not be modified. (It remains frozen.) + * @return this set. + * @see isFrozen + * @see cloneAsThawed + * @stable ICU 3.8 + */ + UnicodeFunctor *freeze(); + + /** + * Clone the set and make the clone mutable. + * See the ICU4J Freezable interface for details. + * @return the mutable clone + * @see freeze + * @see isFrozen + * @stable ICU 3.8 + */ + UnicodeFunctor *cloneAsThawed() const; + //---------------------------------------------------------------- // Public API //---------------------------------------------------------------- @@ -437,6 +563,7 @@ public: * Make this object represent the range start - end. * If end > start then this object is set to an * an empty range. + * A frozen set will not be modified. * * @param start first character in the set, inclusive * @param end last character in the set, inclusive @@ -454,8 +581,9 @@ public: /** * Modifies this set to represent the set specified by the given - * pattern, optionally ignoring white space. See the class - * description for the syntax of the pattern language. + * pattern, ignoring Unicode Pattern_White_Space characters. + * See the class description for the syntax of the pattern language. + * A frozen set will not be modified. * @param pattern a string specifying what characters are in the set * @param status returns U_ILLEGAL_ARGUMENT_ERROR if the pattern * contains a syntax error. @@ -466,10 +594,12 @@ public: UnicodeSet& applyPattern(const UnicodeString& pattern, UErrorCode& status); +#ifndef U_HIDE_INTERNAL_API /** * Modifies this set to represent the set specified by the given - * pattern, optionally ignoring white space. See the class - * description for the syntax of the pattern language. + * pattern, optionally ignoring Unicode Pattern_White_Space characters. + * See the class description for the syntax of the pattern language. + * A frozen set will not be modified. * @param pattern a string specifying what characters are in the set * @param options bitmask for options to apply to the pattern. * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. @@ -485,6 +615,7 @@ public: uint32_t options, const SymbolTable* symbols, UErrorCode& status); +#endif /* U_HIDE_INTERNAL_API */ /** * Parses the given pattern, starting at the given position. The @@ -497,6 +628,7 @@ public: * pairs list for the parsed pattern is returned. This method calls * itself recursively to parse embedded subpatterns. * Empties the set passed before applying the pattern. + * A frozen set will not be modified. * * @param pattern the string containing the pattern to be parsed. * The portion of the string from pos.getIndex(), which must be a @@ -514,7 +646,7 @@ public: * @param status returns U_ILLEGAL_ARGUMENT_ERROR if the pattern * contains a syntax error. * @return a reference to this - * @draft ICU 2.8 + * @stable ICU 2.8 */ UnicodeSet& applyPattern(const UnicodeString& pattern, ParsePosition& pos, @@ -526,6 +658,7 @@ public: * Returns a string representation of this set. If the result of * calling this function is passed to a UnicodeSet constructor, it * will produce another set that is equal to this one. + * A frozen set will not be modified. * @param result the string to receive the rules. Previous * contents will be deleted. * @param escapeUnprintable if TRUE then convert unprintable @@ -541,6 +674,7 @@ public: * Modifies this set to contain those code points which have the given value * for the given binary or enumerated property, as returned by * u_getIntPropertyValue. Prior contents of this set are lost. + * A frozen set will not be modified. * * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 @@ -566,6 +700,7 @@ public: * Modifies this set to contain those code points which have the * given value for the given property. Prior contents of this * set are lost. + * A frozen set will not be modified. * * @param prop a property alias, either short or long. The name is matched * loosely. See PropertyAliases.txt for names and a description of loose @@ -575,7 +710,8 @@ public: * correspond to the following sets: * * "ANY" = [\\u0000-\\U0010FFFF], - * "ASCII" = [\\u0000-\\u007F]. + * "ASCII" = [\\u0000-\\u007F], + * "Assigned" = [:^Cn:]. * * @param value a value alias, either short or long. The name is matched * loosely. See PropertyValueAliases.txt for names and a description of @@ -613,6 +749,7 @@ public: /** * Returns true if this set contains the given character. + * This function works faster with a frozen set. * @param c character to be checked for containment * @return true if the test condition is met * @stable ICU 2.0 @@ -712,6 +849,113 @@ public: */ inline UBool containsSome(const UnicodeString& s) const; + /** + * Returns the length of the initial substring of the input string which + * consists only of characters and strings that are contained in this set + * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), + * or only of characters and strings that are not contained + * in this set (USET_SPAN_NOT_CONTAINED). + * See USetSpanCondition for details. + * Similar to the strspn() C library function. + * Unpaired surrogates are treated according to contains() of their surrogate code points. + * This function works faster with a frozen set and with a non-negative string length argument. + * @param s start of the string + * @param length of the string; can be -1 for NUL-terminated + * @param spanCondition specifies the containment condition + * @return the length of the initial substring according to the spanCondition; + * 0 if the start of the string does not fit the spanCondition + * @stable ICU 3.8 + * @see USetSpanCondition + */ + int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const; + + /** + * Returns the end of the substring of the input string according to the USetSpanCondition. + * Same as start+span(s.getBuffer()+start, s.length()-start, spanCondition) + * after pinning start to 0<=start<=s.length(). + * @param s the string + * @param start the start index in the string for the span operation + * @param spanCondition specifies the containment condition + * @return the exclusive end of the substring according to the spanCondition; + * the substring s.tempSubStringBetween(start, end) fulfills the spanCondition + * @stable ICU 4.4 + * @see USetSpanCondition + */ + inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const; + + /** + * Returns the start of the trailing substring of the input string which + * consists only of characters and strings that are contained in this set + * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), + * or only of characters and strings that are not contained + * in this set (USET_SPAN_NOT_CONTAINED). + * See USetSpanCondition for details. + * Unpaired surrogates are treated according to contains() of their surrogate code points. + * This function works faster with a frozen set and with a non-negative string length argument. + * @param s start of the string + * @param length of the string; can be -1 for NUL-terminated + * @param spanCondition specifies the containment condition + * @return the start of the trailing substring according to the spanCondition; + * the string length if the end of the string does not fit the spanCondition + * @stable ICU 3.8 + * @see USetSpanCondition + */ + int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const; + + /** + * Returns the start of the substring of the input string according to the USetSpanCondition. + * Same as spanBack(s.getBuffer(), limit, spanCondition) + * after pinning limit to 0<=end<=s.length(). + * @param s the string + * @param limit the exclusive-end index in the string for the span operation + * (use s.length() or INT32_MAX for spanning back from the end of the string) + * @param spanCondition specifies the containment condition + * @return the start of the substring according to the spanCondition; + * the substring s.tempSubStringBetween(start, limit) fulfills the spanCondition + * @stable ICU 4.4 + * @see USetSpanCondition + */ + inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const; + + /** + * Returns the length of the initial substring of the input string which + * consists only of characters and strings that are contained in this set + * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), + * or only of characters and strings that are not contained + * in this set (USET_SPAN_NOT_CONTAINED). + * See USetSpanCondition for details. + * Similar to the strspn() C library function. + * Malformed byte sequences are treated according to contains(0xfffd). + * This function works faster with a frozen set and with a non-negative string length argument. + * @param s start of the string (UTF-8) + * @param length of the string; can be -1 for NUL-terminated + * @param spanCondition specifies the containment condition + * @return the length of the initial substring according to the spanCondition; + * 0 if the start of the string does not fit the spanCondition + * @stable ICU 3.8 + * @see USetSpanCondition + */ + int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const; + + /** + * Returns the start of the trailing substring of the input string which + * consists only of characters and strings that are contained in this set + * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), + * or only of characters and strings that are not contained + * in this set (USET_SPAN_NOT_CONTAINED). + * See USetSpanCondition for details. + * Malformed byte sequences are treated according to contains(0xfffd). + * This function works faster with a frozen set and with a non-negative string length argument. + * @param s start of the string (UTF-8) + * @param length of the string; can be -1 for NUL-terminated + * @param spanCondition specifies the containment condition + * @return the start of the trailing substring according to the spanCondition; + * the string length if the end of the string does not fit the spanCondition + * @stable ICU 3.8 + * @see USetSpanCondition + */ + int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const; + /** * Implement UnicodeMatcher::matches() * @stable ICU 2.4 @@ -738,6 +982,7 @@ private: * @param limit the limit offset for matching, either last+1 in * the forward direction, or last-1 in the reverse direction, * where last is the index of the last character to match. + * @param s * @return If part of s matches up to the limit, return |limit - * start|. If all of s matches before reaching the limit, return * s.length(). If there is a mismatch between s and text, return @@ -796,6 +1041,7 @@ public: * the call leaves this set unchanged. If end > start * then an empty range is added, leaving the set unchanged. * This is equivalent to a boolean logic OR, or a set UNION. + * A frozen set will not be modified. * * @param start first character, inclusive, of range to be added * to this set. @@ -809,6 +1055,7 @@ public: * Adds the specified character to this set if it is not already * present. If this set already contains the specified character, * the call leaves this set unchanged. + * A frozen set will not be modified. * @stable ICU 2.0 */ UnicodeSet& add(UChar32 c); @@ -819,6 +1066,7 @@ public: * the call leaves this set unchanged. * Thus "ch" => {"ch"} *
Warning: you cannot add an empty string ("") to a UnicodeSet. + * A frozen set will not be modified. * @param s the source string * @return this object, for chaining * @stable ICU 2.4 @@ -829,7 +1077,7 @@ public: /** * @return a code point IF the string consists of a single one. * otherwise returns -1. - * @param string to test + * @param s string to test */ static int32_t getSingleCP(const UnicodeString& s); @@ -839,6 +1087,7 @@ public: /** * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} * If this set already any particular character, it has no effect on that character. + * A frozen set will not be modified. * @param s the source string * @return this object, for chaining * @stable ICU 2.4 @@ -848,6 +1097,7 @@ public: /** * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} * If this set already any particular character, it has no effect on that character. + * A frozen set will not be modified. * @param s the source string * @return this object, for chaining * @stable ICU 2.4 @@ -857,6 +1107,7 @@ public: /** * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} * If this set already any particular character, it has no effect on that character. + * A frozen set will not be modified. * @param s the source string * @return this object, for chaining * @stable ICU 2.4 @@ -866,6 +1117,7 @@ public: /** * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} * If this set already any particular character, it has no effect on that character. + * A frozen set will not be modified. * @param s the source string * @return this object, for chaining * @stable ICU 2.4 @@ -897,6 +1149,7 @@ public: * specified range. If end > start then an empty range is * retained, leaving the set empty. This is equivalent to * a boolean logic AND, or a set INTERSECTION. + * A frozen set will not be modified. * * @param start first character, inclusive, of range to be retained * to this set. @@ -909,6 +1162,7 @@ public: /** * Retain the specified character from this set if it is present. + * A frozen set will not be modified. * @stable ICU 2.0 */ UnicodeSet& retain(UChar32 c); @@ -918,6 +1172,7 @@ public: * The set will not contain the specified range once the call * returns. If end > start then an empty range is * removed, leaving the set unchanged. + * A frozen set will not be modified. * * @param start first character, inclusive, of range to be removed * from this set. @@ -931,6 +1186,7 @@ public: * Removes the specified character from this set if it is present. * The set will not contain the specified range once the call * returns. + * A frozen set will not be modified. * @stable ICU 2.0 */ UnicodeSet& remove(UChar32 c); @@ -939,6 +1195,7 @@ public: * Removes the specified string from this set if it is present. * The set will not contain the specified character once the call * returns. + * A frozen set will not be modified. * @param s the source string * @return this object, for chaining * @stable ICU 2.4 @@ -949,6 +1206,7 @@ public: * Inverts this set. This operation modifies this set so that * its value is its complement. This is equivalent to * complement(MIN_VALUE, MAX_VALUE). + * A frozen set will not be modified. * @stable ICU 2.0 */ virtual UnicodeSet& complement(void); @@ -959,6 +1217,7 @@ public: * added if it is not in this set. If end > start * then an empty range is complemented, leaving the set unchanged. * This is equivalent to a boolean logic XOR. + * A frozen set will not be modified. * * @param start first character, inclusive, of range to be removed * from this set. @@ -972,6 +1231,7 @@ public: * Complements the specified character in this set. The character * will be removed if it is in this set, or will be added if it is * not in this set. + * A frozen set will not be modified. * @stable ICU 2.0 */ UnicodeSet& complement(UChar32 c); @@ -981,6 +1241,7 @@ public: * The set will not contain the specified string once the call * returns. *
Warning: you cannot add an empty string ("") to a UnicodeSet. + * A frozen set will not be modified. * @param s the string to complement * @return this object, for chaining * @stable ICU 2.4 @@ -993,9 +1254,10 @@ public: * modifies this set so that its value is the union of the two * sets. The behavior of this operation is unspecified if the specified * collection is modified while the operation is in progress. + * A frozen set will not be modified. * * @param c set whose elements are to be added to this set. - * @see #add(char, char) + * @see #add(UChar32, UChar32) * @stable ICU 2.0 */ virtual UnicodeSet& addAll(const UnicodeSet& c); @@ -1006,6 +1268,7 @@ public: * its elements that are not contained in the specified set. This * operation effectively modifies this set so that its value is * the intersection of the two sets. + * A frozen set will not be modified. * * @param c set that defines which elements this set will retain. * @stable ICU 2.0 @@ -1017,6 +1280,7 @@ public: * specified set. This operation effectively modifies this * set so that its value is the asymmetric set difference of * the two sets. + * A frozen set will not be modified. * * @param c set that defines which elements will be removed from * this set. @@ -1028,6 +1292,7 @@ public: * Complements in this set all elements contained in the specified * set. Any character in the other set will be removed if it is * in this set, or will be added if it is not in this set. + * A frozen set will not be modified. * * @param c set that defines which elements will be xor'ed from * this set. @@ -1038,6 +1303,7 @@ public: /** * Removes all of the elements from this set. This set will be * empty after this call returns. + * A frozen set will not be modified. * @stable ICU 2.0 */ virtual UnicodeSet& clear(void); @@ -1059,14 +1325,24 @@ public: * == b denotes that the contents are the same, not pointer * comparison.) * + * A frozen set will not be modified. + * * @param attribute bitmask for attributes to close over. * Currently only the USET_CASE bit is supported. Any undefined bits * are ignored. * @return a reference to this set. - * @internal + * @stable ICU 4.2 */ UnicodeSet& closeOver(int32_t attribute); + /** + * Remove all strings from this set. + * + * @return a reference to this set. + * @stable ICU 4.2 + */ + virtual UnicodeSet &removeAllStrings(); + /** * Iteration method that returns the number of ranges contained in * this set. @@ -1147,6 +1423,7 @@ public: /** * Reallocate this objects internal structures to take up the least * possible space, without changing this object's value. + * A frozen set will not be modified. * @stable ICU 2.4 */ virtual UnicodeSet& compact(); @@ -1198,28 +1475,41 @@ private: virtual UBool matchesIndexValue(uint8_t v) const; private: + friend class RBBIRuleScanner; + + //---------------------------------------------------------------- + // Implementation: Clone as thawed (see ICU4J Freezable) + //---------------------------------------------------------------- + + UnicodeSet(const UnicodeSet& o, UBool /* asThawed */); //---------------------------------------------------------------- // Implementation: Pattern parsing //---------------------------------------------------------------- + void applyPatternIgnoreSpace(const UnicodeString& pattern, + ParsePosition& pos, + const SymbolTable* symbols, + UErrorCode& status); + void applyPattern(RuleCharacterIterator& chars, const SymbolTable* symbols, UnicodeString& rebuiltPat, uint32_t options, + UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), UErrorCode& ec); //---------------------------------------------------------------- // Implementation: Utility methods //---------------------------------------------------------------- - void ensureCapacity(int32_t newLen); + void ensureCapacity(int32_t newLen, UErrorCode& ec); - void ensureBufferCapacity(int32_t newLen); + void ensureBufferCapacity(int32_t newLen, UErrorCode& ec); void swapBuffers(void); - UBool allocateStrings(); + UBool allocateStrings(UErrorCode &status); UnicodeString& _toPattern(UnicodeString& result, UBool escapeUnprintable) const; @@ -1258,7 +1548,7 @@ private: * * The original design document is out of date, but still useful. * Ignore the property and value names: - * http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html + * http://source.icu-project.org/repos/icu/icuhtml/trunk/design/unicodeset_properties.html * * Recognized syntax: * @@ -1266,8 +1556,8 @@ private: * \\p{foo} \\P{foo} - white space not allowed within "\\p" or "\\P" * \\N{name} - white space not allowed within "\\N" * - * Other than the above restrictions, white space is ignored. Case - * is ignored except in "\\p" and "\\P" and "\\N". In 'name' leading + * Other than the above restrictions, Unicode Pattern_White_Space characters are ignored. + * Case is ignored except in "\\p" and "\\P" and "\\N". In 'name' leading * and trailing space is deleted, and internal runs of whitespace * are collapsed to a single space. * @@ -1288,6 +1578,7 @@ private: * On return, the position after the last character parsed, that is, * the locations marked '%'. If the parse fails, ppos is returned * unchanged. + * @param ec status * @return a reference to this. */ UnicodeSet& applyPropertyPattern(const UnicodeString& pattern, @@ -1298,6 +1589,9 @@ private: UnicodeString& rebuiltPat, UErrorCode& ec); + friend void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status); + static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status); + /** * A filter that returns TRUE if the given code point should be * included in the UnicodeSet being constructed. @@ -1319,31 +1613,27 @@ private: UErrorCode &status); /** - * Return a cached copy of the inclusions list for the property source. + * Set the new pattern to cache. */ - static const UnicodeSet* getInclusions(int32_t src, UErrorCode &errorCode); + void setPattern(const UnicodeString& newPat); + /** + * Release existing cached pattern. + */ + void releasePattern(); friend class UnicodeSetIterator; +}; - //---------------------------------------------------------------- - // Implementation: closeOver - //---------------------------------------------------------------- - - void caseCloseOne(const UnicodeString& folded); - - void caseCloseOne(const CaseEquivClass& c); - - void caseCloseOne(UChar folded); - - static const CaseEquivClass* getCaseMapOf(const UnicodeString& folded); - static const CaseEquivClass* getCaseMapOf(UChar folded); -}; inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const { return !operator==(o); } +inline UBool UnicodeSet::isFrozen() const { + return (UBool)(bmpSet!=NULL || stringSpan!=NULL); +} + inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const { return !containsNone(start, end); } @@ -1356,6 +1646,46 @@ inline UBool UnicodeSet::containsSome(const UnicodeString& s) const { return !containsNone(s); } +inline UBool UnicodeSet::isBogus() const { + return (UBool)(fFlags & kIsBogus); +} + +inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) { + return reinterpret_cast(uset); +} + +inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) { + return reinterpret_cast(uset); +} + +inline USet *UnicodeSet::toUSet() { + return reinterpret_cast(this); +} + +inline const USet *UnicodeSet::toUSet() const { + return reinterpret_cast(this); +} + +inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const { + int32_t sLength=s.length(); + if(start<0) { + start=0; + } else if(start>sLength) { + start=sLength; + } + return start+span(s.getBuffer()+start, sLength-start, spanCondition); +} + +inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const { + int32_t sLength=s.length(); + if(limit<0) { + limit=0; + } else if(limit>sLength) { + limit=sLength; + } + return spanBack(s.getBuffer(), limit, spanCondition); +} + U_NAMESPACE_END #endif