ICU-531.30.tar.gz

[apple/icu.git] / icuSources / common / unicode / uniset.h
diff --git a/icuSources/common/unicode/uniset.h b/icuSources/common/unicode/uniset.h

index 80762e95b25884a37428c6b8bd1c8973a6b895e0..fa7cc7ca1c2cdc884884bc2c02cdacae475f7fe8 100644 (file)
--- a/icuSources/common/unicode/uniset.h
+++ b/icuSources/common/unicode/uniset.h
@@ -1,29 +1,38 @@
  /*
-**********************************************************************
-* Copyright (C) 1999-2003, International Business Machines Corporation and others. All Rights Reserved.
-**********************************************************************
+***************************************************************************
+* Copyright (C) 1999-2013, International Business Machines Corporation
+* and others. All Rights Reserved.
+***************************************************************************
  *   Date        Name        Description
  *   10/20/99    alan        Creation.
-**********************************************************************
+***************************************************************************
  */
  
  #ifndef UNICODESET_H
  #define UNICODESET_H
  
  #include "unicode/unifilt.h"
-#include "unicode/utypes.h"
  #include "unicode/unistr.h"
-#include "unicode/uchar.h"
  #include "unicode/uset.h"
  
+/**
+ * \file
+ * \brief C++ API: Unicode Set
+ */
+
  U_NAMESPACE_BEGIN
  
+// Forward Declarations.
+void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status); /**< @internal */
+
+class BMPSet;
  class ParsePosition;
+class RBBIRuleScanner;
  class SymbolTable;
+class UnicodeSetStringSpan;
  class UVector;
-class CaseEquivClass;
+class RuleCharacterIterator;
  
-    
  /**
   * A mutable set of Unicode characters and multicharacter strings.  Objects of this class
   * represent <em>character classes</em> used in regular expressions.
@@ -68,7 +77,7 @@ class CaseEquivClass;
   * similar to that employed by version 8 regular expression character
   * classes.  Here are some simple examples:
   *
- * <blockquote>
+ * \htmlonly<blockquote>\endhtmlonly
   *   <table>
   *     <tr align="top">
   *       <td nowrap valign="top" align="left"><code>[]</code></td>
@@ -82,36 +91,36 @@ class CaseEquivClass;
   *     </tr>
   *     <tr>
   *       <td nowrap valign="top" align="left"><code>[a-e]</code></td>
- *       <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code 
+ *       <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
   *       point order</td>
   *     </tr>
   *     <tr>
- *       <td nowrap valign="top" align="left"><code>[\u4E01]</code></td>
+ *       <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
   *       <td valign="top">The character U+4E01</td>
   *     </tr>
   *     <tr>
   *       <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
- *       <td valign="top">The character 'a' and the multicharacter strings &quot;ab&quot; and 
+ *       <td valign="top">The character 'a' and the multicharacter strings &quot;ab&quot; and
   *       &quot;ac&quot;</td>
   *     </tr>
   *     <tr>
- *       <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td>
+ *       <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>
   *       <td valign="top">All characters in the general category Uppercase Letter</td>
   *     </tr>
   *   </table>
- * </blockquote>
- * 
+ * \htmlonly</blockquote>\endhtmlonly
+ *
   * Any character may be preceded by a backslash in order to remove any special
   * meaning.  White space characters, as defined by UCharacter.isWhitespace(), are
   * ignored, unless they are escaped.
   *
   * <p>Property patterns specify a set of characters having a certain
   * property as defined by the Unicode standard.  Both the POSIX-like
- * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized.  For a
+ * "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized.  For a
   * complete list of supported property patterns, see the User's Guide
   * for UnicodeSet at
- * <a href="http://oss.software.ibm.com/icu/userguide/unicodeSet.html">
- * http://oss.software.ibm.com/icu/userguide/unicodeSet.html</a>.
+ * <a href="http://icu-project.org/userguide/unicodeSet.html">
+ * http://icu-project.org/userguide/unicodeSet.html</a>.
   * Actual determination of property data is defined by the underlying
   * Unicode database as implemented by UCharacter.
   *
@@ -119,7 +128,7 @@ class CaseEquivClass;
   * Unicode property sets.  When elements are concatenated, they
   * specify their union.  To complement a set, place a '^' immediately
   * after the opening '['.  Property patterns are inverted by modifying
- * their delimiters; "[:^foo]" and "\P{foo}".  In any other location,
+ * their delimiters; "[:^foo]" and "\\P{foo}".  In any other location,
   * '^' has no special meaning.
   *
   * <p>Ranges are indicated by placing two a '-' between two
@@ -129,16 +138,16 @@ class CaseEquivClass;
   * right character it is a syntax error.  If a '-' occurs as the first
   * character after the opening '[' or '[^', or if it occurs as the
   * last character before the closing ']', then it is taken as a
- * literal.  Thus "[a\u005C-b]", "[-ab]", and "[ab-]" all indicate the same
+ * literal.  Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
   * set of three characters, 'a', 'b', and '-'.
   *
   * <p>Sets may be intersected using the '&' operator or the asymmetric
   * set difference may be taken using the '-' operator, for example,
- * "[[:L:]&[\u005Cu0000-\u005Cu0FFF]]" indicates the set of all Unicode letters
+ * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
   * with values less than 4096.  Operators ('&' and '|') have equal
   * precedence and bind left-to-right.  Thus
- * "[[:L:]-[a-z]-[\u005Cu0100-\u005Cu01FF]]" is equivalent to
- * "[[[:L:]-[a-z]]-[\u005Cu0100-\u005Cu01FF]]".  This only really matters for
+ * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
+ * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for
   * difference; intersection is commutative.
   *
   * <table>
@@ -155,11 +164,11 @@ class CaseEquivClass;
   * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
   * <td>The asymmetric difference of sets specified by <em>pat1</em> and
   * <em>pat2</em>
- * <tr valign=top><td nowrap><code>[:Lu:] or \p{Lu}</code>
+ * <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>
   * <td>The set of characters having the specified
   * Unicode property; in
   * this case, Unicode uppercase letters
- * <tr valign=top><td nowrap><code>[:^Lu:] or \P{Lu}</code>
+ * <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>
   * <td>The set of characters <em>not</em> having the given
   * Unicode property
   * </table>
@@ -168,7 +177,7 @@ class CaseEquivClass;
   *
   * <p><b>Formal syntax</b></p>
   *
- * <blockquote>
+ * \htmlonly<blockquote>\endhtmlonly
   *   <table>
   *     <tr align="top">
   *       <td nowrap valign="top" align="right"><code>pattern :=&nbsp; </code></td>
@@ -200,7 +209,7 @@ class CaseEquivClass;
   *       <td nowrap valign="top" align="right"><code>char :=&nbsp; </code></td>
   *       <td valign="top"><em>any character that is not</em><code> special<br>
   *       | ('\' </code><em>any character</em><code>)<br>
- *       | ('\u' hex hex hex hex)<br>
+ *       | ('\\u' hex hex hex hex)<br>
   *       </code></td>
   *     </tr>
   *     <tr align="top">
@@ -250,7 +259,16 @@ class CaseEquivClass;
   *       </td>
   *     </tr>
   *   </table>
- * </blockquote>
+ * \htmlonly</blockquote>\endhtmlonly
+ * 
+ * <p>Note:
+ *  - Most UnicodeSet methods do not take a UErrorCode parameter because
+ *   there are usually very few opportunities for failure other than a shortage
+ *   of memory, error codes in low-level C++ string methods would be inconvenient,
+ *   and the error code as the last parameter (ICU convention) would prevent
+ *   the use of default parameter values.
+ *   Instead, such methods set the UnicodeSet into a "bogus" state
+ *   (see isBogus()) if an error occurs.
   *
   * @author Alan Liu
   * @stable ICU 2.0
@@ -259,11 +277,11 @@ class U_COMMON_API UnicodeSet : public UnicodeFilter {
  
      int32_t len; // length of list used; 0 <= len <= capacity
      int32_t capacity; // capacity of list
-    int32_t bufferCapacity; // capacity of buffer
      UChar32* list; // MUST be terminated with HIGH
+    BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
      UChar32* buffer; // internal buffer, may be NULL
-
-    UVector* strings; // maintained in sorted order
+    int32_t bufferCapacity; // capacity of buffer
+    int32_t patLen;
  
      /**
       * The pattern representation of this set.  This may not be the
@@ -274,21 +292,60 @@ class U_COMMON_API UnicodeSet : public UnicodeFilter {
       * indicating that toPattern() must generate a pattern
       * representation from the inversion list.
       */
-    UnicodeString pat;
+    UChar *pat;
+    UVector* strings; // maintained in sorted order
+    UnicodeSetStringSpan *stringSpan;
  
+private:
+    enum { // constants
+        kIsBogus = 1       // This set is bogus (i.e. not valid)
+    };
+    uint8_t fFlags;         // Bit flag (see constants above)
  public:
-
      /**
-     * Minimum value that can be stored in a UnicodeSet.
-     * @draft ICU 2.4
+     * Determine if this object contains a valid set.
+     * A bogus set has no value. It is different from an empty set.
+     * It can be used to indicate that no set value is available.
+     *
+     * @return TRUE if the set is valid, FALSE otherwise
+     * @see setToBogus()
+     * @stable ICU 4.0
       */
-    static const UChar32 MIN_VALUE;
-
+    inline UBool isBogus(void) const;
+    
      /**
-     * Maximum value that can be stored in a UnicodeSet.
-     * @draft ICU 2.4
+     * Make this UnicodeSet object invalid.
+     * The string will test TRUE with isBogus().
+     *
+     * A bogus set has no value. It is different from an empty set.
+     * It can be used to indicate that no set value is available.
+     *
+     * This utility function is used throughout the UnicodeSet
+     * implementation to indicate that a UnicodeSet operation failed,
+     * and may be used in other functions,
+     * especially but not exclusively when such functions do not
+     * take a UErrorCode for simplicity.
+     *
+     * @see isBogus()
+     * @stable ICU 4.0
       */
-    static const UChar32 MAX_VALUE;
+    void setToBogus();
+
+public:
+
+    enum {
+        /**
+         * Minimum value that can be stored in a UnicodeSet.
+         * @stable ICU 2.4
+         */
+        MIN_VALUE = 0,
+
+        /**
+         * Maximum value that can be stored in a UnicodeSet.
+         * @stable ICU 2.4
+         */
+        MAX_VALUE = 0x10ffff
+    };
  
      //----------------------------------------------------------------
      // Constructors &c
@@ -308,7 +365,7 @@ public:
       *
       * @param start first character, inclusive, of range
       * @param end last character, inclusive, of range
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UnicodeSet(UChar32 start, UChar32 end);
  
@@ -323,29 +380,42 @@ public:
      UnicodeSet(const UnicodeString& pattern,
                 UErrorCode& status);
  
+#ifndef U_HIDE_INTERNAL_API
      /**
       * Constructs a set from the given pattern.  See the class
       * description for the syntax of the pattern language.
       * @param pattern a string specifying what characters are in the set
       * @param options bitmask for options to apply to the pattern.
       * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+     * @param symbols a symbol table mapping variable names to values
+     * and stand-in characters to UnicodeSets; may be NULL
       * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
       * contains a syntax error.
       * @internal
       */
      UnicodeSet(const UnicodeString& pattern,
                 uint32_t options,
+               const SymbolTable* symbols,
                 UErrorCode& status);
+#endif  /* U_HIDE_INTERNAL_API */
  
-#ifdef U_USE_UNICODESET_DEPRECATES
      /**
-     * Obsolete: Constructs a set from the given Unicode character category.
-     * @param category an integer indicating the character category as
-     * defined in uchar.h.
-     * @obsolete ICU 2.6. Use a pattern with the category instead since this API will be removed in that release.
+     * Constructs a set from the given pattern.  See the class description
+     * for the syntax of the pattern language.
+     * @param pattern a string specifying what characters are in the set
+     * @param pos on input, the position in pattern at which to start parsing.
+     * On output, the position after the last character parsed.
+     * @param options bitmask for options to apply to the pattern.
+     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+     * @param symbols a symbol table mapping variable names to values
+     * and stand-in characters to UnicodeSets; may be NULL
+     * @param status input-output error code
+     * @stable ICU 2.8
       */
-    UnicodeSet(int8_t category, UErrorCode& status);
-#endif
+    UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
+               uint32_t options,
+               const SymbolTable* symbols,
+               UErrorCode& status);
  
      /**
       * Constructs a set that is identical to the given UnicodeSet.
@@ -361,6 +431,7 @@ public:
  
      /**
       * Assigns this object to be a copy of another.
+     * A frozen set will not be modified.
       * @stable ICU 2.0
       */
      UnicodeSet& operator=(const UnicodeSet& o);
@@ -389,6 +460,9 @@ public:
       * Returns a copy of this object.  All UnicodeFunctor objects have
       * to support cloning in order to allow classes using
       * UnicodeFunctors, such as Transliterator, to implement cloning.
+     * If this set is frozen, then the clone will be frozen as well.
+     * Use cloneAsThawed() for a mutable clone of a frozen set.
+     * @see cloneAsThawed
       * @stable ICU 2.0
       */
      virtual UnicodeFunctor* clone() const;
@@ -402,6 +476,85 @@ public:
       */
      virtual int32_t hashCode(void) const;
  
+    /**
+     * Get a UnicodeSet pointer from a USet
+     *
+     * @param uset a USet (the ICU plain C type for UnicodeSet)
+     * @return the corresponding UnicodeSet pointer.
+     *
+     * @stable ICU 4.2
+     */
+    inline static UnicodeSet *fromUSet(USet *uset);
+
+    /**
+     * Get a UnicodeSet pointer from a const USet
+     *
+     * @param uset a const USet (the ICU plain C type for UnicodeSet)
+     * @return the corresponding UnicodeSet pointer.
+     *
+     * @stable ICU 4.2
+     */
+    inline static const UnicodeSet *fromUSet(const USet *uset);
+    
+    /**
+     * Produce a USet * pointer for this UnicodeSet.
+     * USet is the plain C type for UnicodeSet
+     *
+     * @return a USet pointer for this UnicodeSet
+     * @stable ICU 4.2
+     */
+    inline USet *toUSet();
+
+
+    /**
+     * Produce a const USet * pointer for this UnicodeSet.
+     * USet is the plain C type for UnicodeSet
+     *
+     * @return a const USet pointer for this UnicodeSet
+     * @stable ICU 4.2
+     */
+    inline const USet * toUSet() const;
+
+
+    //----------------------------------------------------------------
+    // Freezable API
+    //----------------------------------------------------------------
+
+    /**
+     * Determines whether the set has been frozen (made immutable) or not.
+     * See the ICU4J Freezable interface for details.
+     * @return TRUE/FALSE for whether the set has been frozen
+     * @see freeze
+     * @see cloneAsThawed
+     * @stable ICU 3.8
+     */
+    inline UBool isFrozen() const;
+
+    /**
+     * Freeze the set (make it immutable).
+     * Once frozen, it cannot be unfrozen and is therefore thread-safe
+     * until it is deleted.
+     * See the ICU4J Freezable interface for details.
+     * Freezing the set may also make some operations faster, for example
+     * contains() and span().
+     * A frozen set will not be modified. (It remains frozen.)
+     * @return this set.
+     * @see isFrozen
+     * @see cloneAsThawed
+     * @stable ICU 3.8
+     */
+    UnicodeFunctor *freeze();
+
+    /**
+     * Clone the set and make the clone mutable.
+     * See the ICU4J Freezable interface for details.
+     * @return the mutable clone
+     * @see freeze
+     * @see isFrozen
+     * @stable ICU 3.8
+     */
+    UnicodeFunctor *cloneAsThawed() const;
+
      //----------------------------------------------------------------
      // Public API
      //----------------------------------------------------------------
@@ -410,67 +563,118 @@ public:
       * Make this object represent the range <code>start - end</code>.
       * If <code>end > start</code> then this object is set to an
       * an empty range.
+     * A frozen set will not be modified.
       *
       * @param start first character in the set, inclusive
       * @param end last character in the set, inclusive
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UnicodeSet& set(UChar32 start, UChar32 end);
  
      /**
       * Return true if the given position, in the given pattern, appears
       * to be the start of a UnicodeSet pattern.
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      static UBool resemblesPattern(const UnicodeString& pattern,
                                    int32_t pos);
  
      /**
       * Modifies this set to represent the set specified by the given
-     * pattern, optionally ignoring white space.  See the class
-     * description for the syntax of the pattern language.
+     * pattern, ignoring Unicode Pattern_White_Space characters.
+     * See the class description for the syntax of the pattern language.
+     * A frozen set will not be modified.
       * @param pattern a string specifying what characters are in the set
       * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
       * contains a syntax error.
+     * <em> Empties the set passed before applying the pattern.</em>
+     * @return a reference to this
       * @stable ICU 2.0
       */
-    virtual UnicodeSet& applyPattern(const UnicodeString& pattern,
-                                     UErrorCode& status);
+    UnicodeSet& applyPattern(const UnicodeString& pattern,
+                             UErrorCode& status);
  
+#ifndef U_HIDE_INTERNAL_API
      /**
       * Modifies this set to represent the set specified by the given
-     * pattern, optionally ignoring white space.  See the class
-     * description for the syntax of the pattern language.
+     * pattern, optionally ignoring Unicode Pattern_White_Space characters.
+     * See the class description for the syntax of the pattern language.
+     * A frozen set will not be modified.
       * @param pattern a string specifying what characters are in the set
       * @param options bitmask for options to apply to the pattern.
       * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+     * @param symbols a symbol table mapping variable names to
+     * values and stand-ins to UnicodeSets; may be NULL
       * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
       * contains a syntax error.
+     *<em> Empties the set passed before applying the pattern.</em>
+     * @return a reference to this
       * @internal
       */
      UnicodeSet& applyPattern(const UnicodeString& pattern,
                               uint32_t options,
+                             const SymbolTable* symbols,
+                             UErrorCode& status);
+#endif  /* U_HIDE_INTERNAL_API */
+
+    /**
+     * Parses the given pattern, starting at the given position.  The
+     * character at pattern.charAt(pos.getIndex()) must be '[', or the
+     * parse fails.  Parsing continues until the corresponding closing
+     * ']'.  If a syntax error is encountered between the opening and
+     * closing brace, the parse fails.  Upon return from a successful
+     * parse, the ParsePosition is updated to point to the character
+     * following the closing ']', and a StringBuffer containing a
+     * pairs list for the parsed pattern is returned.  This method calls
+     * itself recursively to parse embedded subpatterns.
+     *<em> Empties the set passed before applying the pattern.</em>
+     * A frozen set will not be modified.
+     *
+     * @param pattern the string containing the pattern to be parsed.
+     * The portion of the string from pos.getIndex(), which must be a
+     * '[', to the corresponding closing ']', is parsed.
+     * @param pos upon entry, the position at which to being parsing.
+     * The character at pattern.charAt(pos.getIndex()) must be a '['.
+     * Upon return from a successful parse, pos.getIndex() is either
+     * the character after the closing ']' of the parsed pattern, or
+     * pattern.length() if the closing ']' is the last character of
+     * the pattern string.
+     * @param options bitmask for options to apply to the pattern.
+     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
+     * @param symbols a symbol table mapping variable names to
+     * values and stand-ins to UnicodeSets; may be NULL
+     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
+     * contains a syntax error.
+     * @return a reference to this
+     * @stable ICU 2.8
+     */
+    UnicodeSet& applyPattern(const UnicodeString& pattern,
+                             ParsePosition& pos,
+                             uint32_t options,
+                             const SymbolTable* symbols,
                               UErrorCode& status);
  
      /**
       * Returns a string representation of this set.  If the result of
       * calling this function is passed to a UnicodeSet constructor, it
       * will produce another set that is equal to this one.
+     * A frozen set will not be modified.
       * @param result the string to receive the rules.  Previous
       * contents will be deleted.
       * @param escapeUnprintable if TRUE then convert unprintable
-     * character to their hex escape representations, \uxxxx or
-     * \Uxxxxxxxx.  Unprintable characters are those other than
+     * character to their hex escape representations, \\uxxxx or
+     * \\Uxxxxxxxx.  Unprintable characters are those other than
       * U+000A, U+0020..U+007E.
       * @stable ICU 2.0
       */
      virtual UnicodeString& toPattern(UnicodeString& result,
-                                     UBool escapeUnprintable = FALSE) const;
+                             UBool escapeUnprintable = FALSE) const;
  
      /**
       * Modifies this set to contain those code points which have the given value
       * for the given binary or enumerated property, as returned by
       * u_getIntPropertyValue.  Prior contents of this set are lost.
+     * A frozen set will not be modified.
       *
       * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
       * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
@@ -486,7 +690,7 @@ public:
       *
       * @return a reference to this set
       *
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UnicodeSet& applyIntPropertyValue(UProperty prop,
                                        int32_t value,
@@ -496,6 +700,7 @@ public:
       * Modifies this set to contain those code points which have the
       * given value for the given property.  Prior contents of this
       * set are lost.
+     * A frozen set will not be modified.
       *
       * @param prop a property alias, either short or long.  The name is matched
       * loosely.  See PropertyAliases.txt for names and a description of loose
@@ -504,8 +709,9 @@ public:
       * property alias, or a special ID.  Special IDs are matched loosely and
       * correspond to the following sets:
       *
-     * "ANY" = [\u0000-\U0010FFFF],
-     * "ASCII" = [\u0000-\u007F].
+     * "ANY" = [\\u0000-\\U0010FFFF],
+     * "ASCII" = [\\u0000-\\u007F],
+     * "Assigned" = [:^Cn:].
       *
       * @param value a value alias, either short or long.  The name is matched
       * loosely.  See PropertyValueAliases.txt for names and a description of
@@ -517,15 +723,16 @@ public:
       *
       * @return a reference to this set
       *
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
                                     const UnicodeString& value,
                                     UErrorCode& ec);
  
      /**
-     * Returns the number of elements in this set (its cardinality),
-     * <em>n</em>, where <code>0 <= </code><em>n</em><code> <= 65536</code>.
+     * Returns the number of elements in this set (its cardinality).
+     * Note than the elements of a set may include both individual
+     * codepoints and strings.
       *
       * @return the number of elements in this set (its cardinality).
       * @stable ICU 2.0
@@ -542,12 +749,13 @@ public:
  
      /**
       * Returns true if this set contains the given character.
+     * This function works faster with a frozen set.
       * @param c character to be checked for containment
       * @return true if the test condition is met
       * @stable ICU 2.0
       */
      virtual UBool contains(UChar32 c) const;
-    
+
      /**
       * Returns true if this set contains every character
       * of the given range.
@@ -563,35 +771,35 @@ public:
       * multicharacter string.
       * @param s string to be checked for containment
       * @return <tt>true</tt> if this set contains the specified string
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UBool contains(const UnicodeString& s) const;
-    
+
      /**
       * Returns true if this set contains all the characters and strings
       * of the given set.
       * @param c set to be checked for containment
       * @return true if the test condition is met
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      virtual UBool containsAll(const UnicodeSet& c) const;
-    
+
      /**
       * Returns true if this set contains all the characters
       * of the given string.
       * @param s string containing characters to be checked for containment
       * @return true if the test condition is met
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UBool containsAll(const UnicodeString& s) const;
-    
+
      /**
       * Returns true if this set contains none of the characters
       * of the given range.
       * @param start first character, inclusive, of the range
       * @param end last character, inclusive, of the range
       * @return true if the test condition is met
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UBool containsNone(UChar32 start, UChar32 end) const;
  
@@ -600,57 +808,164 @@ public:
       * of the given set.
       * @param c set to be checked for containment
       * @return true if the test condition is met
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UBool containsNone(const UnicodeSet& c) const;
-    
+
      /**
       * Returns true if this set contains none of the characters
       * of the given string.
       * @param s string containing characters to be checked for containment
       * @return true if the test condition is met
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UBool containsNone(const UnicodeString& s) const;
-        
+
      /**
       * Returns true if this set contains one or more of the characters
       * in the given range.
       * @param start first character, inclusive, of the range
       * @param end last character, inclusive, of the range
       * @return true if the condition is met
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      inline UBool containsSome(UChar32 start, UChar32 end) const;
-        
+
      /**
       * Returns true if this set contains one or more of the characters
       * and strings of the given set.
       * @param s The set to be checked for containment
       * @return true if the condition is met
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      inline UBool containsSome(const UnicodeSet& s) const;
-        
+
      /**
       * Returns true if this set contains one or more of the characters
       * of the given string.
       * @param s string containing characters to be checked for containment
       * @return true if the condition is met
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      inline UBool containsSome(const UnicodeString& s) const;
-        
+
+    /**
+     * Returns the length of the initial substring of the input string which
+     * consists only of characters and strings that are contained in this set
+     * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
+     * or only of characters and strings that are not contained
+     * in this set (USET_SPAN_NOT_CONTAINED).
+     * See USetSpanCondition for details.
+     * Similar to the strspn() C library function.
+     * Unpaired surrogates are treated according to contains() of their surrogate code points.
+     * This function works faster with a frozen set and with a non-negative string length argument.
+     * @param s start of the string
+     * @param length of the string; can be -1 for NUL-terminated
+     * @param spanCondition specifies the containment condition
+     * @return the length of the initial substring according to the spanCondition;
+     *         0 if the start of the string does not fit the spanCondition
+     * @stable ICU 3.8
+     * @see USetSpanCondition
+     */
+    int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
+
+    /**
+     * Returns the end of the substring of the input string according to the USetSpanCondition.
+     * Same as <code>start+span(s.getBuffer()+start, s.length()-start, spanCondition)</code>
+     * after pinning start to 0<=start<=s.length().
+     * @param s the string
+     * @param start the start index in the string for the span operation
+     * @param spanCondition specifies the containment condition
+     * @return the exclusive end of the substring according to the spanCondition;
+     *         the substring s.tempSubStringBetween(start, end) fulfills the spanCondition
+     * @stable ICU 4.4
+     * @see USetSpanCondition
+     */
+    inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
+
+    /**
+     * Returns the start of the trailing substring of the input string which
+     * consists only of characters and strings that are contained in this set
+     * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
+     * or only of characters and strings that are not contained
+     * in this set (USET_SPAN_NOT_CONTAINED).
+     * See USetSpanCondition for details.
+     * Unpaired surrogates are treated according to contains() of their surrogate code points.
+     * This function works faster with a frozen set and with a non-negative string length argument.
+     * @param s start of the string
+     * @param length of the string; can be -1 for NUL-terminated
+     * @param spanCondition specifies the containment condition
+     * @return the start of the trailing substring according to the spanCondition;
+     *         the string length if the end of the string does not fit the spanCondition
+     * @stable ICU 3.8
+     * @see USetSpanCondition
+     */
+    int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
+
+    /**
+     * Returns the start of the substring of the input string according to the USetSpanCondition.
+     * Same as <code>spanBack(s.getBuffer(), limit, spanCondition)</code>
+     * after pinning limit to 0<=end<=s.length().
+     * @param s the string
+     * @param limit the exclusive-end index in the string for the span operation
+     *              (use s.length() or INT32_MAX for spanning back from the end of the string)
+     * @param spanCondition specifies the containment condition
+     * @return the start of the substring according to the spanCondition;
+     *         the substring s.tempSubStringBetween(start, limit) fulfills the spanCondition
+     * @stable ICU 4.4
+     * @see USetSpanCondition
+     */
+    inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
+
+    /**
+     * Returns the length of the initial substring of the input string which
+     * consists only of characters and strings that are contained in this set
+     * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
+     * or only of characters and strings that are not contained
+     * in this set (USET_SPAN_NOT_CONTAINED).
+     * See USetSpanCondition for details.
+     * Similar to the strspn() C library function.
+     * Malformed byte sequences are treated according to contains(0xfffd).
+     * This function works faster with a frozen set and with a non-negative string length argument.
+     * @param s start of the string (UTF-8)
+     * @param length of the string; can be -1 for NUL-terminated
+     * @param spanCondition specifies the containment condition
+     * @return the length of the initial substring according to the spanCondition;
+     *         0 if the start of the string does not fit the spanCondition
+     * @stable ICU 3.8
+     * @see USetSpanCondition
+     */
+    int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
+
+    /**
+     * Returns the start of the trailing substring of the input string which
+     * consists only of characters and strings that are contained in this set
+     * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
+     * or only of characters and strings that are not contained
+     * in this set (USET_SPAN_NOT_CONTAINED).
+     * See USetSpanCondition for details.
+     * Malformed byte sequences are treated according to contains(0xfffd).
+     * This function works faster with a frozen set and with a non-negative string length argument.
+     * @param s start of the string (UTF-8)
+     * @param length of the string; can be -1 for NUL-terminated
+     * @param spanCondition specifies the containment condition
+     * @return the start of the trailing substring according to the spanCondition;
+     *         the string length if the end of the string does not fit the spanCondition
+     * @stable ICU 3.8
+     * @see USetSpanCondition
+     */
+    int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
+
      /**
       * Implement UnicodeMatcher::matches()
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
-    UMatchDegree matches(const Replaceable& text,
+    virtual UMatchDegree matches(const Replaceable& text,
                           int32_t& offset,
                           int32_t limit,
                           UBool incremental);
  
- private:    
+private:
      /**
       * Returns the longest match for s in text at the given position.
       * If limit > start then match forward from start+1 to limit
@@ -667,6 +982,7 @@ public:
       * @param limit the limit offset for matching, either last+1 in
       * the forward direction, or last-1 in the reverse direction,
       * where last is the index of the last character to match.
+     * @param s
       * @return If part of s matches up to the limit, return |limit -
       * start|.  If all of s matches before reaching the limit, return
       * s.length().  If there is a mismatch between s and text, return
@@ -675,7 +991,7 @@ public:
      static int32_t matchRest(const Replaceable& text,
                               int32_t start, int32_t limit,
                               const UnicodeString& s);
-    
+
      /**
       * Returns the smallest value i such that c < list[i].  Caller
       * must ensure that c is a legal value or this method will enter
@@ -687,16 +1003,16 @@ public:
       */
      int32_t findCodePoint(UChar32 c) const;
  
- public:
+public:
  
      /**
       * Implementation of UnicodeMatcher API.  Union the set of all
       * characters that may be matched by this object into the given
       * set.
       * @param toUnionTo the set into which to union the source characters
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
-    void addMatchSetTo(UnicodeSet& toUnionTo) const;
+    virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
  
      /**
       * Returns the index of the given character within this set, where
@@ -704,7 +1020,7 @@ public:
       * is not in this set, return -1.  The inverse of this method is
       * <code>charAt()</code>.
       * @return an index from 0..size()-1, or -1
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      int32_t indexOf(UChar32 c) const;
  
@@ -715,7 +1031,7 @@ public:
       * <code>indexOf()</code>.
       * @param index an index from 0..size()-1
       * @return the character at the given index, or (UChar32)-1.
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UChar32 charAt(int32_t index) const;
  
@@ -725,6 +1041,7 @@ public:
       * the call leaves this set unchanged.  If <code>end > start</code>
       * then an empty range is added, leaving the set unchanged.
       * This is equivalent to a boolean logic OR, or a set UNION.
+     * A frozen set will not be modified.
       *
       * @param start first character, inclusive, of range to be added
       * to this set.
@@ -738,6 +1055,7 @@ public:
       * Adds the specified character to this set if it is not already
       * present.  If this set already contains the specified character,
       * the call leaves this set unchanged.
+     * A frozen set will not be modified.
       * @stable ICU 2.0
       */
      UnicodeSet& add(UChar32 c);
@@ -748,56 +1066,61 @@ public:
       * the call leaves this set unchanged.
       * Thus "ch" => {"ch"}
       * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+     * A frozen set will not be modified.
       * @param s the source string
       * @return this object, for chaining
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UnicodeSet& add(const UnicodeString& s);
  
- private:    
+ private:
      /**
       * @return a code point IF the string consists of a single one.
       * otherwise returns -1.
-     * @param string to test
+     * @param s string to test
       */
      static int32_t getSingleCP(const UnicodeString& s);
  
      void _add(const UnicodeString& s);
-    
+
   public:
      /**
       * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
       * If this set already any particular character, it has no effect on that character.
+     * A frozen set will not be modified.
       * @param s the source string
       * @return this object, for chaining
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UnicodeSet& addAll(const UnicodeString& s);
  
      /**
       * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
       * If this set already any particular character, it has no effect on that character.
+     * A frozen set will not be modified.
       * @param s the source string
       * @return this object, for chaining
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UnicodeSet& retainAll(const UnicodeString& s);
  
      /**
       * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
       * If this set already any particular character, it has no effect on that character.
+     * A frozen set will not be modified.
       * @param s the source string
       * @return this object, for chaining
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UnicodeSet& complementAll(const UnicodeString& s);
  
      /**
       * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
       * If this set already any particular character, it has no effect on that character.
+     * A frozen set will not be modified.
       * @param s the source string
       * @return this object, for chaining
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UnicodeSet& removeAll(const UnicodeString& s);
  
@@ -807,25 +1130,26 @@ public:
       * @param s the source string
       * @return a newly created set containing the given string.
       * The caller owns the return object and is responsible for deleting it.
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
-    static UnicodeSet* createFrom(const UnicodeString& s);
+    static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
+
  
-    
      /**
       * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
       * @param s the source string
       * @return a newly created set containing the given characters
       * The caller owns the return object and is responsible for deleting it.
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
-    static UnicodeSet* createFromAll(const UnicodeString& s);
+    static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
  
      /**
       * Retain only the elements in this set that are contained in the
       * specified range.  If <code>end > start</code> then an empty range is
       * retained, leaving the set empty.  This is equivalent to
       * a boolean logic AND, or a set INTERSECTION.
+     * A frozen set will not be modified.
       *
       * @param start first character, inclusive, of range to be retained
       * to this set.
@@ -838,6 +1162,7 @@ public:
  
      /**
       * Retain the specified character from this set if it is present.
+     * A frozen set will not be modified.
       * @stable ICU 2.0
       */
      UnicodeSet& retain(UChar32 c);
@@ -847,6 +1172,7 @@ public:
       * The set will not contain the specified range once the call
       * returns.  If <code>end > start</code> then an empty range is
       * removed, leaving the set unchanged.
+     * A frozen set will not be modified.
       *
       * @param start first character, inclusive, of range to be removed
       * from this set.
@@ -860,6 +1186,7 @@ public:
       * Removes the specified character from this set if it is present.
       * The set will not contain the specified range once the call
       * returns.
+     * A frozen set will not be modified.
       * @stable ICU 2.0
       */
      UnicodeSet& remove(UChar32 c);
@@ -868,9 +1195,10 @@ public:
       * Removes the specified string from this set if it is present.
       * The set will not contain the specified character once the call
       * returns.
+     * A frozen set will not be modified.
       * @param s the source string
       * @return this object, for chaining
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UnicodeSet& remove(const UnicodeString& s);
  
@@ -878,6 +1206,7 @@ public:
       * Inverts this set.  This operation modifies this set so that
       * its value is its complement.  This is equivalent to
       * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
+     * A frozen set will not be modified.
       * @stable ICU 2.0
       */
      virtual UnicodeSet& complement(void);
@@ -888,6 +1217,7 @@ public:
       * added if it is not in this set.  If <code>end > start</code>
       * then an empty range is complemented, leaving the set unchanged.
       * This is equivalent to a boolean logic XOR.
+     * A frozen set will not be modified.
       *
       * @param start first character, inclusive, of range to be removed
       * from this set.
@@ -901,6 +1231,7 @@ public:
       * Complements the specified character in this set.  The character
       * will be removed if it is in this set, or will be added if it is
       * not in this set.
+     * A frozen set will not be modified.
       * @stable ICU 2.0
       */
      UnicodeSet& complement(UChar32 c);
@@ -910,9 +1241,10 @@ public:
       * The set will not contain the specified string once the call
       * returns.
       * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+     * A frozen set will not be modified.
       * @param s the string to complement
       * @return this object, for chaining
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      UnicodeSet& complement(const UnicodeString& s);
  
@@ -922,9 +1254,10 @@ public:
       * modifies this set so that its value is the <i>union</i> of the two
       * sets.  The behavior of this operation is unspecified if the specified
       * collection is modified while the operation is in progress.
+     * A frozen set will not be modified.
       *
       * @param c set whose elements are to be added to this set.
-     * @see #add(char, char)
+     * @see #add(UChar32, UChar32)
       * @stable ICU 2.0
       */
      virtual UnicodeSet& addAll(const UnicodeSet& c);
@@ -935,6 +1268,7 @@ public:
       * its elements that are not contained in the specified set.  This
       * operation effectively modifies this set so that its value is
       * the <i>intersection</i> of the two sets.
+     * A frozen set will not be modified.
       *
       * @param c set that defines which elements this set will retain.
       * @stable ICU 2.0
@@ -946,6 +1280,7 @@ public:
       * specified set.  This operation effectively modifies this
       * set so that its value is the <i>asymmetric set difference</i> of
       * the two sets.
+     * A frozen set will not be modified.
       *
       * @param c set that defines which elements will be removed from
       *          this set.
@@ -957,16 +1292,18 @@ public:
       * Complements in this set all elements contained in the specified
       * set.  Any character in the other set will be removed if it is
       * in this set, or will be added if it is not in this set.
+     * A frozen set will not be modified.
       *
       * @param c set that defines which elements will be xor'ed from
       *          this set.
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      virtual UnicodeSet& complementAll(const UnicodeSet& c);
  
      /**
       * Removes all of the elements from this set.  This set will be
       * empty after this call returns.
+     * A frozen set will not be modified.
       * @stable ICU 2.0
       */
      virtual UnicodeSet& clear(void);
@@ -982,26 +1319,36 @@ public:
       * 2. For each string 'e' in the resulting set, if e !=
       * foldCase(e), 'e' will be removed.
       *
-     * Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
+     * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
       *
       * (Here foldCase(x) refers to the operation u_strFoldCase, and a
       * == b denotes that the contents are the same, not pointer
       * comparison.)
       *
+     * A frozen set will not be modified.
+     *
       * @param attribute bitmask for attributes to close over.
       * Currently only the USET_CASE bit is supported.  Any undefined bits
       * are ignored.
       * @return a reference to this set.
-     * @internal
+     * @stable ICU 4.2
       */
      UnicodeSet& closeOver(int32_t attribute);
  
+    /**
+     * Remove all strings from this set.
+     *
+     * @return a reference to this set.
+     * @stable ICU 4.2
+     */
+    virtual UnicodeSet &removeAllStrings();
+
      /**
       * Iteration method that returns the number of ranges contained in
       * this set.
       * @see #getRangeStart
       * @see #getRangeEnd
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      virtual int32_t getRangeCount(void) const;
  
@@ -1010,7 +1357,7 @@ public:
       * specified range of this set.
       * @see #getRangeCount
       * @see #getRangeEnd
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      virtual UChar32 getRangeStart(int32_t index) const;
  
@@ -1019,7 +1366,7 @@ public:
       * specified range of this set.
       * @see #getRangeStart
       * @see #getRangeEnd
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      virtual UChar32 getRangeEnd(int32_t index) const;
  
@@ -1069,14 +1416,15 @@ public:
       * @return the total length of the serialized format, including
       * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
       * than U_BUFFER_OVERFLOW_ERROR.
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
  
      /**
       * Reallocate this objects internal structures to take up the least
       * possible space, without changing this object's value.
-     * @draft ICU 2.4
+     * A frozen set will not be modified.
+     * @stable ICU 2.4
       */
      virtual UnicodeSet& compact();
  
@@ -1091,7 +1439,7 @@ public:
       * @return          The class ID for all objects of this class.
       * @stable ICU 2.0
       */
-    static UClassID getStaticClassID(void);
+    static UClassID U_EXPORT2 getStaticClassID(void);
  
      /**
       * Implement UnicodeFunctor API.
@@ -1099,7 +1447,7 @@ public:
       * @return The class ID for this object. All objects of a given
       * class have the same class ID.  Objects of other classes have
       * different class IDs.
-     * @draft ICU 2.4
+     * @stable ICU 2.4
       */
      virtual UClassID getDynamicClassID(void) const;
  
@@ -1113,49 +1461,11 @@ private:
  
      const UnicodeString* getString(int32_t index) const;
  
-private:
-
-    static const char fgClassID;
-
      //----------------------------------------------------------------
      // RuleBasedTransliterator support
      //----------------------------------------------------------------
  
-    friend class TransliteratorParser;
-    friend class TransliteratorIDParser;
-
-    friend class RBBIRuleScanner;
-    friend class RegexCompile;
-
-    /**
-     * Constructs a set from the given pattern.  See the class description
-     * for the syntax of the pattern language.
-
-     * @param pattern a string specifying what characters are in the set
-     * @param pos on input, the position in pattern at which to start parsing.
-     * On output, the position after the last character parsed.
-     * @param varNameToChar a mapping from variable names (String) to characters
-     * (Character).  May be null.  If varCharToSet is non-null, then names may
-     * map to either single characters or sets, depending on whether a mapping
-     * exists in varCharToSet.  If varCharToSet is null then all names map to
-     * single characters.
-     * @param varCharToSet a mapping from characters (Character objects from
-     * varNameToChar) to UnicodeSet objects.  May be null.  Is only used if
-     * varNameToChar is also non-null.
-     * @exception <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
-     * contains a syntax error.
-     */
-    UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
-               const SymbolTable& symbols,
-               UErrorCode& status);
-
-    /**
-     * Constructs a set from the given pattern.  Identical to the
-     * 4-parameter ParsePosition contstructor, but does not take a
-     * SymbolTable, and does not recognize embedded variables.
-     */
-    UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
-               uint32_t options, UErrorCode& status);
+private:
  
      /**
       * Returns <tt>true</tt> if this set contains any character whose low byte
@@ -1165,59 +1475,41 @@ private:
      virtual UBool matchesIndexValue(uint8_t v) const;
  
  private:
+    friend class RBBIRuleScanner;
+
+    //----------------------------------------------------------------
+    // Implementation: Clone as thawed (see ICU4J Freezable)
+    //----------------------------------------------------------------
+
+    UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
  
      //----------------------------------------------------------------
      // Implementation: Pattern parsing
      //----------------------------------------------------------------
  
-    /**
-     * Parses the given pattern, starting at the given position.  The
-     * character at pattern.charAt(pos.getIndex()) must be '[', or the
-     * parse fails.  Parsing continues until the corresponding closing
-     * ']'.  If a syntax error is encountered between the opening and
-     * closing brace, the parse fails.  Upon return from a successful
-     * parse, the ParsePosition is updated to point to the character
-     * following the closing ']', and a StringBuffer containing a
-     * pairs list for the parsed pattern is returned.  This method calls
-     * itself recursively to parse embedded subpatterns.
-     *
-     * @param pattern the string containing the pattern to be parsed.
-     * The portion of the string from pos.getIndex(), which must be a
-     * '[', to the corresponding closing ']', is parsed.
-     * @param pos upon entry, the position at which to being parsing.
-     * The character at pattern.charAt(pos.getIndex()) must be a '['.
-     * Upon return from a successful parse, pos.getIndex() is either
-     * the character after the closing ']' of the parsed pattern, or
-     * pattern.length() if the closing ']' is the last character of
-     * the pattern string.
-     * @return a StringBuffer containing a pairs list for the parsed
-     * substring of <code>pattern</code>
-     * @exception U_ILLEGAL_ARGUMENT_ERROR if the parse fails.
-     */
-    void applyPattern(const UnicodeString& pattern,
-                      ParsePosition& pos,
-                      uint32_t options,
+    void applyPatternIgnoreSpace(const UnicodeString& pattern,
+                                 ParsePosition& pos,
+                                 const SymbolTable* symbols,
+                                 UErrorCode& status);
+
+    void applyPattern(RuleCharacterIterator& chars,
                        const SymbolTable* symbols,
-                      UErrorCode& status);
+                      UnicodeString& rebuiltPat,
+                      uint32_t options,
+                      UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
+                      UErrorCode& ec);
  
      //----------------------------------------------------------------
      // Implementation: Utility methods
      //----------------------------------------------------------------
  
-    void ensureCapacity(int32_t newLen);
+    void ensureCapacity(int32_t newLen, UErrorCode& ec);
  
-    void ensureBufferCapacity(int32_t newLen);
+    void ensureBufferCapacity(int32_t newLen, UErrorCode& ec);
  
      void swapBuffers(void);
  
-    UBool allocateStrings();
-
-    void _applyPattern(const UnicodeString& pattern,
-                       ParsePosition& pos,
-                       uint32_t options,
-                       const SymbolTable* symbols,
-                       UnicodeString& rebuiltPat,
-                       UErrorCode& status);
+    UBool allocateStrings(UErrorCode &status);
  
      UnicodeString& _toPattern(UnicodeString& result,
                                UBool escapeUnprintable) const;
@@ -1241,28 +1533,31 @@ private:
  
      /**
       * Return true if the given position, in the given pattern, appears
-     * to be the start of a property set pattern [:foo:], \p{foo}, or
-     * \P{foo}, or \N{name}.
+     * to be the start of a property set pattern [:foo:], \\p{foo}, or
+     * \\P{foo}, or \\N{name}.
       */
      static UBool resemblesPropertyPattern(const UnicodeString& pattern,
                                            int32_t pos);
  
+    static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
+                                          int32_t iterOpts);
+
      /**
       * Parse the given property pattern at the given parse position
       * and set this UnicodeSet to the result.
       *
       * The original design document is out of date, but still useful.
       * Ignore the property and value names:
-     * http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html
+     * http://source.icu-project.org/repos/icu/icuhtml/trunk/design/unicodeset_properties.html
       *
       * Recognized syntax:
       *
       * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
-     * \p{foo} \P{foo}  - white space not allowed within "\p" or "\P"
-     * \N{name}         - white space not allowed within "\N"
+     * \\p{foo} \\P{foo}  - white space not allowed within "\\p" or "\\P"
+     * \\N{name}         - white space not allowed within "\\N"
       *
-     * Other than the above restrictions, white space is ignored.  Case
-     * is ignored except in "\p" and "\P" and "\N".  In 'name' leading
+     * Other than the above restrictions, Unicode Pattern_White_Space characters are ignored.
+     * Case is ignored except in "\\p" and "\\P" and "\\N".  In 'name' leading
       * and trailing space is deleted, and internal runs of whitespace
       * are collapsed to a single space.
       *
@@ -1277,18 +1572,26 @@ private:
       * @param ppos on entry, the position at which to begin parsing.
       * This should be one of the locations marked '^':
       *
-     *   [:blah:]     \p{blah}     \P{blah}     \N{name}
+     *   [:blah:]     \\p{blah}     \\P{blah}     \\N{name}
       *   ^       %    ^       %    ^       %    ^       %
       *
       * On return, the position after the last character parsed, that is,
       * the locations marked '%'.  If the parse fails, ppos is returned
       * unchanged.
+     * @param ec status
       * @return a reference to this.
       */
      UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
                                       ParsePosition& ppos,
                                       UErrorCode &ec);
  
+    void applyPropertyPattern(RuleCharacterIterator& chars,
+                              UnicodeString& rebuiltPat,
+                              UErrorCode& ec);
+
+    friend void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status);
+    static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
+
      /**
       * A filter that returns TRUE if the given code point should be
       * included in the UnicodeSet being constructed.
@@ -1301,47 +1604,36 @@ private:
       * property-conformant.  That is, if it returns value v for one
       * code point, then it must return v for all affiliated code
       * points, as defined by the inclusions list.  See
-     * uprv_getInclusions().
+     * getInclusions().
+     * src is a UPropertySource value.
       */
      void applyFilter(Filter filter,
                       void* context,
+                     int32_t src,
                       UErrorCode &status);
  
      /**
-     * Return a cached copy of the inclusions list that
-     * uprv_getInclusions() produces.
+     * Set the new pattern to cache.
       */
-    static const UnicodeSet* getInclusions(UErrorCode &errorCode);
+    void setPattern(const UnicodeString& newPat);
+    /**
+     * Release existing cached pattern.
+     */
+    void releasePattern();
  
      friend class UnicodeSetIterator;
-
-    //----------------------------------------------------------------
-    // Implementation: closeOver
-    //----------------------------------------------------------------
-
-    void caseCloseOne(const UnicodeString& folded);
-
-    void caseCloseOne(const CaseEquivClass& c);
-
-    void caseCloseOne(UChar folded);
-
-    static const CaseEquivClass* getCaseMapOf(const UnicodeString& folded);
-
-    static const CaseEquivClass* getCaseMapOf(UChar folded);
  };
  
-inline UClassID
-UnicodeSet::getStaticClassID(void)
-{ return (UClassID)&fgClassID; }
  
-inline UClassID
-UnicodeSet::getDynamicClassID(void) const
-{ return UnicodeSet::getStaticClassID(); }
  
  inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
      return !operator==(o);
  }
  
+inline UBool UnicodeSet::isFrozen() const {
+    return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
+}
+
  inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
      return !containsNone(start, end);
  }
@@ -1354,6 +1646,46 @@ inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
      return !containsNone(s);
  }
  
+inline UBool UnicodeSet::isBogus() const {
+    return (UBool)(fFlags & kIsBogus);
+}
+
+inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
+    return reinterpret_cast<UnicodeSet *>(uset);
+}
+
+inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
+    return reinterpret_cast<const UnicodeSet *>(uset);
+}
+
+inline USet *UnicodeSet::toUSet() {
+    return reinterpret_cast<USet *>(this);
+}
+
+inline const USet *UnicodeSet::toUSet() const {
+    return reinterpret_cast<const USet *>(this);
+}
+
+inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
+    int32_t sLength=s.length();
+    if(start<0) {
+        start=0;
+    } else if(start>sLength) {
+        start=sLength;
+    }
+    return start+span(s.getBuffer()+start, sLength-start, spanCondition);
+}
+
+inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
+    int32_t sLength=s.length();
+    if(limit<0) {
+        limit=0;
+    } else if(limit>sLength) {
+        limit=sLength;
+    }
+    return spanBack(s.getBuffer(), limit, spanCondition);
+}
+
  U_NAMESPACE_END
  
  #endif