icuSources/common/unicode/uniset.h

   1 /*
   2 ***************************************************************************
   3 * Copyright (C) 1999-2006, International Business Machines Corporation
   4 * and others. All Rights Reserved.
   5 ***************************************************************************
   6 *   Date        Name        Description
   7 *   10/20/99    alan        Creation.
   8 ***************************************************************************
   9 */
  10
  11 #ifndef UNICODESET_H
  12 #define UNICODESET_H
  13
  14 #include "unicode/unifilt.h"
  15 #include "unicode/unistr.h"
  16 #include "unicode/uset.h"
  17
  18 /**
  19  * \file
  20  * \brief C++ API: Unicode Set
  21  */
  22
  23 U_NAMESPACE_BEGIN
  24
  25 class ParsePosition;
  26 class SymbolTable;
  27 class UVector;
  28 class RuleCharacterIterator;
  29
  30 /**
  31  * A mutable set of Unicode characters and multicharacter strings.  Objects of this class
  32  * represent <em>character classes</em> used in regular expressions.
  33  * A character specifies a subset of Unicode code points.  Legal
  34  * code points are U+0000 to U+10FFFF, inclusive.
  35  *
  36  * <p>The UnicodeSet class is not designed to be subclassed.
  37  *
  38  * <p><code>UnicodeSet</code> supports two APIs. The first is the
  39  * <em>operand</em> API that allows the caller to modify the value of
  40  * a <code>UnicodeSet</code> object. It conforms to Java 2's
  41  * <code>java.util.Set</code> interface, although
  42  * <code>UnicodeSet</code> does not actually implement that
  43  * interface. All methods of <code>Set</code> are supported, with the
  44  * modification that they take a character range or single character
  45  * instead of an <code>Object</code>, and they take a
  46  * <code>UnicodeSet</code> instead of a <code>Collection</code>.  The
  47  * operand API may be thought of in terms of boolean logic: a boolean
  48  * OR is implemented by <code>add</code>, a boolean AND is implemented
  49  * by <code>retain</code>, a boolean XOR is implemented by
  50  * <code>complement</code> taking an argument, and a boolean NOT is
  51  * implemented by <code>complement</code> with no argument.  In terms
  52  * of traditional set theory function names, <code>add</code> is a
  53  * union, <code>retain</code> is an intersection, <code>remove</code>
  54  * is an asymmetric difference, and <code>complement</code> with no
  55  * argument is a set complement with respect to the superset range
  56  * <code>MIN_VALUE-MAX_VALUE</code>
  57  *
  58  * <p>The second API is the
  59  * <code>applyPattern()</code>/<code>toPattern()</code> API from the
  60  * <code>java.text.Format</code>-derived classes.  Unlike the
  61  * methods that add characters, add categories, and control the logic
  62  * of the set, the method <code>applyPattern()</code> sets all
  63  * attributes of a <code>UnicodeSet</code> at once, based on a
  64  * string pattern.
  65  *
  66  * <p><b>Pattern syntax</b></p>
  67  *
  68  * Patterns are accepted by the constructors and the
  69  * <code>applyPattern()</code> methods and returned by the
  70  * <code>toPattern()</code> method.  These patterns follow a syntax
  71  * similar to that employed by version 8 regular expression character
  72  * classes.  Here are some simple examples:
  73  *
  74  * \htmlonly<blockquote>\endhtmlonly
  75  *   <table>
  76  *     <tr align="top">
  77  *       <td nowrap valign="top" align="left"><code>[]</code></td>
  78  *       <td valign="top">No characters</td>
  79  *     </tr><tr align="top">
  80  *       <td nowrap valign="top" align="left"><code>[a]</code></td>
  81  *       <td valign="top">The character 'a'</td>
  82  *     </tr><tr align="top">
  83  *       <td nowrap valign="top" align="left"><code>[ae]</code></td>
  84  *       <td valign="top">The characters 'a' and 'e'</td>
  85  *     </tr>
  86  *     <tr>
  87  *       <td nowrap valign="top" align="left"><code>[a-e]</code></td>
  88  *       <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
  89  *       point order</td>
  90  *     </tr>
  91  *     <tr>
  92  *       <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
  93  *       <td valign="top">The character U+4E01</td>
  94  *     </tr>
  95  *     <tr>
  96  *       <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
  97  *       <td valign="top">The character 'a' and the multicharacter strings &quot;ab&quot; and
  98  *       &quot;ac&quot;</td>
  99  *     </tr>
 100  *     <tr>
 101  *       <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>
 102  *       <td valign="top">All characters in the general category Uppercase Letter</td>
 103  *     </tr>
 104  *   </table>
 105  * \htmlonly</blockquote>\endhtmlonly
 106  *
 107  * Any character may be preceded by a backslash in order to remove any special
 108  * meaning.  White space characters, as defined by UCharacter.isWhitespace(), are
 109  * ignored, unless they are escaped.
 110  *
 111  * <p>Property patterns specify a set of characters having a certain
 112  * property as defined by the Unicode standard.  Both the POSIX-like
 113  * "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized.  For a
 114  * complete list of supported property patterns, see the User's Guide
 115  * for UnicodeSet at
 116  * <a href="http://icu.sourceforge.net/userguide/unicodeSet.html">
 117  * http://icu.sourceforge.net/userguide/unicodeSet.html</a>.
 118  * Actual determination of property data is defined by the underlying
 119  * Unicode database as implemented by UCharacter.
 120  *
 121  * <p>Patterns specify individual characters, ranges of characters, and
 122  * Unicode property sets.  When elements are concatenated, they
 123  * specify their union.  To complement a set, place a '^' immediately
 124  * after the opening '['.  Property patterns are inverted by modifying
 125  * their delimiters; "[:^foo]" and "\\P{foo}".  In any other location,
 126  * '^' has no special meaning.
 127  *
 128  * <p>Ranges are indicated by placing two a '-' between two
 129  * characters, as in "a-z".  This specifies the range of all
 130  * characters from the left to the right, in Unicode order.  If the
 131  * left character is greater than or equal to the
 132  * right character it is a syntax error.  If a '-' occurs as the first
 133  * character after the opening '[' or '[^', or if it occurs as the
 134  * last character before the closing ']', then it is taken as a
 135  * literal.  Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
 136  * set of three characters, 'a', 'b', and '-'.
 137  *
 138  * <p>Sets may be intersected using the '&' operator or the asymmetric
 139  * set difference may be taken using the '-' operator, for example,
 140  * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
 141  * with values less than 4096.  Operators ('&' and '|') have equal
 142  * precedence and bind left-to-right.  Thus
 143  * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
 144  * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for
 145  * difference; intersection is commutative.
 146  *
 147  * <table>
 148  * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
 149  * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
 150  * through 'z' and all letters in between, in Unicode order
 151  * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
 152  * all characters but 'a' through 'z',
 153  * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
 154  * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
 155  * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
 156  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
 157  * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
 158  * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
 159  * <td>The asymmetric difference of sets specified by <em>pat1</em> and
 160  * <em>pat2</em>
 161  * <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>
 162  * <td>The set of characters having the specified
 163  * Unicode property; in
 164  * this case, Unicode uppercase letters
 165  * <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>
 166  * <td>The set of characters <em>not</em> having the given
 167  * Unicode property
 168  * </table>
 169  *
 170  * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
 171  *
 172  * <p><b>Formal syntax</b></p>
 173  *
 174  * \htmlonly<blockquote>\endhtmlonly
 175  *   <table>
 176  *     <tr align="top">
 177  *       <td nowrap valign="top" align="right"><code>pattern :=&nbsp; </code></td>
 178  *       <td valign="top"><code>('[' '^'? item* ']') |
 179  *       property</code></td>
 180  *     </tr>
 181  *     <tr align="top">
 182  *       <td nowrap valign="top" align="right"><code>item :=&nbsp; </code></td>
 183  *       <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
 184  *       </code></td>
 185  *     </tr>
 186  *     <tr align="top">
 187  *       <td nowrap valign="top" align="right"><code>pattern-expr :=&nbsp; </code></td>
 188  *       <td valign="top"><code>pattern | pattern-expr pattern |
 189  *       pattern-expr op pattern<br>
 190  *       </code></td>
 191  *     </tr>
 192  *     <tr align="top">
 193  *       <td nowrap valign="top" align="right"><code>op :=&nbsp; </code></td>
 194  *       <td valign="top"><code>'&amp;' | '-'<br>
 195  *       </code></td>
 196  *     </tr>
 197  *     <tr align="top">
 198  *       <td nowrap valign="top" align="right"><code>special :=&nbsp; </code></td>
 199  *       <td valign="top"><code>'[' | ']' | '-'<br>
 200  *       </code></td>
 201  *     </tr>
 202  *     <tr align="top">
 203  *       <td nowrap valign="top" align="right"><code>char :=&nbsp; </code></td>
 204  *       <td valign="top"><em>any character that is not</em><code> special<br>
 205  *       | ('\' </code><em>any character</em><code>)<br>
 206  *       | ('\\u' hex hex hex hex)<br>
 207  *       </code></td>
 208  *     </tr>
 209  *     <tr align="top">
 210  *       <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
 211  *       <td valign="top"><em>any character for which
 212  *       </em><code>Character.digit(c, 16)</code><em>
 213  *       returns a non-negative result</em></td>
 214  *     </tr>
 215  *     <tr>
 216  *       <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
 217  *       <td valign="top"><em>a Unicode property set pattern</em></td>
 218  *     </tr>
 219  *   </table>
 220  *   <br>
 221  *   <table border="1">
 222  *     <tr>
 223  *       <td>Legend: <table>
 224  *         <tr>
 225  *           <td nowrap valign="top"><code>a := b</code></td>
 226  *           <td width="20" valign="top">&nbsp; </td>
 227  *           <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
 228  *         </tr>
 229  *         <tr>
 230  *           <td nowrap valign="top"><code>a?</code></td>
 231  *           <td valign="top"></td>
 232  *           <td valign="top">zero or one instance of <code>a</code><br>
 233  *           </td>
 234  *         </tr>
 235  *         <tr>
 236  *           <td nowrap valign="top"><code>a*</code></td>
 237  *           <td valign="top"></td>
 238  *           <td valign="top">one or more instances of <code>a</code><br>
 239  *           </td>
 240  *         </tr>
 241  *         <tr>
 242  *           <td nowrap valign="top"><code>a | b</code></td>
 243  *           <td valign="top"></td>
 244  *           <td valign="top">either <code>a</code> or <code>b</code><br>
 245  *           </td>
 246  *         </tr>
 247  *         <tr>
 248  *           <td nowrap valign="top"><code>'a'</code></td>
 249  *           <td valign="top"></td>
 250  *           <td valign="top">the literal string between the quotes </td>
 251  *         </tr>
 252  *       </table>
 253  *       </td>
 254  *     </tr>
 255  *   </table>
 256  * \htmlonly</blockquote>\endhtmlonly
 257  *
 258  * @author Alan Liu
 259  * @stable ICU 2.0
 260  */
 261 class U_COMMON_API UnicodeSet : public UnicodeFilter {
 262
 263     int32_t len; // length of list used; 0 <= len <= capacity
 264     int32_t capacity; // capacity of list
 265     int32_t bufferCapacity; // capacity of buffer
 266     UChar32* list; // MUST be terminated with HIGH
 267     UChar32* buffer; // internal buffer, may be NULL
 268
 269     UVector* strings; // maintained in sorted order
 270
 271     /**
 272      * The pattern representation of this set.  This may not be the
 273      * most economical pattern.  It is the pattern supplied to
 274      * applyPattern(), with variables substituted and whitespace
 275      * removed.  For sets constructed without applyPattern(), or
 276      * modified using the non-pattern API, this string will be empty,
 277      * indicating that toPattern() must generate a pattern
 278      * representation from the inversion list.
 279      */
 280     UnicodeString pat;
 281
 282 public:
 283
 284     enum {
 285         /**
 286          * Minimum value that can be stored in a UnicodeSet.
 287          * @stable ICU 2.4
 288          */
 289         MIN_VALUE = 0,
 290
 291         /**
 292          * Maximum value that can be stored in a UnicodeSet.
 293          * @stable ICU 2.4
 294          */
 295         MAX_VALUE = 0x10ffff
 296     };
 297
 298     //----------------------------------------------------------------
 299     // Constructors &c
 300     //----------------------------------------------------------------
 301
 302 public:
 303
 304     /**
 305      * Constructs an empty set.
 306      * @stable ICU 2.0
 307      */
 308     UnicodeSet();
 309
 310     /**
 311      * Constructs a set containing the given range. If <code>end >
 312      * start</code> then an empty set is created.
 313      *
 314      * @param start first character, inclusive, of range
 315      * @param end last character, inclusive, of range
 316      * @stable ICU 2.4
 317      */
 318     UnicodeSet(UChar32 start, UChar32 end);
 319
 320     /**
 321      * Constructs a set from the given pattern.  See the class
 322      * description for the syntax of the pattern language.
 323      * @param pattern a string specifying what characters are in the set
 324      * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
 325      * contains a syntax error.
 326      * @stable ICU 2.0
 327      */
 328     UnicodeSet(const UnicodeString& pattern,
 329                UErrorCode& status);
 330
 331     /**
 332      * Constructs a set from the given pattern.  See the class
 333      * description for the syntax of the pattern language.
 334      * @param pattern a string specifying what characters are in the set
 335      * @param options bitmask for options to apply to the pattern.
 336      * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
 337      * @param symbols a symbol table mapping variable names to values
 338      * and stand-in characters to UnicodeSets; may be NULL
 339      * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
 340      * contains a syntax error.
 341      * @internal
 342      */
 343     UnicodeSet(const UnicodeString& pattern,
 344                uint32_t options,
 345                const SymbolTable* symbols,
 346                UErrorCode& status);
 347
 348     /**
 349      * Constructs a set from the given pattern.  See the class description
 350      * for the syntax of the pattern language.
 351      * @param pattern a string specifying what characters are in the set
 352      * @param pos on input, the position in pattern at which to start parsing.
 353      * On output, the position after the last character parsed.
 354      * @param options bitmask for options to apply to the pattern.
 355      * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
 356      * @param symbols a symbol table mapping variable names to values
 357      * and stand-in characters to UnicodeSets; may be NULL
 358      * @param status input-output error code
 359      * @stable ICU 2.8
 360      */
 361     UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
 362                uint32_t options,
 363                const SymbolTable* symbols,
 364                UErrorCode& status);
 365
 366     /**
 367      * Constructs a set that is identical to the given UnicodeSet.
 368      * @stable ICU 2.0
 369      */
 370     UnicodeSet(const UnicodeSet& o);
 371
 372     /**
 373      * Destructs the set.
 374      * @stable ICU 2.0
 375      */
 376     virtual ~UnicodeSet();
 377
 378     /**
 379      * Assigns this object to be a copy of another.
 380      * @stable ICU 2.0
 381      */
 382     UnicodeSet& operator=(const UnicodeSet& o);
 383
 384     /**
 385      * Compares the specified object with this set for equality.  Returns
 386      * <tt>true</tt> if the two sets
 387      * have the same size, and every member of the specified set is
 388      * contained in this set (or equivalently, every member of this set is
 389      * contained in the specified set).
 390      *
 391      * @param o set to be compared for equality with this set.
 392      * @return <tt>true</tt> if the specified set is equal to this set.
 393      * @stable ICU 2.0
 394      */
 395     virtual UBool operator==(const UnicodeSet& o) const;
 396
 397     /**
 398      * Compares the specified object with this set for equality.  Returns
 399      * <tt>true</tt> if the specified set is not equal to this set.
 400      * @stable ICU 2.0
 401      */
 402     UBool operator!=(const UnicodeSet& o) const;
 403
 404     /**
 405      * Returns a copy of this object.  All UnicodeFunctor objects have
 406      * to support cloning in order to allow classes using
 407      * UnicodeFunctors, such as Transliterator, to implement cloning.
 408      * @stable ICU 2.0
 409      */
 410     virtual UnicodeFunctor* clone() const;
 411
 412     /**
 413      * Returns the hash code value for this set.
 414      *
 415      * @return the hash code value for this set.
 416      * @see Object#hashCode()
 417      * @stable ICU 2.0
 418      */
 419     virtual int32_t hashCode(void) const;
 420
 421     //----------------------------------------------------------------
 422     // Public API
 423     //----------------------------------------------------------------
 424
 425     /**
 426      * Make this object represent the range <code>start - end</code>.
 427      * If <code>end > start</code> then this object is set to an
 428      * an empty range.
 429      *
 430      * @param start first character in the set, inclusive
 431      * @param end last character in the set, inclusive
 432      * @stable ICU 2.4
 433      */
 434     UnicodeSet& set(UChar32 start, UChar32 end);
 435
 436     /**
 437      * Return true if the given position, in the given pattern, appears
 438      * to be the start of a UnicodeSet pattern.
 439      * @stable ICU 2.4
 440      */
 441     static UBool resemblesPattern(const UnicodeString& pattern,
 442                                   int32_t pos);
 443
 444     /**
 445      * Modifies this set to represent the set specified by the given
 446      * pattern, optionally ignoring white space.  See the class
 447      * description for the syntax of the pattern language.
 448      * @param pattern a string specifying what characters are in the set
 449      * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
 450      * contains a syntax error.
 451      * <em> Empties the set passed before applying the pattern.</em>
 452      * @return a reference to this
 453      * @stable ICU 2.0
 454      */
 455     UnicodeSet& applyPattern(const UnicodeString& pattern,
 456                              UErrorCode& status);
 457
 458     /**
 459      * Modifies this set to represent the set specified by the given
 460      * pattern, optionally ignoring white space.  See the class
 461      * description for the syntax of the pattern language.
 462      * @param pattern a string specifying what characters are in the set
 463      * @param options bitmask for options to apply to the pattern.
 464      * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
 465      * @param symbols a symbol table mapping variable names to
 466      * values and stand-ins to UnicodeSets; may be NULL
 467      * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
 468      * contains a syntax error.
 469      *<em> Empties the set passed before applying the pattern.</em>
 470      * @return a reference to this
 471      * @internal
 472      */
 473     UnicodeSet& applyPattern(const UnicodeString& pattern,
 474                              uint32_t options,
 475                              const SymbolTable* symbols,
 476                              UErrorCode& status);
 477
 478     /**
 479      * Parses the given pattern, starting at the given position.  The
 480      * character at pattern.charAt(pos.getIndex()) must be '[', or the
 481      * parse fails.  Parsing continues until the corresponding closing
 482      * ']'.  If a syntax error is encountered between the opening and
 483      * closing brace, the parse fails.  Upon return from a successful
 484      * parse, the ParsePosition is updated to point to the character
 485      * following the closing ']', and a StringBuffer containing a
 486      * pairs list for the parsed pattern is returned.  This method calls
 487      * itself recursively to parse embedded subpatterns.
 488      *<em> Empties the set passed before applying the pattern.</em>
 489      *
 490      * @param pattern the string containing the pattern to be parsed.
 491      * The portion of the string from pos.getIndex(), which must be a
 492      * '[', to the corresponding closing ']', is parsed.
 493      * @param pos upon entry, the position at which to being parsing.
 494      * The character at pattern.charAt(pos.getIndex()) must be a '['.
 495      * Upon return from a successful parse, pos.getIndex() is either
 496      * the character after the closing ']' of the parsed pattern, or
 497      * pattern.length() if the closing ']' is the last character of
 498      * the pattern string.
 499      * @param options bitmask for options to apply to the pattern.
 500      * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
 501      * @param symbols a symbol table mapping variable names to
 502      * values and stand-ins to UnicodeSets; may be NULL
 503      * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
 504      * contains a syntax error.
 505      * @return a reference to this
 506      * @stable ICU 2.8
 507      */
 508     UnicodeSet& applyPattern(const UnicodeString& pattern,
 509                              ParsePosition& pos,
 510                              uint32_t options,
 511                              const SymbolTable* symbols,
 512                              UErrorCode& status);
 513
 514     /**
 515      * Returns a string representation of this set.  If the result of
 516      * calling this function is passed to a UnicodeSet constructor, it
 517      * will produce another set that is equal to this one.
 518      * @param result the string to receive the rules.  Previous
 519      * contents will be deleted.
 520      * @param escapeUnprintable if TRUE then convert unprintable
 521      * character to their hex escape representations, \\uxxxx or
 522      * \\Uxxxxxxxx.  Unprintable characters are those other than
 523      * U+000A, U+0020..U+007E.
 524      * @stable ICU 2.0
 525      */
 526     virtual UnicodeString& toPattern(UnicodeString& result,
 527                              UBool escapeUnprintable = FALSE) const;
 528
 529     /**
 530      * Modifies this set to contain those code points which have the given value
 531      * for the given binary or enumerated property, as returned by
 532      * u_getIntPropertyValue.  Prior contents of this set are lost.
 533      *
 534      * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
 535      * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
 536      * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
 537      *
 538      * @param value a value in the range u_getIntPropertyMinValue(prop)..
 539      * u_getIntPropertyMaxValue(prop), with one exception.  If prop is
 540      * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
 541      * rather a mask value produced by U_GET_GC_MASK().  This allows grouped
 542      * categories such as [:L:] to be represented.
 543      *
 544      * @param ec error code input/output parameter
 545      *
 546      * @return a reference to this set
 547      *
 548      * @stable ICU 2.4
 549      */
 550     UnicodeSet& applyIntPropertyValue(UProperty prop,
 551                                       int32_t value,
 552                                       UErrorCode& ec);
 553
 554     /**
 555      * Modifies this set to contain those code points which have the
 556      * given value for the given property.  Prior contents of this
 557      * set are lost.
 558      *
 559      * @param prop a property alias, either short or long.  The name is matched
 560      * loosely.  See PropertyAliases.txt for names and a description of loose
 561      * matching.  If the value string is empty, then this string is interpreted
 562      * as either a General_Category value alias, a Script value alias, a binary
 563      * property alias, or a special ID.  Special IDs are matched loosely and
 564      * correspond to the following sets:
 565      *
 566      * "ANY" = [\\u0000-\\U0010FFFF],
 567      * "ASCII" = [\\u0000-\\u007F],
 568      * "Assigned" = [:^Cn:].
 569      *
 570      * @param value a value alias, either short or long.  The name is matched
 571      * loosely.  See PropertyValueAliases.txt for names and a description of
 572      * loose matching.  In addition to aliases listed, numeric values and
 573      * canonical combining classes may be expressed numerically, e.g., ("nv",
 574      * "0.5") or ("ccc", "220").  The value string may also be empty.
 575      *
 576      * @param ec error code input/output parameter
 577      *
 578      * @return a reference to this set
 579      *
 580      * @stable ICU 2.4
 581      */
 582     UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
 583                                    const UnicodeString& value,
 584                                    UErrorCode& ec);
 585
 586     /**
 587      * Returns the number of elements in this set (its cardinality).
 588      * Note than the elements of a set may include both individual
 589      * codepoints and strings.
 590      *
 591      * @return the number of elements in this set (its cardinality).
 592      * @stable ICU 2.0
 593      */
 594     virtual int32_t size(void) const;
 595
 596     /**
 597      * Returns <tt>true</tt> if this set contains no elements.
 598      *
 599      * @return <tt>true</tt> if this set contains no elements.
 600      * @stable ICU 2.0
 601      */
 602     virtual UBool isEmpty(void) const;
 603
 604     /**
 605      * Returns true if this set contains the given character.
 606      * @param c character to be checked for containment
 607      * @return true if the test condition is met
 608      * @stable ICU 2.0
 609      */
 610     virtual UBool contains(UChar32 c) const;
 611
 612     /**
 613      * Returns true if this set contains every character
 614      * of the given range.
 615      * @param start first character, inclusive, of the range
 616      * @param end last character, inclusive, of the range
 617      * @return true if the test condition is met
 618      * @stable ICU 2.0
 619      */
 620     virtual UBool contains(UChar32 start, UChar32 end) const;
 621
 622     /**
 623      * Returns <tt>true</tt> if this set contains the given
 624      * multicharacter string.
 625      * @param s string to be checked for containment
 626      * @return <tt>true</tt> if this set contains the specified string
 627      * @stable ICU 2.4
 628      */
 629     UBool contains(const UnicodeString& s) const;
 630
 631     /**
 632      * Returns true if this set contains all the characters and strings
 633      * of the given set.
 634      * @param c set to be checked for containment
 635      * @return true if the test condition is met
 636      * @stable ICU 2.4
 637      */
 638     virtual UBool containsAll(const UnicodeSet& c) const;
 639
 640     /**
 641      * Returns true if this set contains all the characters
 642      * of the given string.
 643      * @param s string containing characters to be checked for containment
 644      * @return true if the test condition is met
 645      * @stable ICU 2.4
 646      */
 647     UBool containsAll(const UnicodeString& s) const;
 648
 649     /**
 650      * Returns true if this set contains none of the characters
 651      * of the given range.
 652      * @param start first character, inclusive, of the range
 653      * @param end last character, inclusive, of the range
 654      * @return true if the test condition is met
 655      * @stable ICU 2.4
 656      */
 657     UBool containsNone(UChar32 start, UChar32 end) const;
 658
 659     /**
 660      * Returns true if this set contains none of the characters and strings
 661      * of the given set.
 662      * @param c set to be checked for containment
 663      * @return true if the test condition is met
 664      * @stable ICU 2.4
 665      */
 666     UBool containsNone(const UnicodeSet& c) const;
 667
 668     /**
 669      * Returns true if this set contains none of the characters
 670      * of the given string.
 671      * @param s string containing characters to be checked for containment
 672      * @return true if the test condition is met
 673      * @stable ICU 2.4
 674      */
 675     UBool containsNone(const UnicodeString& s) const;
 676
 677     /**
 678      * Returns true if this set contains one or more of the characters
 679      * in the given range.
 680      * @param start first character, inclusive, of the range
 681      * @param end last character, inclusive, of the range
 682      * @return true if the condition is met
 683      * @stable ICU 2.4
 684      */
 685     inline UBool containsSome(UChar32 start, UChar32 end) const;
 686
 687     /**
 688      * Returns true if this set contains one or more of the characters
 689      * and strings of the given set.
 690      * @param s The set to be checked for containment
 691      * @return true if the condition is met
 692      * @stable ICU 2.4
 693      */
 694     inline UBool containsSome(const UnicodeSet& s) const;
 695
 696     /**
 697      * Returns true if this set contains one or more of the characters
 698      * of the given string.
 699      * @param s string containing characters to be checked for containment
 700      * @return true if the condition is met
 701      * @stable ICU 2.4
 702      */
 703     inline UBool containsSome(const UnicodeString& s) const;
 704
 705     /**
 706      * Implement UnicodeMatcher::matches()
 707      * @stable ICU 2.4
 708      */
 709     virtual UMatchDegree matches(const Replaceable& text,
 710                          int32_t& offset,
 711                          int32_t limit,
 712                          UBool incremental);
 713
 714 private:
 715     /**
 716      * Returns the longest match for s in text at the given position.
 717      * If limit > start then match forward from start+1 to limit
 718      * matching all characters except s.charAt(0).  If limit < start,
 719      * go backward starting from start-1 matching all characters
 720      * except s.charAt(s.length()-1).  This method assumes that the
 721      * first character, text.charAt(start), matches s, so it does not
 722      * check it.
 723      * @param text the text to match
 724      * @param start the first character to match.  In the forward
 725      * direction, text.charAt(start) is matched against s.charAt(0).
 726      * In the reverse direction, it is matched against
 727      * s.charAt(s.length()-1).
 728      * @param limit the limit offset for matching, either last+1 in
 729      * the forward direction, or last-1 in the reverse direction,
 730      * where last is the index of the last character to match.
 731      * @return If part of s matches up to the limit, return |limit -
 732      * start|.  If all of s matches before reaching the limit, return
 733      * s.length().  If there is a mismatch between s and text, return
 734      * 0
 735      */
 736     static int32_t matchRest(const Replaceable& text,
 737                              int32_t start, int32_t limit,
 738                              const UnicodeString& s);
 739
 740     /**
 741      * Returns the smallest value i such that c < list[i].  Caller
 742      * must ensure that c is a legal value or this method will enter
 743      * an infinite loop.  This method performs a binary search.
 744      * @param c a character in the range MIN_VALUE..MAX_VALUE
 745      * inclusive
 746      * @return the smallest integer i in the range 0..len-1,
 747      * inclusive, such that c < list[i]
 748      */
 749     int32_t findCodePoint(UChar32 c) const;
 750
 751 public:
 752
 753     /**
 754      * Implementation of UnicodeMatcher API.  Union the set of all
 755      * characters that may be matched by this object into the given
 756      * set.
 757      * @param toUnionTo the set into which to union the source characters
 758      * @stable ICU 2.4
 759      */
 760     virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
 761
 762     /**
 763      * Returns the index of the given character within this set, where
 764      * the set is ordered by ascending code point.  If the character
 765      * is not in this set, return -1.  The inverse of this method is
 766      * <code>charAt()</code>.
 767      * @return an index from 0..size()-1, or -1
 768      * @stable ICU 2.4
 769      */
 770     int32_t indexOf(UChar32 c) const;
 771
 772     /**
 773      * Returns the character at the given index within this set, where
 774      * the set is ordered by ascending code point.  If the index is
 775      * out of range, return (UChar32)-1.  The inverse of this method is
 776      * <code>indexOf()</code>.
 777      * @param index an index from 0..size()-1
 778      * @return the character at the given index, or (UChar32)-1.
 779      * @stable ICU 2.4
 780      */
 781     UChar32 charAt(int32_t index) const;
 782
 783     /**
 784      * Adds the specified range to this set if it is not already
 785      * present.  If this set already contains the specified range,
 786      * the call leaves this set unchanged.  If <code>end > start</code>
 787      * then an empty range is added, leaving the set unchanged.
 788      * This is equivalent to a boolean logic OR, or a set UNION.
 789      *
 790      * @param start first character, inclusive, of range to be added
 791      * to this set.
 792      * @param end last character, inclusive, of range to be added
 793      * to this set.
 794      * @stable ICU 2.0
 795      */
 796     virtual UnicodeSet& add(UChar32 start, UChar32 end);
 797
 798     /**
 799      * Adds the specified character to this set if it is not already
 800      * present.  If this set already contains the specified character,
 801      * the call leaves this set unchanged.
 802      * @stable ICU 2.0
 803      */
 804     UnicodeSet& add(UChar32 c);
 805
 806     /**
 807      * Adds the specified multicharacter to this set if it is not already
 808      * present.  If this set already contains the multicharacter,
 809      * the call leaves this set unchanged.
 810      * Thus "ch" => {"ch"}
 811      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
 812      * @param s the source string
 813      * @return this object, for chaining
 814      * @stable ICU 2.4
 815      */
 816     UnicodeSet& add(const UnicodeString& s);
 817
 818  private:
 819     /**
 820      * @return a code point IF the string consists of a single one.
 821      * otherwise returns -1.
 822      * @param string to test
 823      */
 824     static int32_t getSingleCP(const UnicodeString& s);
 825
 826     void _add(const UnicodeString& s);
 827
 828  public:
 829     /**
 830      * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
 831      * If this set already any particular character, it has no effect on that character.
 832      * @param s the source string
 833      * @return this object, for chaining
 834      * @stable ICU 2.4
 835      */
 836     UnicodeSet& addAll(const UnicodeString& s);
 837
 838     /**
 839      * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
 840      * If this set already any particular character, it has no effect on that character.
 841      * @param s the source string
 842      * @return this object, for chaining
 843      * @stable ICU 2.4
 844      */
 845     UnicodeSet& retainAll(const UnicodeString& s);
 846
 847     /**
 848      * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
 849      * If this set already any particular character, it has no effect on that character.
 850      * @param s the source string
 851      * @return this object, for chaining
 852      * @stable ICU 2.4
 853      */
 854     UnicodeSet& complementAll(const UnicodeString& s);
 855
 856     /**
 857      * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
 858      * If this set already any particular character, it has no effect on that character.
 859      * @param s the source string
 860      * @return this object, for chaining
 861      * @stable ICU 2.4
 862      */
 863     UnicodeSet& removeAll(const UnicodeString& s);
 864
 865     /**
 866      * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
 867      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
 868      * @param s the source string
 869      * @return a newly created set containing the given string.
 870      * The caller owns the return object and is responsible for deleting it.
 871      * @stable ICU 2.4
 872      */
 873     static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
 874
 875
 876     /**
 877      * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
 878      * @param s the source string
 879      * @return a newly created set containing the given characters
 880      * The caller owns the return object and is responsible for deleting it.
 881      * @stable ICU 2.4
 882      */
 883     static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
 884
 885     /**
 886      * Retain only the elements in this set that are contained in the
 887      * specified range.  If <code>end > start</code> then an empty range is
 888      * retained, leaving the set empty.  This is equivalent to
 889      * a boolean logic AND, or a set INTERSECTION.
 890      *
 891      * @param start first character, inclusive, of range to be retained
 892      * to this set.
 893      * @param end last character, inclusive, of range to be retained
 894      * to this set.
 895      * @stable ICU 2.0
 896      */
 897     virtual UnicodeSet& retain(UChar32 start, UChar32 end);
 898
 899
 900     /**
 901      * Retain the specified character from this set if it is present.
 902      * @stable ICU 2.0
 903      */
 904     UnicodeSet& retain(UChar32 c);
 905
 906     /**
 907      * Removes the specified range from this set if it is present.
 908      * The set will not contain the specified range once the call
 909      * returns.  If <code>end > start</code> then an empty range is
 910      * removed, leaving the set unchanged.
 911      *
 912      * @param start first character, inclusive, of range to be removed
 913      * from this set.
 914      * @param end last character, inclusive, of range to be removed
 915      * from this set.
 916      * @stable ICU 2.0
 917      */
 918     virtual UnicodeSet& remove(UChar32 start, UChar32 end);
 919
 920     /**
 921      * Removes the specified character from this set if it is present.
 922      * The set will not contain the specified range once the call
 923      * returns.
 924      * @stable ICU 2.0
 925      */
 926     UnicodeSet& remove(UChar32 c);
 927
 928     /**
 929      * Removes the specified string from this set if it is present.
 930      * The set will not contain the specified character once the call
 931      * returns.
 932      * @param s the source string
 933      * @return this object, for chaining
 934      * @stable ICU 2.4
 935      */
 936     UnicodeSet& remove(const UnicodeString& s);
 937
 938     /**
 939      * Inverts this set.  This operation modifies this set so that
 940      * its value is its complement.  This is equivalent to
 941      * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
 942      * @stable ICU 2.0
 943      */
 944     virtual UnicodeSet& complement(void);
 945
 946     /**
 947      * Complements the specified range in this set.  Any character in
 948      * the range will be removed if it is in this set, or will be
 949      * added if it is not in this set.  If <code>end > start</code>
 950      * then an empty range is complemented, leaving the set unchanged.
 951      * This is equivalent to a boolean logic XOR.
 952      *
 953      * @param start first character, inclusive, of range to be removed
 954      * from this set.
 955      * @param end last character, inclusive, of range to be removed
 956      * from this set.
 957      * @stable ICU 2.0
 958      */
 959     virtual UnicodeSet& complement(UChar32 start, UChar32 end);
 960
 961     /**
 962      * Complements the specified character in this set.  The character
 963      * will be removed if it is in this set, or will be added if it is
 964      * not in this set.
 965      * @stable ICU 2.0
 966      */
 967     UnicodeSet& complement(UChar32 c);
 968
 969     /**
 970      * Complement the specified string in this set.
 971      * The set will not contain the specified string once the call
 972      * returns.
 973      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
 974      * @param s the string to complement
 975      * @return this object, for chaining
 976      * @stable ICU 2.4
 977      */
 978     UnicodeSet& complement(const UnicodeString& s);
 979
 980     /**
 981      * Adds all of the elements in the specified set to this set if
 982      * they're not already present.  This operation effectively
 983      * modifies this set so that its value is the <i>union</i> of the two
 984      * sets.  The behavior of this operation is unspecified if the specified
 985      * collection is modified while the operation is in progress.
 986      *
 987      * @param c set whose elements are to be added to this set.
 988      * @see #add(char, char)
 989      * @stable ICU 2.0
 990      */
 991     virtual UnicodeSet& addAll(const UnicodeSet& c);
 992
 993     /**
 994      * Retains only the elements in this set that are contained in the
 995      * specified set.  In other words, removes from this set all of
 996      * its elements that are not contained in the specified set.  This
 997      * operation effectively modifies this set so that its value is
 998      * the <i>intersection</i> of the two sets.
 999      *
1000      * @param c set that defines which elements this set will retain.
1001      * @stable ICU 2.0
1002      */
1003     virtual UnicodeSet& retainAll(const UnicodeSet& c);
1004
1005     /**
1006      * Removes from this set all of its elements that are contained in the
1007      * specified set.  This operation effectively modifies this
1008      * set so that its value is the <i>asymmetric set difference</i> of
1009      * the two sets.
1010      *
1011      * @param c set that defines which elements will be removed from
1012      *          this set.
1013      * @stable ICU 2.0
1014      */
1015     virtual UnicodeSet& removeAll(const UnicodeSet& c);
1016
1017     /**
1018      * Complements in this set all elements contained in the specified
1019      * set.  Any character in the other set will be removed if it is
1020      * in this set, or will be added if it is not in this set.
1021      *
1022      * @param c set that defines which elements will be xor'ed from
1023      *          this set.
1024      * @stable ICU 2.4
1025      */
1026     virtual UnicodeSet& complementAll(const UnicodeSet& c);
1027
1028     /**
1029      * Removes all of the elements from this set.  This set will be
1030      * empty after this call returns.
1031      * @stable ICU 2.0
1032      */
1033     virtual UnicodeSet& clear(void);
1034
1035     /**
1036      * Close this set over the given attribute.  For the attribute
1037      * USET_CASE, the result is to modify this set so that:
1038      *
1039      * 1. For each character or string 'a' in this set, all strings or
1040      * characters 'b' such that foldCase(a) == foldCase(b) are added
1041      * to this set.
1042      *
1043      * 2. For each string 'e' in the resulting set, if e !=
1044      * foldCase(e), 'e' will be removed.
1045      *
1046      * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
1047      *
1048      * (Here foldCase(x) refers to the operation u_strFoldCase, and a
1049      * == b denotes that the contents are the same, not pointer
1050      * comparison.)
1051      *
1052      * @param attribute bitmask for attributes to close over.
1053      * Currently only the USET_CASE bit is supported.  Any undefined bits
1054      * are ignored.
1055      * @return a reference to this set.
1056      * @internal
1057      */
1058     UnicodeSet& closeOver(int32_t attribute);
1059
1060     /**
1061      * Iteration method that returns the number of ranges contained in
1062      * this set.
1063      * @see #getRangeStart
1064      * @see #getRangeEnd
1065      * @stable ICU 2.4
1066      */
1067     virtual int32_t getRangeCount(void) const;
1068
1069     /**
1070      * Iteration method that returns the first character in the
1071      * specified range of this set.
1072      * @see #getRangeCount
1073      * @see #getRangeEnd
1074      * @stable ICU 2.4
1075      */
1076     virtual UChar32 getRangeStart(int32_t index) const;
1077
1078     /**
1079      * Iteration method that returns the last character in the
1080      * specified range of this set.
1081      * @see #getRangeStart
1082      * @see #getRangeEnd
1083      * @stable ICU 2.4
1084      */
1085     virtual UChar32 getRangeEnd(int32_t index) const;
1086
1087     /**
1088      * Serializes this set into an array of 16-bit integers.  Serialization
1089      * (currently) only records the characters in the set; multicharacter
1090      * strings are ignored.
1091      *
1092      * The array has following format (each line is one 16-bit
1093      * integer):
1094      *
1095      *  length     = (n+2*m) | (m!=0?0x8000:0)
1096      *  bmpLength  = n; present if m!=0
1097      *  bmp[0]
1098      *  bmp[1]
1099      *  ...
1100      *  bmp[n-1]
1101      *  supp-high[0]
1102      *  supp-low[0]
1103      *  supp-high[1]
1104      *  supp-low[1]
1105      *  ...
1106      *  supp-high[m-1]
1107      *  supp-low[m-1]
1108      *
1109      * The array starts with a header.  After the header are n bmp
1110      * code points, then m supplementary code points.  Either n or m
1111      * or both may be zero.  n+2*m is always <= 0x7FFF.
1112      *
1113      * If there are no supplementary characters (if m==0) then the
1114      * header is one 16-bit integer, 'length', with value n.
1115      *
1116      * If there are supplementary characters (if m!=0) then the header
1117      * is two 16-bit integers.  The first, 'length', has value
1118      * (n+2*m)|0x8000.  The second, 'bmpLength', has value n.
1119      *
1120      * After the header the code points are stored in ascending order.
1121      * Supplementary code points are stored as most significant 16
1122      * bits followed by least significant 16 bits.
1123      *
1124      * @param dest pointer to buffer of destCapacity 16-bit integers.
1125      * May be NULL only if destCapacity is zero.
1126      * @param destCapacity size of dest, or zero.  Must not be negative.
1127      * @param ec error code.  Will be set to U_INDEX_OUTOFBOUNDS_ERROR
1128      * if n+2*m > 0x7FFF.  Will be set to U_BUFFER_OVERFLOW_ERROR if
1129      * n+2*m+(m!=0?2:1) > destCapacity.
1130      * @return the total length of the serialized format, including
1131      * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
1132      * than U_BUFFER_OVERFLOW_ERROR.
1133      * @stable ICU 2.4
1134      */
1135     int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1136
1137     /**
1138      * Reallocate this objects internal structures to take up the least
1139      * possible space, without changing this object's value.
1140      * @stable ICU 2.4
1141      */
1142     virtual UnicodeSet& compact();
1143
1144     /**
1145      * Return the class ID for this class.  This is useful only for
1146      * comparing to a return value from getDynamicClassID().  For example:
1147      * <pre>
1148      * .      Base* polymorphic_pointer = createPolymorphicObject();
1149      * .      if (polymorphic_pointer->getDynamicClassID() ==
1150      * .          Derived::getStaticClassID()) ...
1151      * </pre>
1152      * @return          The class ID for all objects of this class.
1153      * @stable ICU 2.0
1154      */
1155     static UClassID U_EXPORT2 getStaticClassID(void);
1156
1157     /**
1158      * Implement UnicodeFunctor API.
1159      *
1160      * @return The class ID for this object. All objects of a given
1161      * class have the same class ID.  Objects of other classes have
1162      * different class IDs.
1163      * @stable ICU 2.4
1164      */
1165     virtual UClassID getDynamicClassID(void) const;
1166
1167 private:
1168
1169     // Private API for the USet API
1170
1171     friend class USetAccess;
1172
1173     int32_t getStringCount() const;
1174
1175     const UnicodeString* getString(int32_t index) const;
1176
1177     //----------------------------------------------------------------
1178     // RuleBasedTransliterator support
1179     //----------------------------------------------------------------
1180
1181 private:
1182
1183     /**
1184      * Returns <tt>true</tt> if this set contains any character whose low byte
1185      * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
1186      * indexing.
1187      */
1188     virtual UBool matchesIndexValue(uint8_t v) const;
1189
1190 private:
1191
1192     //----------------------------------------------------------------
1193     // Implementation: Pattern parsing
1194     //----------------------------------------------------------------
1195
1196     void applyPattern(RuleCharacterIterator& chars,
1197                       const SymbolTable* symbols,
1198                       UnicodeString& rebuiltPat,
1199                       uint32_t options,
1200                       UErrorCode& ec);
1201
1202     //----------------------------------------------------------------
1203     // Implementation: Utility methods
1204     //----------------------------------------------------------------
1205
1206     void ensureCapacity(int32_t newLen);
1207
1208     void ensureBufferCapacity(int32_t newLen);
1209
1210     void swapBuffers(void);
1211
1212     UBool allocateStrings();
1213
1214     UnicodeString& _toPattern(UnicodeString& result,
1215                               UBool escapeUnprintable) const;
1216
1217     UnicodeString& _generatePattern(UnicodeString& result,
1218                                     UBool escapeUnprintable) const;
1219
1220     static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1221
1222     static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1223
1224     //----------------------------------------------------------------
1225     // Implementation: Fundamental operators
1226     //----------------------------------------------------------------
1227
1228     void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1229
1230     void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1231
1232     void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1233
1234     /**
1235      * Return true if the given position, in the given pattern, appears
1236      * to be the start of a property set pattern [:foo:], \\p{foo}, or
1237      * \\P{foo}, or \\N{name}.
1238      */
1239     static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1240                                           int32_t pos);
1241
1242     static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1243                                           int32_t iterOpts);
1244
1245     /**
1246      * Parse the given property pattern at the given parse position
1247      * and set this UnicodeSet to the result.
1248      *
1249      * The original design document is out of date, but still useful.
1250      * Ignore the property and value names:
1251      * http://dev.icu-project.org/cgi-bin/viewcvs.cgi/~checkout~/icuhtml/design/unicodeset_properties.html
1252      *
1253      * Recognized syntax:
1254      *
1255      * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
1256      * \\p{foo} \\P{foo}  - white space not allowed within "\\p" or "\\P"
1257      * \\N{name}         - white space not allowed within "\\N"
1258      *
1259      * Other than the above restrictions, white space is ignored.  Case
1260      * is ignored except in "\\p" and "\\P" and "\\N".  In 'name' leading
1261      * and trailing space is deleted, and internal runs of whitespace
1262      * are collapsed to a single space.
1263      *
1264      * We support binary properties, enumerated properties, and the
1265      * following non-enumerated properties:
1266      *
1267      *  Numeric_Value
1268      *  Name
1269      *  Unicode_1_Name
1270      *
1271      * @param pattern the pattern string
1272      * @param ppos on entry, the position at which to begin parsing.
1273      * This should be one of the locations marked '^':
1274      *
1275      *   [:blah:]     \\p{blah}     \\P{blah}     \\N{name}
1276      *   ^       %    ^       %    ^       %    ^       %
1277      *
1278      * On return, the position after the last character parsed, that is,
1279      * the locations marked '%'.  If the parse fails, ppos is returned
1280      * unchanged.
1281      * @return a reference to this.
1282      */
1283     UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1284                                      ParsePosition& ppos,
1285                                      UErrorCode &ec);
1286
1287     void applyPropertyPattern(RuleCharacterIterator& chars,
1288                               UnicodeString& rebuiltPat,
1289                               UErrorCode& ec);
1290
1291     /**
1292      * A filter that returns TRUE if the given code point should be
1293      * included in the UnicodeSet being constructed.
1294      */
1295     typedef UBool (*Filter)(UChar32 codePoint, void* context);
1296
1297     /**
1298      * Given a filter, set this UnicodeSet to the code points
1299      * contained by that filter.  The filter MUST be
1300      * property-conformant.  That is, if it returns value v for one
1301      * code point, then it must return v for all affiliated code
1302      * points, as defined by the inclusions list.  See
1303      * getInclusions().
1304      * src is a UPropertySource value.
1305      */
1306     void applyFilter(Filter filter,
1307                      void* context,
1308                      int32_t src,
1309                      UErrorCode &status);
1310
1311     /**
1312      * Return a cached copy of the inclusions list for the property source.
1313      */
1314     static const UnicodeSet* getInclusions(int32_t src, UErrorCode &errorCode);
1315
1316     friend class UnicodeSetIterator;
1317 };
1318
1319 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
1320     return !operator==(o);
1321 }
1322
1323 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1324     return !containsNone(start, end);
1325 }
1326
1327 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
1328     return !containsNone(s);
1329 }
1330
1331 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
1332     return !containsNone(s);
1333 }
1334
1335 U_NAMESPACE_END
1336
1337 #endif