2 ******************************************************************************* 
   4 *   Copyright (C) 2002-2010, International Business Machines 
   5 *   Corporation and others.  All Rights Reserved. 
   7 ******************************************************************************* 
  10 *   tab size:   8 (not used) 
  13 *   created on: 2002mar07 
  14 *   created by: Markus W. Scherer 
  16 *   C version of UnicodeSet. 
  22  * \brief C API: Unicode Set 
  24  * <p>This is a C wrapper around the C++ UnicodeSet class.</p> 
  30 #include "unicode/utypes.h" 
  31 #include "unicode/uchar.h" 
  32 #include "unicode/localpointer.h" 
  37  * A UnicodeSet.  Use the uset_* API to manipulate.  Create with 
  38  * uset_open*, and destroy with uset_close. 
  41 typedef struct USet USet
; 
  45  * Bitmask values to be passed to uset_openPatternOptions() or 
  46  * uset_applyPattern() taking an option parameter. 
  51      * Ignore white space within patterns unless quoted or escaped. 
  54     USET_IGNORE_SPACE 
= 1,   
  57      * Enable case insensitive matching.  E.g., "[ab]" with this flag 
  58      * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will 
  59      * match all except 'a', 'A', 'b', and 'B'. This performs a full 
  60      * closure over case mappings, e.g. U+017F for s. 
  62      * The resulting set is a superset of the input for the code points but 
  63      * not for the strings. 
  64      * It performs a case mapping closure of the code points and adds 
  65      * full case folding strings for the code points, and reduces strings of 
  66      * the original set to their full case folding equivalents. 
  68      * This is designed for case-insensitive matches, for example 
  69      * in regular expressions. The full code point case closure allows checking of 
  70      * an input character directly against the closure set. 
  71      * Strings are matched by comparing the case-folded form from the closure 
  72      * set with an incremental case folding of the string in question. 
  74      * The closure set will also contain single code points if the original 
  75      * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). 
  76      * This is not necessary (that is, redundant) for the above matching method 
  77      * but results in the same closure sets regardless of whether the original 
  78      * set contained the code point or a string. 
  82     USET_CASE_INSENSITIVE 
= 2,   
  85      * Enable case insensitive matching.  E.g., "[ab]" with this flag 
  86      * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will 
  87      * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, 
  88      * title-, and uppercase mappings as well as the case folding 
  89      * of each existing element in the set. 
  92     USET_ADD_CASE_MAPPINGS 
= 4, 
  95      * Enough for any single-code point set 
  98     USET_SERIALIZED_STATIC_ARRAY_CAPACITY
=8 
 102  * Argument values for whether span() and similar functions continue while 
 103  * the current character is contained vs. not contained in the set. 
 105  * The functionality is straightforward for sets with only single code points, 
 106  * without strings (which is the common case): 
 107  * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE 
 109  * - span() and spanBack() partition any string the same way when 
 110  *   alternating between span(USET_SPAN_NOT_CONTAINED) and 
 111  *   span(either "contained" condition). 
 112  * - Using a complemented (inverted) set and the opposite span conditions 
 113  *   yields the same results. 
 115  * When a set contains multi-code point strings, then these statements may not 
 116  * be true, depending on the strings in the set (for example, whether they 
 117  * overlap with each other) and the string that is processed. 
 118  * For a set with strings: 
 119  * - The complement of the set contains the opposite set of code points, 
 120  *   but the same set of strings. 
 121  *   Therefore, complementing both the set and the span conditions 
 122  *   may yield different results. 
 123  * - When starting spans at different positions in a string 
 124  *   (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different 
 125  *   because a set string may start before the later position. 
 126  * - span(USET_SPAN_SIMPLE) may be shorter than 
 127  *   span(USET_SPAN_CONTAINED) because it will not recursively try 
 128  *   all possible paths. 
 129  *   For example, with a set which contains the three strings "xy", "xya" and "ax", 
 130  *   span("xyax", USET_SPAN_CONTAINED) will return 4 but 
 131  *   span("xyax", USET_SPAN_SIMPLE) will return 3. 
 132  *   span(USET_SPAN_SIMPLE) will never be longer than 
 133  *   span(USET_SPAN_CONTAINED). 
 134  * - With either "contained" condition, span() and spanBack() may partition 
 135  *   a string in different ways. 
 136  *   For example, with a set which contains the two strings "ab" and "ba", 
 137  *   and when processing the string "aba", 
 138  *   span() will yield contained/not-contained boundaries of { 0, 2, 3 } 
 139  *   while spanBack() will yield boundaries of { 0, 1, 3 }. 
 141  * Note: If it is important to get the same boundaries whether iterating forward 
 142  * or backward through a string, then either only span() should be used and 
 143  * the boundaries cached for backward operation, or an ICU BreakIterator 
 146  * Note: Unpaired surrogates are treated like surrogate code points. 
 147  * Similarly, set strings match only on code point boundaries, 
 148  * never in the middle of a surrogate pair. 
 149  * Illegal UTF-8 sequences are treated like U+FFFD. 
 150  * When processing UTF-8 strings, malformed set strings 
 151  * (strings with unpaired surrogates which cannot be converted to UTF-8) 
 156 typedef enum USetSpanCondition 
{ 
 158      * Continue a span() while there is no set element at the current position. 
 159      * Stops before the first set element (character or string). 
 160      * (For code points only, this is like while contains(current)==FALSE). 
 162      * When span() returns, the substring between where it started and the position 
 163      * it returned consists only of characters that are not in the set, 
 164      * and none of its strings overlap with the span. 
 168     USET_SPAN_NOT_CONTAINED 
= 0, 
 170      * Continue a span() while there is a set element at the current position. 
 171      * (For characters only, this is like while contains(current)==TRUE). 
 173      * When span() returns, the substring between where it started and the position 
 174      * it returned consists only of set elements (characters or strings) that are in the set. 
 176      * If a set contains strings, then the span will be the longest substring 
 177      * matching any of the possible concatenations of set elements (characters or strings). 
 178      * (There must be a single, non-overlapping concatenation of characters or strings.) 
 179      * This is equivalent to a POSIX regular expression for (OR of each set element)*. 
 183     USET_SPAN_CONTAINED 
= 1, 
 185      * Continue a span() while there is a set element at the current position. 
 186      * (For characters only, this is like while contains(current)==TRUE). 
 188      * When span() returns, the substring between where it started and the position 
 189      * it returned consists only of set elements (characters or strings) that are in the set. 
 191      * If a set only contains single characters, then this is the same 
 192      * as USET_SPAN_CONTAINED. 
 194      * If a set contains strings, then the span will be the longest substring 
 195      * with a match at each position with the longest single set element (character or string). 
 197      * Use this span condition together with other longest-match algorithms, 
 198      * such as ICU converters (ucnv_getUnicodeSet()). 
 202     USET_SPAN_SIMPLE 
= 2, 
 204      * One more than the last span condition. 
 207     USET_SPAN_CONDITION_COUNT
 
 211  * A serialized form of a Unicode set.  Limited manipulations are 
 212  * possible directly on a serialized set.  See below. 
 215 typedef struct USerializedSet 
{ 
 217      * The serialized Unicode Set. 
 220     const uint16_t *array
; 
 222      * The length of the array that contains BMP characters. 
 227      * The total length of the array. 
 232      * A small buffer for the array to reduce memory allocations. 
 235     uint16_t staticArray
[USET_SERIALIZED_STATIC_ARRAY_CAPACITY
]; 
 238 /********************************************************************* 
 240  *********************************************************************/ 
 243  * Create an empty USet object. 
 244  * Equivalent to uset_open(1, 0). 
 245  * @return a newly created USet.  The caller must call uset_close() on 
 249 U_STABLE USet
* U_EXPORT2
 
 253  * Creates a USet object that contains the range of characters 
 254  * start..end, inclusive.  If <code>start > end</code>  
 255  * then an empty set is created (same as using uset_openEmpty()). 
 256  * @param start first character of the range, inclusive 
 257  * @param end last character of the range, inclusive 
 258  * @return a newly created USet.  The caller must call uset_close() on 
 262 U_STABLE USet
* U_EXPORT2
 
 263 uset_open(UChar32 start
, UChar32 end
); 
 266  * Creates a set from the given pattern.  See the UnicodeSet class 
 267  * description for the syntax of the pattern language. 
 268  * @param pattern a string specifying what characters are in the set 
 269  * @param patternLength the length of the pattern, or -1 if null 
 271  * @param ec the error code 
 274 U_STABLE USet
* U_EXPORT2
 
 275 uset_openPattern(const UChar
* pattern
, int32_t patternLength
, 
 279  * Creates a set from the given pattern.  See the UnicodeSet class 
 280  * description for the syntax of the pattern language. 
 281  * @param pattern a string specifying what characters are in the set 
 282  * @param patternLength the length of the pattern, or -1 if null 
 284  * @param options bitmask for options to apply to the pattern. 
 285  * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 
 286  * @param ec the error code 
 289 U_STABLE USet
* U_EXPORT2
 
 290 uset_openPatternOptions(const UChar
* pattern
, int32_t patternLength
, 
 295  * Disposes of the storage used by a USet object.  This function should 
 296  * be called exactly once for objects returned by uset_open(). 
 297  * @param set the object to dispose of 
 300 U_STABLE 
void U_EXPORT2
 
 301 uset_close(USet
* set
); 
 303 #if U_SHOW_CPLUSPLUS_API 
 308  * \class LocalUSetPointer 
 309  * "Smart pointer" class, closes a USet via uset_close(). 
 310  * For most methods see the LocalPointerBase base class. 
 312  * @see LocalPointerBase 
 316 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer
, USet
, uset_close
); 
 323  * Returns a copy of this object. 
 324  * If this set is frozen, then the clone will be frozen as well. 
 325  * Use uset_cloneAsThawed() for a mutable clone of a frozen set. 
 326  * @param set the original set 
 327  * @return the newly allocated copy of the set 
 328  * @see uset_cloneAsThawed 
 331 U_STABLE USet 
* U_EXPORT2
 
 332 uset_clone(const USet 
*set
); 
 335  * Determines whether the set has been frozen (made immutable) or not. 
 336  * See the ICU4J Freezable interface for details. 
 338  * @return TRUE/FALSE for whether the set has been frozen 
 340  * @see uset_cloneAsThawed 
 343 U_STABLE UBool U_EXPORT2
 
 344 uset_isFrozen(const USet 
*set
); 
 347  * Freeze the set (make it immutable). 
 348  * Once frozen, it cannot be unfrozen and is therefore thread-safe 
 349  * until it is deleted. 
 350  * See the ICU4J Freezable interface for details. 
 351  * Freezing the set may also make some operations faster, for example 
 352  * uset_contains() and uset_span(). 
 353  * A frozen set will not be modified. (It remains frozen.) 
 355  * @return the same set, now frozen 
 357  * @see uset_cloneAsThawed 
 360 U_STABLE 
void U_EXPORT2
 
 361 uset_freeze(USet 
*set
); 
 364  * Clone the set and make the clone mutable. 
 365  * See the ICU4J Freezable interface for details. 
 367  * @return the mutable clone 
 373 U_STABLE USet 
* U_EXPORT2
 
 374 uset_cloneAsThawed(const USet 
*set
); 
 377  * Causes the USet object to represent the range <code>start - end</code>. 
 378  * If <code>start > end</code> then this USet is set to an empty range. 
 379  * A frozen set will not be modified. 
 380  * @param set the object to set to the given range 
 381  * @param start first character in the set, inclusive 
 382  * @param end last character in the set, inclusive 
 385 U_STABLE 
void U_EXPORT2
 
 387          UChar32 start
, UChar32 end
); 
 390  * Modifies the set to represent the set specified by the given 
 391  * pattern. See the UnicodeSet class description for the syntax of  
 392  * the pattern language. See also the User Guide chapter about UnicodeSet. 
 393  * <em>Empties the set passed before applying the pattern.</em> 
 394  * A frozen set will not be modified. 
 395  * @param set               The set to which the pattern is to be applied.  
 396  * @param pattern           A pointer to UChar string specifying what characters are in the set. 
 397  *                          The character at pattern[0] must be a '['. 
 398  * @param patternLength     The length of the UChar string. -1 if NUL terminated. 
 399  * @param options           A bitmask for options to apply to the pattern. 
 400  *                          Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 
 401  * @param status            Returns an error if the pattern cannot be parsed. 
 402  * @return                  Upon successful parse, the value is either 
 403  *                          the index of the character after the closing ']'  
 404  *                          of the parsed pattern. 
 405  *                          If the status code indicates failure, then the return value  
 406  *                          is the index of the error in the source. 
 410 U_STABLE 
int32_t U_EXPORT2 
 
 411 uset_applyPattern(USet 
*set
, 
 412                   const UChar 
*pattern
, int32_t patternLength
, 
 417  * Modifies the set to contain those code points which have the given value 
 418  * for the given binary or enumerated property, as returned by 
 419  * u_getIntPropertyValue.  Prior contents of this set are lost. 
 420  * A frozen set will not be modified. 
 422  * @param set the object to contain the code points defined by the property 
 424  * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 
 425  * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 
 426  * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. 
 428  * @param value a value in the range u_getIntPropertyMinValue(prop).. 
 429  * u_getIntPropertyMaxValue(prop), with one exception.  If prop is 
 430  * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but 
 431  * rather a mask value produced by U_GET_GC_MASK().  This allows grouped 
 432  * categories such as [:L:] to be represented. 
 434  * @param ec error code input/output parameter 
 438 U_STABLE 
void U_EXPORT2
 
 439 uset_applyIntPropertyValue(USet
* set
, 
 440                            UProperty prop
, int32_t value
, UErrorCode
* ec
); 
 443  * Modifies the set to contain those code points which have the 
 444  * given value for the given property.  Prior contents of this 
 446  * A frozen set will not be modified. 
 448  * @param set the object to contain the code points defined by the given 
 449  * property and value alias 
 451  * @param prop a string specifying a property alias, either short or long. 
 452  * The name is matched loosely.  See PropertyAliases.txt for names and a 
 453  * description of loose matching.  If the value string is empty, then this 
 454  * string is interpreted as either a General_Category value alias, a Script 
 455  * value alias, a binary property alias, or a special ID.  Special IDs are 
 456  * matched loosely and correspond to the following sets: 
 458  * "ANY" = [\\u0000-\\U0010FFFF], 
 459  * "ASCII" = [\\u0000-\\u007F], 
 460  * "Assigned" = [:^Cn:]. 
 462  * @param propLength the length of the prop, or -1 if NULL 
 464  * @param value a string specifying a value alias, either short or long. 
 465  * The name is matched loosely.  See PropertyValueAliases.txt for names 
 466  * and a description of loose matching.  In addition to aliases listed, 
 467  * numeric values and canonical combining classes may be expressed 
 468  * numerically, e.g., ("nv", "0.5") or ("ccc", "220").  The value string 
 471  * @param valueLength the length of the value, or -1 if NULL 
 473  * @param ec error code input/output parameter 
 477 U_STABLE 
void U_EXPORT2
 
 478 uset_applyPropertyAlias(USet
* set
, 
 479                         const UChar 
*prop
, int32_t propLength
, 
 480                         const UChar 
*value
, int32_t valueLength
, 
 484  * Return true if the given position, in the given pattern, appears 
 485  * to be the start of a UnicodeSet pattern. 
 487  * @param pattern a string specifying the pattern 
 488  * @param patternLength the length of the pattern, or -1 if NULL 
 489  * @param pos the given position 
 492 U_STABLE UBool U_EXPORT2
 
 493 uset_resemblesPattern(const UChar 
*pattern
, int32_t patternLength
, 
 497  * Returns a string representation of this set.  If the result of 
 498  * calling this function is passed to a uset_openPattern(), it 
 499  * will produce another set that is equal to this one. 
 501  * @param result the string to receive the rules, may be NULL 
 502  * @param resultCapacity the capacity of result, may be 0 if result is NULL 
 503  * @param escapeUnprintable if TRUE then convert unprintable 
 504  * character to their hex escape representations, \\uxxxx or 
 505  * \\Uxxxxxxxx.  Unprintable characters are those other than 
 506  * U+000A, U+0020..U+007E. 
 507  * @param ec error code. 
 508  * @return length of string, possibly larger than resultCapacity 
 511 U_STABLE 
int32_t U_EXPORT2
 
 512 uset_toPattern(const USet
* set
, 
 513                UChar
* result
, int32_t resultCapacity
, 
 514                UBool escapeUnprintable
, 
 518  * Adds the given character to the given USet.  After this call, 
 519  * uset_contains(set, c) will return TRUE. 
 520  * A frozen set will not be modified. 
 521  * @param set the object to which to add the character 
 522  * @param c the character to add 
 525 U_STABLE 
void U_EXPORT2
 
 526 uset_add(USet
* set
, UChar32 c
); 
 529  * Adds all of the elements in the specified set to this set if 
 530  * they're not already present.  This operation effectively 
 531  * modifies this set so that its value is the <i>union</i> of the two 
 532  * sets.  The behavior of this operation is unspecified if the specified 
 533  * collection is modified while the operation is in progress. 
 534  * A frozen set will not be modified. 
 536  * @param set the object to which to add the set 
 537  * @param additionalSet the source set whose elements are to be added to this set. 
 540 U_STABLE 
void U_EXPORT2
 
 541 uset_addAll(USet
* set
, const USet 
*additionalSet
); 
 544  * Adds the given range of characters to the given USet.  After this call, 
 545  * uset_contains(set, start, end) will return TRUE. 
 546  * A frozen set will not be modified. 
 547  * @param set the object to which to add the character 
 548  * @param start the first character of the range to add, inclusive 
 549  * @param end the last character of the range to add, inclusive 
 552 U_STABLE 
void U_EXPORT2
 
 553 uset_addRange(USet
* set
, UChar32 start
, UChar32 end
); 
 556  * Adds the given string to the given USet.  After this call, 
 557  * uset_containsString(set, str, strLen) will return TRUE. 
 558  * A frozen set will not be modified. 
 559  * @param set the object to which to add the character 
 560  * @param str the string to add 
 561  * @param strLen the length of the string or -1 if null terminated. 
 564 U_STABLE 
void U_EXPORT2
 
 565 uset_addString(USet
* set
, const UChar
* str
, int32_t strLen
); 
 568  * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 
 569  * If this set already any particular character, it has no effect on that character. 
 570  * A frozen set will not be modified. 
 571  * @param set the object to which to add the character 
 572  * @param str the source string 
 573  * @param strLen the length of the string or -1 if null terminated. 
 576 U_STABLE 
void U_EXPORT2
 
 577 uset_addAllCodePoints(USet
* set
, const UChar 
*str
, int32_t strLen
); 
 580  * Removes the given character from the given USet.  After this call, 
 581  * uset_contains(set, c) will return FALSE. 
 582  * A frozen set will not be modified. 
 583  * @param set the object from which to remove the character 
 584  * @param c the character to remove 
 587 U_STABLE 
void U_EXPORT2
 
 588 uset_remove(USet
* set
, UChar32 c
); 
 591  * Removes the given range of characters from the given USet.  After this call, 
 592  * uset_contains(set, start, end) will return FALSE. 
 593  * A frozen set will not be modified. 
 594  * @param set the object to which to add the character 
 595  * @param start the first character of the range to remove, inclusive 
 596  * @param end the last character of the range to remove, inclusive 
 599 U_STABLE 
void U_EXPORT2
 
 600 uset_removeRange(USet
* set
, UChar32 start
, UChar32 end
); 
 603  * Removes the given string to the given USet.  After this call, 
 604  * uset_containsString(set, str, strLen) will return FALSE. 
 605  * A frozen set will not be modified. 
 606  * @param set the object to which to add the character 
 607  * @param str the string to remove 
 608  * @param strLen the length of the string or -1 if null terminated. 
 611 U_STABLE 
void U_EXPORT2
 
 612 uset_removeString(USet
* set
, const UChar
* str
, int32_t strLen
); 
 615  * Removes from this set all of its elements that are contained in the 
 616  * specified set.  This operation effectively modifies this 
 617  * set so that its value is the <i>asymmetric set difference</i> of 
 619  * A frozen set will not be modified. 
 620  * @param set the object from which the elements are to be removed 
 621  * @param removeSet the object that defines which elements will be 
 622  * removed from this set 
 625 U_STABLE 
void U_EXPORT2
 
 626 uset_removeAll(USet
* set
, const USet
* removeSet
); 
 629  * Retain only the elements in this set that are contained in the 
 630  * specified range.  If <code>start > end</code> then an empty range is 
 631  * retained, leaving the set empty.  This is equivalent to 
 632  * a boolean logic AND, or a set INTERSECTION. 
 633  * A frozen set will not be modified. 
 635  * @param set the object for which to retain only the specified range 
 636  * @param start first character, inclusive, of range to be retained 
 638  * @param end last character, inclusive, of range to be retained 
 642 U_STABLE 
void U_EXPORT2
 
 643 uset_retain(USet
* set
, UChar32 start
, UChar32 end
); 
 646  * Retains only the elements in this set that are contained in the 
 647  * specified set.  In other words, removes from this set all of 
 648  * its elements that are not contained in the specified set.  This 
 649  * operation effectively modifies this set so that its value is 
 650  * the <i>intersection</i> of the two sets. 
 651  * A frozen set will not be modified. 
 653  * @param set the object on which to perform the retain 
 654  * @param retain set that defines which elements this set will retain 
 657 U_STABLE 
void U_EXPORT2
 
 658 uset_retainAll(USet
* set
, const USet
* retain
); 
 661  * Reallocate this objects internal structures to take up the least 
 662  * possible space, without changing this object's value. 
 663  * A frozen set will not be modified. 
 665  * @param set the object on which to perfrom the compact 
 668 U_STABLE 
void U_EXPORT2
 
 669 uset_compact(USet
* set
); 
 672  * Inverts this set.  This operation modifies this set so that 
 673  * its value is its complement.  This operation does not affect 
 674  * the multicharacter strings, if any. 
 675  * A frozen set will not be modified. 
 679 U_STABLE 
void U_EXPORT2
 
 680 uset_complement(USet
* set
); 
 683  * Complements in this set all elements contained in the specified 
 684  * set.  Any character in the other set will be removed if it is 
 685  * in this set, or will be added if it is not in this set. 
 686  * A frozen set will not be modified. 
 688  * @param set the set with which to complement 
 689  * @param complement set that defines which elements will be xor'ed 
 693 U_STABLE 
void U_EXPORT2
 
 694 uset_complementAll(USet
* set
, const USet
* complement
); 
 697  * Removes all of the elements from this set.  This set will be 
 698  * empty after this call returns. 
 699  * A frozen set will not be modified. 
 703 U_STABLE 
void U_EXPORT2
 
 704 uset_clear(USet
* set
); 
 707  * Close this set over the given attribute.  For the attribute 
 708  * USET_CASE, the result is to modify this set so that: 
 710  * 1. For each character or string 'a' in this set, all strings or 
 711  * characters 'b' such that foldCase(a) == foldCase(b) are added 
 714  * 2. For each string 'e' in the resulting set, if e != 
 715  * foldCase(e), 'e' will be removed. 
 717  * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] 
 719  * (Here foldCase(x) refers to the operation u_strFoldCase, and a 
 720  * == b denotes that the contents are the same, not pointer 
 723  * A frozen set will not be modified. 
 727  * @param attributes bitmask for attributes to close over. 
 728  * Currently only the USET_CASE bit is supported.  Any undefined bits 
 732 U_STABLE 
void U_EXPORT2
 
 733 uset_closeOver(USet
* set
, int32_t attributes
); 
 736  * Remove all strings from this set. 
 741 U_STABLE 
void U_EXPORT2
 
 742 uset_removeAllStrings(USet
* set
); 
 745  * Returns TRUE if the given USet contains no characters and no 
 748  * @return true if set is empty 
 751 U_STABLE UBool U_EXPORT2
 
 752 uset_isEmpty(const USet
* set
); 
 755  * Returns TRUE if the given USet contains the given character. 
 756  * This function works faster with a frozen set. 
 758  * @param c The codepoint to check for within the set 
 759  * @return true if set contains c 
 762 U_STABLE UBool U_EXPORT2
 
 763 uset_contains(const USet
* set
, UChar32 c
); 
 766  * Returns TRUE if the given USet contains all characters c 
 767  * where start <= c && c <= end. 
 769  * @param start the first character of the range to test, inclusive 
 770  * @param end the last character of the range to test, inclusive 
 771  * @return TRUE if set contains the range 
 774 U_STABLE UBool U_EXPORT2
 
 775 uset_containsRange(const USet
* set
, UChar32 start
, UChar32 end
); 
 778  * Returns TRUE if the given USet contains the given string. 
 780  * @param str the string 
 781  * @param strLen the length of the string or -1 if null terminated. 
 782  * @return true if set contains str 
 785 U_STABLE UBool U_EXPORT2
 
 786 uset_containsString(const USet
* set
, const UChar
* str
, int32_t strLen
); 
 789  * Returns the index of the given character within this set, where 
 790  * the set is ordered by ascending code point.  If the character 
 791  * is not in this set, return -1.  The inverse of this method is 
 792  * <code>charAt()</code>. 
 794  * @param c the character to obtain the index for 
 795  * @return an index from 0..size()-1, or -1 
 798 U_STABLE 
int32_t U_EXPORT2
 
 799 uset_indexOf(const USet
* set
, UChar32 c
); 
 802  * Returns the character at the given index within this set, where 
 803  * the set is ordered by ascending code point.  If the index is 
 804  * out of range, return (UChar32)-1.  The inverse of this method is 
 805  * <code>indexOf()</code>. 
 807  * @param charIndex an index from 0..size()-1 to obtain the char for 
 808  * @return the character at the given index, or (UChar32)-1. 
 811 U_STABLE UChar32 U_EXPORT2
 
 812 uset_charAt(const USet
* set
, int32_t charIndex
); 
 815  * Returns the number of characters and strings contained in the given 
 818  * @return a non-negative integer counting the characters and strings 
 822 U_STABLE 
int32_t U_EXPORT2
 
 823 uset_size(const USet
* set
); 
 826  * Returns the number of items in this set.  An item is either a range 
 827  * of characters or a single multicharacter string. 
 829  * @return a non-negative integer counting the character ranges 
 830  * and/or strings contained in set 
 833 U_STABLE 
int32_t U_EXPORT2
 
 834 uset_getItemCount(const USet
* set
); 
 837  * Returns an item of this set.  An item is either a range of 
 838  * characters or a single multicharacter string. 
 840  * @param itemIndex a non-negative integer in the range 0.. 
 841  * uset_getItemCount(set)-1 
 842  * @param start pointer to variable to receive first character 
 843  * in range, inclusive 
 844  * @param end pointer to variable to receive last character in range, 
 846  * @param str buffer to receive the string, may be NULL 
 847  * @param strCapacity capacity of str, or 0 if str is NULL 
 848  * @param ec error code 
 849  * @return the length of the string (>= 2), or 0 if the item is a 
 850  * range, in which case it is the range *start..*end, or -1 if 
 851  * itemIndex is out of range 
 854 U_STABLE 
int32_t U_EXPORT2
 
 855 uset_getItem(const USet
* set
, int32_t itemIndex
, 
 856              UChar32
* start
, UChar32
* end
, 
 857              UChar
* str
, int32_t strCapacity
, 
 861  * Returns true if set1 contains all the characters and strings 
 862  * of set2. It answers the question, 'Is set1 a superset of set2?' 
 863  * @param set1 set to be checked for containment 
 864  * @param set2 set to be checked for containment 
 865  * @return true if the test condition is met 
 868 U_STABLE UBool U_EXPORT2
 
 869 uset_containsAll(const USet
* set1
, const USet
* set2
); 
 872  * Returns true if this set contains all the characters 
 873  * of the given string. This is does not check containment of grapheme 
 874  * clusters, like uset_containsString. 
 875  * @param set set of characters to be checked for containment 
 876  * @param str string containing codepoints to be checked for containment 
 877  * @param strLen the length of the string or -1 if null terminated. 
 878  * @return true if the test condition is met 
 881 U_STABLE UBool U_EXPORT2
 
 882 uset_containsAllCodePoints(const USet
* set
, const UChar 
*str
, int32_t strLen
); 
 885  * Returns true if set1 contains none of the characters and strings 
 886  * of set2. It answers the question, 'Is set1 a disjoint set of set2?' 
 887  * @param set1 set to be checked for containment 
 888  * @param set2 set to be checked for containment 
 889  * @return true if the test condition is met 
 892 U_STABLE UBool U_EXPORT2
 
 893 uset_containsNone(const USet
* set1
, const USet
* set2
); 
 896  * Returns true if set1 contains some of the characters and strings 
 897  * of set2. It answers the question, 'Does set1 and set2 have an intersection?' 
 898  * @param set1 set to be checked for containment 
 899  * @param set2 set to be checked for containment 
 900  * @return true if the test condition is met 
 903 U_STABLE UBool U_EXPORT2
 
 904 uset_containsSome(const USet
* set1
, const USet
* set2
); 
 907  * Returns the length of the initial substring of the input string which 
 908  * consists only of characters and strings that are contained in this set 
 909  * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 
 910  * or only of characters and strings that are not contained 
 911  * in this set (USET_SPAN_NOT_CONTAINED). 
 912  * See USetSpanCondition for details. 
 913  * Similar to the strspn() C library function. 
 914  * Unpaired surrogates are treated according to contains() of their surrogate code points. 
 915  * This function works faster with a frozen set and with a non-negative string length argument. 
 917  * @param s start of the string 
 918  * @param length of the string; can be -1 for NUL-terminated 
 919  * @param spanCondition specifies the containment condition 
 920  * @return the length of the initial substring according to the spanCondition; 
 921  *         0 if the start of the string does not fit the spanCondition 
 923  * @see USetSpanCondition 
 925 U_STABLE 
int32_t U_EXPORT2
 
 926 uset_span(const USet 
*set
, const UChar 
*s
, int32_t length
, USetSpanCondition spanCondition
); 
 929  * Returns the start of the trailing substring of the input string which 
 930  * consists only of characters and strings that are contained in this set 
 931  * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 
 932  * or only of characters and strings that are not contained 
 933  * in this set (USET_SPAN_NOT_CONTAINED). 
 934  * See USetSpanCondition for details. 
 935  * Unpaired surrogates are treated according to contains() of their surrogate code points. 
 936  * This function works faster with a frozen set and with a non-negative string length argument. 
 938  * @param s start of the string 
 939  * @param length of the string; can be -1 for NUL-terminated 
 940  * @param spanCondition specifies the containment condition 
 941  * @return the start of the trailing substring according to the spanCondition; 
 942  *         the string length if the end of the string does not fit the spanCondition 
 944  * @see USetSpanCondition 
 946 U_STABLE 
int32_t U_EXPORT2
 
 947 uset_spanBack(const USet 
*set
, const UChar 
*s
, int32_t length
, USetSpanCondition spanCondition
); 
 950  * Returns the length of the initial substring of the input string which 
 951  * consists only of characters and strings that are contained in this set 
 952  * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 
 953  * or only of characters and strings that are not contained 
 954  * in this set (USET_SPAN_NOT_CONTAINED). 
 955  * See USetSpanCondition for details. 
 956  * Similar to the strspn() C library function. 
 957  * Malformed byte sequences are treated according to contains(0xfffd). 
 958  * This function works faster with a frozen set and with a non-negative string length argument. 
 960  * @param s start of the string (UTF-8) 
 961  * @param length of the string; can be -1 for NUL-terminated 
 962  * @param spanCondition specifies the containment condition 
 963  * @return the length of the initial substring according to the spanCondition; 
 964  *         0 if the start of the string does not fit the spanCondition 
 966  * @see USetSpanCondition 
 968 U_STABLE 
int32_t U_EXPORT2
 
 969 uset_spanUTF8(const USet 
*set
, const char *s
, int32_t length
, USetSpanCondition spanCondition
); 
 972  * Returns the start of the trailing substring of the input string which 
 973  * consists only of characters and strings that are contained in this set 
 974  * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 
 975  * or only of characters and strings that are not contained 
 976  * in this set (USET_SPAN_NOT_CONTAINED). 
 977  * See USetSpanCondition for details. 
 978  * Malformed byte sequences are treated according to contains(0xfffd). 
 979  * This function works faster with a frozen set and with a non-negative string length argument. 
 981  * @param s start of the string (UTF-8) 
 982  * @param length of the string; can be -1 for NUL-terminated 
 983  * @param spanCondition specifies the containment condition 
 984  * @return the start of the trailing substring according to the spanCondition; 
 985  *         the string length if the end of the string does not fit the spanCondition 
 987  * @see USetSpanCondition 
 989 U_STABLE 
int32_t U_EXPORT2
 
 990 uset_spanBackUTF8(const USet 
*set
, const char *s
, int32_t length
, USetSpanCondition spanCondition
); 
 993  * Returns true if set1 contains all of the characters and strings 
 994  * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?' 
 995  * @param set1 set to be checked for containment 
 996  * @param set2 set to be checked for containment 
 997  * @return true if the test condition is met 
1000 U_STABLE UBool U_EXPORT2
 
1001 uset_equals(const USet
* set1
, const USet
* set2
); 
1003 /********************************************************************* 
1004  * Serialized set API 
1005  *********************************************************************/ 
1008  * Serializes this set into an array of 16-bit integers.  Serialization 
1009  * (currently) only records the characters in the set; multicharacter 
1010  * strings are ignored. 
1013  * has following format (each line is one 16-bit integer): 
1015  *  length     = (n+2*m) | (m!=0?0x8000:0) 
1016  *  bmpLength  = n; present if m!=0 
1029  * The array starts with a header.  After the header are n bmp 
1030  * code points, then m supplementary code points.  Either n or m 
1031  * or both may be zero.  n+2*m is always <= 0x7FFF. 
1033  * If there are no supplementary characters (if m==0) then the 
1034  * header is one 16-bit integer, 'length', with value n. 
1036  * If there are supplementary characters (if m!=0) then the header 
1037  * is two 16-bit integers.  The first, 'length', has value 
1038  * (n+2*m)|0x8000.  The second, 'bmpLength', has value n. 
1040  * After the header the code points are stored in ascending order. 
1041  * Supplementary code points are stored as most significant 16 
1042  * bits followed by least significant 16 bits. 
1044  * @param set the set 
1045  * @param dest pointer to buffer of destCapacity 16-bit integers. 
1046  * May be NULL only if destCapacity is zero. 
1047  * @param destCapacity size of dest, or zero.  Must not be negative. 
1048  * @param pErrorCode pointer to the error code.  Will be set to 
1049  * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF.  Will be set to 
1050  * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity. 
1051  * @return the total length of the serialized format, including 
1052  * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other 
1053  * than U_BUFFER_OVERFLOW_ERROR. 
1056 U_STABLE 
int32_t U_EXPORT2
 
1057 uset_serialize(const USet
* set
, uint16_t* dest
, int32_t destCapacity
, UErrorCode
* pErrorCode
); 
1060  * Given a serialized array, fill in the given serialized set object. 
1061  * @param fillSet pointer to result 
1062  * @param src pointer to start of array 
1063  * @param srcLength length of array 
1064  * @return true if the given array is valid, otherwise false 
1067 U_STABLE UBool U_EXPORT2
 
1068 uset_getSerializedSet(USerializedSet
* fillSet
, const uint16_t* src
, int32_t srcLength
); 
1071  * Set the USerializedSet to contain the given character (and nothing 
1073  * @param fillSet pointer to result 
1074  * @param c The codepoint to set 
1077 U_STABLE 
void U_EXPORT2
 
1078 uset_setSerializedToOne(USerializedSet
* fillSet
, UChar32 c
); 
1081  * Returns TRUE if the given USerializedSet contains the given 
1083  * @param set the serialized set 
1084  * @param c The codepoint to check for within the set 
1085  * @return true if set contains c 
1088 U_STABLE UBool U_EXPORT2
 
1089 uset_serializedContains(const USerializedSet
* set
, UChar32 c
); 
1092  * Returns the number of disjoint ranges of characters contained in 
1093  * the given serialized set.  Ignores any strings contained in the 
1095  * @param set the serialized set 
1096  * @return a non-negative integer counting the character ranges 
1100 U_STABLE 
int32_t U_EXPORT2
 
1101 uset_getSerializedRangeCount(const USerializedSet
* set
); 
1104  * Returns a range of characters contained in the given serialized 
1106  * @param set the serialized set 
1107  * @param rangeIndex a non-negative integer in the range 0.. 
1108  * uset_getSerializedRangeCount(set)-1 
1109  * @param pStart pointer to variable to receive first character 
1110  * in range, inclusive 
1111  * @param pEnd pointer to variable to receive last character in range, 
1113  * @return true if rangeIndex is valid, otherwise false 
1116 U_STABLE UBool U_EXPORT2
 
1117 uset_getSerializedRange(const USerializedSet
* set
, int32_t rangeIndex
, 
1118                         UChar32
* pStart
, UChar32
* pEnd
);