2 **********************************************************************
3 * Copyright (C) 1999-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 **********************************************************************
11 #include "unicode/utypes.h"
12 #include "unicode/uniset.h"
13 #include "unicode/parsepos.h"
14 #include "unicode/symtable.h"
26 // Define UChar constants using hex for EBCDIC compatibility
27 // Used #define to reduce private static exports and memory access time.
28 #define SET_OPEN ((UChar)0x005B) /*[*/
29 #define SET_CLOSE ((UChar)0x005D) /*]*/
30 #define HYPHEN ((UChar)0x002D) /*-*/
31 #define COMPLEMENT ((UChar)0x005E) /*^*/
32 #define COLON ((UChar)0x003A) /*:*/
33 #define BACKSLASH ((UChar)0x005C) /*\*/
34 #define INTERSECTION ((UChar)0x0026) /*&*/
35 #define UPPER_U ((UChar)0x0055) /*U*/
36 #define LOWER_U ((UChar)0x0075) /*u*/
37 #define OPEN_BRACE ((UChar)123) /*{*/
38 #define CLOSE_BRACE ((UChar)125) /*}*/
39 #define UPPER_P ((UChar)0x0050) /*P*/
40 #define LOWER_P ((UChar)0x0070) /*p*/
41 #define UPPER_N ((UChar)78) /*N*/
42 #define EQUALS ((UChar)0x003D) /*=*/
44 // HIGH_VALUE > all valid values. 110000 for codepoints
45 #define UNICODESET_HIGH 0x0110000
47 // LOW <= all valid values. ZERO for codepoints
48 #define UNICODESET_LOW 0x000000
50 // initial storage. Must be >= 0
51 #define START_EXTRA 16
53 // extra amount for growth. Must be >= 0
54 #define GROW_EXTRA START_EXTRA
58 SymbolTable::~SymbolTable() {}
60 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet
)
63 * Modify the given UChar32 variable so that it is in range, by
64 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
65 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
66 * It modifies its argument in-place and also returns it.
68 static inline UChar32
pinCodePoint(UChar32
& c
) {
69 if (c
< UNICODESET_LOW
) {
71 } else if (c
> (UNICODESET_HIGH
-1)) {
72 c
= (UNICODESET_HIGH
-1);
77 //----------------------------------------------------------------
79 //----------------------------------------------------------------
81 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
82 // To enable the debugging, define the symbol DEBUG_MEM in the line
83 // below. This will result in text being sent to stdout that looks
85 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
86 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
87 // Each line lists a construction (ct) or destruction (dt) event, the
88 // object address, the number of outstanding objects after the event,
89 // and the pattern of the object in question.
95 static int32_t _dbgCount
= 0;
97 static inline void _dbgct(UnicodeSet
* set
) {
99 set
->toPattern(str
, TRUE
);
101 str
.extract(0, 39, buf
, "");
102 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set
, ++_dbgCount
, buf
);
105 static inline void _dbgdt(UnicodeSet
* set
) {
107 set
->toPattern(str
, TRUE
);
109 str
.extract(0, 39, buf
, "");
110 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set
, --_dbgCount
, buf
);
120 //----------------------------------------------------------------
121 // UnicodeString in UVector support
122 //----------------------------------------------------------------
124 static void U_CALLCONV
cloneUnicodeString(UHashTok
*dst
, UHashTok
*src
) {
125 dst
->pointer
= new UnicodeString(*(UnicodeString
*)src
->pointer
);
128 static int8_t U_CALLCONV
compareUnicodeString(UHashTok t1
, UHashTok t2
) {
129 const UnicodeString
&a
= *(const UnicodeString
*)t1
.pointer
;
130 const UnicodeString
&b
= *(const UnicodeString
*)t2
.pointer
;
134 //----------------------------------------------------------------
136 //----------------------------------------------------------------
139 * Constructs an empty set.
141 UnicodeSet::UnicodeSet() :
142 len(1), capacity(1 + START_EXTRA
), bufferCapacity(0),
143 list(0), buffer(0), strings(0)
145 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
147 list
[0] = UNICODESET_HIGH
;
154 * Constructs a set containing the given range. If <code>end >
155 * start</code> then an empty set is created.
157 * @param start first character, inclusive, of range
158 * @param end last character, inclusive, of range
160 UnicodeSet::UnicodeSet(UChar32 start
, UChar32 end
) :
161 len(1), capacity(1 + START_EXTRA
), bufferCapacity(0),
162 list(0), buffer(0), strings(0)
164 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
166 list
[0] = UNICODESET_HIGH
;
169 complement(start
, end
);
174 * Constructs a set that is identical to the given UnicodeSet.
176 UnicodeSet::UnicodeSet(const UnicodeSet
& o
) :
178 len(0), capacity(o
.len
+ GROW_EXTRA
), bufferCapacity(0),
179 list(0), buffer(0), strings(0)
181 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
192 UnicodeSet::~UnicodeSet() {
193 _dbgdt(this); // first!
202 * Assigns this object to be a copy of another.
204 UnicodeSet
& UnicodeSet::operator=(const UnicodeSet
& o
) {
205 ensureCapacity(o
.len
);
207 uprv_memcpy(list
, o
.list
, len
*sizeof(UChar32
));
208 UErrorCode ec
= U_ZERO_ERROR
;
209 strings
->assign(*o
.strings
, cloneUnicodeString
, ec
);
215 * Compares the specified object with this set for equality. Returns
216 * <tt>true</tt> if the two sets
217 * have the same size, and every member of the specified set is
218 * contained in this set (or equivalently, every member of this set is
219 * contained in the specified set).
221 * @param o set to be compared for equality with this set.
222 * @return <tt>true</tt> if the specified set is equal to this set.
224 UBool
UnicodeSet::operator==(const UnicodeSet
& o
) const {
225 if (len
!= o
.len
) return FALSE
;
226 for (int32_t i
= 0; i
< len
; ++i
) {
227 if (list
[i
] != o
.list
[i
]) return FALSE
;
229 if (*strings
!= *o
.strings
) return FALSE
;
234 * Returns a copy of this object. All UnicodeMatcher objects have
235 * to support cloning in order to allow classes using
236 * UnicodeMatchers, such as Transliterator, to implement cloning.
238 UnicodeFunctor
* UnicodeSet::clone() const {
239 return new UnicodeSet(*this);
243 * Returns the hash code value for this set.
245 * @return the hash code value for this set.
246 * @see Object#hashCode()
248 int32_t UnicodeSet::hashCode(void) const {
249 int32_t result
= len
;
250 for (int32_t i
= 0; i
< len
; ++i
) {
257 //----------------------------------------------------------------
259 //----------------------------------------------------------------
262 * Make this object represent the range <code>start - end</code>.
263 * If <code>end > start</code> then this object is set to an
266 * @param start first character in the set, inclusive
267 * @rparam end last character in the set, inclusive
269 UnicodeSet
& UnicodeSet::set(UChar32 start
, UChar32 end
) {
271 complement(start
, end
);
276 * Returns the number of elements in this set (its cardinality),
277 * Note than the elements of a set may include both individual
278 * codepoints and strings.
280 * @return the number of elements in this set (its cardinality).
282 int32_t UnicodeSet::size(void) const {
284 int32_t count
= getRangeCount();
285 for (int32_t i
= 0; i
< count
; ++i
) {
286 n
+= getRangeEnd(i
) - getRangeStart(i
) + 1;
288 return n
+ strings
->size();
292 * Returns <tt>true</tt> if this set contains no elements.
294 * @return <tt>true</tt> if this set contains no elements.
296 UBool
UnicodeSet::isEmpty(void) const {
297 return len
== 1 && strings
->size() == 0;
301 * Returns true if this set contains the given character.
302 * @param c character to be checked for containment
303 * @return true if the test condition is met
305 UBool
UnicodeSet::contains(UChar32 c
) const {
306 // Set i to the index of the start item greater than ch
307 // We know we will terminate without length test!
308 // LATER: for large sets, add binary search
311 // if (c < list[++i]) break;
313 if (c
>= UNICODESET_HIGH
) { // Don't need to check LOW bound
316 int32_t i
= findCodePoint(c
);
317 return ((i
& 1) != 0); // return true if odd
321 * Returns the smallest value i such that c < list[i]. Caller
322 * must ensure that c is a legal value or this method will enter
323 * an infinite loop. This method performs a binary search.
324 * @param c a character in the range MIN_VALUE..MAX_VALUE
326 * @return the smallest integer i in the range 0..len-1,
327 * inclusive, such that c < list[i]
329 int32_t UnicodeSet::findCodePoint(UChar32 c
) const {
332 set list[] c=0 1 3 4 7 8
333 === ============== ===========
334 [] [110000] 0 0 0 0 0 0
335 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
336 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
337 [:Any:] [0, 110000] 1 1 1 1 1 1
340 // Return the smallest i such that c < list[i]. Assume
341 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
344 // High runner test. c is often after the last range, so an
345 // initial check for this condition pays off.
346 if (len
>= 2 && c
>= list
[len
-2])
349 int32_t hi
= len
- 1;
350 // invariant: c >= list[lo]
351 // invariant: c < list[hi]
353 int32_t i
= (lo
+ hi
) >> 1;
356 } else if (c
< list
[i
]) {
366 * Returns true if this set contains every character
367 * of the given range.
368 * @param start first character, inclusive, of the range
369 * @param end last character, inclusive, of the range
370 * @return true if the test condition is met
372 UBool
UnicodeSet::contains(UChar32 start
, UChar32 end
) const {
375 // if (start < list[++i]) break;
377 int32_t i
= findCodePoint(start
);
378 return ((i
& 1) != 0 && end
< list
[i
]);
382 * Returns <tt>true</tt> if this set contains the given
383 * multicharacter string.
384 * @param s string to be checked for containment
385 * @return <tt>true</tt> if this set contains the specified string
387 UBool
UnicodeSet::contains(const UnicodeString
& s
) const {
388 if (s
.length() == 0) return FALSE
;
389 int32_t cp
= getSingleCP(s
);
391 return strings
->contains((void*) &s
);
393 return contains((UChar32
) cp
);
398 * Returns true if this set contains all the characters and strings
400 * @param c set to be checked for containment
401 * @return true if the test condition is met
403 UBool
UnicodeSet::containsAll(const UnicodeSet
& c
) const {
404 // The specified set is a subset if all of its pairs are contained in
405 // this set. It's possible to code this more efficiently in terms of
406 // direct manipulation of the inversion lists if the need arises.
407 int32_t n
= c
.getRangeCount();
408 for (int i
=0; i
<n
; ++i
) {
409 if (!contains(c
.getRangeStart(i
), c
.getRangeEnd(i
))) {
413 if (!strings
->containsAll(*c
.strings
)) return FALSE
;
418 * Returns true if this set contains all the characters
419 * of the given string.
420 * @param s string containing characters to be checked for containment
421 * @return true if the test condition is met
423 UBool
UnicodeSet::containsAll(const UnicodeString
& s
) const {
425 for (int32_t i
= 0; i
< s
.length(); i
+= UTF_CHAR_LENGTH(cp
)) {
427 if (!contains(cp
)) return FALSE
;
433 * Returns true if this set contains none of the characters
434 * of the given range.
435 * @param start first character, inclusive, of the range
436 * @param end last character, inclusive, of the range
437 * @return true if the test condition is met
439 UBool
UnicodeSet::containsNone(UChar32 start
, UChar32 end
) const {
442 // if (start < list[++i]) break;
444 int32_t i
= findCodePoint(start
);
445 return ((i
& 1) == 0 && end
< list
[i
]);
449 * Returns true if this set contains none of the characters and strings
451 * @param c set to be checked for containment
452 * @return true if the test condition is met
454 UBool
UnicodeSet::containsNone(const UnicodeSet
& c
) const {
455 // The specified set is a subset if all of its pairs are contained in
456 // this set. It's possible to code this more efficiently in terms of
457 // direct manipulation of the inversion lists if the need arises.
458 int32_t n
= c
.getRangeCount();
459 for (int32_t i
=0; i
<n
; ++i
) {
460 if (!containsNone(c
.getRangeStart(i
), c
.getRangeEnd(i
))) {
464 if (!strings
->containsNone(*c
.strings
)) return FALSE
;
469 * Returns true if this set contains none of the characters
470 * of the given string.
471 * @param s string containing characters to be checked for containment
472 * @return true if the test condition is met
474 UBool
UnicodeSet::containsNone(const UnicodeString
& s
) const {
476 for (int32_t i
= 0; i
< s
.length(); i
+= UTF_CHAR_LENGTH(cp
)) {
478 if (contains(cp
)) return FALSE
;
484 * Returns <tt>true</tt> if this set contains any character whose low byte
485 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
488 UBool
UnicodeSet::matchesIndexValue(uint8_t v
) const {
489 /* The index value v, in the range [0,255], is contained in this set if
490 * it is contained in any pair of this set. Pairs either have the high
491 * bytes equal, or unequal. If the high bytes are equal, then we have
492 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
493 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
494 * Then v is contained if xx <= v || v <= yy. (This is identical to the
495 * time zone month containment logic.)
498 for (i
=0; i
<getRangeCount(); ++i
) {
499 UChar32 low
= getRangeStart(i
);
500 UChar32 high
= getRangeEnd(i
);
501 if ((low
& ~0xFF) == (high
& ~0xFF)) {
502 if ((low
& 0xFF) <= v
&& v
<= (high
& 0xFF)) {
505 } else if ((low
& 0xFF) <= v
|| v
<= (high
& 0xFF)) {
509 if (strings
->size() != 0) {
510 for (i
=0; i
<strings
->size(); ++i
) {
511 const UnicodeString
& s
= *(const UnicodeString
*)strings
->elementAt(i
);
512 //if (s.length() == 0) {
513 // // Empty strings match everything
516 // assert(s.length() != 0); // We enforce this elsewhere
517 UChar32 c
= s
.char32At(0);
518 if ((c
& 0xFF) == v
) {
527 * Implementation of UnicodeMatcher::matches(). Always matches the
528 * longest possible multichar string.
530 UMatchDegree
UnicodeSet::matches(const Replaceable
& text
,
534 if (offset
== limit
) {
535 // Strings, if any, have length != 0, so we don't worry
536 // about them here. If we ever allow zero-length strings
537 // we much check for them here.
538 if (contains(U_ETHER
)) {
539 return incremental
? U_PARTIAL_MATCH
: U_MATCH
;
544 if (strings
->size() != 0) { // try strings first
546 // might separate forward and backward loops later
547 // for now they are combined
549 // TODO Improve efficiency of this, at least in the forward
550 // direction, if not in both. In the forward direction we
551 // can assume the strings are sorted.
554 UBool forward
= offset
< limit
;
556 // firstChar is the leftmost char to match in the
557 // forward direction or the rightmost char to match in
558 // the reverse direction.
559 UChar firstChar
= text
.charAt(offset
);
561 // If there are multiple strings that can match we
562 // return the longest match.
563 int32_t highWaterLength
= 0;
565 for (i
=0; i
<strings
->size(); ++i
) {
566 const UnicodeString
& trial
= *(const UnicodeString
*)strings
->elementAt(i
);
568 //if (trial.length() == 0) {
569 // return U_MATCH; // null-string always matches
571 // assert(trial.length() != 0); // We ensure this elsewhere
573 UChar c
= trial
.charAt(forward
? 0 : trial
.length() - 1);
575 // Strings are sorted, so we can optimize in the
576 // forward direction.
577 if (forward
&& c
> firstChar
) break;
578 if (c
!= firstChar
) continue;
580 int32_t matchLen
= matchRest(text
, offset
, limit
, trial
);
583 int32_t maxLen
= forward
? limit
-offset
: offset
-limit
;
584 if (matchLen
== maxLen
) {
585 // We have successfully matched but only up to limit.
586 return U_PARTIAL_MATCH
;
590 if (matchLen
== trial
.length()) {
591 // We have successfully matched the whole string.
592 if (matchLen
> highWaterLength
) {
593 highWaterLength
= matchLen
;
595 // In the forward direction we know strings
596 // are sorted so we can bail early.
597 if (forward
&& matchLen
< highWaterLength
) {
604 // We've checked all strings without a partial match.
605 // If we have full matches, return the longest one.
606 if (highWaterLength
!= 0) {
607 offset
+= forward
? highWaterLength
: -highWaterLength
;
611 return UnicodeFilter::matches(text
, offset
, limit
, incremental
);
616 * Returns the longest match for s in text at the given position.
617 * If limit > start then match forward from start+1 to limit
618 * matching all characters except s.charAt(0). If limit < start,
619 * go backward starting from start-1 matching all characters
620 * except s.charAt(s.length()-1). This method assumes that the
621 * first character, text.charAt(start), matches s, so it does not
623 * @param text the text to match
624 * @param start the first character to match. In the forward
625 * direction, text.charAt(start) is matched against s.charAt(0).
626 * In the reverse direction, it is matched against
627 * s.charAt(s.length()-1).
628 * @param limit the limit offset for matching, either last+1 in
629 * the forward direction, or last-1 in the reverse direction,
630 * where last is the index of the last character to match.
631 * @return If part of s matches up to the limit, return |limit -
632 * start|. If all of s matches before reaching the limit, return
633 * s.length(). If there is a mismatch between s and text, return
636 int32_t UnicodeSet::matchRest(const Replaceable
& text
,
637 int32_t start
, int32_t limit
,
638 const UnicodeString
& s
) {
641 int32_t slen
= s
.length();
643 maxLen
= limit
- start
;
644 if (maxLen
> slen
) maxLen
= slen
;
645 for (i
= 1; i
< maxLen
; ++i
) {
646 if (text
.charAt(start
+ i
) != s
.charAt(i
)) return 0;
649 maxLen
= start
- limit
;
650 if (maxLen
> slen
) maxLen
= slen
;
651 --slen
; // <=> slen = s.length() - 1;
652 for (i
= 1; i
< maxLen
; ++i
) {
653 if (text
.charAt(start
- i
) != s
.charAt(slen
- i
)) return 0;
660 * Implement of UnicodeMatcher
662 void UnicodeSet::addMatchSetTo(UnicodeSet
& toUnionTo
) const {
663 toUnionTo
.addAll(*this);
667 * Returns the index of the given character within this set, where
668 * the set is ordered by ascending code point. If the character
669 * is not in this set, return -1. The inverse of this method is
670 * <code>charAt()</code>.
671 * @return an index from 0..size()-1, or -1
673 int32_t UnicodeSet::indexOf(UChar32 c
) const {
674 if (c
< MIN_VALUE
|| c
> MAX_VALUE
) {
680 UChar32 start
= list
[i
++];
684 UChar32 limit
= list
[i
++];
686 return n
+ c
- start
;
693 * Returns the character at the given index within this set, where
694 * the set is ordered by ascending code point. If the index is
695 * out of range, return (UChar32)-1. The inverse of this method is
696 * <code>indexOf()</code>.
697 * @param index an index from 0..size()-1
698 * @return the character at the given index, or (UChar32)-1.
700 UChar32
UnicodeSet::charAt(int32_t index
) const {
702 // len2 is the largest even integer <= len, that is, it is len
703 // for even values and len-1 for odd values. With odd values
704 // the last entry is UNICODESET_HIGH.
705 int32_t len2
= len
& ~1;
706 for (int32_t i
=0; i
< len2
;) {
707 UChar32 start
= list
[i
++];
708 int32_t count
= list
[i
++] - start
;
710 return (UChar32
)(start
+ index
);
719 * Adds the specified range to this set if it is not already
720 * present. If this set already contains the specified range,
721 * the call leaves this set unchanged. If <code>end > start</code>
722 * then an empty range is added, leaving the set unchanged.
724 * @param start first character, inclusive, of range to be added
726 * @param end last character, inclusive, of range to be added
729 UnicodeSet
& UnicodeSet::add(UChar32 start
, UChar32 end
) {
730 if (pinCodePoint(start
) < pinCodePoint(end
)) {
731 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
733 } else if (start
== end
) {
739 // #define DEBUG_US_ADD
743 void dump(UChar32 c
) {
745 printf("%c", (char)c
);
750 void dump(const UChar32
* list
, int32_t len
) {
752 for (int32_t i
=0; i
<len
; ++i
) {
753 if (i
!= 0) printf(", ");
761 * Adds the specified character to this set if it is not already
762 * present. If this set already contains the specified character,
763 * the call leaves this set unchanged.
765 UnicodeSet
& UnicodeSet::add(UChar32 c
) {
766 // find smallest i such that c < list[i]
767 // if odd, then it is IN the set
768 // if even, then it is OUT of the set
769 int32_t i
= findCodePoint(pinCodePoint(c
));
772 if ((i
& 1) != 0) return *this;
775 // assert(list[len-1] == HIGH);
778 // [start_0, limit_0, start_1, limit_1, HIGH]
780 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
784 // i == 0 means c is before the first range
789 printf(" found at %d", i
);
795 if (c
== list
[i
]-1) {
796 // c is before start of next range
798 // if we touched the HIGH mark, then add a new one
799 if (c
== (UNICODESET_HIGH
- 1)) {
800 ensureCapacity(len
+1);
801 list
[len
++] = UNICODESET_HIGH
;
803 if (i
> 0 && c
== list
[i
-1]) {
804 // collapse adjacent ranges
806 // [..., start_k-1, c, c, limit_k, ..., HIGH]
810 //for (int32_t k=i-1; k<len-2; ++k) {
811 // list[k] = list[k+2];
813 UChar32
* dst
= list
+ i
- 1;
814 UChar32
* src
= dst
+ 2;
815 UChar32
* srclimit
= list
+ len
;
816 while (src
< srclimit
) *(dst
++) = *(src
++);
822 else if (i
> 0 && c
== list
[i
-1]) {
823 // c is after end of prior range
825 // no need to check for collapse here
829 // At this point we know the new char is not adjacent to
830 // any existing ranges, and it is not 10FFFF.
833 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
837 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
841 ensureCapacity(len
+2);
843 //for (int32_t k=len-1; k>=i; --k) {
844 // list[k+2] = list[k];
846 UChar32
* src
= list
+ len
;
847 UChar32
* dst
= src
+ 2;
848 UChar32
* srclimit
= list
+ i
;
849 while (src
> srclimit
) *(--dst
) = *(--src
);
860 for (i
=1; i
<len
; ++i
) {
861 if (list
[i
] <= list
[i
-1]) {
863 printf("ERROR: list has been corrupted\n");
874 * Adds the specified multicharacter to this set if it is not already
875 * present. If this set already contains the multicharacter,
876 * the call leaves this set unchanged.
877 * Thus "ch" => {"ch"}
878 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
879 * @param s the source string
880 * @return the modified set, for chaining
882 UnicodeSet
& UnicodeSet::add(const UnicodeString
& s
) {
883 if (s
.length() == 0) return *this;
884 int32_t cp
= getSingleCP(s
);
886 if (!strings
->contains((void*) &s
)) {
891 add((UChar32
)cp
, (UChar32
)cp
);
897 * Adds the given string, in order, to 'strings'. The given string
898 * must have been checked by the caller to not be empty and to not
899 * already be in 'strings'.
901 void UnicodeSet::_add(const UnicodeString
& s
) {
902 UnicodeString
* t
= new UnicodeString(s
);
903 UErrorCode ec
= U_ZERO_ERROR
;
904 strings
->sortedInsert(t
, compareUnicodeString
, ec
);
908 * @return a code point IF the string consists of a single one.
909 * otherwise returns -1.
910 * @param string to test
912 int32_t UnicodeSet::getSingleCP(const UnicodeString
& s
) {
913 //if (s.length() < 1) {
914 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
916 if (s
.length() > 2) return -1;
917 if (s
.length() == 1) return s
.charAt(0);
919 // at this point, len = 2
920 UChar32 cp
= s
.char32At(0);
921 if (cp
> 0xFFFF) { // is surrogate pair
928 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
929 * If this set already any particular character, it has no effect on that character.
930 * @param the source string
931 * @return the modified set, for chaining
933 UnicodeSet
& UnicodeSet::addAll(const UnicodeString
& s
) {
935 for (int32_t i
= 0; i
< s
.length(); i
+= UTF_CHAR_LENGTH(cp
)) {
943 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
944 * If this set already any particular character, it has no effect on that character.
945 * @param the source string
946 * @return the modified set, for chaining
948 UnicodeSet
& UnicodeSet::retainAll(const UnicodeString
& s
) {
956 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
957 * If this set already any particular character, it has no effect on that character.
958 * @param the source string
959 * @return the modified set, for chaining
961 UnicodeSet
& UnicodeSet::complementAll(const UnicodeString
& s
) {
969 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
970 * If this set already any particular character, it has no effect on that character.
971 * @param the source string
972 * @return the modified set, for chaining
974 UnicodeSet
& UnicodeSet::removeAll(const UnicodeString
& s
) {
982 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
983 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
984 * @param the source string
985 * @return a newly created set containing the given string
987 UnicodeSet
* U_EXPORT2
UnicodeSet::createFrom(const UnicodeString
& s
) {
988 UnicodeSet
*set
= new UnicodeSet();
995 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
996 * @param the source string
997 * @return a newly created set containing the given characters
999 UnicodeSet
* U_EXPORT2
UnicodeSet::createFromAll(const UnicodeString
& s
) {
1000 UnicodeSet
*set
= new UnicodeSet();
1006 * Retain only the elements in this set that are contained in the
1007 * specified range. If <code>end > start</code> then an empty range is
1008 * retained, leaving the set empty.
1010 * @param start first character, inclusive, of range to be retained
1012 * @param end last character, inclusive, of range to be retained
1015 UnicodeSet
& UnicodeSet::retain(UChar32 start
, UChar32 end
) {
1016 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1017 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1018 retain(range
, 2, 0);
1025 UnicodeSet
& UnicodeSet::retain(UChar32 c
) {
1026 return retain(c
, c
);
1030 * Removes the specified range from this set if it is present.
1031 * The set will not contain the specified range once the call
1032 * returns. If <code>end > start</code> then an empty range is
1033 * removed, leaving the set unchanged.
1035 * @param start first character, inclusive, of range to be removed
1037 * @param end last character, inclusive, of range to be removed
1040 UnicodeSet
& UnicodeSet::remove(UChar32 start
, UChar32 end
) {
1041 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1042 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1043 retain(range
, 2, 2);
1049 * Removes the specified character from this set if it is present.
1050 * The set will not contain the specified range once the call
1053 UnicodeSet
& UnicodeSet::remove(UChar32 c
) {
1054 return remove(c
, c
);
1058 * Removes the specified string from this set if it is present.
1059 * The set will not contain the specified character once the call
1061 * @param the source string
1062 * @return the modified set, for chaining
1064 UnicodeSet
& UnicodeSet::remove(const UnicodeString
& s
) {
1065 if (s
.length() == 0) return *this;
1066 int32_t cp
= getSingleCP(s
);
1068 strings
->removeElement((void*) &s
);
1071 remove((UChar32
)cp
, (UChar32
)cp
);
1077 * Complements the specified range in this set. Any character in
1078 * the range will be removed if it is in this set, or will be
1079 * added if it is not in this set. If <code>end > start</code>
1080 * then an empty range is xor'ed, leaving the set unchanged.
1082 * @param start first character, inclusive, of range to be removed
1084 * @param end last character, inclusive, of range to be removed
1087 UnicodeSet
& UnicodeSet::complement(UChar32 start
, UChar32 end
) {
1088 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1089 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1090 exclusiveOr(range
, 2, 0);
1096 UnicodeSet
& UnicodeSet::complement(UChar32 c
) {
1097 return complement(c
, c
);
1101 * This is equivalent to
1102 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1104 UnicodeSet
& UnicodeSet::complement(void) {
1105 if (list
[0] == UNICODESET_LOW
) {
1106 ensureBufferCapacity(len
-1);
1107 uprv_memcpy(buffer
, list
+ 1, (len
-1)*sizeof(UChar32
));
1110 ensureBufferCapacity(len
+1);
1111 uprv_memcpy(buffer
+ 1, list
, len
*sizeof(UChar32
));
1112 buffer
[0] = UNICODESET_LOW
;
1121 * Complement the specified string in this set.
1122 * The set will not contain the specified string once the call
1124 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1125 * @param s the string to complement
1126 * @return this object, for chaining
1128 UnicodeSet
& UnicodeSet::complement(const UnicodeString
& s
) {
1129 if (s
.length() == 0) return *this;
1130 int32_t cp
= getSingleCP(s
);
1132 if (strings
->contains((void*) &s
)) {
1133 strings
->removeElement((void*) &s
);
1139 complement((UChar32
)cp
, (UChar32
)cp
);
1145 * Adds all of the elements in the specified set to this set if
1146 * they're not already present. This operation effectively
1147 * modifies this set so that its value is the <i>union</i> of the two
1148 * sets. The behavior of this operation is unspecified if the specified
1149 * collection is modified while the operation is in progress.
1151 * @param c set whose elements are to be added to this set.
1152 * @see #add(char, char)
1154 UnicodeSet
& UnicodeSet::addAll(const UnicodeSet
& c
) {
1155 add(c
.list
, c
.len
, 0);
1157 // Add strings in order
1158 for (int32_t i
=0; i
<c
.strings
->size(); ++i
) {
1159 const UnicodeString
* s
= (const UnicodeString
*)c
.strings
->elementAt(i
);
1160 if (!strings
->contains((void*) s
)) {
1168 * Retains only the elements in this set that are contained in the
1169 * specified set. In other words, removes from this set all of
1170 * its elements that are not contained in the specified set. This
1171 * operation effectively modifies this set so that its value is
1172 * the <i>intersection</i> of the two sets.
1174 * @param c set that defines which elements this set will retain.
1176 UnicodeSet
& UnicodeSet::retainAll(const UnicodeSet
& c
) {
1177 retain(c
.list
, c
.len
, 0);
1178 strings
->retainAll(*c
.strings
);
1183 * Removes from this set all of its elements that are contained in the
1184 * specified set. This operation effectively modifies this
1185 * set so that its value is the <i>asymmetric set difference</i> of
1188 * @param c set that defines which elements will be removed from
1191 UnicodeSet
& UnicodeSet::removeAll(const UnicodeSet
& c
) {
1192 retain(c
.list
, c
.len
, 2);
1193 strings
->removeAll(*c
.strings
);
1198 * Complements in this set all elements contained in the specified
1199 * set. Any character in the other set will be removed if it is
1200 * in this set, or will be added if it is not in this set.
1202 * @param c set that defines which elements will be xor'ed from
1205 UnicodeSet
& UnicodeSet::complementAll(const UnicodeSet
& c
) {
1206 exclusiveOr(c
.list
, c
.len
, 0);
1208 for (int32_t i
=0; i
<c
.strings
->size(); ++i
) {
1209 void* e
= c
.strings
->elementAt(i
);
1210 if (!strings
->removeElement(e
)) {
1211 _add(*(const UnicodeString
*)e
);
1218 * Removes all of the elements from this set. This set will be
1219 * empty after this call returns.
1221 UnicodeSet
& UnicodeSet::clear(void) {
1222 list
[0] = UNICODESET_HIGH
;
1225 strings
->removeAllElements();
1230 * Iteration method that returns the number of ranges contained in
1232 * @see #getRangeStart
1235 int32_t UnicodeSet::getRangeCount() const {
1240 * Iteration method that returns the first character in the
1241 * specified range of this set.
1242 * @see #getRangeCount
1245 UChar32
UnicodeSet::getRangeStart(int32_t index
) const {
1246 return list
[index
*2];
1250 * Iteration method that returns the last character in the
1251 * specified range of this set.
1252 * @see #getRangeStart
1255 UChar32
UnicodeSet::getRangeEnd(int32_t index
) const {
1256 return list
[index
*2 + 1] - 1;
1259 int32_t UnicodeSet::getStringCount() const {
1260 return strings
->size();
1263 const UnicodeString
* UnicodeSet::getString(int32_t index
) const {
1264 return (const UnicodeString
*) strings
->elementAt(index
);
1268 * Reallocate this objects internal structures to take up the least
1269 * possible space, without changing this object's value.
1271 UnicodeSet
& UnicodeSet::compact() {
1272 if (len
!= capacity
) {
1274 UChar32
* temp
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
1275 uprv_memcpy(temp
, list
, len
*sizeof(UChar32
));
1284 int32_t UnicodeSet::serialize(uint16_t *dest
, int32_t destCapacity
, UErrorCode
& ec
) const {
1285 int32_t bmpLength
, length
, destLength
;
1287 if (U_FAILURE(ec
)) {
1291 if (destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
1292 ec
=U_ILLEGAL_ARGUMENT_ERROR
;
1296 /* count necessary 16-bit units */
1297 length
=this->len
-1; // Subtract 1 to ignore final UNICODESET_HIGH
1298 // assert(length>=0);
1301 if (destCapacity
>0) {
1304 ec
=U_BUFFER_OVERFLOW_ERROR
;
1310 if (this->list
[length
-1]<=0xffff) {
1313 } else if (this->list
[0]>=0x10000) {
1314 /* all supplementary */
1318 /* some BMP, some supplementary */
1319 for (bmpLength
=0; bmpLength
<length
&& this->list
[bmpLength
]<=0xffff; ++bmpLength
) {}
1320 length
=bmpLength
+2*(length
-bmpLength
);
1323 /* length: number of 16-bit array units */
1324 if (length
>0x7fff) {
1325 /* there are only 15 bits for the length in the first serialized word */
1326 ec
=U_INDEX_OUTOFBOUNDS_ERROR
;
1331 * total serialized length:
1332 * number of 16-bit array units (length) +
1333 * 1 length unit (always) +
1334 * 1 bmpLength unit (if there are supplementary values)
1336 destLength
=length
+((length
>bmpLength
)?2:1);
1337 if (destLength
<=destCapacity
) {
1341 *dest
=(uint16_t)length
;
1342 if (length
>bmpLength
) {
1344 *++dest
=(uint16_t)bmpLength
;
1348 /* write the BMP part of the array */
1350 for (i
=0; i
<bmpLength
; ++i
) {
1351 *dest
++=(uint16_t)*p
++;
1354 /* write the supplementary part of the array */
1355 for (; i
<length
; i
+=2) {
1356 *dest
++=(uint16_t)(*p
>>16);
1357 *dest
++=(uint16_t)*p
++;
1360 ec
=U_BUFFER_OVERFLOW_ERROR
;
1365 //----------------------------------------------------------------
1366 // Implementation: Utility methods
1367 //----------------------------------------------------------------
1370 * Allocate our strings vector and return TRUE if successful.
1372 UBool
UnicodeSet::allocateStrings() {
1373 UErrorCode ec
= U_ZERO_ERROR
;
1374 strings
= new UVector(uhash_deleteUnicodeString
,
1375 uhash_compareUnicodeString
, ec
);
1376 if (U_FAILURE(ec
)) {
1384 void UnicodeSet::ensureCapacity(int32_t newLen
) {
1385 if (newLen
<= capacity
)
1387 capacity
= newLen
+ GROW_EXTRA
;
1388 UChar32
* temp
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
1389 uprv_memcpy(temp
, list
, len
*sizeof(UChar32
));
1394 void UnicodeSet::ensureBufferCapacity(int32_t newLen
) {
1395 if (buffer
!= NULL
&& newLen
<= bufferCapacity
)
1400 bufferCapacity
= newLen
+ GROW_EXTRA
;
1401 buffer
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * bufferCapacity
);
1405 * Swap list and buffer.
1407 void UnicodeSet::swapBuffers(void) {
1408 // swap list and buffer
1409 UChar32
* temp
= list
;
1413 int32_t c
= capacity
;
1414 capacity
= bufferCapacity
;
1418 //----------------------------------------------------------------
1419 // Implementation: Fundamental operators
1420 //----------------------------------------------------------------
1422 static inline UChar32
max(UChar32 a
, UChar32 b
) {
1423 return (a
> b
) ? a
: b
;
1426 // polarity = 0, 3 is normal: x xor y
1427 // polarity = 1, 2: x xor ~y == x === y
1429 void UnicodeSet::exclusiveOr(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1430 ensureBufferCapacity(len
+ otherLen
);
1431 int32_t i
= 0, j
= 0, k
= 0;
1432 UChar32 a
= list
[i
++];
1434 if (polarity
== 1 || polarity
== 2) {
1436 if (other
[j
] == UNICODESET_LOW
) { // skip base if already LOW
1443 // simplest of all the routines
1444 // sort the values, discarding identicals!
1452 } else if (a
!= UNICODESET_HIGH
) { // at this point, a == b
1453 // discard both values!
1457 buffer
[k
++] = UNICODESET_HIGH
;
1466 // polarity = 0 is normal: x union y
1467 // polarity = 2: x union ~y
1468 // polarity = 1: ~x union y
1469 // polarity = 3: ~x union ~y
1471 void UnicodeSet::add(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1472 ensureBufferCapacity(len
+ otherLen
);
1473 int32_t i
= 0, j
= 0, k
= 0;
1474 UChar32 a
= list
[i
++];
1475 UChar32 b
= other
[j
++];
1476 // change from xor is that we have to check overlapping pairs
1477 // polarity bit 1 means a is second, bit 2 means b is.
1480 case 0: // both first; take lower if unequal
1481 if (a
< b
) { // take a
1482 // Back up over overlapping ranges in buffer[]
1483 if (k
> 0 && a
<= buffer
[k
-1]) {
1484 // Pick latter end value in buffer[] vs. list[]
1485 a
= max(list
[i
], buffer
[--k
]);
1491 i
++; // Common if/else code factored out
1493 } else if (b
< a
) { // take b
1494 if (k
> 0 && b
<= buffer
[k
-1]) {
1495 b
= max(other
[j
], buffer
[--k
]);
1502 } else { // a == b, take a, drop b
1503 if (a
== UNICODESET_HIGH
) goto loop_end
;
1504 // This is symmetrical; it doesn't matter if
1505 // we backtrack with a or b. - liu
1506 if (k
> 0 && a
<= buffer
[k
-1]) {
1507 a
= max(list
[i
], buffer
[--k
]);
1519 case 3: // both second; take higher if unequal, and drop other
1520 if (b
<= a
) { // take a
1521 if (a
== UNICODESET_HIGH
) goto loop_end
;
1524 if (b
== UNICODESET_HIGH
) goto loop_end
;
1528 polarity
^= 1; // factored common code
1532 case 1: // a second, b first; if b < a, overlap
1533 if (a
< b
) { // no overlap, take a
1534 buffer
[k
++] = a
; a
= list
[i
++]; polarity
^= 1;
1535 } else if (b
< a
) { // OVERLAP, drop b
1538 } else { // a == b, drop both!
1539 if (a
== UNICODESET_HIGH
) goto loop_end
;
1546 case 2: // a first, b second; if a < b, overlap
1547 if (b
< a
) { // no overlap, take b
1551 } else if (a
< b
) { // OVERLAP, drop a
1554 } else { // a == b, drop both!
1555 if (a
== UNICODESET_HIGH
) goto loop_end
;
1565 buffer
[k
++] = UNICODESET_HIGH
; // terminate
1571 // polarity = 0 is normal: x intersect y
1572 // polarity = 2: x intersect ~y == set-minus
1573 // polarity = 1: ~x intersect y
1574 // polarity = 3: ~x intersect ~y
1576 void UnicodeSet::retain(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1577 ensureBufferCapacity(len
+ otherLen
);
1578 int32_t i
= 0, j
= 0, k
= 0;
1579 UChar32 a
= list
[i
++];
1580 UChar32 b
= other
[j
++];
1581 // change from xor is that we have to check overlapping pairs
1582 // polarity bit 1 means a is second, bit 2 means b is.
1585 case 0: // both first; drop the smaller
1586 if (a
< b
) { // drop a
1589 } else if (b
< a
) { // drop b
1592 } else { // a == b, take one, drop other
1593 if (a
== UNICODESET_HIGH
) goto loop_end
;
1601 case 3: // both second; take lower if unequal
1602 if (a
< b
) { // take a
1606 } else if (b
< a
) { // take b
1610 } else { // a == b, take one, drop other
1611 if (a
== UNICODESET_HIGH
) goto loop_end
;
1619 case 1: // a second, b first;
1620 if (a
< b
) { // NO OVERLAP, drop a
1623 } else if (b
< a
) { // OVERLAP, take b
1627 } else { // a == b, drop both!
1628 if (a
== UNICODESET_HIGH
) goto loop_end
;
1635 case 2: // a first, b second; if a < b, overlap
1636 if (b
< a
) { // no overlap, drop b
1639 } else if (a
< b
) { // OVERLAP, take a
1643 } else { // a == b, drop both!
1644 if (a
== UNICODESET_HIGH
) goto loop_end
;
1654 buffer
[k
++] = UNICODESET_HIGH
; // terminate
1661 * Append the <code>toPattern()</code> representation of a
1662 * string to the given <code>StringBuffer</code>.
1664 void UnicodeSet::_appendToPat(UnicodeString
& buf
, const UnicodeString
& s
, UBool
1665 escapeUnprintable
) {
1667 for (int32_t i
= 0; i
< s
.length(); i
+= UTF_CHAR_LENGTH(cp
)) {
1668 _appendToPat(buf
, cp
= s
.char32At(i
), escapeUnprintable
);
1673 * Append the <code>toPattern()</code> representation of a
1674 * character to the given <code>StringBuffer</code>.
1676 void UnicodeSet::_appendToPat(UnicodeString
& buf
, UChar32 c
, UBool
1677 escapeUnprintable
) {
1678 if (escapeUnprintable
&& ICU_Utility::isUnprintable(c
)) {
1679 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1681 if (ICU_Utility::escapeUnprintable(buf
, c
)) {
1685 // Okay to let ':' pass through
1696 case SymbolTable::SYMBOL_REF
:
1697 buf
.append(BACKSLASH
);
1700 // Escape whitespace
1701 if (uprv_isRuleWhiteSpace(c
)) {
1702 buf
.append(BACKSLASH
);
1710 * Append a string representation of this set to result. This will be
1711 * a cleaned version of the string passed to applyPattern(), if there
1712 * is one. Otherwise it will be generated.
1714 UnicodeString
& UnicodeSet::_toPattern(UnicodeString
& result
,
1715 UBool escapeUnprintable
) const {
1716 if (pat
.length() > 0) {
1718 int32_t backslashCount
= 0;
1719 for (i
=0; i
<pat
.length(); ) {
1720 UChar32 c
= pat
.char32At(i
);
1721 i
+= UTF_CHAR_LENGTH(c
);
1722 if (escapeUnprintable
&& ICU_Utility::isUnprintable(c
)) {
1723 // If the unprintable character is preceded by an odd
1724 // number of backslashes, then it has been escaped.
1725 // Before unescaping it, we delete the final
1727 if ((backslashCount
% 2) == 1) {
1728 result
.truncate(result
.length() - 1);
1730 ICU_Utility::escapeUnprintable(result
, c
);
1734 if (c
== BACKSLASH
) {
1744 return _generatePattern(result
, escapeUnprintable
);
1748 * Returns a string representation of this set. If the result of
1749 * calling this function is passed to a UnicodeSet constructor, it
1750 * will produce another set that is equal to this one.
1752 UnicodeString
& UnicodeSet::toPattern(UnicodeString
& result
,
1753 UBool escapeUnprintable
) const {
1755 return _toPattern(result
, escapeUnprintable
);
1759 * Generate and append a string representation of this set to result.
1760 * This does not use this.pat, the cleaned up copy of the string
1761 * passed to applyPattern().
1763 UnicodeString
& UnicodeSet::_generatePattern(UnicodeString
& result
,
1764 UBool escapeUnprintable
) const {
1765 result
.append(SET_OPEN
);
1767 // // Check against the predefined categories. We implicitly build
1768 // // up ALL category sets the first time toPattern() is called.
1769 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
1770 // if (*this == getCategorySet(cat)) {
1771 // result.append(COLON);
1772 // result.append(CATEGORY_NAMES, cat*2, 2);
1773 // return result.append(CATEGORY_CLOSE);
1777 int32_t count
= getRangeCount();
1779 // If the set contains at least 2 intervals and includes both
1780 // MIN_VALUE and MAX_VALUE, then the inverse representation will
1781 // be more economical.
1783 getRangeStart(0) == MIN_VALUE
&&
1784 getRangeEnd(count
-1) == MAX_VALUE
) {
1787 result
.append(COMPLEMENT
);
1789 for (int32_t i
= 1; i
< count
; ++i
) {
1790 UChar32 start
= getRangeEnd(i
-1)+1;
1791 UChar32 end
= getRangeStart(i
)-1;
1792 _appendToPat(result
, start
, escapeUnprintable
);
1794 if ((start
+1) != end
) {
1795 result
.append(HYPHEN
);
1797 _appendToPat(result
, end
, escapeUnprintable
);
1802 // Default; emit the ranges as pairs
1804 for (int32_t i
= 0; i
< count
; ++i
) {
1805 UChar32 start
= getRangeStart(i
);
1806 UChar32 end
= getRangeEnd(i
);
1807 _appendToPat(result
, start
, escapeUnprintable
);
1809 if ((start
+1) != end
) {
1810 result
.append(HYPHEN
);
1812 _appendToPat(result
, end
, escapeUnprintable
);
1817 for (int32_t i
= 0; i
<strings
->size(); ++i
) {
1818 result
.append(OPEN_BRACE
);
1819 _appendToPat(result
,
1820 *(const UnicodeString
*) strings
->elementAt(i
),
1822 result
.append(CLOSE_BRACE
);
1824 return result
.append(SET_CLOSE
);