2 **********************************************************************
3 * Copyright (C) 1999-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 **********************************************************************
11 #include "unicode/utypes.h"
12 #include "unicode/parsepos.h"
13 #include "unicode/symtable.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utf8.h"
16 #include "unicode/utf16.h"
20 #include "patternprops.h"
28 #include "unisetspan.h"
30 // Define UChar constants using hex for EBCDIC compatibility
31 // Used #define to reduce private static exports and memory access time.
32 #define SET_OPEN ((UChar)0x005B) /*[*/
33 #define SET_CLOSE ((UChar)0x005D) /*]*/
34 #define HYPHEN ((UChar)0x002D) /*-*/
35 #define COMPLEMENT ((UChar)0x005E) /*^*/
36 #define COLON ((UChar)0x003A) /*:*/
37 #define BACKSLASH ((UChar)0x005C) /*\*/
38 #define INTERSECTION ((UChar)0x0026) /*&*/
39 #define UPPER_U ((UChar)0x0055) /*U*/
40 #define LOWER_U ((UChar)0x0075) /*u*/
41 #define OPEN_BRACE ((UChar)123) /*{*/
42 #define CLOSE_BRACE ((UChar)125) /*}*/
43 #define UPPER_P ((UChar)0x0050) /*P*/
44 #define LOWER_P ((UChar)0x0070) /*p*/
45 #define UPPER_N ((UChar)78) /*N*/
46 #define EQUALS ((UChar)0x003D) /*=*/
48 // HIGH_VALUE > all valid values. 110000 for codepoints
49 #define UNICODESET_HIGH 0x0110000
51 // LOW <= all valid values. ZERO for codepoints
52 #define UNICODESET_LOW 0x000000
54 // initial storage. Must be >= 0
55 #define START_EXTRA 16
57 // extra amount for growth. Must be >= 0
58 #define GROW_EXTRA START_EXTRA
62 SymbolTable::~SymbolTable() {}
64 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet
)
67 * Modify the given UChar32 variable so that it is in range, by
68 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
69 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
70 * It modifies its argument in-place and also returns it.
72 static inline UChar32
pinCodePoint(UChar32
& c
) {
73 if (c
< UNICODESET_LOW
) {
75 } else if (c
> (UNICODESET_HIGH
-1)) {
76 c
= (UNICODESET_HIGH
-1);
81 //----------------------------------------------------------------
83 //----------------------------------------------------------------
85 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
86 // To enable the debugging, define the symbol DEBUG_MEM in the line
87 // below. This will result in text being sent to stdout that looks
89 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
90 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
91 // Each line lists a construction (ct) or destruction (dt) event, the
92 // object address, the number of outstanding objects after the event,
93 // and the pattern of the object in question.
99 static int32_t _dbgCount
= 0;
101 static inline void _dbgct(UnicodeSet
* set
) {
103 set
->toPattern(str
, TRUE
);
105 str
.extract(0, 39, buf
, "");
106 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set
, ++_dbgCount
, buf
);
109 static inline void _dbgdt(UnicodeSet
* set
) {
111 set
->toPattern(str
, TRUE
);
113 str
.extract(0, 39, buf
, "");
114 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set
, --_dbgCount
, buf
);
124 //----------------------------------------------------------------
125 // UnicodeString in UVector support
126 //----------------------------------------------------------------
128 static void U_CALLCONV
cloneUnicodeString(UElement
*dst
, UElement
*src
) {
129 dst
->pointer
= new UnicodeString(*(UnicodeString
*)src
->pointer
);
132 static int8_t U_CALLCONV
compareUnicodeString(UElement t1
, UElement t2
) {
133 const UnicodeString
&a
= *(const UnicodeString
*)t1
.pointer
;
134 const UnicodeString
&b
= *(const UnicodeString
*)t2
.pointer
;
138 //----------------------------------------------------------------
140 //----------------------------------------------------------------
143 * Constructs an empty set.
145 UnicodeSet::UnicodeSet() :
146 len(1), capacity(1 + START_EXTRA
), list(0), bmpSet(0), buffer(0),
147 bufferCapacity(0), patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
150 UErrorCode status
= U_ZERO_ERROR
;
151 allocateStrings(status
);
152 if (U_FAILURE(status
)) {
155 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
157 list
[0] = UNICODESET_HIGH
;
158 } else { // If memory allocation failed, set to bogus state.
166 * Constructs a set containing the given range. If <code>end >
167 * start</code> then an empty set is created.
169 * @param start first character, inclusive, of range
170 * @param end last character, inclusive, of range
172 UnicodeSet::UnicodeSet(UChar32 start
, UChar32 end
) :
173 len(1), capacity(1 + START_EXTRA
), list(0), bmpSet(0), buffer(0),
174 bufferCapacity(0), patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
177 UErrorCode status
= U_ZERO_ERROR
;
178 allocateStrings(status
);
179 if (U_FAILURE(status
)) {
182 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
184 list
[0] = UNICODESET_HIGH
;
185 complement(start
, end
);
186 } else { // If memory allocation failed, set to bogus state.
194 * Constructs a set that is identical to the given UnicodeSet.
196 UnicodeSet::UnicodeSet(const UnicodeSet
& o
) :
198 len(0), capacity(o
.isFrozen() ? o
.len
: o
.len
+ GROW_EXTRA
), list(0),
200 buffer(0), bufferCapacity(0),
201 patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
204 UErrorCode status
= U_ZERO_ERROR
;
205 allocateStrings(status
);
206 if (U_FAILURE(status
)) {
209 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
212 } else { // If memory allocation failed, set to bogus state.
219 // Copy-construct as thawed.
220 UnicodeSet::UnicodeSet(const UnicodeSet
& o
, UBool
/* asThawed */) :
222 len(0), capacity(o
.len
+ GROW_EXTRA
), list(0),
224 buffer(0), bufferCapacity(0),
225 patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
228 UErrorCode status
= U_ZERO_ERROR
;
229 allocateStrings(status
);
230 if (U_FAILURE(status
)) {
233 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
235 // *this = o except for bmpSet and stringSpan
237 uprv_memcpy(list
, o
.list
, len
*sizeof(UChar32
));
238 if (strings
!= NULL
&& o
.strings
!= NULL
) {
239 strings
->assign(*o
.strings
, cloneUnicodeString
, status
);
240 } else { // Invalid strings.
245 setPattern(UnicodeString(o
.pat
, o
.patLen
));
247 } else { // If memory allocation failed, set to bogus state.
257 UnicodeSet::~UnicodeSet() {
258 _dbgdt(this); // first!
270 * Assigns this object to be a copy of another.
272 UnicodeSet
& UnicodeSet::operator=(const UnicodeSet
& o
) {
283 UErrorCode ec
= U_ZERO_ERROR
;
284 ensureCapacity(o
.len
, ec
);
286 return *this; // There is no way to report this error :-(
289 uprv_memcpy(list
, o
.list
, len
*sizeof(UChar32
));
290 if (o
.bmpSet
== NULL
) {
293 bmpSet
= new BMPSet(*o
.bmpSet
, list
, len
);
294 if (bmpSet
== NULL
) { // Check for memory allocation error.
299 if (strings
!= NULL
&& o
.strings
!= NULL
) {
300 strings
->assign(*o
.strings
, cloneUnicodeString
, ec
);
301 } else { // Invalid strings.
305 if (o
.stringSpan
== NULL
) {
308 stringSpan
= new UnicodeSetStringSpan(*o
.stringSpan
, *strings
);
309 if (stringSpan
== NULL
) { // Check for memory allocation error.
316 setPattern(UnicodeString(o
.pat
, o
.patLen
));
322 * Returns a copy of this object. All UnicodeMatcher objects have
323 * to support cloning in order to allow classes using
324 * UnicodeMatchers, such as Transliterator, to implement cloning.
326 UnicodeFunctor
* UnicodeSet::clone() const {
327 return new UnicodeSet(*this);
330 UnicodeFunctor
*UnicodeSet::cloneAsThawed() const {
331 return new UnicodeSet(*this, TRUE
);
335 * Compares the specified object with this set for equality. Returns
336 * <tt>true</tt> if the two sets
337 * have the same size, and every member of the specified set is
338 * contained in this set (or equivalently, every member of this set is
339 * contained in the specified set).
341 * @param o set to be compared for equality with this set.
342 * @return <tt>true</tt> if the specified set is equal to this set.
344 UBool
UnicodeSet::operator==(const UnicodeSet
& o
) const {
345 if (len
!= o
.len
) return FALSE
;
346 for (int32_t i
= 0; i
< len
; ++i
) {
347 if (list
[i
] != o
.list
[i
]) return FALSE
;
349 if (*strings
!= *o
.strings
) return FALSE
;
354 * Returns the hash code value for this set.
356 * @return the hash code value for this set.
357 * @see Object#hashCode()
359 int32_t UnicodeSet::hashCode(void) const {
360 int32_t result
= len
;
361 for (int32_t i
= 0; i
< len
; ++i
) {
368 //----------------------------------------------------------------
370 //----------------------------------------------------------------
373 * Returns the number of elements in this set (its cardinality),
374 * Note than the elements of a set may include both individual
375 * codepoints and strings.
377 * @return the number of elements in this set (its cardinality).
379 int32_t UnicodeSet::size(void) const {
381 int32_t count
= getRangeCount();
382 for (int32_t i
= 0; i
< count
; ++i
) {
383 n
+= getRangeEnd(i
) - getRangeStart(i
) + 1;
385 return n
+ strings
->size();
389 * Returns <tt>true</tt> if this set contains no elements.
391 * @return <tt>true</tt> if this set contains no elements.
393 UBool
UnicodeSet::isEmpty(void) const {
394 return len
== 1 && strings
->size() == 0;
398 * Returns true if this set contains the given character.
399 * @param c character to be checked for containment
400 * @return true if the test condition is met
402 UBool
UnicodeSet::contains(UChar32 c
) const {
403 // Set i to the index of the start item greater than ch
404 // We know we will terminate without length test!
405 // LATER: for large sets, add binary search
408 // if (c < list[++i]) break;
410 if (bmpSet
!= NULL
) {
411 return bmpSet
->contains(c
);
413 if (stringSpan
!= NULL
) {
414 return stringSpan
->contains(c
);
416 if (c
>= UNICODESET_HIGH
) { // Don't need to check LOW bound
419 int32_t i
= findCodePoint(c
);
420 return (UBool
)(i
& 1); // return true if odd
424 * Returns the smallest value i such that c < list[i]. Caller
425 * must ensure that c is a legal value or this method will enter
426 * an infinite loop. This method performs a binary search.
427 * @param c a character in the range MIN_VALUE..MAX_VALUE
429 * @return the smallest integer i in the range 0..len-1,
430 * inclusive, such that c < list[i]
432 int32_t UnicodeSet::findCodePoint(UChar32 c
) const {
435 set list[] c=0 1 3 4 7 8
436 === ============== ===========
437 [] [110000] 0 0 0 0 0 0
438 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
439 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
440 [:Any:] [0, 110000] 1 1 1 1 1 1
443 // Return the smallest i such that c < list[i]. Assume
444 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
447 // High runner test. c is often after the last range, so an
448 // initial check for this condition pays off.
450 int32_t hi
= len
- 1;
451 if (lo
>= hi
|| c
>= list
[hi
-1])
453 // invariant: c >= list[lo]
454 // invariant: c < list[hi]
456 int32_t i
= (lo
+ hi
) >> 1;
459 } else if (c
< list
[i
]) {
469 * Returns true if this set contains every character
470 * of the given range.
471 * @param start first character, inclusive, of the range
472 * @param end last character, inclusive, of the range
473 * @return true if the test condition is met
475 UBool
UnicodeSet::contains(UChar32 start
, UChar32 end
) const {
478 // if (start < list[++i]) break;
480 int32_t i
= findCodePoint(start
);
481 return ((i
& 1) != 0 && end
< list
[i
]);
485 * Returns <tt>true</tt> if this set contains the given
486 * multicharacter string.
487 * @param s string to be checked for containment
488 * @return <tt>true</tt> if this set contains the specified string
490 UBool
UnicodeSet::contains(const UnicodeString
& s
) const {
491 if (s
.length() == 0) return FALSE
;
492 int32_t cp
= getSingleCP(s
);
494 return strings
->contains((void*) &s
);
496 return contains((UChar32
) cp
);
501 * Returns true if this set contains all the characters and strings
503 * @param c set to be checked for containment
504 * @return true if the test condition is met
506 UBool
UnicodeSet::containsAll(const UnicodeSet
& c
) const {
507 // The specified set is a subset if all of its pairs are contained in
508 // this set. It's possible to code this more efficiently in terms of
509 // direct manipulation of the inversion lists if the need arises.
510 int32_t n
= c
.getRangeCount();
511 for (int i
=0; i
<n
; ++i
) {
512 if (!contains(c
.getRangeStart(i
), c
.getRangeEnd(i
))) {
516 if (!strings
->containsAll(*c
.strings
)) return FALSE
;
521 * Returns true if this set contains all the characters
522 * of the given string.
523 * @param s string containing characters to be checked for containment
524 * @return true if the test condition is met
526 UBool
UnicodeSet::containsAll(const UnicodeString
& s
) const {
527 return (UBool
)(span(s
.getBuffer(), s
.length(), USET_SPAN_CONTAINED
) ==
532 * Returns true if this set contains none of the characters
533 * of the given range.
534 * @param start first character, inclusive, of the range
535 * @param end last character, inclusive, of the range
536 * @return true if the test condition is met
538 UBool
UnicodeSet::containsNone(UChar32 start
, UChar32 end
) const {
541 // if (start < list[++i]) break;
543 int32_t i
= findCodePoint(start
);
544 return ((i
& 1) == 0 && end
< list
[i
]);
548 * Returns true if this set contains none of the characters and strings
550 * @param c set to be checked for containment
551 * @return true if the test condition is met
553 UBool
UnicodeSet::containsNone(const UnicodeSet
& c
) const {
554 // The specified set is a subset if all of its pairs are contained in
555 // this set. It's possible to code this more efficiently in terms of
556 // direct manipulation of the inversion lists if the need arises.
557 int32_t n
= c
.getRangeCount();
558 for (int32_t i
=0; i
<n
; ++i
) {
559 if (!containsNone(c
.getRangeStart(i
), c
.getRangeEnd(i
))) {
563 if (!strings
->containsNone(*c
.strings
)) return FALSE
;
568 * Returns true if this set contains none of the characters
569 * of the given string.
570 * @param s string containing characters to be checked for containment
571 * @return true if the test condition is met
573 UBool
UnicodeSet::containsNone(const UnicodeString
& s
) const {
574 return (UBool
)(span(s
.getBuffer(), s
.length(), USET_SPAN_NOT_CONTAINED
) ==
579 * Returns <tt>true</tt> if this set contains any character whose low byte
580 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
583 UBool
UnicodeSet::matchesIndexValue(uint8_t v
) const {
584 /* The index value v, in the range [0,255], is contained in this set if
585 * it is contained in any pair of this set. Pairs either have the high
586 * bytes equal, or unequal. If the high bytes are equal, then we have
587 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
588 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
589 * Then v is contained if xx <= v || v <= yy. (This is identical to the
590 * time zone month containment logic.)
593 int32_t rangeCount
=getRangeCount();
594 for (i
=0; i
<rangeCount
; ++i
) {
595 UChar32 low
= getRangeStart(i
);
596 UChar32 high
= getRangeEnd(i
);
597 if ((low
& ~0xFF) == (high
& ~0xFF)) {
598 if ((low
& 0xFF) <= v
&& v
<= (high
& 0xFF)) {
601 } else if ((low
& 0xFF) <= v
|| v
<= (high
& 0xFF)) {
605 if (strings
->size() != 0) {
606 for (i
=0; i
<strings
->size(); ++i
) {
607 const UnicodeString
& s
= *(const UnicodeString
*)strings
->elementAt(i
);
608 //if (s.length() == 0) {
609 // // Empty strings match everything
612 // assert(s.length() != 0); // We enforce this elsewhere
613 UChar32 c
= s
.char32At(0);
614 if ((c
& 0xFF) == v
) {
623 * Implementation of UnicodeMatcher::matches(). Always matches the
624 * longest possible multichar string.
626 UMatchDegree
UnicodeSet::matches(const Replaceable
& text
,
630 if (offset
== limit
) {
631 // Strings, if any, have length != 0, so we don't worry
632 // about them here. If we ever allow zero-length strings
633 // we much check for them here.
634 if (contains(U_ETHER
)) {
635 return incremental
? U_PARTIAL_MATCH
: U_MATCH
;
640 if (strings
->size() != 0) { // try strings first
642 // might separate forward and backward loops later
643 // for now they are combined
645 // TODO Improve efficiency of this, at least in the forward
646 // direction, if not in both. In the forward direction we
647 // can assume the strings are sorted.
650 UBool forward
= offset
< limit
;
652 // firstChar is the leftmost char to match in the
653 // forward direction or the rightmost char to match in
654 // the reverse direction.
655 UChar firstChar
= text
.charAt(offset
);
657 // If there are multiple strings that can match we
658 // return the longest match.
659 int32_t highWaterLength
= 0;
661 for (i
=0; i
<strings
->size(); ++i
) {
662 const UnicodeString
& trial
= *(const UnicodeString
*)strings
->elementAt(i
);
664 //if (trial.length() == 0) {
665 // return U_MATCH; // null-string always matches
667 // assert(trial.length() != 0); // We ensure this elsewhere
669 UChar c
= trial
.charAt(forward
? 0 : trial
.length() - 1);
671 // Strings are sorted, so we can optimize in the
672 // forward direction.
673 if (forward
&& c
> firstChar
) break;
674 if (c
!= firstChar
) continue;
676 int32_t matchLen
= matchRest(text
, offset
, limit
, trial
);
679 int32_t maxLen
= forward
? limit
-offset
: offset
-limit
;
680 if (matchLen
== maxLen
) {
681 // We have successfully matched but only up to limit.
682 return U_PARTIAL_MATCH
;
686 if (matchLen
== trial
.length()) {
687 // We have successfully matched the whole string.
688 if (matchLen
> highWaterLength
) {
689 highWaterLength
= matchLen
;
691 // In the forward direction we know strings
692 // are sorted so we can bail early.
693 if (forward
&& matchLen
< highWaterLength
) {
700 // We've checked all strings without a partial match.
701 // If we have full matches, return the longest one.
702 if (highWaterLength
!= 0) {
703 offset
+= forward
? highWaterLength
: -highWaterLength
;
707 return UnicodeFilter::matches(text
, offset
, limit
, incremental
);
712 * Returns the longest match for s in text at the given position.
713 * If limit > start then match forward from start+1 to limit
714 * matching all characters except s.charAt(0). If limit < start,
715 * go backward starting from start-1 matching all characters
716 * except s.charAt(s.length()-1). This method assumes that the
717 * first character, text.charAt(start), matches s, so it does not
719 * @param text the text to match
720 * @param start the first character to match. In the forward
721 * direction, text.charAt(start) is matched against s.charAt(0).
722 * In the reverse direction, it is matched against
723 * s.charAt(s.length()-1).
724 * @param limit the limit offset for matching, either last+1 in
725 * the forward direction, or last-1 in the reverse direction,
726 * where last is the index of the last character to match.
727 * @return If part of s matches up to the limit, return |limit -
728 * start|. If all of s matches before reaching the limit, return
729 * s.length(). If there is a mismatch between s and text, return
732 int32_t UnicodeSet::matchRest(const Replaceable
& text
,
733 int32_t start
, int32_t limit
,
734 const UnicodeString
& s
) {
737 int32_t slen
= s
.length();
739 maxLen
= limit
- start
;
740 if (maxLen
> slen
) maxLen
= slen
;
741 for (i
= 1; i
< maxLen
; ++i
) {
742 if (text
.charAt(start
+ i
) != s
.charAt(i
)) return 0;
745 maxLen
= start
- limit
;
746 if (maxLen
> slen
) maxLen
= slen
;
747 --slen
; // <=> slen = s.length() - 1;
748 for (i
= 1; i
< maxLen
; ++i
) {
749 if (text
.charAt(start
- i
) != s
.charAt(slen
- i
)) return 0;
756 * Implement of UnicodeMatcher
758 void UnicodeSet::addMatchSetTo(UnicodeSet
& toUnionTo
) const {
759 toUnionTo
.addAll(*this);
763 * Returns the index of the given character within this set, where
764 * the set is ordered by ascending code point. If the character
765 * is not in this set, return -1. The inverse of this method is
766 * <code>charAt()</code>.
767 * @return an index from 0..size()-1, or -1
769 int32_t UnicodeSet::indexOf(UChar32 c
) const {
770 if (c
< MIN_VALUE
|| c
> MAX_VALUE
) {
776 UChar32 start
= list
[i
++];
780 UChar32 limit
= list
[i
++];
782 return n
+ c
- start
;
789 * Returns the character at the given index within this set, where
790 * the set is ordered by ascending code point. If the index is
791 * out of range, return (UChar32)-1. The inverse of this method is
792 * <code>indexOf()</code>.
793 * @param index an index from 0..size()-1
794 * @return the character at the given index, or (UChar32)-1.
796 UChar32
UnicodeSet::charAt(int32_t index
) const {
798 // len2 is the largest even integer <= len, that is, it is len
799 // for even values and len-1 for odd values. With odd values
800 // the last entry is UNICODESET_HIGH.
801 int32_t len2
= len
& ~1;
802 for (int32_t i
=0; i
< len2
;) {
803 UChar32 start
= list
[i
++];
804 int32_t count
= list
[i
++] - start
;
806 return (UChar32
)(start
+ index
);
815 * Make this object represent the range <code>start - end</code>.
816 * If <code>end > start</code> then this object is set to an
819 * @param start first character in the set, inclusive
820 * @rparam end last character in the set, inclusive
822 UnicodeSet
& UnicodeSet::set(UChar32 start
, UChar32 end
) {
824 complement(start
, end
);
829 * Adds the specified range to this set if it is not already
830 * present. If this set already contains the specified range,
831 * the call leaves this set unchanged. If <code>end > start</code>
832 * then an empty range is added, leaving the set unchanged.
834 * @param start first character, inclusive, of range to be added
836 * @param end last character, inclusive, of range to be added
839 UnicodeSet
& UnicodeSet::add(UChar32 start
, UChar32 end
) {
840 if (pinCodePoint(start
) < pinCodePoint(end
)) {
841 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
843 } else if (start
== end
) {
849 // #define DEBUG_US_ADD
853 void dump(UChar32 c
) {
855 printf("%c", (char)c
);
860 void dump(const UChar32
* list
, int32_t len
) {
862 for (int32_t i
=0; i
<len
; ++i
) {
863 if (i
!= 0) printf(", ");
871 * Adds the specified character to this set if it is not already
872 * present. If this set already contains the specified character,
873 * the call leaves this set unchanged.
875 UnicodeSet
& UnicodeSet::add(UChar32 c
) {
876 // find smallest i such that c < list[i]
877 // if odd, then it is IN the set
878 // if even, then it is OUT of the set
879 int32_t i
= findCodePoint(pinCodePoint(c
));
882 if ((i
& 1) != 0 || isFrozen() || isBogus()) return *this;
885 // assert(list[len-1] == HIGH);
888 // [start_0, limit_0, start_1, limit_1, HIGH]
890 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
894 // i == 0 means c is before the first range
899 printf(" found at %d", i
);
905 if (c
== list
[i
]-1) {
906 // c is before start of next range
908 // if we touched the HIGH mark, then add a new one
909 if (c
== (UNICODESET_HIGH
- 1)) {
910 UErrorCode status
= U_ZERO_ERROR
;
911 ensureCapacity(len
+1, status
);
912 if (U_FAILURE(status
)) {
913 return *this; // There is no way to report this error :-(
915 list
[len
++] = UNICODESET_HIGH
;
917 if (i
> 0 && c
== list
[i
-1]) {
918 // collapse adjacent ranges
920 // [..., start_k-1, c, c, limit_k, ..., HIGH]
924 //for (int32_t k=i-1; k<len-2; ++k) {
925 // list[k] = list[k+2];
927 UChar32
* dst
= list
+ i
- 1;
928 UChar32
* src
= dst
+ 2;
929 UChar32
* srclimit
= list
+ len
;
930 while (src
< srclimit
) *(dst
++) = *(src
++);
936 else if (i
> 0 && c
== list
[i
-1]) {
937 // c is after end of prior range
939 // no need to check for collapse here
943 // At this point we know the new char is not adjacent to
944 // any existing ranges, and it is not 10FFFF.
947 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
951 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
955 UErrorCode status
= U_ZERO_ERROR
;
956 ensureCapacity(len
+2, status
);
957 if (U_FAILURE(status
)) {
958 return *this; // There is no way to report this error :-(
961 //for (int32_t k=len-1; k>=i; --k) {
962 // list[k+2] = list[k];
964 UChar32
* src
= list
+ len
;
965 UChar32
* dst
= src
+ 2;
966 UChar32
* srclimit
= list
+ i
;
967 while (src
> srclimit
) *(--dst
) = *(--src
);
978 for (i
=1; i
<len
; ++i
) {
979 if (list
[i
] <= list
[i
-1]) {
981 printf("ERROR: list has been corrupted\n");
992 * Adds the specified multicharacter to this set if it is not already
993 * present. If this set already contains the multicharacter,
994 * the call leaves this set unchanged.
995 * Thus "ch" => {"ch"}
996 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
997 * @param s the source string
998 * @return the modified set, for chaining
1000 UnicodeSet
& UnicodeSet::add(const UnicodeString
& s
) {
1001 if (s
.length() == 0 || isFrozen() || isBogus()) return *this;
1002 int32_t cp
= getSingleCP(s
);
1004 if (!strings
->contains((void*) &s
)) {
1015 * Adds the given string, in order, to 'strings'. The given string
1016 * must have been checked by the caller to not be empty and to not
1017 * already be in 'strings'.
1019 void UnicodeSet::_add(const UnicodeString
& s
) {
1020 if (isFrozen() || isBogus()) {
1023 UnicodeString
* t
= new UnicodeString(s
);
1024 if (t
== NULL
) { // Check for memory allocation error.
1028 UErrorCode ec
= U_ZERO_ERROR
;
1029 strings
->sortedInsert(t
, compareUnicodeString
, ec
);
1030 if (U_FAILURE(ec
)) {
1037 * @return a code point IF the string consists of a single one.
1038 * otherwise returns -1.
1039 * @param string to test
1041 int32_t UnicodeSet::getSingleCP(const UnicodeString
& s
) {
1042 //if (s.length() < 1) {
1043 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1045 if (s
.length() > 2) return -1;
1046 if (s
.length() == 1) return s
.charAt(0);
1048 // at this point, len = 2
1049 UChar32 cp
= s
.char32At(0);
1050 if (cp
> 0xFFFF) { // is surrogate pair
1057 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1058 * If this set already any particular character, it has no effect on that character.
1059 * @param the source string
1060 * @return the modified set, for chaining
1062 UnicodeSet
& UnicodeSet::addAll(const UnicodeString
& s
) {
1064 for (int32_t i
= 0; i
< s
.length(); i
+= U16_LENGTH(cp
)) {
1072 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1073 * If this set already any particular character, it has no effect on that character.
1074 * @param the source string
1075 * @return the modified set, for chaining
1077 UnicodeSet
& UnicodeSet::retainAll(const UnicodeString
& s
) {
1085 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1086 * If this set already any particular character, it has no effect on that character.
1087 * @param the source string
1088 * @return the modified set, for chaining
1090 UnicodeSet
& UnicodeSet::complementAll(const UnicodeString
& s
) {
1098 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1099 * If this set already any particular character, it has no effect on that character.
1100 * @param the source string
1101 * @return the modified set, for chaining
1103 UnicodeSet
& UnicodeSet::removeAll(const UnicodeString
& s
) {
1110 UnicodeSet
& UnicodeSet::removeAllStrings() {
1111 strings
->removeAllElements();
1117 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1118 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1119 * @param the source string
1120 * @return a newly created set containing the given string
1122 UnicodeSet
* U_EXPORT2
UnicodeSet::createFrom(const UnicodeString
& s
) {
1123 UnicodeSet
*set
= new UnicodeSet();
1124 if (set
!= NULL
) { // Check for memory allocation error.
1132 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1133 * @param the source string
1134 * @return a newly created set containing the given characters
1136 UnicodeSet
* U_EXPORT2
UnicodeSet::createFromAll(const UnicodeString
& s
) {
1137 UnicodeSet
*set
= new UnicodeSet();
1138 if (set
!= NULL
) { // Check for memory allocation error.
1145 * Retain only the elements in this set that are contained in the
1146 * specified range. If <code>end > start</code> then an empty range is
1147 * retained, leaving the set empty.
1149 * @param start first character, inclusive, of range to be retained
1151 * @param end last character, inclusive, of range to be retained
1154 UnicodeSet
& UnicodeSet::retain(UChar32 start
, UChar32 end
) {
1155 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1156 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1157 retain(range
, 2, 0);
1164 UnicodeSet
& UnicodeSet::retain(UChar32 c
) {
1165 return retain(c
, c
);
1169 * Removes the specified range from this set if it is present.
1170 * The set will not contain the specified range once the call
1171 * returns. If <code>end > start</code> then an empty range is
1172 * removed, leaving the set unchanged.
1174 * @param start first character, inclusive, of range to be removed
1176 * @param end last character, inclusive, of range to be removed
1179 UnicodeSet
& UnicodeSet::remove(UChar32 start
, UChar32 end
) {
1180 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1181 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1182 retain(range
, 2, 2);
1188 * Removes the specified character from this set if it is present.
1189 * The set will not contain the specified range once the call
1192 UnicodeSet
& UnicodeSet::remove(UChar32 c
) {
1193 return remove(c
, c
);
1197 * Removes the specified string from this set if it is present.
1198 * The set will not contain the specified character once the call
1200 * @param the source string
1201 * @return the modified set, for chaining
1203 UnicodeSet
& UnicodeSet::remove(const UnicodeString
& s
) {
1204 if (s
.length() == 0 || isFrozen() || isBogus()) return *this;
1205 int32_t cp
= getSingleCP(s
);
1207 strings
->removeElement((void*) &s
);
1210 remove((UChar32
)cp
, (UChar32
)cp
);
1216 * Complements the specified range in this set. Any character in
1217 * the range will be removed if it is in this set, or will be
1218 * added if it is not in this set. If <code>end > start</code>
1219 * then an empty range is xor'ed, leaving the set unchanged.
1221 * @param start first character, inclusive, of range to be removed
1223 * @param end last character, inclusive, of range to be removed
1226 UnicodeSet
& UnicodeSet::complement(UChar32 start
, UChar32 end
) {
1227 if (isFrozen() || isBogus()) {
1230 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1231 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1232 exclusiveOr(range
, 2, 0);
1238 UnicodeSet
& UnicodeSet::complement(UChar32 c
) {
1239 return complement(c
, c
);
1243 * This is equivalent to
1244 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1246 UnicodeSet
& UnicodeSet::complement(void) {
1247 if (isFrozen() || isBogus()) {
1250 UErrorCode status
= U_ZERO_ERROR
;
1251 if (list
[0] == UNICODESET_LOW
) {
1252 ensureBufferCapacity(len
-1, status
);
1253 if (U_FAILURE(status
)) {
1256 uprv_memcpy(buffer
, list
+ 1, (len
-1)*sizeof(UChar32
));
1259 ensureBufferCapacity(len
+1, status
);
1260 if (U_FAILURE(status
)) {
1263 uprv_memcpy(buffer
+ 1, list
, len
*sizeof(UChar32
));
1264 buffer
[0] = UNICODESET_LOW
;
1273 * Complement the specified string in this set.
1274 * The set will not contain the specified string once the call
1276 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1277 * @param s the string to complement
1278 * @return this object, for chaining
1280 UnicodeSet
& UnicodeSet::complement(const UnicodeString
& s
) {
1281 if (s
.length() == 0 || isFrozen() || isBogus()) return *this;
1282 int32_t cp
= getSingleCP(s
);
1284 if (strings
->contains((void*) &s
)) {
1285 strings
->removeElement((void*) &s
);
1291 complement((UChar32
)cp
, (UChar32
)cp
);
1297 * Adds all of the elements in the specified set to this set if
1298 * they're not already present. This operation effectively
1299 * modifies this set so that its value is the <i>union</i> of the two
1300 * sets. The behavior of this operation is unspecified if the specified
1301 * collection is modified while the operation is in progress.
1303 * @param c set whose elements are to be added to this set.
1304 * @see #add(char, char)
1306 UnicodeSet
& UnicodeSet::addAll(const UnicodeSet
& c
) {
1307 if ( c
.len
>0 && c
.list
!=NULL
) {
1308 add(c
.list
, c
.len
, 0);
1311 // Add strings in order
1312 if ( c
.strings
!=NULL
) {
1313 for (int32_t i
=0; i
<c
.strings
->size(); ++i
) {
1314 const UnicodeString
* s
= (const UnicodeString
*)c
.strings
->elementAt(i
);
1315 if (!strings
->contains((void*) s
)) {
1324 * Retains only the elements in this set that are contained in the
1325 * specified set. In other words, removes from this set all of
1326 * its elements that are not contained in the specified set. This
1327 * operation effectively modifies this set so that its value is
1328 * the <i>intersection</i> of the two sets.
1330 * @param c set that defines which elements this set will retain.
1332 UnicodeSet
& UnicodeSet::retainAll(const UnicodeSet
& c
) {
1333 if (isFrozen() || isBogus()) {
1336 retain(c
.list
, c
.len
, 0);
1337 strings
->retainAll(*c
.strings
);
1342 * Removes from this set all of its elements that are contained in the
1343 * specified set. This operation effectively modifies this
1344 * set so that its value is the <i>asymmetric set difference</i> of
1347 * @param c set that defines which elements will be removed from
1350 UnicodeSet
& UnicodeSet::removeAll(const UnicodeSet
& c
) {
1351 if (isFrozen() || isBogus()) {
1354 retain(c
.list
, c
.len
, 2);
1355 strings
->removeAll(*c
.strings
);
1360 * Complements in this set all elements contained in the specified
1361 * set. Any character in the other set will be removed if it is
1362 * in this set, or will be added if it is not in this set.
1364 * @param c set that defines which elements will be xor'ed from
1367 UnicodeSet
& UnicodeSet::complementAll(const UnicodeSet
& c
) {
1368 if (isFrozen() || isBogus()) {
1371 exclusiveOr(c
.list
, c
.len
, 0);
1373 for (int32_t i
=0; i
<c
.strings
->size(); ++i
) {
1374 void* e
= c
.strings
->elementAt(i
);
1375 if (!strings
->removeElement(e
)) {
1376 _add(*(const UnicodeString
*)e
);
1383 * Removes all of the elements from this set. This set will be
1384 * empty after this call returns.
1386 UnicodeSet
& UnicodeSet::clear(void) {
1391 list
[0] = UNICODESET_HIGH
;
1395 if (strings
!= NULL
) {
1396 strings
->removeAllElements();
1398 if (list
!= NULL
&& strings
!= NULL
) {
1406 * Iteration method that returns the number of ranges contained in
1408 * @see #getRangeStart
1411 int32_t UnicodeSet::getRangeCount() const {
1416 * Iteration method that returns the first character in the
1417 * specified range of this set.
1418 * @see #getRangeCount
1421 UChar32
UnicodeSet::getRangeStart(int32_t index
) const {
1422 return list
[index
*2];
1426 * Iteration method that returns the last character in the
1427 * specified range of this set.
1428 * @see #getRangeStart
1431 UChar32
UnicodeSet::getRangeEnd(int32_t index
) const {
1432 return list
[index
*2 + 1] - 1;
1435 int32_t UnicodeSet::getStringCount() const {
1436 return strings
->size();
1439 const UnicodeString
* UnicodeSet::getString(int32_t index
) const {
1440 return (const UnicodeString
*) strings
->elementAt(index
);
1444 * Reallocate this objects internal structures to take up the least
1445 * possible space, without changing this object's value.
1447 UnicodeSet
& UnicodeSet::compact() {
1448 if (isFrozen() || isBogus()) {
1451 // Delete buffer first to defragment memory less.
1452 if (buffer
!= NULL
) {
1456 if (len
< capacity
) {
1457 // Make the capacity equal to len or 1.
1458 // We don't want to realloc of 0 size.
1459 int32_t newCapacity
= len
+ (len
== 0);
1460 UChar32
* temp
= (UChar32
*) uprv_realloc(list
, sizeof(UChar32
) * newCapacity
);
1463 capacity
= newCapacity
;
1465 // else what the heck happened?! We allocated less memory!
1466 // Oh well. We'll keep our original array.
1471 #ifdef DEBUG_SERIALIZE
1476 * Deserialize constructor.
1478 UnicodeSet::UnicodeSet(const uint16_t data
[], int32_t dataLen
, ESerialization serialization
, UErrorCode
&ec
)
1479 : len(1), capacity(1+START_EXTRA
), list(0), bmpSet(0), buffer(0),
1480 bufferCapacity(0), patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
1488 if( (serialization
!= kSerialized
)
1491 ec
= U_ILLEGAL_ARGUMENT_ERROR
;
1496 allocateStrings(ec
);
1497 if (U_FAILURE(ec
)) {
1503 int32_t headerSize
= ((data
[0]&0x8000)) ?2:1;
1504 int32_t bmpLength
= (headerSize
==1)?data
[0]:data
[1];
1506 len
= (((data
[0]&0x7FFF)-bmpLength
)/2)+bmpLength
;
1507 #ifdef DEBUG_SERIALIZE
1508 printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen
,headerSize
,bmpLength
,len
, data
[0],data
[1],data
[2],data
[3]);
1511 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
1512 if(!list
|| U_FAILURE(ec
)) {
1518 for(i
= 0; i
< bmpLength
;i
++) {
1519 list
[i
] = data
[i
+headerSize
];
1520 #ifdef DEBUG_SERIALIZE
1521 printf("<<16@%d[%d] %X\n", i
+headerSize
, i
, list
[i
]);
1525 for(i
=bmpLength
;i
<len
;i
++) {
1526 list
[i
] = ((UChar32
)data
[headerSize
+bmpLength
+(i
-bmpLength
)*2+0] << 16) +
1527 ((UChar32
)data
[headerSize
+bmpLength
+(i
-bmpLength
)*2+1]);
1528 #ifdef DEBUG_SERIALIZE
1529 printf("<<32@%d+[%d] %lX\n", headerSize
+bmpLength
+i
, i
, list
[i
]);
1533 list
[len
++]=UNICODESET_HIGH
;
1537 int32_t UnicodeSet::serialize(uint16_t *dest
, int32_t destCapacity
, UErrorCode
& ec
) const {
1538 int32_t bmpLength
, length
, destLength
;
1540 if (U_FAILURE(ec
)) {
1544 if (destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
1545 ec
=U_ILLEGAL_ARGUMENT_ERROR
;
1549 /* count necessary 16-bit units */
1550 length
=this->len
-1; // Subtract 1 to ignore final UNICODESET_HIGH
1551 // assert(length>=0);
1554 if (destCapacity
>0) {
1557 ec
=U_BUFFER_OVERFLOW_ERROR
;
1563 if (this->list
[length
-1]<=0xffff) {
1566 } else if (this->list
[0]>=0x10000) {
1567 /* all supplementary */
1571 /* some BMP, some supplementary */
1572 for (bmpLength
=0; bmpLength
<length
&& this->list
[bmpLength
]<=0xffff; ++bmpLength
) {}
1573 length
=bmpLength
+2*(length
-bmpLength
);
1575 #ifdef DEBUG_SERIALIZE
1576 printf(">> bmpLength%d length%d len%d\n", bmpLength
, length
, len
);
1578 /* length: number of 16-bit array units */
1579 if (length
>0x7fff) {
1580 /* there are only 15 bits for the length in the first serialized word */
1581 ec
=U_INDEX_OUTOFBOUNDS_ERROR
;
1586 * total serialized length:
1587 * number of 16-bit array units (length) +
1588 * 1 length unit (always) +
1589 * 1 bmpLength unit (if there are supplementary values)
1591 destLength
=length
+((length
>bmpLength
)?2:1);
1592 if (destLength
<=destCapacity
) {
1596 #ifdef DEBUG_SERIALIZE
1597 printf("writeHdr\n");
1599 *dest
=(uint16_t)length
;
1600 if (length
>bmpLength
) {
1602 *++dest
=(uint16_t)bmpLength
;
1606 /* write the BMP part of the array */
1608 for (i
=0; i
<bmpLength
; ++i
) {
1609 #ifdef DEBUG_SERIALIZE
1610 printf("writebmp: %x\n", (int)*p
);
1612 *dest
++=(uint16_t)*p
++;
1615 /* write the supplementary part of the array */
1616 for (; i
<length
; i
+=2) {
1617 #ifdef DEBUG_SERIALIZE
1618 printf("write32: %x\n", (int)*p
);
1620 *dest
++=(uint16_t)(*p
>>16);
1621 *dest
++=(uint16_t)*p
++;
1624 ec
=U_BUFFER_OVERFLOW_ERROR
;
1629 //----------------------------------------------------------------
1630 // Implementation: Utility methods
1631 //----------------------------------------------------------------
1634 * Allocate our strings vector and return TRUE if successful.
1636 UBool
UnicodeSet::allocateStrings(UErrorCode
&status
) {
1637 if (U_FAILURE(status
)) {
1640 strings
= new UVector(uprv_deleteUObject
,
1641 uhash_compareUnicodeString
, 1, status
);
1642 if (strings
== NULL
) { // Check for memory allocation error.
1643 status
= U_MEMORY_ALLOCATION_ERROR
;
1646 if (U_FAILURE(status
)) {
1654 void UnicodeSet::ensureCapacity(int32_t newLen
, UErrorCode
& ec
) {
1655 if (newLen
<= capacity
)
1657 UChar32
* temp
= (UChar32
*) uprv_realloc(list
, sizeof(UChar32
) * (newLen
+ GROW_EXTRA
));
1659 ec
= U_MEMORY_ALLOCATION_ERROR
;
1664 capacity
= newLen
+ GROW_EXTRA
;
1665 // else we keep the original contents on the memory failure.
1668 void UnicodeSet::ensureBufferCapacity(int32_t newLen
, UErrorCode
& ec
) {
1669 if (buffer
!= NULL
&& newLen
<= bufferCapacity
)
1671 UChar32
* temp
= (UChar32
*) uprv_realloc(buffer
, sizeof(UChar32
) * (newLen
+ GROW_EXTRA
));
1673 ec
= U_MEMORY_ALLOCATION_ERROR
;
1678 bufferCapacity
= newLen
+ GROW_EXTRA
;
1679 // else we keep the original contents on the memory failure.
1683 * Swap list and buffer.
1685 void UnicodeSet::swapBuffers(void) {
1686 // swap list and buffer
1687 UChar32
* temp
= list
;
1691 int32_t c
= capacity
;
1692 capacity
= bufferCapacity
;
1696 void UnicodeSet::setToBogus() {
1697 clear(); // Remove everything in the set.
1701 //----------------------------------------------------------------
1702 // Implementation: Fundamental operators
1703 //----------------------------------------------------------------
1705 static inline UChar32
max(UChar32 a
, UChar32 b
) {
1706 return (a
> b
) ? a
: b
;
1709 // polarity = 0, 3 is normal: x xor y
1710 // polarity = 1, 2: x xor ~y == x === y
1712 void UnicodeSet::exclusiveOr(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1713 if (isFrozen() || isBogus()) {
1716 UErrorCode status
= U_ZERO_ERROR
;
1717 ensureBufferCapacity(len
+ otherLen
, status
);
1718 if (U_FAILURE(status
)) {
1722 int32_t i
= 0, j
= 0, k
= 0;
1723 UChar32 a
= list
[i
++];
1725 if (polarity
== 1 || polarity
== 2) {
1727 if (other
[j
] == UNICODESET_LOW
) { // skip base if already LOW
1734 // simplest of all the routines
1735 // sort the values, discarding identicals!
1743 } else if (a
!= UNICODESET_HIGH
) { // at this point, a == b
1744 // discard both values!
1748 buffer
[k
++] = UNICODESET_HIGH
;
1757 // polarity = 0 is normal: x union y
1758 // polarity = 2: x union ~y
1759 // polarity = 1: ~x union y
1760 // polarity = 3: ~x union ~y
1762 void UnicodeSet::add(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1763 if (isFrozen() || isBogus() || other
==NULL
) {
1766 UErrorCode status
= U_ZERO_ERROR
;
1767 ensureBufferCapacity(len
+ otherLen
, status
);
1768 if (U_FAILURE(status
)) {
1772 int32_t i
= 0, j
= 0, k
= 0;
1773 UChar32 a
= list
[i
++];
1774 UChar32 b
= other
[j
++];
1775 // change from xor is that we have to check overlapping pairs
1776 // polarity bit 1 means a is second, bit 2 means b is.
1779 case 0: // both first; take lower if unequal
1780 if (a
< b
) { // take a
1781 // Back up over overlapping ranges in buffer[]
1782 if (k
> 0 && a
<= buffer
[k
-1]) {
1783 // Pick latter end value in buffer[] vs. list[]
1784 a
= max(list
[i
], buffer
[--k
]);
1790 i
++; // Common if/else code factored out
1792 } else if (b
< a
) { // take b
1793 if (k
> 0 && b
<= buffer
[k
-1]) {
1794 b
= max(other
[j
], buffer
[--k
]);
1801 } else { // a == b, take a, drop b
1802 if (a
== UNICODESET_HIGH
) goto loop_end
;
1803 // This is symmetrical; it doesn't matter if
1804 // we backtrack with a or b. - liu
1805 if (k
> 0 && a
<= buffer
[k
-1]) {
1806 a
= max(list
[i
], buffer
[--k
]);
1818 case 3: // both second; take higher if unequal, and drop other
1819 if (b
<= a
) { // take a
1820 if (a
== UNICODESET_HIGH
) goto loop_end
;
1823 if (b
== UNICODESET_HIGH
) goto loop_end
;
1827 polarity
^= 1; // factored common code
1831 case 1: // a second, b first; if b < a, overlap
1832 if (a
< b
) { // no overlap, take a
1833 buffer
[k
++] = a
; a
= list
[i
++]; polarity
^= 1;
1834 } else if (b
< a
) { // OVERLAP, drop b
1837 } else { // a == b, drop both!
1838 if (a
== UNICODESET_HIGH
) goto loop_end
;
1845 case 2: // a first, b second; if a < b, overlap
1846 if (b
< a
) { // no overlap, take b
1850 } else if (a
< b
) { // OVERLAP, drop a
1853 } else { // a == b, drop both!
1854 if (a
== UNICODESET_HIGH
) goto loop_end
;
1864 buffer
[k
++] = UNICODESET_HIGH
; // terminate
1870 // polarity = 0 is normal: x intersect y
1871 // polarity = 2: x intersect ~y == set-minus
1872 // polarity = 1: ~x intersect y
1873 // polarity = 3: ~x intersect ~y
1875 void UnicodeSet::retain(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1876 if (isFrozen() || isBogus()) {
1879 UErrorCode status
= U_ZERO_ERROR
;
1880 ensureBufferCapacity(len
+ otherLen
, status
);
1881 if (U_FAILURE(status
)) {
1885 int32_t i
= 0, j
= 0, k
= 0;
1886 UChar32 a
= list
[i
++];
1887 UChar32 b
= other
[j
++];
1888 // change from xor is that we have to check overlapping pairs
1889 // polarity bit 1 means a is second, bit 2 means b is.
1892 case 0: // both first; drop the smaller
1893 if (a
< b
) { // drop a
1896 } else if (b
< a
) { // drop b
1899 } else { // a == b, take one, drop other
1900 if (a
== UNICODESET_HIGH
) goto loop_end
;
1908 case 3: // both second; take lower if unequal
1909 if (a
< b
) { // take a
1913 } else if (b
< a
) { // take b
1917 } else { // a == b, take one, drop other
1918 if (a
== UNICODESET_HIGH
) goto loop_end
;
1926 case 1: // a second, b first;
1927 if (a
< b
) { // NO OVERLAP, drop a
1930 } else if (b
< a
) { // OVERLAP, take b
1934 } else { // a == b, drop both!
1935 if (a
== UNICODESET_HIGH
) goto loop_end
;
1942 case 2: // a first, b second; if a < b, overlap
1943 if (b
< a
) { // no overlap, drop b
1946 } else if (a
< b
) { // OVERLAP, take a
1950 } else { // a == b, drop both!
1951 if (a
== UNICODESET_HIGH
) goto loop_end
;
1961 buffer
[k
++] = UNICODESET_HIGH
; // terminate
1968 * Append the <code>toPattern()</code> representation of a
1969 * string to the given <code>StringBuffer</code>.
1971 void UnicodeSet::_appendToPat(UnicodeString
& buf
, const UnicodeString
& s
, UBool
1972 escapeUnprintable
) {
1974 for (int32_t i
= 0; i
< s
.length(); i
+= U16_LENGTH(cp
)) {
1975 _appendToPat(buf
, cp
= s
.char32At(i
), escapeUnprintable
);
1980 * Append the <code>toPattern()</code> representation of a
1981 * character to the given <code>StringBuffer</code>.
1983 void UnicodeSet::_appendToPat(UnicodeString
& buf
, UChar32 c
, UBool
1984 escapeUnprintable
) {
1985 if (escapeUnprintable
&& ICU_Utility::isUnprintable(c
)) {
1986 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1988 if (ICU_Utility::escapeUnprintable(buf
, c
)) {
1992 // Okay to let ':' pass through
2003 case SymbolTable::SYMBOL_REF
:
2004 buf
.append(BACKSLASH
);
2007 // Escape whitespace
2008 if (PatternProps::isWhiteSpace(c
)) {
2009 buf
.append(BACKSLASH
);
2017 * Append a string representation of this set to result. This will be
2018 * a cleaned version of the string passed to applyPattern(), if there
2019 * is one. Otherwise it will be generated.
2021 UnicodeString
& UnicodeSet::_toPattern(UnicodeString
& result
,
2022 UBool escapeUnprintable
) const
2026 int32_t backslashCount
= 0;
2027 for (i
=0; i
<patLen
; ) {
2029 U16_NEXT(pat
, i
, patLen
, c
);
2030 if (escapeUnprintable
&& ICU_Utility::isUnprintable(c
)) {
2031 // If the unprintable character is preceded by an odd
2032 // number of backslashes, then it has been escaped.
2033 // Before unescaping it, we delete the final
2035 if ((backslashCount
% 2) == 1) {
2036 result
.truncate(result
.length() - 1);
2038 ICU_Utility::escapeUnprintable(result
, c
);
2042 if (c
== BACKSLASH
) {
2052 return _generatePattern(result
, escapeUnprintable
);
2056 * Returns a string representation of this set. If the result of
2057 * calling this function is passed to a UnicodeSet constructor, it
2058 * will produce another set that is equal to this one.
2060 UnicodeString
& UnicodeSet::toPattern(UnicodeString
& result
,
2061 UBool escapeUnprintable
) const
2064 return _toPattern(result
, escapeUnprintable
);
2068 * Generate and append a string representation of this set to result.
2069 * This does not use this.pat, the cleaned up copy of the string
2070 * passed to applyPattern().
2072 UnicodeString
& UnicodeSet::_generatePattern(UnicodeString
& result
,
2073 UBool escapeUnprintable
) const
2075 result
.append(SET_OPEN
);
2077 // // Check against the predefined categories. We implicitly build
2078 // // up ALL category sets the first time toPattern() is called.
2079 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2080 // if (*this == getCategorySet(cat)) {
2081 // result.append(COLON);
2082 // result.append(CATEGORY_NAMES, cat*2, 2);
2083 // return result.append(CATEGORY_CLOSE);
2087 int32_t count
= getRangeCount();
2089 // If the set contains at least 2 intervals and includes both
2090 // MIN_VALUE and MAX_VALUE, then the inverse representation will
2091 // be more economical.
2093 getRangeStart(0) == MIN_VALUE
&&
2094 getRangeEnd(count
-1) == MAX_VALUE
) {
2097 result
.append(COMPLEMENT
);
2099 for (int32_t i
= 1; i
< count
; ++i
) {
2100 UChar32 start
= getRangeEnd(i
-1)+1;
2101 UChar32 end
= getRangeStart(i
)-1;
2102 _appendToPat(result
, start
, escapeUnprintable
);
2104 if ((start
+1) != end
) {
2105 result
.append(HYPHEN
);
2107 _appendToPat(result
, end
, escapeUnprintable
);
2112 // Default; emit the ranges as pairs
2114 for (int32_t i
= 0; i
< count
; ++i
) {
2115 UChar32 start
= getRangeStart(i
);
2116 UChar32 end
= getRangeEnd(i
);
2117 _appendToPat(result
, start
, escapeUnprintable
);
2119 if ((start
+1) != end
) {
2120 result
.append(HYPHEN
);
2122 _appendToPat(result
, end
, escapeUnprintable
);
2127 for (int32_t i
= 0; i
<strings
->size(); ++i
) {
2128 result
.append(OPEN_BRACE
);
2129 _appendToPat(result
,
2130 *(const UnicodeString
*) strings
->elementAt(i
),
2132 result
.append(CLOSE_BRACE
);
2134 return result
.append(SET_CLOSE
);
2138 * Release existing cached pattern
2140 void UnicodeSet::releasePattern() {
2149 * Set the new pattern to cache.
2151 void UnicodeSet::setPattern(const UnicodeString
& newPat
) {
2153 int32_t newPatLen
= newPat
.length();
2154 pat
= (UChar
*)uprv_malloc((newPatLen
+ 1) * sizeof(UChar
));
2157 newPat
.extractBetween(0, patLen
, pat
);
2160 // else we don't care if malloc failed. This was just a nice cache.
2161 // We can regenerate an equivalent pattern later when requested.
2164 UnicodeFunctor
*UnicodeSet::freeze() {
2165 if(!isFrozen() && !isBogus()) {
2166 // Do most of what compact() does before freezing because
2167 // compact() will not work when the set is frozen.
2168 // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2170 // Delete buffer first to defragment memory less.
2171 if (buffer
!= NULL
) {
2175 if (capacity
> (len
+ GROW_EXTRA
)) {
2176 // Make the capacity equal to len or 1.
2177 // We don't want to realloc of 0 size.
2178 capacity
= len
+ (len
== 0);
2179 list
= (UChar32
*) uprv_realloc(list
, sizeof(UChar32
) * capacity
);
2180 if (list
== NULL
) { // Check for memory allocation error.
2186 // Optimize contains() and span() and similar functions.
2187 if (!strings
->isEmpty()) {
2188 stringSpan
= new UnicodeSetStringSpan(*this, *strings
, UnicodeSetStringSpan::ALL
);
2189 if (stringSpan
!= NULL
&& !stringSpan
->needsStringSpanUTF16()) {
2190 // All strings are irrelevant for span() etc. because
2191 // all of each string's code points are contained in this set.
2192 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2193 // many relevant strings as UTF-16.
2194 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2199 if (stringSpan
== NULL
) {
2200 // No span-relevant strings: Optimize for code point spans.
2201 bmpSet
=new BMPSet(list
, len
);
2202 if (bmpSet
== NULL
) { // Check for memory allocation error.
2210 int32_t UnicodeSet::span(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
2211 if(length
>0 && bmpSet
!=NULL
) {
2212 return (int32_t)(bmpSet
->span(s
, s
+length
, spanCondition
)-s
);
2220 if(stringSpan
!=NULL
) {
2221 return stringSpan
->span(s
, length
, spanCondition
);
2222 } else if(!strings
->isEmpty()) {
2223 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2224 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED
:
2225 UnicodeSetStringSpan::FWD_UTF16_CONTAINED
;
2226 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2227 if(strSpan
.needsStringSpanUTF16()) {
2228 return strSpan
.span(s
, length
, spanCondition
);
2232 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2233 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2237 int32_t start
=0, prev
=0;
2239 U16_NEXT(s
, start
, length
, c
);
2240 if(spanCondition
!=contains(c
)) {
2243 } while((prev
=start
)<length
);
2247 int32_t UnicodeSet::spanBack(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
2248 if(length
>0 && bmpSet
!=NULL
) {
2249 return (int32_t)(bmpSet
->spanBack(s
, s
+length
, spanCondition
)-s
);
2257 if(stringSpan
!=NULL
) {
2258 return stringSpan
->spanBack(s
, length
, spanCondition
);
2259 } else if(!strings
->isEmpty()) {
2260 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2261 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED
:
2262 UnicodeSetStringSpan::BACK_UTF16_CONTAINED
;
2263 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2264 if(strSpan
.needsStringSpanUTF16()) {
2265 return strSpan
.spanBack(s
, length
, spanCondition
);
2269 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2270 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2274 int32_t prev
=length
;
2276 U16_PREV(s
, 0, length
, c
);
2277 if(spanCondition
!=contains(c
)) {
2280 } while((prev
=length
)>0);
2284 int32_t UnicodeSet::spanUTF8(const char *s
, int32_t length
, USetSpanCondition spanCondition
) const {
2285 if(length
>0 && bmpSet
!=NULL
) {
2286 const uint8_t *s0
=(const uint8_t *)s
;
2287 return (int32_t)(bmpSet
->spanUTF8(s0
, length
, spanCondition
)-s0
);
2290 length
=(int32_t)uprv_strlen(s
);
2295 if(stringSpan
!=NULL
) {
2296 return stringSpan
->spanUTF8((const uint8_t *)s
, length
, spanCondition
);
2297 } else if(!strings
->isEmpty()) {
2298 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2299 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED
:
2300 UnicodeSetStringSpan::FWD_UTF8_CONTAINED
;
2301 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2302 if(strSpan
.needsStringSpanUTF8()) {
2303 return strSpan
.spanUTF8((const uint8_t *)s
, length
, spanCondition
);
2307 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2308 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2312 int32_t start
=0, prev
=0;
2314 U8_NEXT_OR_FFFD(s
, start
, length
, c
);
2315 if(spanCondition
!=contains(c
)) {
2318 } while((prev
=start
)<length
);
2322 int32_t UnicodeSet::spanBackUTF8(const char *s
, int32_t length
, USetSpanCondition spanCondition
) const {
2323 if(length
>0 && bmpSet
!=NULL
) {
2324 const uint8_t *s0
=(const uint8_t *)s
;
2325 return bmpSet
->spanBackUTF8(s0
, length
, spanCondition
);
2328 length
=(int32_t)uprv_strlen(s
);
2333 if(stringSpan
!=NULL
) {
2334 return stringSpan
->spanBackUTF8((const uint8_t *)s
, length
, spanCondition
);
2335 } else if(!strings
->isEmpty()) {
2336 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2337 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED
:
2338 UnicodeSetStringSpan::BACK_UTF8_CONTAINED
;
2339 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2340 if(strSpan
.needsStringSpanUTF8()) {
2341 return strSpan
.spanBackUTF8((const uint8_t *)s
, length
, spanCondition
);
2345 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2346 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2350 int32_t prev
=length
;
2352 U8_PREV_OR_FFFD(s
, 0, length
, c
);
2353 if(spanCondition
!=contains(c
)) {
2356 } while((prev
=length
)>0);