2 **********************************************************************
3 * Copyright (C) 1999-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 **********************************************************************
11 #include "unicode/utypes.h"
12 #include "unicode/parsepos.h"
13 #include "unicode/symtable.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utf8.h"
16 #include "unicode/utf16.h"
20 #include "patternprops.h"
28 #include "unisetspan.h"
30 // Define UChar constants using hex for EBCDIC compatibility
31 // Used #define to reduce private static exports and memory access time.
32 #define SET_OPEN ((UChar)0x005B) /*[*/
33 #define SET_CLOSE ((UChar)0x005D) /*]*/
34 #define HYPHEN ((UChar)0x002D) /*-*/
35 #define COMPLEMENT ((UChar)0x005E) /*^*/
36 #define COLON ((UChar)0x003A) /*:*/
37 #define BACKSLASH ((UChar)0x005C) /*\*/
38 #define INTERSECTION ((UChar)0x0026) /*&*/
39 #define UPPER_U ((UChar)0x0055) /*U*/
40 #define LOWER_U ((UChar)0x0075) /*u*/
41 #define OPEN_BRACE ((UChar)123) /*{*/
42 #define CLOSE_BRACE ((UChar)125) /*}*/
43 #define UPPER_P ((UChar)0x0050) /*P*/
44 #define LOWER_P ((UChar)0x0070) /*p*/
45 #define UPPER_N ((UChar)78) /*N*/
46 #define EQUALS ((UChar)0x003D) /*=*/
48 // HIGH_VALUE > all valid values. 110000 for codepoints
49 #define UNICODESET_HIGH 0x0110000
51 // LOW <= all valid values. ZERO for codepoints
52 #define UNICODESET_LOW 0x000000
54 // initial storage. Must be >= 0
55 #define START_EXTRA 16
57 // extra amount for growth. Must be >= 0
58 #define GROW_EXTRA START_EXTRA
62 SymbolTable::~SymbolTable() {}
64 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet
)
67 * Modify the given UChar32 variable so that it is in range, by
68 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
69 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
70 * It modifies its argument in-place and also returns it.
72 static inline UChar32
pinCodePoint(UChar32
& c
) {
73 if (c
< UNICODESET_LOW
) {
75 } else if (c
> (UNICODESET_HIGH
-1)) {
76 c
= (UNICODESET_HIGH
-1);
81 //----------------------------------------------------------------
83 //----------------------------------------------------------------
85 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
86 // To enable the debugging, define the symbol DEBUG_MEM in the line
87 // below. This will result in text being sent to stdout that looks
89 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
90 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
91 // Each line lists a construction (ct) or destruction (dt) event, the
92 // object address, the number of outstanding objects after the event,
93 // and the pattern of the object in question.
99 static int32_t _dbgCount
= 0;
101 static inline void _dbgct(UnicodeSet
* set
) {
103 set
->toPattern(str
, TRUE
);
105 str
.extract(0, 39, buf
, "");
106 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set
, ++_dbgCount
, buf
);
109 static inline void _dbgdt(UnicodeSet
* set
) {
111 set
->toPattern(str
, TRUE
);
113 str
.extract(0, 39, buf
, "");
114 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set
, --_dbgCount
, buf
);
124 //----------------------------------------------------------------
125 // UnicodeString in UVector support
126 //----------------------------------------------------------------
128 static void U_CALLCONV
cloneUnicodeString(UElement
*dst
, UElement
*src
) {
129 dst
->pointer
= new UnicodeString(*(UnicodeString
*)src
->pointer
);
132 static int8_t U_CALLCONV
compareUnicodeString(UElement t1
, UElement t2
) {
133 const UnicodeString
&a
= *(const UnicodeString
*)t1
.pointer
;
134 const UnicodeString
&b
= *(const UnicodeString
*)t2
.pointer
;
138 //----------------------------------------------------------------
140 //----------------------------------------------------------------
143 * Constructs an empty set.
145 UnicodeSet::UnicodeSet() :
146 len(1), capacity(1 + START_EXTRA
), list(0), bmpSet(0), buffer(0),
147 bufferCapacity(0), patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
150 UErrorCode status
= U_ZERO_ERROR
;
151 allocateStrings(status
);
152 if (U_FAILURE(status
)) {
155 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
157 list
[0] = UNICODESET_HIGH
;
158 } else { // If memory allocation failed, set to bogus state.
166 * Constructs a set containing the given range. If <code>end >
167 * start</code> then an empty set is created.
169 * @param start first character, inclusive, of range
170 * @param end last character, inclusive, of range
172 UnicodeSet::UnicodeSet(UChar32 start
, UChar32 end
) :
173 len(1), capacity(1 + START_EXTRA
), list(0), bmpSet(0), buffer(0),
174 bufferCapacity(0), patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
177 UErrorCode status
= U_ZERO_ERROR
;
178 allocateStrings(status
);
179 if (U_FAILURE(status
)) {
182 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
184 list
[0] = UNICODESET_HIGH
;
185 complement(start
, end
);
186 } else { // If memory allocation failed, set to bogus state.
194 * Constructs a set that is identical to the given UnicodeSet.
196 UnicodeSet::UnicodeSet(const UnicodeSet
& o
) :
198 len(0), capacity(o
.isFrozen() ? o
.len
: o
.len
+ GROW_EXTRA
), list(0),
200 buffer(0), bufferCapacity(0),
201 patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
204 UErrorCode status
= U_ZERO_ERROR
;
205 allocateStrings(status
);
206 if (U_FAILURE(status
)) {
209 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
212 } else { // If memory allocation failed, set to bogus state.
219 // Copy-construct as thawed.
220 UnicodeSet::UnicodeSet(const UnicodeSet
& o
, UBool
/* asThawed */) :
222 len(0), capacity(o
.len
+ GROW_EXTRA
), list(0),
224 buffer(0), bufferCapacity(0),
225 patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
228 UErrorCode status
= U_ZERO_ERROR
;
229 allocateStrings(status
);
230 if (U_FAILURE(status
)) {
233 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
235 // *this = o except for bmpSet and stringSpan
237 uprv_memcpy(list
, o
.list
, len
*sizeof(UChar32
));
238 if (strings
!= NULL
&& o
.strings
!= NULL
) {
239 strings
->assign(*o
.strings
, cloneUnicodeString
, status
);
240 } else { // Invalid strings.
245 setPattern(UnicodeString(o
.pat
, o
.patLen
));
247 } else { // If memory allocation failed, set to bogus state.
257 UnicodeSet::~UnicodeSet() {
258 _dbgdt(this); // first!
270 * Assigns this object to be a copy of another.
272 UnicodeSet
& UnicodeSet::operator=(const UnicodeSet
& o
) {
283 UErrorCode ec
= U_ZERO_ERROR
;
284 ensureCapacity(o
.len
, ec
);
286 return *this; // There is no way to report this error :-(
289 uprv_memcpy(list
, o
.list
, len
*sizeof(UChar32
));
290 if (o
.bmpSet
== NULL
) {
293 bmpSet
= new BMPSet(*o
.bmpSet
, list
, len
);
294 if (bmpSet
== NULL
) { // Check for memory allocation error.
299 if (strings
!= NULL
&& o
.strings
!= NULL
) {
300 strings
->assign(*o
.strings
, cloneUnicodeString
, ec
);
301 } else { // Invalid strings.
305 if (o
.stringSpan
== NULL
) {
308 stringSpan
= new UnicodeSetStringSpan(*o
.stringSpan
, *strings
);
309 if (stringSpan
== NULL
) { // Check for memory allocation error.
316 setPattern(UnicodeString(o
.pat
, o
.patLen
));
322 * Returns a copy of this object. All UnicodeMatcher objects have
323 * to support cloning in order to allow classes using
324 * UnicodeMatchers, such as Transliterator, to implement cloning.
326 UnicodeFunctor
* UnicodeSet::clone() const {
327 return new UnicodeSet(*this);
330 UnicodeFunctor
*UnicodeSet::cloneAsThawed() const {
331 return new UnicodeSet(*this, TRUE
);
335 * Compares the specified object with this set for equality. Returns
336 * <tt>true</tt> if the two sets
337 * have the same size, and every member of the specified set is
338 * contained in this set (or equivalently, every member of this set is
339 * contained in the specified set).
341 * @param o set to be compared for equality with this set.
342 * @return <tt>true</tt> if the specified set is equal to this set.
344 UBool
UnicodeSet::operator==(const UnicodeSet
& o
) const {
345 if (len
!= o
.len
) return FALSE
;
346 for (int32_t i
= 0; i
< len
; ++i
) {
347 if (list
[i
] != o
.list
[i
]) return FALSE
;
349 if (*strings
!= *o
.strings
) return FALSE
;
354 * Returns the hash code value for this set.
356 * @return the hash code value for this set.
357 * @see Object#hashCode()
359 int32_t UnicodeSet::hashCode(void) const {
360 int32_t result
= len
;
361 for (int32_t i
= 0; i
< len
; ++i
) {
368 //----------------------------------------------------------------
370 //----------------------------------------------------------------
373 * Returns the number of elements in this set (its cardinality),
374 * Note than the elements of a set may include both individual
375 * codepoints and strings.
377 * @return the number of elements in this set (its cardinality).
379 int32_t UnicodeSet::size(void) const {
381 int32_t count
= getRangeCount();
382 for (int32_t i
= 0; i
< count
; ++i
) {
383 n
+= getRangeEnd(i
) - getRangeStart(i
) + 1;
385 return n
+ strings
->size();
389 * Returns <tt>true</tt> if this set contains no elements.
391 * @return <tt>true</tt> if this set contains no elements.
393 UBool
UnicodeSet::isEmpty(void) const {
394 return len
== 1 && strings
->size() == 0;
398 * Returns true if this set contains the given character.
399 * @param c character to be checked for containment
400 * @return true if the test condition is met
402 UBool
UnicodeSet::contains(UChar32 c
) const {
403 // Set i to the index of the start item greater than ch
404 // We know we will terminate without length test!
405 // LATER: for large sets, add binary search
408 // if (c < list[++i]) break;
410 if (bmpSet
!= NULL
) {
411 return bmpSet
->contains(c
);
413 if (stringSpan
!= NULL
) {
414 return stringSpan
->contains(c
);
416 if (c
>= UNICODESET_HIGH
) { // Don't need to check LOW bound
419 int32_t i
= findCodePoint(c
);
420 return (UBool
)(i
& 1); // return true if odd
424 * Returns the smallest value i such that c < list[i]. Caller
425 * must ensure that c is a legal value or this method will enter
426 * an infinite loop. This method performs a binary search.
427 * @param c a character in the range MIN_VALUE..MAX_VALUE
429 * @return the smallest integer i in the range 0..len-1,
430 * inclusive, such that c < list[i]
432 int32_t UnicodeSet::findCodePoint(UChar32 c
) const {
435 set list[] c=0 1 3 4 7 8
436 === ============== ===========
437 [] [110000] 0 0 0 0 0 0
438 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
439 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
440 [:Any:] [0, 110000] 1 1 1 1 1 1
443 // Return the smallest i such that c < list[i]. Assume
444 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
447 // High runner test. c is often after the last range, so an
448 // initial check for this condition pays off.
450 int32_t hi
= len
- 1;
451 if (lo
>= hi
|| c
>= list
[hi
-1])
453 // invariant: c >= list[lo]
454 // invariant: c < list[hi]
456 int32_t i
= (lo
+ hi
) >> 1;
459 } else if (c
< list
[i
]) {
469 * Returns true if this set contains every character
470 * of the given range.
471 * @param start first character, inclusive, of the range
472 * @param end last character, inclusive, of the range
473 * @return true if the test condition is met
475 UBool
UnicodeSet::contains(UChar32 start
, UChar32 end
) const {
478 // if (start < list[++i]) break;
480 int32_t i
= findCodePoint(start
);
481 return ((i
& 1) != 0 && end
< list
[i
]);
485 * Returns <tt>true</tt> if this set contains the given
486 * multicharacter string.
487 * @param s string to be checked for containment
488 * @return <tt>true</tt> if this set contains the specified string
490 UBool
UnicodeSet::contains(const UnicodeString
& s
) const {
491 if (s
.length() == 0) return FALSE
;
492 int32_t cp
= getSingleCP(s
);
494 return strings
->contains((void*) &s
);
496 return contains((UChar32
) cp
);
501 * Returns true if this set contains all the characters and strings
503 * @param c set to be checked for containment
504 * @return true if the test condition is met
506 UBool
UnicodeSet::containsAll(const UnicodeSet
& c
) const {
507 // The specified set is a subset if all of its pairs are contained in
508 // this set. It's possible to code this more efficiently in terms of
509 // direct manipulation of the inversion lists if the need arises.
510 int32_t n
= c
.getRangeCount();
511 for (int i
=0; i
<n
; ++i
) {
512 if (!contains(c
.getRangeStart(i
), c
.getRangeEnd(i
))) {
516 if (!strings
->containsAll(*c
.strings
)) return FALSE
;
521 * Returns true if this set contains all the characters
522 * of the given string.
523 * @param s string containing characters to be checked for containment
524 * @return true if the test condition is met
526 UBool
UnicodeSet::containsAll(const UnicodeString
& s
) const {
527 return (UBool
)(span(s
.getBuffer(), s
.length(), USET_SPAN_CONTAINED
) ==
532 * Returns true if this set contains none of the characters
533 * of the given range.
534 * @param start first character, inclusive, of the range
535 * @param end last character, inclusive, of the range
536 * @return true if the test condition is met
538 UBool
UnicodeSet::containsNone(UChar32 start
, UChar32 end
) const {
541 // if (start < list[++i]) break;
543 int32_t i
= findCodePoint(start
);
544 return ((i
& 1) == 0 && end
< list
[i
]);
548 * Returns true if this set contains none of the characters and strings
550 * @param c set to be checked for containment
551 * @return true if the test condition is met
553 UBool
UnicodeSet::containsNone(const UnicodeSet
& c
) const {
554 // The specified set is a subset if all of its pairs are contained in
555 // this set. It's possible to code this more efficiently in terms of
556 // direct manipulation of the inversion lists if the need arises.
557 int32_t n
= c
.getRangeCount();
558 for (int32_t i
=0; i
<n
; ++i
) {
559 if (!containsNone(c
.getRangeStart(i
), c
.getRangeEnd(i
))) {
563 if (!strings
->containsNone(*c
.strings
)) return FALSE
;
568 * Returns true if this set contains none of the characters
569 * of the given string.
570 * @param s string containing characters to be checked for containment
571 * @return true if the test condition is met
573 UBool
UnicodeSet::containsNone(const UnicodeString
& s
) const {
574 return (UBool
)(span(s
.getBuffer(), s
.length(), USET_SPAN_NOT_CONTAINED
) ==
579 * Returns <tt>true</tt> if this set contains any character whose low byte
580 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
583 UBool
UnicodeSet::matchesIndexValue(uint8_t v
) const {
584 /* The index value v, in the range [0,255], is contained in this set if
585 * it is contained in any pair of this set. Pairs either have the high
586 * bytes equal, or unequal. If the high bytes are equal, then we have
587 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
588 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
589 * Then v is contained if xx <= v || v <= yy. (This is identical to the
590 * time zone month containment logic.)
593 int32_t rangeCount
=getRangeCount();
594 for (i
=0; i
<rangeCount
; ++i
) {
595 UChar32 low
= getRangeStart(i
);
596 UChar32 high
= getRangeEnd(i
);
597 if ((low
& ~0xFF) == (high
& ~0xFF)) {
598 if ((low
& 0xFF) <= v
&& v
<= (high
& 0xFF)) {
601 } else if ((low
& 0xFF) <= v
|| v
<= (high
& 0xFF)) {
605 if (strings
->size() != 0) {
606 for (i
=0; i
<strings
->size(); ++i
) {
607 const UnicodeString
& s
= *(const UnicodeString
*)strings
->elementAt(i
);
608 //if (s.length() == 0) {
609 // // Empty strings match everything
612 // assert(s.length() != 0); // We enforce this elsewhere
613 UChar32 c
= s
.char32At(0);
614 if ((c
& 0xFF) == v
) {
623 * Implementation of UnicodeMatcher::matches(). Always matches the
624 * longest possible multichar string.
626 UMatchDegree
UnicodeSet::matches(const Replaceable
& text
,
630 if (offset
== limit
) {
631 // Strings, if any, have length != 0, so we don't worry
632 // about them here. If we ever allow zero-length strings
633 // we much check for them here.
634 if (contains(U_ETHER
)) {
635 return incremental
? U_PARTIAL_MATCH
: U_MATCH
;
640 if (strings
->size() != 0) { // try strings first
642 // might separate forward and backward loops later
643 // for now they are combined
645 // TODO Improve efficiency of this, at least in the forward
646 // direction, if not in both. In the forward direction we
647 // can assume the strings are sorted.
650 UBool forward
= offset
< limit
;
652 // firstChar is the leftmost char to match in the
653 // forward direction or the rightmost char to match in
654 // the reverse direction.
655 UChar firstChar
= text
.charAt(offset
);
657 // If there are multiple strings that can match we
658 // return the longest match.
659 int32_t highWaterLength
= 0;
661 for (i
=0; i
<strings
->size(); ++i
) {
662 const UnicodeString
& trial
= *(const UnicodeString
*)strings
->elementAt(i
);
664 //if (trial.length() == 0) {
665 // return U_MATCH; // null-string always matches
667 // assert(trial.length() != 0); // We ensure this elsewhere
669 UChar c
= trial
.charAt(forward
? 0 : trial
.length() - 1);
671 // Strings are sorted, so we can optimize in the
672 // forward direction.
673 if (forward
&& c
> firstChar
) break;
674 if (c
!= firstChar
) continue;
676 int32_t matchLen
= matchRest(text
, offset
, limit
, trial
);
679 int32_t maxLen
= forward
? limit
-offset
: offset
-limit
;
680 if (matchLen
== maxLen
) {
681 // We have successfully matched but only up to limit.
682 return U_PARTIAL_MATCH
;
686 if (matchLen
== trial
.length()) {
687 // We have successfully matched the whole string.
688 if (matchLen
> highWaterLength
) {
689 highWaterLength
= matchLen
;
691 // In the forward direction we know strings
692 // are sorted so we can bail early.
693 if (forward
&& matchLen
< highWaterLength
) {
700 // We've checked all strings without a partial match.
701 // If we have full matches, return the longest one.
702 if (highWaterLength
!= 0) {
703 offset
+= forward
? highWaterLength
: -highWaterLength
;
707 return UnicodeFilter::matches(text
, offset
, limit
, incremental
);
712 * Returns the longest match for s in text at the given position.
713 * If limit > start then match forward from start+1 to limit
714 * matching all characters except s.charAt(0). If limit < start,
715 * go backward starting from start-1 matching all characters
716 * except s.charAt(s.length()-1). This method assumes that the
717 * first character, text.charAt(start), matches s, so it does not
719 * @param text the text to match
720 * @param start the first character to match. In the forward
721 * direction, text.charAt(start) is matched against s.charAt(0).
722 * In the reverse direction, it is matched against
723 * s.charAt(s.length()-1).
724 * @param limit the limit offset for matching, either last+1 in
725 * the forward direction, or last-1 in the reverse direction,
726 * where last is the index of the last character to match.
727 * @return If part of s matches up to the limit, return |limit -
728 * start|. If all of s matches before reaching the limit, return
729 * s.length(). If there is a mismatch between s and text, return
732 int32_t UnicodeSet::matchRest(const Replaceable
& text
,
733 int32_t start
, int32_t limit
,
734 const UnicodeString
& s
) {
737 int32_t slen
= s
.length();
739 maxLen
= limit
- start
;
740 if (maxLen
> slen
) maxLen
= slen
;
741 for (i
= 1; i
< maxLen
; ++i
) {
742 if (text
.charAt(start
+ i
) != s
.charAt(i
)) return 0;
745 maxLen
= start
- limit
;
746 if (maxLen
> slen
) maxLen
= slen
;
747 --slen
; // <=> slen = s.length() - 1;
748 for (i
= 1; i
< maxLen
; ++i
) {
749 if (text
.charAt(start
- i
) != s
.charAt(slen
- i
)) return 0;
756 * Implement of UnicodeMatcher
758 void UnicodeSet::addMatchSetTo(UnicodeSet
& toUnionTo
) const {
759 toUnionTo
.addAll(*this);
763 * Returns the index of the given character within this set, where
764 * the set is ordered by ascending code point. If the character
765 * is not in this set, return -1. The inverse of this method is
766 * <code>charAt()</code>.
767 * @return an index from 0..size()-1, or -1
769 int32_t UnicodeSet::indexOf(UChar32 c
) const {
770 if (c
< MIN_VALUE
|| c
> MAX_VALUE
) {
776 UChar32 start
= list
[i
++];
780 UChar32 limit
= list
[i
++];
782 return n
+ c
- start
;
789 * Returns the character at the given index within this set, where
790 * the set is ordered by ascending code point. If the index is
791 * out of range, return (UChar32)-1. The inverse of this method is
792 * <code>indexOf()</code>.
793 * @param index an index from 0..size()-1
794 * @return the character at the given index, or (UChar32)-1.
796 UChar32
UnicodeSet::charAt(int32_t index
) const {
798 // len2 is the largest even integer <= len, that is, it is len
799 // for even values and len-1 for odd values. With odd values
800 // the last entry is UNICODESET_HIGH.
801 int32_t len2
= len
& ~1;
802 for (int32_t i
=0; i
< len2
;) {
803 UChar32 start
= list
[i
++];
804 int32_t count
= list
[i
++] - start
;
806 return (UChar32
)(start
+ index
);
815 * Make this object represent the range <code>start - end</code>.
816 * If <code>end > start</code> then this object is set to an
819 * @param start first character in the set, inclusive
820 * @rparam end last character in the set, inclusive
822 UnicodeSet
& UnicodeSet::set(UChar32 start
, UChar32 end
) {
824 complement(start
, end
);
829 * Adds the specified range to this set if it is not already
830 * present. If this set already contains the specified range,
831 * the call leaves this set unchanged. If <code>end > start</code>
832 * then an empty range is added, leaving the set unchanged.
834 * @param start first character, inclusive, of range to be added
836 * @param end last character, inclusive, of range to be added
839 UnicodeSet
& UnicodeSet::add(UChar32 start
, UChar32 end
) {
840 if (pinCodePoint(start
) < pinCodePoint(end
)) {
841 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
843 } else if (start
== end
) {
849 // #define DEBUG_US_ADD
853 void dump(UChar32 c
) {
855 printf("%c", (char)c
);
860 void dump(const UChar32
* list
, int32_t len
) {
862 for (int32_t i
=0; i
<len
; ++i
) {
863 if (i
!= 0) printf(", ");
871 * Adds the specified character to this set if it is not already
872 * present. If this set already contains the specified character,
873 * the call leaves this set unchanged.
875 UnicodeSet
& UnicodeSet::add(UChar32 c
) {
876 // find smallest i such that c < list[i]
877 // if odd, then it is IN the set
878 // if even, then it is OUT of the set
879 int32_t i
= findCodePoint(pinCodePoint(c
));
882 if ((i
& 1) != 0 || isFrozen() || isBogus()) return *this;
885 // assert(list[len-1] == HIGH);
888 // [start_0, limit_0, start_1, limit_1, HIGH]
890 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
894 // i == 0 means c is before the first range
899 printf(" found at %d", i
);
905 if (c
== list
[i
]-1) {
906 // c is before start of next range
908 // if we touched the HIGH mark, then add a new one
909 if (c
== (UNICODESET_HIGH
- 1)) {
910 UErrorCode status
= U_ZERO_ERROR
;
911 ensureCapacity(len
+1, status
);
912 if (U_FAILURE(status
)) {
913 return *this; // There is no way to report this error :-(
915 list
[len
++] = UNICODESET_HIGH
;
917 if (i
> 0 && c
== list
[i
-1]) {
918 // collapse adjacent ranges
920 // [..., start_k-1, c, c, limit_k, ..., HIGH]
924 //for (int32_t k=i-1; k<len-2; ++k) {
925 // list[k] = list[k+2];
927 UChar32
* dst
= list
+ i
- 1;
928 UChar32
* src
= dst
+ 2;
929 UChar32
* srclimit
= list
+ len
;
930 while (src
< srclimit
) *(dst
++) = *(src
++);
936 else if (i
> 0 && c
== list
[i
-1]) {
937 // c is after end of prior range
939 // no need to check for collapse here
943 // At this point we know the new char is not adjacent to
944 // any existing ranges, and it is not 10FFFF.
947 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
951 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
955 UErrorCode status
= U_ZERO_ERROR
;
956 ensureCapacity(len
+2, status
);
957 if (U_FAILURE(status
)) {
958 return *this; // There is no way to report this error :-(
961 //for (int32_t k=len-1; k>=i; --k) {
962 // list[k+2] = list[k];
964 UChar32
* src
= list
+ len
;
965 UChar32
* dst
= src
+ 2;
966 UChar32
* srclimit
= list
+ i
;
967 while (src
> srclimit
) *(--dst
) = *(--src
);
978 for (i
=1; i
<len
; ++i
) {
979 if (list
[i
] <= list
[i
-1]) {
981 printf("ERROR: list has been corrupted\n");
992 * Adds the specified multicharacter to this set if it is not already
993 * present. If this set already contains the multicharacter,
994 * the call leaves this set unchanged.
995 * Thus "ch" => {"ch"}
996 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
997 * @param s the source string
998 * @return the modified set, for chaining
1000 UnicodeSet
& UnicodeSet::add(const UnicodeString
& s
) {
1001 if (s
.length() == 0 || isFrozen() || isBogus()) return *this;
1002 int32_t cp
= getSingleCP(s
);
1004 if (!strings
->contains((void*) &s
)) {
1015 * Adds the given string, in order, to 'strings'. The given string
1016 * must have been checked by the caller to not be empty and to not
1017 * already be in 'strings'.
1019 void UnicodeSet::_add(const UnicodeString
& s
) {
1020 if (isFrozen() || isBogus()) {
1023 UnicodeString
* t
= new UnicodeString(s
);
1024 if (t
== NULL
) { // Check for memory allocation error.
1028 UErrorCode ec
= U_ZERO_ERROR
;
1029 strings
->sortedInsert(t
, compareUnicodeString
, ec
);
1030 if (U_FAILURE(ec
)) {
1037 * @return a code point IF the string consists of a single one.
1038 * otherwise returns -1.
1039 * @param string to test
1041 int32_t UnicodeSet::getSingleCP(const UnicodeString
& s
) {
1042 //if (s.length() < 1) {
1043 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1045 if (s
.length() > 2) return -1;
1046 if (s
.length() == 1) return s
.charAt(0);
1048 // at this point, len = 2
1049 UChar32 cp
= s
.char32At(0);
1050 if (cp
> 0xFFFF) { // is surrogate pair
1057 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1058 * If this set already any particular character, it has no effect on that character.
1059 * @param the source string
1060 * @return the modified set, for chaining
1062 UnicodeSet
& UnicodeSet::addAll(const UnicodeString
& s
) {
1064 for (int32_t i
= 0; i
< s
.length(); i
+= U16_LENGTH(cp
)) {
1072 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1073 * If this set already any particular character, it has no effect on that character.
1074 * @param the source string
1075 * @return the modified set, for chaining
1077 UnicodeSet
& UnicodeSet::retainAll(const UnicodeString
& s
) {
1085 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1086 * If this set already any particular character, it has no effect on that character.
1087 * @param the source string
1088 * @return the modified set, for chaining
1090 UnicodeSet
& UnicodeSet::complementAll(const UnicodeString
& s
) {
1098 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1099 * If this set already any particular character, it has no effect on that character.
1100 * @param the source string
1101 * @return the modified set, for chaining
1103 UnicodeSet
& UnicodeSet::removeAll(const UnicodeString
& s
) {
1110 UnicodeSet
& UnicodeSet::removeAllStrings() {
1111 strings
->removeAllElements();
1117 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1118 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1119 * @param the source string
1120 * @return a newly created set containing the given string
1122 UnicodeSet
* U_EXPORT2
UnicodeSet::createFrom(const UnicodeString
& s
) {
1123 UnicodeSet
*set
= new UnicodeSet();
1124 if (set
!= NULL
) { // Check for memory allocation error.
1132 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1133 * @param the source string
1134 * @return a newly created set containing the given characters
1136 UnicodeSet
* U_EXPORT2
UnicodeSet::createFromAll(const UnicodeString
& s
) {
1137 UnicodeSet
*set
= new UnicodeSet();
1138 if (set
!= NULL
) { // Check for memory allocation error.
1145 * Retain only the elements in this set that are contained in the
1146 * specified range. If <code>end > start</code> then an empty range is
1147 * retained, leaving the set empty.
1149 * @param start first character, inclusive, of range to be retained
1151 * @param end last character, inclusive, of range to be retained
1154 UnicodeSet
& UnicodeSet::retain(UChar32 start
, UChar32 end
) {
1155 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1156 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1157 retain(range
, 2, 0);
1164 UnicodeSet
& UnicodeSet::retain(UChar32 c
) {
1165 return retain(c
, c
);
1169 * Removes the specified range from this set if it is present.
1170 * The set will not contain the specified range once the call
1171 * returns. If <code>end > start</code> then an empty range is
1172 * removed, leaving the set unchanged.
1174 * @param start first character, inclusive, of range to be removed
1176 * @param end last character, inclusive, of range to be removed
1179 UnicodeSet
& UnicodeSet::remove(UChar32 start
, UChar32 end
) {
1180 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1181 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1182 retain(range
, 2, 2);
1188 * Removes the specified character from this set if it is present.
1189 * The set will not contain the specified range once the call
1192 UnicodeSet
& UnicodeSet::remove(UChar32 c
) {
1193 return remove(c
, c
);
1197 * Removes the specified string from this set if it is present.
1198 * The set will not contain the specified character once the call
1200 * @param the source string
1201 * @return the modified set, for chaining
1203 UnicodeSet
& UnicodeSet::remove(const UnicodeString
& s
) {
1204 if (s
.length() == 0 || isFrozen() || isBogus()) return *this;
1205 int32_t cp
= getSingleCP(s
);
1207 strings
->removeElement((void*) &s
);
1210 remove((UChar32
)cp
, (UChar32
)cp
);
1216 * Complements the specified range in this set. Any character in
1217 * the range will be removed if it is in this set, or will be
1218 * added if it is not in this set. If <code>end > start</code>
1219 * then an empty range is xor'ed, leaving the set unchanged.
1221 * @param start first character, inclusive, of range to be removed
1223 * @param end last character, inclusive, of range to be removed
1226 UnicodeSet
& UnicodeSet::complement(UChar32 start
, UChar32 end
) {
1227 if (isFrozen() || isBogus()) {
1230 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1231 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1232 exclusiveOr(range
, 2, 0);
1238 UnicodeSet
& UnicodeSet::complement(UChar32 c
) {
1239 return complement(c
, c
);
1243 * This is equivalent to
1244 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1246 UnicodeSet
& UnicodeSet::complement(void) {
1247 if (isFrozen() || isBogus()) {
1250 UErrorCode status
= U_ZERO_ERROR
;
1251 if (list
[0] == UNICODESET_LOW
) {
1252 ensureBufferCapacity(len
-1, status
);
1253 if (U_FAILURE(status
)) {
1256 uprv_memcpy(buffer
, list
+ 1, (len
-1)*sizeof(UChar32
));
1259 ensureBufferCapacity(len
+1, status
);
1260 if (U_FAILURE(status
)) {
1263 uprv_memcpy(buffer
+ 1, list
, len
*sizeof(UChar32
));
1264 buffer
[0] = UNICODESET_LOW
;
1273 * Complement the specified string in this set.
1274 * The set will not contain the specified string once the call
1276 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1277 * @param s the string to complement
1278 * @return this object, for chaining
1280 UnicodeSet
& UnicodeSet::complement(const UnicodeString
& s
) {
1281 if (s
.length() == 0 || isFrozen() || isBogus()) return *this;
1282 int32_t cp
= getSingleCP(s
);
1284 if (strings
->contains((void*) &s
)) {
1285 strings
->removeElement((void*) &s
);
1291 complement((UChar32
)cp
, (UChar32
)cp
);
1297 * Adds all of the elements in the specified set to this set if
1298 * they're not already present. This operation effectively
1299 * modifies this set so that its value is the <i>union</i> of the two
1300 * sets. The behavior of this operation is unspecified if the specified
1301 * collection is modified while the operation is in progress.
1303 * @param c set whose elements are to be added to this set.
1304 * @see #add(char, char)
1306 UnicodeSet
& UnicodeSet::addAll(const UnicodeSet
& c
) {
1307 if ( c
.len
>0 && c
.list
!=NULL
) {
1308 add(c
.list
, c
.len
, 0);
1311 // Add strings in order
1312 if ( c
.strings
!=NULL
) {
1313 for (int32_t i
=0; i
<c
.strings
->size(); ++i
) {
1314 const UnicodeString
* s
= (const UnicodeString
*)c
.strings
->elementAt(i
);
1315 if (!strings
->contains((void*) s
)) {
1324 * Retains only the elements in this set that are contained in the
1325 * specified set. In other words, removes from this set all of
1326 * its elements that are not contained in the specified set. This
1327 * operation effectively modifies this set so that its value is
1328 * the <i>intersection</i> of the two sets.
1330 * @param c set that defines which elements this set will retain.
1332 UnicodeSet
& UnicodeSet::retainAll(const UnicodeSet
& c
) {
1333 if (isFrozen() || isBogus()) {
1336 retain(c
.list
, c
.len
, 0);
1337 strings
->retainAll(*c
.strings
);
1342 * Removes from this set all of its elements that are contained in the
1343 * specified set. This operation effectively modifies this
1344 * set so that its value is the <i>asymmetric set difference</i> of
1347 * @param c set that defines which elements will be removed from
1350 UnicodeSet
& UnicodeSet::removeAll(const UnicodeSet
& c
) {
1351 if (isFrozen() || isBogus()) {
1354 retain(c
.list
, c
.len
, 2);
1355 strings
->removeAll(*c
.strings
);
1360 * Complements in this set all elements contained in the specified
1361 * set. Any character in the other set will be removed if it is
1362 * in this set, or will be added if it is not in this set.
1364 * @param c set that defines which elements will be xor'ed from
1367 UnicodeSet
& UnicodeSet::complementAll(const UnicodeSet
& c
) {
1368 if (isFrozen() || isBogus()) {
1371 exclusiveOr(c
.list
, c
.len
, 0);
1373 for (int32_t i
=0; i
<c
.strings
->size(); ++i
) {
1374 void* e
= c
.strings
->elementAt(i
);
1375 if (!strings
->removeElement(e
)) {
1376 _add(*(const UnicodeString
*)e
);
1383 * Removes all of the elements from this set. This set will be
1384 * empty after this call returns.
1386 UnicodeSet
& UnicodeSet::clear(void) {
1391 list
[0] = UNICODESET_HIGH
;
1395 if (strings
!= NULL
) {
1396 strings
->removeAllElements();
1398 if (list
!= NULL
&& strings
!= NULL
) {
1406 * Iteration method that returns the number of ranges contained in
1408 * @see #getRangeStart
1411 int32_t UnicodeSet::getRangeCount() const {
1416 * Iteration method that returns the first character in the
1417 * specified range of this set.
1418 * @see #getRangeCount
1421 UChar32
UnicodeSet::getRangeStart(int32_t index
) const {
1422 return list
[index
*2];
1426 * Iteration method that returns the last character in the
1427 * specified range of this set.
1428 * @see #getRangeStart
1431 UChar32
UnicodeSet::getRangeEnd(int32_t index
) const {
1432 return list
[index
*2 + 1] - 1;
1435 int32_t UnicodeSet::getStringCount() const {
1436 return strings
->size();
1439 const UnicodeString
* UnicodeSet::getString(int32_t index
) const {
1440 return (const UnicodeString
*) strings
->elementAt(index
);
1444 * Reallocate this objects internal structures to take up the least
1445 * possible space, without changing this object's value.
1447 UnicodeSet
& UnicodeSet::compact() {
1448 if (isFrozen() || isBogus()) {
1451 // Delete buffer first to defragment memory less.
1452 if (buffer
!= NULL
) {
1456 if (len
< capacity
) {
1457 // Make the capacity equal to len or 1.
1458 // We don't want to realloc of 0 size.
1459 int32_t newCapacity
= len
+ (len
== 0);
1460 UChar32
* temp
= (UChar32
*) uprv_realloc(list
, sizeof(UChar32
) * newCapacity
);
1463 capacity
= newCapacity
;
1465 // else what the heck happened?! We allocated less memory!
1466 // Oh well. We'll keep our original array.
1471 int32_t UnicodeSet::serialize(uint16_t *dest
, int32_t destCapacity
, UErrorCode
& ec
) const {
1472 int32_t bmpLength
, length
, destLength
;
1474 if (U_FAILURE(ec
)) {
1478 if (destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
1479 ec
=U_ILLEGAL_ARGUMENT_ERROR
;
1483 /* count necessary 16-bit units */
1484 length
=this->len
-1; // Subtract 1 to ignore final UNICODESET_HIGH
1485 // assert(length>=0);
1488 if (destCapacity
>0) {
1491 ec
=U_BUFFER_OVERFLOW_ERROR
;
1497 if (this->list
[length
-1]<=0xffff) {
1500 } else if (this->list
[0]>=0x10000) {
1501 /* all supplementary */
1505 /* some BMP, some supplementary */
1506 for (bmpLength
=0; bmpLength
<length
&& this->list
[bmpLength
]<=0xffff; ++bmpLength
) {}
1507 length
=bmpLength
+2*(length
-bmpLength
);
1510 /* length: number of 16-bit array units */
1511 if (length
>0x7fff) {
1512 /* there are only 15 bits for the length in the first serialized word */
1513 ec
=U_INDEX_OUTOFBOUNDS_ERROR
;
1518 * total serialized length:
1519 * number of 16-bit array units (length) +
1520 * 1 length unit (always) +
1521 * 1 bmpLength unit (if there are supplementary values)
1523 destLength
=length
+((length
>bmpLength
)?2:1);
1524 if (destLength
<=destCapacity
) {
1528 *dest
=(uint16_t)length
;
1529 if (length
>bmpLength
) {
1531 *++dest
=(uint16_t)bmpLength
;
1535 /* write the BMP part of the array */
1537 for (i
=0; i
<bmpLength
; ++i
) {
1538 *dest
++=(uint16_t)*p
++;
1541 /* write the supplementary part of the array */
1542 for (; i
<length
; i
+=2) {
1543 *dest
++=(uint16_t)(*p
>>16);
1544 *dest
++=(uint16_t)*p
++;
1547 ec
=U_BUFFER_OVERFLOW_ERROR
;
1552 //----------------------------------------------------------------
1553 // Implementation: Utility methods
1554 //----------------------------------------------------------------
1557 * Allocate our strings vector and return TRUE if successful.
1559 UBool
UnicodeSet::allocateStrings(UErrorCode
&status
) {
1560 if (U_FAILURE(status
)) {
1563 strings
= new UVector(uprv_deleteUObject
,
1564 uhash_compareUnicodeString
, 1, status
);
1565 if (strings
== NULL
) { // Check for memory allocation error.
1566 status
= U_MEMORY_ALLOCATION_ERROR
;
1569 if (U_FAILURE(status
)) {
1577 void UnicodeSet::ensureCapacity(int32_t newLen
, UErrorCode
& ec
) {
1578 if (newLen
<= capacity
)
1580 UChar32
* temp
= (UChar32
*) uprv_realloc(list
, sizeof(UChar32
) * (newLen
+ GROW_EXTRA
));
1582 ec
= U_MEMORY_ALLOCATION_ERROR
;
1587 capacity
= newLen
+ GROW_EXTRA
;
1588 // else we keep the original contents on the memory failure.
1591 void UnicodeSet::ensureBufferCapacity(int32_t newLen
, UErrorCode
& ec
) {
1592 if (buffer
!= NULL
&& newLen
<= bufferCapacity
)
1594 UChar32
* temp
= (UChar32
*) uprv_realloc(buffer
, sizeof(UChar32
) * (newLen
+ GROW_EXTRA
));
1596 ec
= U_MEMORY_ALLOCATION_ERROR
;
1601 bufferCapacity
= newLen
+ GROW_EXTRA
;
1602 // else we keep the original contents on the memory failure.
1606 * Swap list and buffer.
1608 void UnicodeSet::swapBuffers(void) {
1609 // swap list and buffer
1610 UChar32
* temp
= list
;
1614 int32_t c
= capacity
;
1615 capacity
= bufferCapacity
;
1619 void UnicodeSet::setToBogus() {
1620 clear(); // Remove everything in the set.
1624 //----------------------------------------------------------------
1625 // Implementation: Fundamental operators
1626 //----------------------------------------------------------------
1628 static inline UChar32
max(UChar32 a
, UChar32 b
) {
1629 return (a
> b
) ? a
: b
;
1632 // polarity = 0, 3 is normal: x xor y
1633 // polarity = 1, 2: x xor ~y == x === y
1635 void UnicodeSet::exclusiveOr(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1636 if (isFrozen() || isBogus()) {
1639 UErrorCode status
= U_ZERO_ERROR
;
1640 ensureBufferCapacity(len
+ otherLen
, status
);
1641 if (U_FAILURE(status
)) {
1645 int32_t i
= 0, j
= 0, k
= 0;
1646 UChar32 a
= list
[i
++];
1648 if (polarity
== 1 || polarity
== 2) {
1650 if (other
[j
] == UNICODESET_LOW
) { // skip base if already LOW
1657 // simplest of all the routines
1658 // sort the values, discarding identicals!
1666 } else if (a
!= UNICODESET_HIGH
) { // at this point, a == b
1667 // discard both values!
1671 buffer
[k
++] = UNICODESET_HIGH
;
1680 // polarity = 0 is normal: x union y
1681 // polarity = 2: x union ~y
1682 // polarity = 1: ~x union y
1683 // polarity = 3: ~x union ~y
1685 void UnicodeSet::add(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1686 if (isFrozen() || isBogus() || other
==NULL
) {
1689 UErrorCode status
= U_ZERO_ERROR
;
1690 ensureBufferCapacity(len
+ otherLen
, status
);
1691 if (U_FAILURE(status
)) {
1695 int32_t i
= 0, j
= 0, k
= 0;
1696 UChar32 a
= list
[i
++];
1697 UChar32 b
= other
[j
++];
1698 // change from xor is that we have to check overlapping pairs
1699 // polarity bit 1 means a is second, bit 2 means b is.
1702 case 0: // both first; take lower if unequal
1703 if (a
< b
) { // take a
1704 // Back up over overlapping ranges in buffer[]
1705 if (k
> 0 && a
<= buffer
[k
-1]) {
1706 // Pick latter end value in buffer[] vs. list[]
1707 a
= max(list
[i
], buffer
[--k
]);
1713 i
++; // Common if/else code factored out
1715 } else if (b
< a
) { // take b
1716 if (k
> 0 && b
<= buffer
[k
-1]) {
1717 b
= max(other
[j
], buffer
[--k
]);
1724 } else { // a == b, take a, drop b
1725 if (a
== UNICODESET_HIGH
) goto loop_end
;
1726 // This is symmetrical; it doesn't matter if
1727 // we backtrack with a or b. - liu
1728 if (k
> 0 && a
<= buffer
[k
-1]) {
1729 a
= max(list
[i
], buffer
[--k
]);
1741 case 3: // both second; take higher if unequal, and drop other
1742 if (b
<= a
) { // take a
1743 if (a
== UNICODESET_HIGH
) goto loop_end
;
1746 if (b
== UNICODESET_HIGH
) goto loop_end
;
1750 polarity
^= 1; // factored common code
1754 case 1: // a second, b first; if b < a, overlap
1755 if (a
< b
) { // no overlap, take a
1756 buffer
[k
++] = a
; a
= list
[i
++]; polarity
^= 1;
1757 } else if (b
< a
) { // OVERLAP, drop b
1760 } else { // a == b, drop both!
1761 if (a
== UNICODESET_HIGH
) goto loop_end
;
1768 case 2: // a first, b second; if a < b, overlap
1769 if (b
< a
) { // no overlap, take b
1773 } else if (a
< b
) { // OVERLAP, drop a
1776 } else { // a == b, drop both!
1777 if (a
== UNICODESET_HIGH
) goto loop_end
;
1787 buffer
[k
++] = UNICODESET_HIGH
; // terminate
1793 // polarity = 0 is normal: x intersect y
1794 // polarity = 2: x intersect ~y == set-minus
1795 // polarity = 1: ~x intersect y
1796 // polarity = 3: ~x intersect ~y
1798 void UnicodeSet::retain(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1799 if (isFrozen() || isBogus()) {
1802 UErrorCode status
= U_ZERO_ERROR
;
1803 ensureBufferCapacity(len
+ otherLen
, status
);
1804 if (U_FAILURE(status
)) {
1808 int32_t i
= 0, j
= 0, k
= 0;
1809 UChar32 a
= list
[i
++];
1810 UChar32 b
= other
[j
++];
1811 // change from xor is that we have to check overlapping pairs
1812 // polarity bit 1 means a is second, bit 2 means b is.
1815 case 0: // both first; drop the smaller
1816 if (a
< b
) { // drop a
1819 } else if (b
< a
) { // drop b
1822 } else { // a == b, take one, drop other
1823 if (a
== UNICODESET_HIGH
) goto loop_end
;
1831 case 3: // both second; take lower if unequal
1832 if (a
< b
) { // take a
1836 } else if (b
< a
) { // take b
1840 } else { // a == b, take one, drop other
1841 if (a
== UNICODESET_HIGH
) goto loop_end
;
1849 case 1: // a second, b first;
1850 if (a
< b
) { // NO OVERLAP, drop a
1853 } else if (b
< a
) { // OVERLAP, take b
1857 } else { // a == b, drop both!
1858 if (a
== UNICODESET_HIGH
) goto loop_end
;
1865 case 2: // a first, b second; if a < b, overlap
1866 if (b
< a
) { // no overlap, drop b
1869 } else if (a
< b
) { // OVERLAP, take a
1873 } else { // a == b, drop both!
1874 if (a
== UNICODESET_HIGH
) goto loop_end
;
1884 buffer
[k
++] = UNICODESET_HIGH
; // terminate
1891 * Append the <code>toPattern()</code> representation of a
1892 * string to the given <code>StringBuffer</code>.
1894 void UnicodeSet::_appendToPat(UnicodeString
& buf
, const UnicodeString
& s
, UBool
1895 escapeUnprintable
) {
1897 for (int32_t i
= 0; i
< s
.length(); i
+= U16_LENGTH(cp
)) {
1898 _appendToPat(buf
, cp
= s
.char32At(i
), escapeUnprintable
);
1903 * Append the <code>toPattern()</code> representation of a
1904 * character to the given <code>StringBuffer</code>.
1906 void UnicodeSet::_appendToPat(UnicodeString
& buf
, UChar32 c
, UBool
1907 escapeUnprintable
) {
1908 if (escapeUnprintable
&& ICU_Utility::isUnprintable(c
)) {
1909 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1911 if (ICU_Utility::escapeUnprintable(buf
, c
)) {
1915 // Okay to let ':' pass through
1926 case SymbolTable::SYMBOL_REF
:
1927 buf
.append(BACKSLASH
);
1930 // Escape whitespace
1931 if (PatternProps::isWhiteSpace(c
)) {
1932 buf
.append(BACKSLASH
);
1940 * Append a string representation of this set to result. This will be
1941 * a cleaned version of the string passed to applyPattern(), if there
1942 * is one. Otherwise it will be generated.
1944 UnicodeString
& UnicodeSet::_toPattern(UnicodeString
& result
,
1945 UBool escapeUnprintable
) const
1949 int32_t backslashCount
= 0;
1950 for (i
=0; i
<patLen
; ) {
1952 U16_NEXT(pat
, i
, patLen
, c
);
1953 if (escapeUnprintable
&& ICU_Utility::isUnprintable(c
)) {
1954 // If the unprintable character is preceded by an odd
1955 // number of backslashes, then it has been escaped.
1956 // Before unescaping it, we delete the final
1958 if ((backslashCount
% 2) == 1) {
1959 result
.truncate(result
.length() - 1);
1961 ICU_Utility::escapeUnprintable(result
, c
);
1965 if (c
== BACKSLASH
) {
1975 return _generatePattern(result
, escapeUnprintable
);
1979 * Returns a string representation of this set. If the result of
1980 * calling this function is passed to a UnicodeSet constructor, it
1981 * will produce another set that is equal to this one.
1983 UnicodeString
& UnicodeSet::toPattern(UnicodeString
& result
,
1984 UBool escapeUnprintable
) const
1987 return _toPattern(result
, escapeUnprintable
);
1991 * Generate and append a string representation of this set to result.
1992 * This does not use this.pat, the cleaned up copy of the string
1993 * passed to applyPattern().
1995 UnicodeString
& UnicodeSet::_generatePattern(UnicodeString
& result
,
1996 UBool escapeUnprintable
) const
1998 result
.append(SET_OPEN
);
2000 // // Check against the predefined categories. We implicitly build
2001 // // up ALL category sets the first time toPattern() is called.
2002 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2003 // if (*this == getCategorySet(cat)) {
2004 // result.append(COLON);
2005 // result.append(CATEGORY_NAMES, cat*2, 2);
2006 // return result.append(CATEGORY_CLOSE);
2010 int32_t count
= getRangeCount();
2012 // If the set contains at least 2 intervals and includes both
2013 // MIN_VALUE and MAX_VALUE, then the inverse representation will
2014 // be more economical.
2016 getRangeStart(0) == MIN_VALUE
&&
2017 getRangeEnd(count
-1) == MAX_VALUE
) {
2020 result
.append(COMPLEMENT
);
2022 for (int32_t i
= 1; i
< count
; ++i
) {
2023 UChar32 start
= getRangeEnd(i
-1)+1;
2024 UChar32 end
= getRangeStart(i
)-1;
2025 _appendToPat(result
, start
, escapeUnprintable
);
2027 if ((start
+1) != end
) {
2028 result
.append(HYPHEN
);
2030 _appendToPat(result
, end
, escapeUnprintable
);
2035 // Default; emit the ranges as pairs
2037 for (int32_t i
= 0; i
< count
; ++i
) {
2038 UChar32 start
= getRangeStart(i
);
2039 UChar32 end
= getRangeEnd(i
);
2040 _appendToPat(result
, start
, escapeUnprintable
);
2042 if ((start
+1) != end
) {
2043 result
.append(HYPHEN
);
2045 _appendToPat(result
, end
, escapeUnprintable
);
2050 for (int32_t i
= 0; i
<strings
->size(); ++i
) {
2051 result
.append(OPEN_BRACE
);
2052 _appendToPat(result
,
2053 *(const UnicodeString
*) strings
->elementAt(i
),
2055 result
.append(CLOSE_BRACE
);
2057 return result
.append(SET_CLOSE
);
2061 * Release existing cached pattern
2063 void UnicodeSet::releasePattern() {
2072 * Set the new pattern to cache.
2074 void UnicodeSet::setPattern(const UnicodeString
& newPat
) {
2076 int32_t newPatLen
= newPat
.length();
2077 pat
= (UChar
*)uprv_malloc((newPatLen
+ 1) * sizeof(UChar
));
2080 newPat
.extractBetween(0, patLen
, pat
);
2083 // else we don't care if malloc failed. This was just a nice cache.
2084 // We can regenerate an equivalent pattern later when requested.
2087 UnicodeFunctor
*UnicodeSet::freeze() {
2088 if(!isFrozen() && !isBogus()) {
2089 // Do most of what compact() does before freezing because
2090 // compact() will not work when the set is frozen.
2091 // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2093 // Delete buffer first to defragment memory less.
2094 if (buffer
!= NULL
) {
2098 if (capacity
> (len
+ GROW_EXTRA
)) {
2099 // Make the capacity equal to len or 1.
2100 // We don't want to realloc of 0 size.
2101 capacity
= len
+ (len
== 0);
2102 list
= (UChar32
*) uprv_realloc(list
, sizeof(UChar32
) * capacity
);
2103 if (list
== NULL
) { // Check for memory allocation error.
2109 // Optimize contains() and span() and similar functions.
2110 if (!strings
->isEmpty()) {
2111 stringSpan
= new UnicodeSetStringSpan(*this, *strings
, UnicodeSetStringSpan::ALL
);
2112 if (stringSpan
!= NULL
&& !stringSpan
->needsStringSpanUTF16()) {
2113 // All strings are irrelevant for span() etc. because
2114 // all of each string's code points are contained in this set.
2115 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2116 // many relevant strings as UTF-16.
2117 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2122 if (stringSpan
== NULL
) {
2123 // No span-relevant strings: Optimize for code point spans.
2124 bmpSet
=new BMPSet(list
, len
);
2125 if (bmpSet
== NULL
) { // Check for memory allocation error.
2133 int32_t UnicodeSet::span(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
2134 if(length
>0 && bmpSet
!=NULL
) {
2135 return (int32_t)(bmpSet
->span(s
, s
+length
, spanCondition
)-s
);
2143 if(stringSpan
!=NULL
) {
2144 return stringSpan
->span(s
, length
, spanCondition
);
2145 } else if(!strings
->isEmpty()) {
2146 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2147 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED
:
2148 UnicodeSetStringSpan::FWD_UTF16_CONTAINED
;
2149 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2150 if(strSpan
.needsStringSpanUTF16()) {
2151 return strSpan
.span(s
, length
, spanCondition
);
2155 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2156 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2160 int32_t start
=0, prev
=0;
2162 U16_NEXT(s
, start
, length
, c
);
2163 if(spanCondition
!=contains(c
)) {
2166 } while((prev
=start
)<length
);
2170 int32_t UnicodeSet::spanBack(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
2171 if(length
>0 && bmpSet
!=NULL
) {
2172 return (int32_t)(bmpSet
->spanBack(s
, s
+length
, spanCondition
)-s
);
2180 if(stringSpan
!=NULL
) {
2181 return stringSpan
->spanBack(s
, length
, spanCondition
);
2182 } else if(!strings
->isEmpty()) {
2183 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2184 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED
:
2185 UnicodeSetStringSpan::BACK_UTF16_CONTAINED
;
2186 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2187 if(strSpan
.needsStringSpanUTF16()) {
2188 return strSpan
.spanBack(s
, length
, spanCondition
);
2192 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2193 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2197 int32_t prev
=length
;
2199 U16_PREV(s
, 0, length
, c
);
2200 if(spanCondition
!=contains(c
)) {
2203 } while((prev
=length
)>0);
2207 int32_t UnicodeSet::spanUTF8(const char *s
, int32_t length
, USetSpanCondition spanCondition
) const {
2208 if(length
>0 && bmpSet
!=NULL
) {
2209 const uint8_t *s0
=(const uint8_t *)s
;
2210 return (int32_t)(bmpSet
->spanUTF8(s0
, length
, spanCondition
)-s0
);
2213 length
=(int32_t)uprv_strlen(s
);
2218 if(stringSpan
!=NULL
) {
2219 return stringSpan
->spanUTF8((const uint8_t *)s
, length
, spanCondition
);
2220 } else if(!strings
->isEmpty()) {
2221 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2222 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED
:
2223 UnicodeSetStringSpan::FWD_UTF8_CONTAINED
;
2224 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2225 if(strSpan
.needsStringSpanUTF8()) {
2226 return strSpan
.spanUTF8((const uint8_t *)s
, length
, spanCondition
);
2230 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2231 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2235 int32_t start
=0, prev
=0;
2237 U8_NEXT(s
, start
, length
, c
);
2241 if(spanCondition
!=contains(c
)) {
2244 } while((prev
=start
)<length
);
2248 int32_t UnicodeSet::spanBackUTF8(const char *s
, int32_t length
, USetSpanCondition spanCondition
) const {
2249 if(length
>0 && bmpSet
!=NULL
) {
2250 const uint8_t *s0
=(const uint8_t *)s
;
2251 return bmpSet
->spanBackUTF8(s0
, length
, spanCondition
);
2254 length
=(int32_t)uprv_strlen(s
);
2259 if(stringSpan
!=NULL
) {
2260 return stringSpan
->spanBackUTF8((const uint8_t *)s
, length
, spanCondition
);
2261 } else if(!strings
->isEmpty()) {
2262 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2263 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED
:
2264 UnicodeSetStringSpan::BACK_UTF8_CONTAINED
;
2265 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2266 if(strSpan
.needsStringSpanUTF8()) {
2267 return strSpan
.spanBackUTF8((const uint8_t *)s
, length
, spanCondition
);
2271 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2272 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2276 int32_t prev
=length
;
2278 U8_PREV(s
, 0, length
, c
);
2282 if(spanCondition
!=contains(c
)) {
2285 } while((prev
=length
)>0);