2 **********************************************************************
3 * Copyright (C) 1999-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 **********************************************************************
11 #include "unicode/utypes.h"
12 #include "unicode/uniset.h"
13 #include "unicode/parsepos.h"
14 #include "unicode/symtable.h"
26 #include "unisetspan.h"
28 // Define UChar constants using hex for EBCDIC compatibility
29 // Used #define to reduce private static exports and memory access time.
30 #define SET_OPEN ((UChar)0x005B) /*[*/
31 #define SET_CLOSE ((UChar)0x005D) /*]*/
32 #define HYPHEN ((UChar)0x002D) /*-*/
33 #define COMPLEMENT ((UChar)0x005E) /*^*/
34 #define COLON ((UChar)0x003A) /*:*/
35 #define BACKSLASH ((UChar)0x005C) /*\*/
36 #define INTERSECTION ((UChar)0x0026) /*&*/
37 #define UPPER_U ((UChar)0x0055) /*U*/
38 #define LOWER_U ((UChar)0x0075) /*u*/
39 #define OPEN_BRACE ((UChar)123) /*{*/
40 #define CLOSE_BRACE ((UChar)125) /*}*/
41 #define UPPER_P ((UChar)0x0050) /*P*/
42 #define LOWER_P ((UChar)0x0070) /*p*/
43 #define UPPER_N ((UChar)78) /*N*/
44 #define EQUALS ((UChar)0x003D) /*=*/
46 // HIGH_VALUE > all valid values. 110000 for codepoints
47 #define UNICODESET_HIGH 0x0110000
49 // LOW <= all valid values. ZERO for codepoints
50 #define UNICODESET_LOW 0x000000
52 // initial storage. Must be >= 0
53 #define START_EXTRA 16
55 // extra amount for growth. Must be >= 0
56 #define GROW_EXTRA START_EXTRA
60 SymbolTable::~SymbolTable() {}
62 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet
)
65 * Modify the given UChar32 variable so that it is in range, by
66 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
67 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
68 * It modifies its argument in-place and also returns it.
70 static inline UChar32
pinCodePoint(UChar32
& c
) {
71 if (c
< UNICODESET_LOW
) {
73 } else if (c
> (UNICODESET_HIGH
-1)) {
74 c
= (UNICODESET_HIGH
-1);
79 //----------------------------------------------------------------
81 //----------------------------------------------------------------
83 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
84 // To enable the debugging, define the symbol DEBUG_MEM in the line
85 // below. This will result in text being sent to stdout that looks
87 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
88 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
89 // Each line lists a construction (ct) or destruction (dt) event, the
90 // object address, the number of outstanding objects after the event,
91 // and the pattern of the object in question.
97 static int32_t _dbgCount
= 0;
99 static inline void _dbgct(UnicodeSet
* set
) {
101 set
->toPattern(str
, TRUE
);
103 str
.extract(0, 39, buf
, "");
104 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set
, ++_dbgCount
, buf
);
107 static inline void _dbgdt(UnicodeSet
* set
) {
109 set
->toPattern(str
, TRUE
);
111 str
.extract(0, 39, buf
, "");
112 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set
, --_dbgCount
, buf
);
122 //----------------------------------------------------------------
123 // UnicodeString in UVector support
124 //----------------------------------------------------------------
126 static void U_CALLCONV
cloneUnicodeString(UHashTok
*dst
, UHashTok
*src
) {
127 dst
->pointer
= new UnicodeString(*(UnicodeString
*)src
->pointer
);
130 static int8_t U_CALLCONV
compareUnicodeString(UHashTok t1
, UHashTok t2
) {
131 const UnicodeString
&a
= *(const UnicodeString
*)t1
.pointer
;
132 const UnicodeString
&b
= *(const UnicodeString
*)t2
.pointer
;
136 //----------------------------------------------------------------
138 //----------------------------------------------------------------
141 * Constructs an empty set.
143 UnicodeSet::UnicodeSet() :
144 len(1), capacity(1 + START_EXTRA
), list(0), bmpSet(0), buffer(0),
145 bufferCapacity(0), patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
148 UErrorCode status
= U_ZERO_ERROR
;
149 allocateStrings(status
);
150 if (U_FAILURE(status
)) {
153 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
155 list
[0] = UNICODESET_HIGH
;
156 } else { // If memory allocation failed, set to bogus state.
164 * Constructs a set containing the given range. If <code>end >
165 * start</code> then an empty set is created.
167 * @param start first character, inclusive, of range
168 * @param end last character, inclusive, of range
170 UnicodeSet::UnicodeSet(UChar32 start
, UChar32 end
) :
171 len(1), capacity(1 + START_EXTRA
), list(0), bmpSet(0), buffer(0),
172 bufferCapacity(0), patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
175 UErrorCode status
= U_ZERO_ERROR
;
176 allocateStrings(status
);
177 if (U_FAILURE(status
)) {
180 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
182 list
[0] = UNICODESET_HIGH
;
183 complement(start
, end
);
184 } else { // If memory allocation failed, set to bogus state.
192 * Constructs a set that is identical to the given UnicodeSet.
194 UnicodeSet::UnicodeSet(const UnicodeSet
& o
) :
196 len(0), capacity(o
.isFrozen() ? o
.len
: o
.len
+ GROW_EXTRA
), list(0),
198 buffer(0), bufferCapacity(0),
199 patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
202 UErrorCode status
= U_ZERO_ERROR
;
203 allocateStrings(status
);
204 if (U_FAILURE(status
)) {
207 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
210 } else { // If memory allocation failed, set to bogus state.
217 // Copy-construct as thawed.
218 UnicodeSet::UnicodeSet(const UnicodeSet
& o
, UBool
/* asThawed */) :
220 len(0), capacity(o
.len
+ GROW_EXTRA
), list(0),
222 buffer(0), bufferCapacity(0),
223 patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
226 UErrorCode status
= U_ZERO_ERROR
;
227 allocateStrings(status
);
228 if (U_FAILURE(status
)) {
231 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
233 // *this = o except for bmpSet and stringSpan
235 uprv_memcpy(list
, o
.list
, len
*sizeof(UChar32
));
236 if (strings
!= NULL
&& o
.strings
!= NULL
) {
237 strings
->assign(*o
.strings
, cloneUnicodeString
, status
);
238 } else { // Invalid strings.
243 setPattern(UnicodeString(o
.pat
, o
.patLen
));
245 } else { // If memory allocation failed, set to bogus state.
255 UnicodeSet::~UnicodeSet() {
256 _dbgdt(this); // first!
268 * Assigns this object to be a copy of another.
270 UnicodeSet
& UnicodeSet::operator=(const UnicodeSet
& o
) {
281 UErrorCode ec
= U_ZERO_ERROR
;
282 ensureCapacity(o
.len
, ec
);
284 return *this; // There is no way to report this error :-(
287 uprv_memcpy(list
, o
.list
, len
*sizeof(UChar32
));
288 if (o
.bmpSet
== NULL
) {
291 bmpSet
= new BMPSet(*o
.bmpSet
, list
, len
);
292 if (bmpSet
== NULL
) { // Check for memory allocation error.
297 if (strings
!= NULL
&& o
.strings
!= NULL
) {
298 strings
->assign(*o
.strings
, cloneUnicodeString
, ec
);
299 } else { // Invalid strings.
303 if (o
.stringSpan
== NULL
) {
306 stringSpan
= new UnicodeSetStringSpan(*o
.stringSpan
, *strings
);
307 if (stringSpan
== NULL
) { // Check for memory allocation error.
314 setPattern(UnicodeString(o
.pat
, o
.patLen
));
320 * Returns a copy of this object. All UnicodeMatcher objects have
321 * to support cloning in order to allow classes using
322 * UnicodeMatchers, such as Transliterator, to implement cloning.
324 UnicodeFunctor
* UnicodeSet::clone() const {
325 return new UnicodeSet(*this);
328 UnicodeFunctor
*UnicodeSet::cloneAsThawed() const {
329 return new UnicodeSet(*this, TRUE
);
333 * Compares the specified object with this set for equality. Returns
334 * <tt>true</tt> if the two sets
335 * have the same size, and every member of the specified set is
336 * contained in this set (or equivalently, every member of this set is
337 * contained in the specified set).
339 * @param o set to be compared for equality with this set.
340 * @return <tt>true</tt> if the specified set is equal to this set.
342 UBool
UnicodeSet::operator==(const UnicodeSet
& o
) const {
343 if (len
!= o
.len
) return FALSE
;
344 for (int32_t i
= 0; i
< len
; ++i
) {
345 if (list
[i
] != o
.list
[i
]) return FALSE
;
347 if (*strings
!= *o
.strings
) return FALSE
;
352 * Returns the hash code value for this set.
354 * @return the hash code value for this set.
355 * @see Object#hashCode()
357 int32_t UnicodeSet::hashCode(void) const {
358 int32_t result
= len
;
359 for (int32_t i
= 0; i
< len
; ++i
) {
366 //----------------------------------------------------------------
368 //----------------------------------------------------------------
371 * Returns the number of elements in this set (its cardinality),
372 * Note than the elements of a set may include both individual
373 * codepoints and strings.
375 * @return the number of elements in this set (its cardinality).
377 int32_t UnicodeSet::size(void) const {
379 int32_t count
= getRangeCount();
380 for (int32_t i
= 0; i
< count
; ++i
) {
381 n
+= getRangeEnd(i
) - getRangeStart(i
) + 1;
383 return n
+ strings
->size();
387 * Returns <tt>true</tt> if this set contains no elements.
389 * @return <tt>true</tt> if this set contains no elements.
391 UBool
UnicodeSet::isEmpty(void) const {
392 return len
== 1 && strings
->size() == 0;
396 * Returns true if this set contains the given character.
397 * @param c character to be checked for containment
398 * @return true if the test condition is met
400 UBool
UnicodeSet::contains(UChar32 c
) const {
401 // Set i to the index of the start item greater than ch
402 // We know we will terminate without length test!
403 // LATER: for large sets, add binary search
406 // if (c < list[++i]) break;
408 if (bmpSet
!= NULL
) {
409 return bmpSet
->contains(c
);
411 if (stringSpan
!= NULL
) {
412 return stringSpan
->contains(c
);
414 if (c
>= UNICODESET_HIGH
) { // Don't need to check LOW bound
417 int32_t i
= findCodePoint(c
);
418 return (UBool
)(i
& 1); // return true if odd
422 * Returns the smallest value i such that c < list[i]. Caller
423 * must ensure that c is a legal value or this method will enter
424 * an infinite loop. This method performs a binary search.
425 * @param c a character in the range MIN_VALUE..MAX_VALUE
427 * @return the smallest integer i in the range 0..len-1,
428 * inclusive, such that c < list[i]
430 int32_t UnicodeSet::findCodePoint(UChar32 c
) const {
433 set list[] c=0 1 3 4 7 8
434 === ============== ===========
435 [] [110000] 0 0 0 0 0 0
436 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
437 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
438 [:Any:] [0, 110000] 1 1 1 1 1 1
441 // Return the smallest i such that c < list[i]. Assume
442 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
445 // High runner test. c is often after the last range, so an
446 // initial check for this condition pays off.
448 int32_t hi
= len
- 1;
449 if (lo
>= hi
|| c
>= list
[hi
-1])
451 // invariant: c >= list[lo]
452 // invariant: c < list[hi]
454 int32_t i
= (lo
+ hi
) >> 1;
457 } else if (c
< list
[i
]) {
467 * Returns true if this set contains every character
468 * of the given range.
469 * @param start first character, inclusive, of the range
470 * @param end last character, inclusive, of the range
471 * @return true if the test condition is met
473 UBool
UnicodeSet::contains(UChar32 start
, UChar32 end
) const {
476 // if (start < list[++i]) break;
478 int32_t i
= findCodePoint(start
);
479 return ((i
& 1) != 0 && end
< list
[i
]);
483 * Returns <tt>true</tt> if this set contains the given
484 * multicharacter string.
485 * @param s string to be checked for containment
486 * @return <tt>true</tt> if this set contains the specified string
488 UBool
UnicodeSet::contains(const UnicodeString
& s
) const {
489 if (s
.length() == 0) return FALSE
;
490 int32_t cp
= getSingleCP(s
);
492 return strings
->contains((void*) &s
);
494 return contains((UChar32
) cp
);
499 * Returns true if this set contains all the characters and strings
501 * @param c set to be checked for containment
502 * @return true if the test condition is met
504 UBool
UnicodeSet::containsAll(const UnicodeSet
& c
) const {
505 // The specified set is a subset if all of its pairs are contained in
506 // this set. It's possible to code this more efficiently in terms of
507 // direct manipulation of the inversion lists if the need arises.
508 int32_t n
= c
.getRangeCount();
509 for (int i
=0; i
<n
; ++i
) {
510 if (!contains(c
.getRangeStart(i
), c
.getRangeEnd(i
))) {
514 if (!strings
->containsAll(*c
.strings
)) return FALSE
;
519 * Returns true if this set contains all the characters
520 * of the given string.
521 * @param s string containing characters to be checked for containment
522 * @return true if the test condition is met
524 UBool
UnicodeSet::containsAll(const UnicodeString
& s
) const {
525 return (UBool
)(span(s
.getBuffer(), s
.length(), USET_SPAN_CONTAINED
) ==
530 * Returns true if this set contains none of the characters
531 * of the given range.
532 * @param start first character, inclusive, of the range
533 * @param end last character, inclusive, of the range
534 * @return true if the test condition is met
536 UBool
UnicodeSet::containsNone(UChar32 start
, UChar32 end
) const {
539 // if (start < list[++i]) break;
541 int32_t i
= findCodePoint(start
);
542 return ((i
& 1) == 0 && end
< list
[i
]);
546 * Returns true if this set contains none of the characters and strings
548 * @param c set to be checked for containment
549 * @return true if the test condition is met
551 UBool
UnicodeSet::containsNone(const UnicodeSet
& c
) const {
552 // The specified set is a subset if all of its pairs are contained in
553 // this set. It's possible to code this more efficiently in terms of
554 // direct manipulation of the inversion lists if the need arises.
555 int32_t n
= c
.getRangeCount();
556 for (int32_t i
=0; i
<n
; ++i
) {
557 if (!containsNone(c
.getRangeStart(i
), c
.getRangeEnd(i
))) {
561 if (!strings
->containsNone(*c
.strings
)) return FALSE
;
566 * Returns true if this set contains none of the characters
567 * of the given string.
568 * @param s string containing characters to be checked for containment
569 * @return true if the test condition is met
571 UBool
UnicodeSet::containsNone(const UnicodeString
& s
) const {
572 return (UBool
)(span(s
.getBuffer(), s
.length(), USET_SPAN_NOT_CONTAINED
) ==
577 * Returns <tt>true</tt> if this set contains any character whose low byte
578 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
581 UBool
UnicodeSet::matchesIndexValue(uint8_t v
) const {
582 /* The index value v, in the range [0,255], is contained in this set if
583 * it is contained in any pair of this set. Pairs either have the high
584 * bytes equal, or unequal. If the high bytes are equal, then we have
585 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
586 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
587 * Then v is contained if xx <= v || v <= yy. (This is identical to the
588 * time zone month containment logic.)
591 int32_t rangeCount
=getRangeCount();
592 for (i
=0; i
<rangeCount
; ++i
) {
593 UChar32 low
= getRangeStart(i
);
594 UChar32 high
= getRangeEnd(i
);
595 if ((low
& ~0xFF) == (high
& ~0xFF)) {
596 if ((low
& 0xFF) <= v
&& v
<= (high
& 0xFF)) {
599 } else if ((low
& 0xFF) <= v
|| v
<= (high
& 0xFF)) {
603 if (strings
->size() != 0) {
604 for (i
=0; i
<strings
->size(); ++i
) {
605 const UnicodeString
& s
= *(const UnicodeString
*)strings
->elementAt(i
);
606 //if (s.length() == 0) {
607 // // Empty strings match everything
610 // assert(s.length() != 0); // We enforce this elsewhere
611 UChar32 c
= s
.char32At(0);
612 if ((c
& 0xFF) == v
) {
621 * Implementation of UnicodeMatcher::matches(). Always matches the
622 * longest possible multichar string.
624 UMatchDegree
UnicodeSet::matches(const Replaceable
& text
,
628 if (offset
== limit
) {
629 // Strings, if any, have length != 0, so we don't worry
630 // about them here. If we ever allow zero-length strings
631 // we much check for them here.
632 if (contains(U_ETHER
)) {
633 return incremental
? U_PARTIAL_MATCH
: U_MATCH
;
638 if (strings
->size() != 0) { // try strings first
640 // might separate forward and backward loops later
641 // for now they are combined
643 // TODO Improve efficiency of this, at least in the forward
644 // direction, if not in both. In the forward direction we
645 // can assume the strings are sorted.
648 UBool forward
= offset
< limit
;
650 // firstChar is the leftmost char to match in the
651 // forward direction or the rightmost char to match in
652 // the reverse direction.
653 UChar firstChar
= text
.charAt(offset
);
655 // If there are multiple strings that can match we
656 // return the longest match.
657 int32_t highWaterLength
= 0;
659 for (i
=0; i
<strings
->size(); ++i
) {
660 const UnicodeString
& trial
= *(const UnicodeString
*)strings
->elementAt(i
);
662 //if (trial.length() == 0) {
663 // return U_MATCH; // null-string always matches
665 // assert(trial.length() != 0); // We ensure this elsewhere
667 UChar c
= trial
.charAt(forward
? 0 : trial
.length() - 1);
669 // Strings are sorted, so we can optimize in the
670 // forward direction.
671 if (forward
&& c
> firstChar
) break;
672 if (c
!= firstChar
) continue;
674 int32_t matchLen
= matchRest(text
, offset
, limit
, trial
);
677 int32_t maxLen
= forward
? limit
-offset
: offset
-limit
;
678 if (matchLen
== maxLen
) {
679 // We have successfully matched but only up to limit.
680 return U_PARTIAL_MATCH
;
684 if (matchLen
== trial
.length()) {
685 // We have successfully matched the whole string.
686 if (matchLen
> highWaterLength
) {
687 highWaterLength
= matchLen
;
689 // In the forward direction we know strings
690 // are sorted so we can bail early.
691 if (forward
&& matchLen
< highWaterLength
) {
698 // We've checked all strings without a partial match.
699 // If we have full matches, return the longest one.
700 if (highWaterLength
!= 0) {
701 offset
+= forward
? highWaterLength
: -highWaterLength
;
705 return UnicodeFilter::matches(text
, offset
, limit
, incremental
);
710 * Returns the longest match for s in text at the given position.
711 * If limit > start then match forward from start+1 to limit
712 * matching all characters except s.charAt(0). If limit < start,
713 * go backward starting from start-1 matching all characters
714 * except s.charAt(s.length()-1). This method assumes that the
715 * first character, text.charAt(start), matches s, so it does not
717 * @param text the text to match
718 * @param start the first character to match. In the forward
719 * direction, text.charAt(start) is matched against s.charAt(0).
720 * In the reverse direction, it is matched against
721 * s.charAt(s.length()-1).
722 * @param limit the limit offset for matching, either last+1 in
723 * the forward direction, or last-1 in the reverse direction,
724 * where last is the index of the last character to match.
725 * @return If part of s matches up to the limit, return |limit -
726 * start|. If all of s matches before reaching the limit, return
727 * s.length(). If there is a mismatch between s and text, return
730 int32_t UnicodeSet::matchRest(const Replaceable
& text
,
731 int32_t start
, int32_t limit
,
732 const UnicodeString
& s
) {
735 int32_t slen
= s
.length();
737 maxLen
= limit
- start
;
738 if (maxLen
> slen
) maxLen
= slen
;
739 for (i
= 1; i
< maxLen
; ++i
) {
740 if (text
.charAt(start
+ i
) != s
.charAt(i
)) return 0;
743 maxLen
= start
- limit
;
744 if (maxLen
> slen
) maxLen
= slen
;
745 --slen
; // <=> slen = s.length() - 1;
746 for (i
= 1; i
< maxLen
; ++i
) {
747 if (text
.charAt(start
- i
) != s
.charAt(slen
- i
)) return 0;
754 * Implement of UnicodeMatcher
756 void UnicodeSet::addMatchSetTo(UnicodeSet
& toUnionTo
) const {
757 toUnionTo
.addAll(*this);
761 * Returns the index of the given character within this set, where
762 * the set is ordered by ascending code point. If the character
763 * is not in this set, return -1. The inverse of this method is
764 * <code>charAt()</code>.
765 * @return an index from 0..size()-1, or -1
767 int32_t UnicodeSet::indexOf(UChar32 c
) const {
768 if (c
< MIN_VALUE
|| c
> MAX_VALUE
) {
774 UChar32 start
= list
[i
++];
778 UChar32 limit
= list
[i
++];
780 return n
+ c
- start
;
787 * Returns the character at the given index within this set, where
788 * the set is ordered by ascending code point. If the index is
789 * out of range, return (UChar32)-1. The inverse of this method is
790 * <code>indexOf()</code>.
791 * @param index an index from 0..size()-1
792 * @return the character at the given index, or (UChar32)-1.
794 UChar32
UnicodeSet::charAt(int32_t index
) const {
796 // len2 is the largest even integer <= len, that is, it is len
797 // for even values and len-1 for odd values. With odd values
798 // the last entry is UNICODESET_HIGH.
799 int32_t len2
= len
& ~1;
800 for (int32_t i
=0; i
< len2
;) {
801 UChar32 start
= list
[i
++];
802 int32_t count
= list
[i
++] - start
;
804 return (UChar32
)(start
+ index
);
813 * Make this object represent the range <code>start - end</code>.
814 * If <code>end > start</code> then this object is set to an
817 * @param start first character in the set, inclusive
818 * @rparam end last character in the set, inclusive
820 UnicodeSet
& UnicodeSet::set(UChar32 start
, UChar32 end
) {
822 complement(start
, end
);
827 * Adds the specified range to this set if it is not already
828 * present. If this set already contains the specified range,
829 * the call leaves this set unchanged. If <code>end > start</code>
830 * then an empty range is added, leaving the set unchanged.
832 * @param start first character, inclusive, of range to be added
834 * @param end last character, inclusive, of range to be added
837 UnicodeSet
& UnicodeSet::add(UChar32 start
, UChar32 end
) {
838 if (pinCodePoint(start
) < pinCodePoint(end
)) {
839 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
841 } else if (start
== end
) {
847 // #define DEBUG_US_ADD
851 void dump(UChar32 c
) {
853 printf("%c", (char)c
);
858 void dump(const UChar32
* list
, int32_t len
) {
860 for (int32_t i
=0; i
<len
; ++i
) {
861 if (i
!= 0) printf(", ");
869 * Adds the specified character to this set if it is not already
870 * present. If this set already contains the specified character,
871 * the call leaves this set unchanged.
873 UnicodeSet
& UnicodeSet::add(UChar32 c
) {
874 // find smallest i such that c < list[i]
875 // if odd, then it is IN the set
876 // if even, then it is OUT of the set
877 int32_t i
= findCodePoint(pinCodePoint(c
));
880 if ((i
& 1) != 0 || isFrozen() || isBogus()) return *this;
883 // assert(list[len-1] == HIGH);
886 // [start_0, limit_0, start_1, limit_1, HIGH]
888 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
892 // i == 0 means c is before the first range
897 printf(" found at %d", i
);
903 if (c
== list
[i
]-1) {
904 // c is before start of next range
906 // if we touched the HIGH mark, then add a new one
907 if (c
== (UNICODESET_HIGH
- 1)) {
908 UErrorCode status
= U_ZERO_ERROR
;
909 ensureCapacity(len
+1, status
);
910 if (U_FAILURE(status
)) {
911 return *this; // There is no way to report this error :-(
913 list
[len
++] = UNICODESET_HIGH
;
915 if (i
> 0 && c
== list
[i
-1]) {
916 // collapse adjacent ranges
918 // [..., start_k-1, c, c, limit_k, ..., HIGH]
922 //for (int32_t k=i-1; k<len-2; ++k) {
923 // list[k] = list[k+2];
925 UChar32
* dst
= list
+ i
- 1;
926 UChar32
* src
= dst
+ 2;
927 UChar32
* srclimit
= list
+ len
;
928 while (src
< srclimit
) *(dst
++) = *(src
++);
934 else if (i
> 0 && c
== list
[i
-1]) {
935 // c is after end of prior range
937 // no need to check for collapse here
941 // At this point we know the new char is not adjacent to
942 // any existing ranges, and it is not 10FFFF.
945 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
949 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
953 UErrorCode status
= U_ZERO_ERROR
;
954 ensureCapacity(len
+2, status
);
955 if (U_FAILURE(status
)) {
956 return *this; // There is no way to report this error :-(
959 //for (int32_t k=len-1; k>=i; --k) {
960 // list[k+2] = list[k];
962 UChar32
* src
= list
+ len
;
963 UChar32
* dst
= src
+ 2;
964 UChar32
* srclimit
= list
+ i
;
965 while (src
> srclimit
) *(--dst
) = *(--src
);
976 for (i
=1; i
<len
; ++i
) {
977 if (list
[i
] <= list
[i
-1]) {
979 printf("ERROR: list has been corrupted\n");
990 * Adds the specified multicharacter to this set if it is not already
991 * present. If this set already contains the multicharacter,
992 * the call leaves this set unchanged.
993 * Thus "ch" => {"ch"}
994 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
995 * @param s the source string
996 * @return the modified set, for chaining
998 UnicodeSet
& UnicodeSet::add(const UnicodeString
& s
) {
999 if (s
.length() == 0 || isFrozen() || isBogus()) return *this;
1000 int32_t cp
= getSingleCP(s
);
1002 if (!strings
->contains((void*) &s
)) {
1013 * Adds the given string, in order, to 'strings'. The given string
1014 * must have been checked by the caller to not be empty and to not
1015 * already be in 'strings'.
1017 void UnicodeSet::_add(const UnicodeString
& s
) {
1018 if (isFrozen() || isBogus()) {
1021 UnicodeString
* t
= new UnicodeString(s
);
1022 if (t
== NULL
) { // Check for memory allocation error.
1026 UErrorCode ec
= U_ZERO_ERROR
;
1027 strings
->sortedInsert(t
, compareUnicodeString
, ec
);
1028 if (U_FAILURE(ec
)) {
1035 * @return a code point IF the string consists of a single one.
1036 * otherwise returns -1.
1037 * @param string to test
1039 int32_t UnicodeSet::getSingleCP(const UnicodeString
& s
) {
1040 //if (s.length() < 1) {
1041 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1043 if (s
.length() > 2) return -1;
1044 if (s
.length() == 1) return s
.charAt(0);
1046 // at this point, len = 2
1047 UChar32 cp
= s
.char32At(0);
1048 if (cp
> 0xFFFF) { // is surrogate pair
1055 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1056 * If this set already any particular character, it has no effect on that character.
1057 * @param the source string
1058 * @return the modified set, for chaining
1060 UnicodeSet
& UnicodeSet::addAll(const UnicodeString
& s
) {
1062 for (int32_t i
= 0; i
< s
.length(); i
+= UTF_CHAR_LENGTH(cp
)) {
1070 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1071 * If this set already any particular character, it has no effect on that character.
1072 * @param the source string
1073 * @return the modified set, for chaining
1075 UnicodeSet
& UnicodeSet::retainAll(const UnicodeString
& s
) {
1083 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1084 * If this set already any particular character, it has no effect on that character.
1085 * @param the source string
1086 * @return the modified set, for chaining
1088 UnicodeSet
& UnicodeSet::complementAll(const UnicodeString
& s
) {
1096 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1097 * If this set already any particular character, it has no effect on that character.
1098 * @param the source string
1099 * @return the modified set, for chaining
1101 UnicodeSet
& UnicodeSet::removeAll(const UnicodeString
& s
) {
1108 UnicodeSet
& UnicodeSet::removeAllStrings() {
1109 strings
->removeAllElements();
1115 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1116 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1117 * @param the source string
1118 * @return a newly created set containing the given string
1120 UnicodeSet
* U_EXPORT2
UnicodeSet::createFrom(const UnicodeString
& s
) {
1121 UnicodeSet
*set
= new UnicodeSet();
1122 if (set
!= NULL
) { // Check for memory allocation error.
1130 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1131 * @param the source string
1132 * @return a newly created set containing the given characters
1134 UnicodeSet
* U_EXPORT2
UnicodeSet::createFromAll(const UnicodeString
& s
) {
1135 UnicodeSet
*set
= new UnicodeSet();
1136 if (set
!= NULL
) { // Check for memory allocation error.
1143 * Retain only the elements in this set that are contained in the
1144 * specified range. If <code>end > start</code> then an empty range is
1145 * retained, leaving the set empty.
1147 * @param start first character, inclusive, of range to be retained
1149 * @param end last character, inclusive, of range to be retained
1152 UnicodeSet
& UnicodeSet::retain(UChar32 start
, UChar32 end
) {
1153 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1154 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1155 retain(range
, 2, 0);
1162 UnicodeSet
& UnicodeSet::retain(UChar32 c
) {
1163 return retain(c
, c
);
1167 * Removes the specified range from this set if it is present.
1168 * The set will not contain the specified range once the call
1169 * returns. If <code>end > start</code> then an empty range is
1170 * removed, leaving the set unchanged.
1172 * @param start first character, inclusive, of range to be removed
1174 * @param end last character, inclusive, of range to be removed
1177 UnicodeSet
& UnicodeSet::remove(UChar32 start
, UChar32 end
) {
1178 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1179 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1180 retain(range
, 2, 2);
1186 * Removes the specified character from this set if it is present.
1187 * The set will not contain the specified range once the call
1190 UnicodeSet
& UnicodeSet::remove(UChar32 c
) {
1191 return remove(c
, c
);
1195 * Removes the specified string from this set if it is present.
1196 * The set will not contain the specified character once the call
1198 * @param the source string
1199 * @return the modified set, for chaining
1201 UnicodeSet
& UnicodeSet::remove(const UnicodeString
& s
) {
1202 if (s
.length() == 0 || isFrozen() || isBogus()) return *this;
1203 int32_t cp
= getSingleCP(s
);
1205 strings
->removeElement((void*) &s
);
1208 remove((UChar32
)cp
, (UChar32
)cp
);
1214 * Complements the specified range in this set. Any character in
1215 * the range will be removed if it is in this set, or will be
1216 * added if it is not in this set. If <code>end > start</code>
1217 * then an empty range is xor'ed, leaving the set unchanged.
1219 * @param start first character, inclusive, of range to be removed
1221 * @param end last character, inclusive, of range to be removed
1224 UnicodeSet
& UnicodeSet::complement(UChar32 start
, UChar32 end
) {
1225 if (isFrozen() || isBogus()) {
1228 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1229 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1230 exclusiveOr(range
, 2, 0);
1236 UnicodeSet
& UnicodeSet::complement(UChar32 c
) {
1237 return complement(c
, c
);
1241 * This is equivalent to
1242 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1244 UnicodeSet
& UnicodeSet::complement(void) {
1245 if (isFrozen() || isBogus()) {
1248 UErrorCode status
= U_ZERO_ERROR
;
1249 if (list
[0] == UNICODESET_LOW
) {
1250 ensureBufferCapacity(len
-1, status
);
1251 if (U_FAILURE(status
)) {
1254 uprv_memcpy(buffer
, list
+ 1, (len
-1)*sizeof(UChar32
));
1257 ensureBufferCapacity(len
+1, status
);
1258 if (U_FAILURE(status
)) {
1261 uprv_memcpy(buffer
+ 1, list
, len
*sizeof(UChar32
));
1262 buffer
[0] = UNICODESET_LOW
;
1271 * Complement the specified string in this set.
1272 * The set will not contain the specified string once the call
1274 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1275 * @param s the string to complement
1276 * @return this object, for chaining
1278 UnicodeSet
& UnicodeSet::complement(const UnicodeString
& s
) {
1279 if (s
.length() == 0 || isFrozen() || isBogus()) return *this;
1280 int32_t cp
= getSingleCP(s
);
1282 if (strings
->contains((void*) &s
)) {
1283 strings
->removeElement((void*) &s
);
1289 complement((UChar32
)cp
, (UChar32
)cp
);
1295 * Adds all of the elements in the specified set to this set if
1296 * they're not already present. This operation effectively
1297 * modifies this set so that its value is the <i>union</i> of the two
1298 * sets. The behavior of this operation is unspecified if the specified
1299 * collection is modified while the operation is in progress.
1301 * @param c set whose elements are to be added to this set.
1302 * @see #add(char, char)
1304 UnicodeSet
& UnicodeSet::addAll(const UnicodeSet
& c
) {
1305 if ( c
.len
>0 && c
.list
!=NULL
) {
1306 add(c
.list
, c
.len
, 0);
1309 // Add strings in order
1310 if ( c
.strings
!=NULL
) {
1311 for (int32_t i
=0; i
<c
.strings
->size(); ++i
) {
1312 const UnicodeString
* s
= (const UnicodeString
*)c
.strings
->elementAt(i
);
1313 if (!strings
->contains((void*) s
)) {
1322 * Retains only the elements in this set that are contained in the
1323 * specified set. In other words, removes from this set all of
1324 * its elements that are not contained in the specified set. This
1325 * operation effectively modifies this set so that its value is
1326 * the <i>intersection</i> of the two sets.
1328 * @param c set that defines which elements this set will retain.
1330 UnicodeSet
& UnicodeSet::retainAll(const UnicodeSet
& c
) {
1331 if (isFrozen() || isBogus()) {
1334 retain(c
.list
, c
.len
, 0);
1335 strings
->retainAll(*c
.strings
);
1340 * Removes from this set all of its elements that are contained in the
1341 * specified set. This operation effectively modifies this
1342 * set so that its value is the <i>asymmetric set difference</i> of
1345 * @param c set that defines which elements will be removed from
1348 UnicodeSet
& UnicodeSet::removeAll(const UnicodeSet
& c
) {
1349 if (isFrozen() || isBogus()) {
1352 retain(c
.list
, c
.len
, 2);
1353 strings
->removeAll(*c
.strings
);
1358 * Complements in this set all elements contained in the specified
1359 * set. Any character in the other set will be removed if it is
1360 * in this set, or will be added if it is not in this set.
1362 * @param c set that defines which elements will be xor'ed from
1365 UnicodeSet
& UnicodeSet::complementAll(const UnicodeSet
& c
) {
1366 if (isFrozen() || isBogus()) {
1369 exclusiveOr(c
.list
, c
.len
, 0);
1371 for (int32_t i
=0; i
<c
.strings
->size(); ++i
) {
1372 void* e
= c
.strings
->elementAt(i
);
1373 if (!strings
->removeElement(e
)) {
1374 _add(*(const UnicodeString
*)e
);
1381 * Removes all of the elements from this set. This set will be
1382 * empty after this call returns.
1384 UnicodeSet
& UnicodeSet::clear(void) {
1389 list
[0] = UNICODESET_HIGH
;
1393 if (strings
!= NULL
) {
1394 strings
->removeAllElements();
1396 if (list
!= NULL
&& strings
!= NULL
) {
1404 * Iteration method that returns the number of ranges contained in
1406 * @see #getRangeStart
1409 int32_t UnicodeSet::getRangeCount() const {
1414 * Iteration method that returns the first character in the
1415 * specified range of this set.
1416 * @see #getRangeCount
1419 UChar32
UnicodeSet::getRangeStart(int32_t index
) const {
1420 return list
[index
*2];
1424 * Iteration method that returns the last character in the
1425 * specified range of this set.
1426 * @see #getRangeStart
1429 UChar32
UnicodeSet::getRangeEnd(int32_t index
) const {
1430 return list
[index
*2 + 1] - 1;
1433 int32_t UnicodeSet::getStringCount() const {
1434 return strings
->size();
1437 const UnicodeString
* UnicodeSet::getString(int32_t index
) const {
1438 return (const UnicodeString
*) strings
->elementAt(index
);
1442 * Reallocate this objects internal structures to take up the least
1443 * possible space, without changing this object's value.
1445 UnicodeSet
& UnicodeSet::compact() {
1446 if (isFrozen() || isBogus()) {
1449 // Delete buffer first to defragment memory less.
1450 if (buffer
!= NULL
) {
1454 if (len
< capacity
) {
1455 // Make the capacity equal to len or 1.
1456 // We don't want to realloc of 0 size.
1457 int32_t newCapacity
= len
+ (len
== 0);
1458 UChar32
* temp
= (UChar32
*) uprv_realloc(list
, sizeof(UChar32
) * newCapacity
);
1461 capacity
= newCapacity
;
1463 // else what the heck happened?! We allocated less memory!
1464 // Oh well. We'll keep our original array.
1469 int32_t UnicodeSet::serialize(uint16_t *dest
, int32_t destCapacity
, UErrorCode
& ec
) const {
1470 int32_t bmpLength
, length
, destLength
;
1472 if (U_FAILURE(ec
)) {
1476 if (destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
1477 ec
=U_ILLEGAL_ARGUMENT_ERROR
;
1481 /* count necessary 16-bit units */
1482 length
=this->len
-1; // Subtract 1 to ignore final UNICODESET_HIGH
1483 // assert(length>=0);
1486 if (destCapacity
>0) {
1489 ec
=U_BUFFER_OVERFLOW_ERROR
;
1495 if (this->list
[length
-1]<=0xffff) {
1498 } else if (this->list
[0]>=0x10000) {
1499 /* all supplementary */
1503 /* some BMP, some supplementary */
1504 for (bmpLength
=0; bmpLength
<length
&& this->list
[bmpLength
]<=0xffff; ++bmpLength
) {}
1505 length
=bmpLength
+2*(length
-bmpLength
);
1508 /* length: number of 16-bit array units */
1509 if (length
>0x7fff) {
1510 /* there are only 15 bits for the length in the first serialized word */
1511 ec
=U_INDEX_OUTOFBOUNDS_ERROR
;
1516 * total serialized length:
1517 * number of 16-bit array units (length) +
1518 * 1 length unit (always) +
1519 * 1 bmpLength unit (if there are supplementary values)
1521 destLength
=length
+((length
>bmpLength
)?2:1);
1522 if (destLength
<=destCapacity
) {
1526 *dest
=(uint16_t)length
;
1527 if (length
>bmpLength
) {
1529 *++dest
=(uint16_t)bmpLength
;
1533 /* write the BMP part of the array */
1535 for (i
=0; i
<bmpLength
; ++i
) {
1536 *dest
++=(uint16_t)*p
++;
1539 /* write the supplementary part of the array */
1540 for (; i
<length
; i
+=2) {
1541 *dest
++=(uint16_t)(*p
>>16);
1542 *dest
++=(uint16_t)*p
++;
1545 ec
=U_BUFFER_OVERFLOW_ERROR
;
1550 //----------------------------------------------------------------
1551 // Implementation: Utility methods
1552 //----------------------------------------------------------------
1555 * Allocate our strings vector and return TRUE if successful.
1557 UBool
UnicodeSet::allocateStrings(UErrorCode
&status
) {
1558 if (U_FAILURE(status
)) {
1561 strings
= new UVector(uhash_deleteUnicodeString
,
1562 uhash_compareUnicodeString
, 1, status
);
1563 if (strings
== NULL
) { // Check for memory allocation error.
1564 status
= U_MEMORY_ALLOCATION_ERROR
;
1567 if (U_FAILURE(status
)) {
1575 void UnicodeSet::ensureCapacity(int32_t newLen
, UErrorCode
& ec
) {
1576 if (newLen
<= capacity
)
1578 UChar32
* temp
= (UChar32
*) uprv_realloc(list
, sizeof(UChar32
) * (newLen
+ GROW_EXTRA
));
1580 ec
= U_MEMORY_ALLOCATION_ERROR
;
1585 capacity
= newLen
+ GROW_EXTRA
;
1586 // else we keep the original contents on the memory failure.
1589 void UnicodeSet::ensureBufferCapacity(int32_t newLen
, UErrorCode
& ec
) {
1590 if (buffer
!= NULL
&& newLen
<= bufferCapacity
)
1592 UChar32
* temp
= (UChar32
*) uprv_realloc(buffer
, sizeof(UChar32
) * (newLen
+ GROW_EXTRA
));
1594 ec
= U_MEMORY_ALLOCATION_ERROR
;
1599 bufferCapacity
= newLen
+ GROW_EXTRA
;
1600 // else we keep the original contents on the memory failure.
1604 * Swap list and buffer.
1606 void UnicodeSet::swapBuffers(void) {
1607 // swap list and buffer
1608 UChar32
* temp
= list
;
1612 int32_t c
= capacity
;
1613 capacity
= bufferCapacity
;
1617 void UnicodeSet::setToBogus() {
1618 clear(); // Remove everything in the set.
1622 //----------------------------------------------------------------
1623 // Implementation: Fundamental operators
1624 //----------------------------------------------------------------
1626 static inline UChar32
max(UChar32 a
, UChar32 b
) {
1627 return (a
> b
) ? a
: b
;
1630 // polarity = 0, 3 is normal: x xor y
1631 // polarity = 1, 2: x xor ~y == x === y
1633 void UnicodeSet::exclusiveOr(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1634 if (isFrozen() || isBogus()) {
1637 UErrorCode status
= U_ZERO_ERROR
;
1638 ensureBufferCapacity(len
+ otherLen
, status
);
1639 if (U_FAILURE(status
)) {
1643 int32_t i
= 0, j
= 0, k
= 0;
1644 UChar32 a
= list
[i
++];
1646 if (polarity
== 1 || polarity
== 2) {
1648 if (other
[j
] == UNICODESET_LOW
) { // skip base if already LOW
1655 // simplest of all the routines
1656 // sort the values, discarding identicals!
1664 } else if (a
!= UNICODESET_HIGH
) { // at this point, a == b
1665 // discard both values!
1669 buffer
[k
++] = UNICODESET_HIGH
;
1678 // polarity = 0 is normal: x union y
1679 // polarity = 2: x union ~y
1680 // polarity = 1: ~x union y
1681 // polarity = 3: ~x union ~y
1683 void UnicodeSet::add(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1684 if (isFrozen() || isBogus() || other
==NULL
) {
1687 UErrorCode status
= U_ZERO_ERROR
;
1688 ensureBufferCapacity(len
+ otherLen
, status
);
1689 if (U_FAILURE(status
)) {
1693 int32_t i
= 0, j
= 0, k
= 0;
1694 UChar32 a
= list
[i
++];
1695 UChar32 b
= other
[j
++];
1696 // change from xor is that we have to check overlapping pairs
1697 // polarity bit 1 means a is second, bit 2 means b is.
1700 case 0: // both first; take lower if unequal
1701 if (a
< b
) { // take a
1702 // Back up over overlapping ranges in buffer[]
1703 if (k
> 0 && a
<= buffer
[k
-1]) {
1704 // Pick latter end value in buffer[] vs. list[]
1705 a
= max(list
[i
], buffer
[--k
]);
1711 i
++; // Common if/else code factored out
1713 } else if (b
< a
) { // take b
1714 if (k
> 0 && b
<= buffer
[k
-1]) {
1715 b
= max(other
[j
], buffer
[--k
]);
1722 } else { // a == b, take a, drop b
1723 if (a
== UNICODESET_HIGH
) goto loop_end
;
1724 // This is symmetrical; it doesn't matter if
1725 // we backtrack with a or b. - liu
1726 if (k
> 0 && a
<= buffer
[k
-1]) {
1727 a
= max(list
[i
], buffer
[--k
]);
1739 case 3: // both second; take higher if unequal, and drop other
1740 if (b
<= a
) { // take a
1741 if (a
== UNICODESET_HIGH
) goto loop_end
;
1744 if (b
== UNICODESET_HIGH
) goto loop_end
;
1748 polarity
^= 1; // factored common code
1752 case 1: // a second, b first; if b < a, overlap
1753 if (a
< b
) { // no overlap, take a
1754 buffer
[k
++] = a
; a
= list
[i
++]; polarity
^= 1;
1755 } else if (b
< a
) { // OVERLAP, drop b
1758 } else { // a == b, drop both!
1759 if (a
== UNICODESET_HIGH
) goto loop_end
;
1766 case 2: // a first, b second; if a < b, overlap
1767 if (b
< a
) { // no overlap, take b
1771 } else if (a
< b
) { // OVERLAP, drop a
1774 } else { // a == b, drop both!
1775 if (a
== UNICODESET_HIGH
) goto loop_end
;
1785 buffer
[k
++] = UNICODESET_HIGH
; // terminate
1791 // polarity = 0 is normal: x intersect y
1792 // polarity = 2: x intersect ~y == set-minus
1793 // polarity = 1: ~x intersect y
1794 // polarity = 3: ~x intersect ~y
1796 void UnicodeSet::retain(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1797 if (isFrozen() || isBogus()) {
1800 UErrorCode status
= U_ZERO_ERROR
;
1801 ensureBufferCapacity(len
+ otherLen
, status
);
1802 if (U_FAILURE(status
)) {
1806 int32_t i
= 0, j
= 0, k
= 0;
1807 UChar32 a
= list
[i
++];
1808 UChar32 b
= other
[j
++];
1809 // change from xor is that we have to check overlapping pairs
1810 // polarity bit 1 means a is second, bit 2 means b is.
1813 case 0: // both first; drop the smaller
1814 if (a
< b
) { // drop a
1817 } else if (b
< a
) { // drop b
1820 } else { // a == b, take one, drop other
1821 if (a
== UNICODESET_HIGH
) goto loop_end
;
1829 case 3: // both second; take lower if unequal
1830 if (a
< b
) { // take a
1834 } else if (b
< a
) { // take b
1838 } else { // a == b, take one, drop other
1839 if (a
== UNICODESET_HIGH
) goto loop_end
;
1847 case 1: // a second, b first;
1848 if (a
< b
) { // NO OVERLAP, drop a
1851 } else if (b
< a
) { // OVERLAP, take b
1855 } else { // a == b, drop both!
1856 if (a
== UNICODESET_HIGH
) goto loop_end
;
1863 case 2: // a first, b second; if a < b, overlap
1864 if (b
< a
) { // no overlap, drop b
1867 } else if (a
< b
) { // OVERLAP, take a
1871 } else { // a == b, drop both!
1872 if (a
== UNICODESET_HIGH
) goto loop_end
;
1882 buffer
[k
++] = UNICODESET_HIGH
; // terminate
1889 * Append the <code>toPattern()</code> representation of a
1890 * string to the given <code>StringBuffer</code>.
1892 void UnicodeSet::_appendToPat(UnicodeString
& buf
, const UnicodeString
& s
, UBool
1893 escapeUnprintable
) {
1895 for (int32_t i
= 0; i
< s
.length(); i
+= UTF_CHAR_LENGTH(cp
)) {
1896 _appendToPat(buf
, cp
= s
.char32At(i
), escapeUnprintable
);
1901 * Append the <code>toPattern()</code> representation of a
1902 * character to the given <code>StringBuffer</code>.
1904 void UnicodeSet::_appendToPat(UnicodeString
& buf
, UChar32 c
, UBool
1905 escapeUnprintable
) {
1906 if (escapeUnprintable
&& ICU_Utility::isUnprintable(c
)) {
1907 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1909 if (ICU_Utility::escapeUnprintable(buf
, c
)) {
1913 // Okay to let ':' pass through
1924 case SymbolTable::SYMBOL_REF
:
1925 buf
.append(BACKSLASH
);
1928 // Escape whitespace
1929 if (uprv_isRuleWhiteSpace(c
)) {
1930 buf
.append(BACKSLASH
);
1938 * Append a string representation of this set to result. This will be
1939 * a cleaned version of the string passed to applyPattern(), if there
1940 * is one. Otherwise it will be generated.
1942 UnicodeString
& UnicodeSet::_toPattern(UnicodeString
& result
,
1943 UBool escapeUnprintable
) const
1947 int32_t backslashCount
= 0;
1948 for (i
=0; i
<patLen
; ) {
1950 U16_NEXT(pat
, i
, patLen
, c
);
1951 if (escapeUnprintable
&& ICU_Utility::isUnprintable(c
)) {
1952 // If the unprintable character is preceded by an odd
1953 // number of backslashes, then it has been escaped.
1954 // Before unescaping it, we delete the final
1956 if ((backslashCount
% 2) == 1) {
1957 result
.truncate(result
.length() - 1);
1959 ICU_Utility::escapeUnprintable(result
, c
);
1963 if (c
== BACKSLASH
) {
1973 return _generatePattern(result
, escapeUnprintable
);
1977 * Returns a string representation of this set. If the result of
1978 * calling this function is passed to a UnicodeSet constructor, it
1979 * will produce another set that is equal to this one.
1981 UnicodeString
& UnicodeSet::toPattern(UnicodeString
& result
,
1982 UBool escapeUnprintable
) const
1985 return _toPattern(result
, escapeUnprintable
);
1989 * Generate and append a string representation of this set to result.
1990 * This does not use this.pat, the cleaned up copy of the string
1991 * passed to applyPattern().
1993 UnicodeString
& UnicodeSet::_generatePattern(UnicodeString
& result
,
1994 UBool escapeUnprintable
) const
1996 result
.append(SET_OPEN
);
1998 // // Check against the predefined categories. We implicitly build
1999 // // up ALL category sets the first time toPattern() is called.
2000 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2001 // if (*this == getCategorySet(cat)) {
2002 // result.append(COLON);
2003 // result.append(CATEGORY_NAMES, cat*2, 2);
2004 // return result.append(CATEGORY_CLOSE);
2008 int32_t count
= getRangeCount();
2010 // If the set contains at least 2 intervals and includes both
2011 // MIN_VALUE and MAX_VALUE, then the inverse representation will
2012 // be more economical.
2014 getRangeStart(0) == MIN_VALUE
&&
2015 getRangeEnd(count
-1) == MAX_VALUE
) {
2018 result
.append(COMPLEMENT
);
2020 for (int32_t i
= 1; i
< count
; ++i
) {
2021 UChar32 start
= getRangeEnd(i
-1)+1;
2022 UChar32 end
= getRangeStart(i
)-1;
2023 _appendToPat(result
, start
, escapeUnprintable
);
2025 if ((start
+1) != end
) {
2026 result
.append(HYPHEN
);
2028 _appendToPat(result
, end
, escapeUnprintable
);
2033 // Default; emit the ranges as pairs
2035 for (int32_t i
= 0; i
< count
; ++i
) {
2036 UChar32 start
= getRangeStart(i
);
2037 UChar32 end
= getRangeEnd(i
);
2038 _appendToPat(result
, start
, escapeUnprintable
);
2040 if ((start
+1) != end
) {
2041 result
.append(HYPHEN
);
2043 _appendToPat(result
, end
, escapeUnprintable
);
2048 for (int32_t i
= 0; i
<strings
->size(); ++i
) {
2049 result
.append(OPEN_BRACE
);
2050 _appendToPat(result
,
2051 *(const UnicodeString
*) strings
->elementAt(i
),
2053 result
.append(CLOSE_BRACE
);
2055 return result
.append(SET_CLOSE
);
2059 * Release existing cached pattern
2061 void UnicodeSet::releasePattern() {
2070 * Set the new pattern to cache.
2072 void UnicodeSet::setPattern(const UnicodeString
& newPat
) {
2074 int32_t newPatLen
= newPat
.length();
2075 pat
= (UChar
*)uprv_malloc((newPatLen
+ 1) * sizeof(UChar
));
2078 newPat
.extractBetween(0, patLen
, pat
);
2081 // else we don't care if malloc failed. This was just a nice cache.
2082 // We can regenerate an equivalent pattern later when requested.
2085 UnicodeFunctor
*UnicodeSet::freeze() {
2086 if(!isFrozen() && !isBogus()) {
2087 // Do most of what compact() does before freezing because
2088 // compact() will not work when the set is frozen.
2089 // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2091 // Delete buffer first to defragment memory less.
2092 if (buffer
!= NULL
) {
2096 if (capacity
> (len
+ GROW_EXTRA
)) {
2097 // Make the capacity equal to len or 1.
2098 // We don't want to realloc of 0 size.
2099 capacity
= len
+ (len
== 0);
2100 list
= (UChar32
*) uprv_realloc(list
, sizeof(UChar32
) * capacity
);
2101 if (list
== NULL
) { // Check for memory allocation error.
2107 // Optimize contains() and span() and similar functions.
2108 if (!strings
->isEmpty()) {
2109 stringSpan
= new UnicodeSetStringSpan(*this, *strings
, UnicodeSetStringSpan::ALL
);
2110 if (stringSpan
!= NULL
&& !stringSpan
->needsStringSpanUTF16()) {
2111 // All strings are irrelevant for span() etc. because
2112 // all of each string's code points are contained in this set.
2113 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2114 // many relevant strings as UTF-16.
2115 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2120 if (stringSpan
== NULL
) {
2121 // No span-relevant strings: Optimize for code point spans.
2122 bmpSet
=new BMPSet(list
, len
);
2123 if (bmpSet
== NULL
) { // Check for memory allocation error.
2131 int32_t UnicodeSet::span(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
2132 if(length
>0 && bmpSet
!=NULL
) {
2133 return (int32_t)(bmpSet
->span(s
, s
+length
, spanCondition
)-s
);
2141 if(stringSpan
!=NULL
) {
2142 return stringSpan
->span(s
, length
, spanCondition
);
2143 } else if(!strings
->isEmpty()) {
2144 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2145 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED
:
2146 UnicodeSetStringSpan::FWD_UTF16_CONTAINED
;
2147 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2148 if(strSpan
.needsStringSpanUTF16()) {
2149 return strSpan
.span(s
, length
, spanCondition
);
2153 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2154 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2158 int32_t start
=0, prev
=0;
2160 U16_NEXT(s
, start
, length
, c
);
2161 if(spanCondition
!=contains(c
)) {
2164 } while((prev
=start
)<length
);
2168 int32_t UnicodeSet::spanBack(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
2169 if(length
>0 && bmpSet
!=NULL
) {
2170 return (int32_t)(bmpSet
->spanBack(s
, s
+length
, spanCondition
)-s
);
2178 if(stringSpan
!=NULL
) {
2179 return stringSpan
->spanBack(s
, length
, spanCondition
);
2180 } else if(!strings
->isEmpty()) {
2181 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2182 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED
:
2183 UnicodeSetStringSpan::BACK_UTF16_CONTAINED
;
2184 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2185 if(strSpan
.needsStringSpanUTF16()) {
2186 return strSpan
.spanBack(s
, length
, spanCondition
);
2190 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2191 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2195 int32_t prev
=length
;
2197 U16_PREV(s
, 0, length
, c
);
2198 if(spanCondition
!=contains(c
)) {
2201 } while((prev
=length
)>0);
2205 int32_t UnicodeSet::spanUTF8(const char *s
, int32_t length
, USetSpanCondition spanCondition
) const {
2206 if(length
>0 && bmpSet
!=NULL
) {
2207 const uint8_t *s0
=(const uint8_t *)s
;
2208 return (int32_t)(bmpSet
->spanUTF8(s0
, length
, spanCondition
)-s0
);
2211 length
=uprv_strlen(s
);
2216 if(stringSpan
!=NULL
) {
2217 return stringSpan
->spanUTF8((const uint8_t *)s
, length
, spanCondition
);
2218 } else if(!strings
->isEmpty()) {
2219 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2220 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED
:
2221 UnicodeSetStringSpan::FWD_UTF8_CONTAINED
;
2222 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2223 if(strSpan
.needsStringSpanUTF8()) {
2224 return strSpan
.spanUTF8((const uint8_t *)s
, length
, spanCondition
);
2228 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2229 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2233 int32_t start
=0, prev
=0;
2235 U8_NEXT(s
, start
, length
, c
);
2239 if(spanCondition
!=contains(c
)) {
2242 } while((prev
=start
)<length
);
2246 int32_t UnicodeSet::spanBackUTF8(const char *s
, int32_t length
, USetSpanCondition spanCondition
) const {
2247 if(length
>0 && bmpSet
!=NULL
) {
2248 const uint8_t *s0
=(const uint8_t *)s
;
2249 return bmpSet
->spanBackUTF8(s0
, length
, spanCondition
);
2252 length
=uprv_strlen(s
);
2257 if(stringSpan
!=NULL
) {
2258 return stringSpan
->spanBackUTF8((const uint8_t *)s
, length
, spanCondition
);
2259 } else if(!strings
->isEmpty()) {
2260 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2261 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED
:
2262 UnicodeSetStringSpan::BACK_UTF8_CONTAINED
;
2263 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2264 if(strSpan
.needsStringSpanUTF8()) {
2265 return strSpan
.spanBackUTF8((const uint8_t *)s
, length
, spanCondition
);
2269 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2270 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2274 int32_t prev
=length
;
2276 U8_PREV(s
, 0, length
, c
);
2280 if(spanCondition
!=contains(c
)) {
2283 } while((prev
=length
)>0);