1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 1999-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 **********************************************************************
13 #include "unicode/utypes.h"
14 #include "unicode/parsepos.h"
15 #include "unicode/symtable.h"
16 #include "unicode/uniset.h"
17 #include "unicode/utf8.h"
18 #include "unicode/utf16.h"
22 #include "patternprops.h"
30 #include "unisetspan.h"
32 // Define UChar constants using hex for EBCDIC compatibility
33 // Used #define to reduce private static exports and memory access time.
34 #define SET_OPEN ((UChar)0x005B) /*[*/
35 #define SET_CLOSE ((UChar)0x005D) /*]*/
36 #define HYPHEN ((UChar)0x002D) /*-*/
37 #define COMPLEMENT ((UChar)0x005E) /*^*/
38 #define COLON ((UChar)0x003A) /*:*/
39 #define BACKSLASH ((UChar)0x005C) /*\*/
40 #define INTERSECTION ((UChar)0x0026) /*&*/
41 #define UPPER_U ((UChar)0x0055) /*U*/
42 #define LOWER_U ((UChar)0x0075) /*u*/
43 #define OPEN_BRACE ((UChar)123) /*{*/
44 #define CLOSE_BRACE ((UChar)125) /*}*/
45 #define UPPER_P ((UChar)0x0050) /*P*/
46 #define LOWER_P ((UChar)0x0070) /*p*/
47 #define UPPER_N ((UChar)78) /*N*/
48 #define EQUALS ((UChar)0x003D) /*=*/
50 // HIGH_VALUE > all valid values. 110000 for codepoints
51 #define UNICODESET_HIGH 0x0110000
53 // LOW <= all valid values. ZERO for codepoints
54 #define UNICODESET_LOW 0x000000
56 // initial storage. Must be >= 0
57 #define START_EXTRA 16
59 // extra amount for growth. Must be >= 0
60 #define GROW_EXTRA START_EXTRA
64 SymbolTable::~SymbolTable() {}
66 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet
)
69 * Modify the given UChar32 variable so that it is in range, by
70 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
71 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
72 * It modifies its argument in-place and also returns it.
74 static inline UChar32
pinCodePoint(UChar32
& c
) {
75 if (c
< UNICODESET_LOW
) {
77 } else if (c
> (UNICODESET_HIGH
-1)) {
78 c
= (UNICODESET_HIGH
-1);
83 //----------------------------------------------------------------
85 //----------------------------------------------------------------
87 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
88 // To enable the debugging, define the symbol DEBUG_MEM in the line
89 // below. This will result in text being sent to stdout that looks
91 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
92 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
93 // Each line lists a construction (ct) or destruction (dt) event, the
94 // object address, the number of outstanding objects after the event,
95 // and the pattern of the object in question.
101 static int32_t _dbgCount
= 0;
103 static inline void _dbgct(UnicodeSet
* set
) {
105 set
->toPattern(str
, TRUE
);
107 str
.extract(0, 39, buf
, "");
108 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set
, ++_dbgCount
, buf
);
111 static inline void _dbgdt(UnicodeSet
* set
) {
113 set
->toPattern(str
, TRUE
);
115 str
.extract(0, 39, buf
, "");
116 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set
, --_dbgCount
, buf
);
126 //----------------------------------------------------------------
127 // UnicodeString in UVector support
128 //----------------------------------------------------------------
130 static void U_CALLCONV
cloneUnicodeString(UElement
*dst
, UElement
*src
) {
131 dst
->pointer
= new UnicodeString(*(UnicodeString
*)src
->pointer
);
134 static int8_t U_CALLCONV
compareUnicodeString(UElement t1
, UElement t2
) {
135 const UnicodeString
&a
= *(const UnicodeString
*)t1
.pointer
;
136 const UnicodeString
&b
= *(const UnicodeString
*)t2
.pointer
;
140 //----------------------------------------------------------------
142 //----------------------------------------------------------------
145 * Constructs an empty set.
147 UnicodeSet::UnicodeSet() :
148 len(1), capacity(1 + START_EXTRA
), list(0), bmpSet(0), buffer(0),
149 bufferCapacity(0), patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
152 UErrorCode status
= U_ZERO_ERROR
;
153 allocateStrings(status
);
154 if (U_FAILURE(status
)) {
157 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
159 list
[0] = UNICODESET_HIGH
;
160 } else { // If memory allocation failed, set to bogus state.
168 * Constructs a set containing the given range. If <code>end >
169 * start</code> then an empty set is created.
171 * @param start first character, inclusive, of range
172 * @param end last character, inclusive, of range
174 UnicodeSet::UnicodeSet(UChar32 start
, UChar32 end
) :
175 len(1), capacity(1 + START_EXTRA
), list(0), bmpSet(0), buffer(0),
176 bufferCapacity(0), patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
179 UErrorCode status
= U_ZERO_ERROR
;
180 allocateStrings(status
);
181 if (U_FAILURE(status
)) {
184 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
186 list
[0] = UNICODESET_HIGH
;
187 complement(start
, end
);
188 } else { // If memory allocation failed, set to bogus state.
196 * Constructs a set that is identical to the given UnicodeSet.
198 UnicodeSet::UnicodeSet(const UnicodeSet
& o
) :
200 len(0), capacity(o
.isFrozen() ? o
.len
: o
.len
+ GROW_EXTRA
), list(0),
202 buffer(0), bufferCapacity(0),
203 patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
206 UErrorCode status
= U_ZERO_ERROR
;
207 allocateStrings(status
);
208 if (U_FAILURE(status
)) {
211 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
214 } else { // If memory allocation failed, set to bogus state.
221 // Copy-construct as thawed.
222 UnicodeSet::UnicodeSet(const UnicodeSet
& o
, UBool
/* asThawed */) :
224 len(0), capacity(o
.len
+ GROW_EXTRA
), list(0),
226 buffer(0), bufferCapacity(0),
227 patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
230 UErrorCode status
= U_ZERO_ERROR
;
231 allocateStrings(status
);
232 if (U_FAILURE(status
)) {
235 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
237 // *this = o except for bmpSet and stringSpan
239 uprv_memcpy(list
, o
.list
, (size_t)len
*sizeof(UChar32
));
240 if (strings
!= NULL
&& o
.strings
!= NULL
) {
241 strings
->assign(*o
.strings
, cloneUnicodeString
, status
);
242 } else { // Invalid strings.
247 setPattern(UnicodeString(o
.pat
, o
.patLen
));
249 } else { // If memory allocation failed, set to bogus state.
259 UnicodeSet::~UnicodeSet() {
260 _dbgdt(this); // first!
272 * Assigns this object to be a copy of another.
274 UnicodeSet
& UnicodeSet::operator=(const UnicodeSet
& o
) {
285 UErrorCode ec
= U_ZERO_ERROR
;
286 ensureCapacity(o
.len
, ec
);
288 return *this; // There is no way to report this error :-(
291 uprv_memcpy(list
, o
.list
, (size_t)len
*sizeof(UChar32
));
292 if (o
.bmpSet
== NULL
) {
295 bmpSet
= new BMPSet(*o
.bmpSet
, list
, len
);
296 if (bmpSet
== NULL
) { // Check for memory allocation error.
301 if (strings
!= NULL
&& o
.strings
!= NULL
) {
302 strings
->assign(*o
.strings
, cloneUnicodeString
, ec
);
303 } else { // Invalid strings.
307 if (o
.stringSpan
== NULL
) {
310 stringSpan
= new UnicodeSetStringSpan(*o
.stringSpan
, *strings
);
311 if (stringSpan
== NULL
) { // Check for memory allocation error.
318 setPattern(UnicodeString(o
.pat
, o
.patLen
));
324 * Returns a copy of this object. All UnicodeMatcher objects have
325 * to support cloning in order to allow classes using
326 * UnicodeMatchers, such as Transliterator, to implement cloning.
328 UnicodeFunctor
* UnicodeSet::clone() const {
329 return new UnicodeSet(*this);
332 UnicodeFunctor
*UnicodeSet::cloneAsThawed() const {
333 return new UnicodeSet(*this, TRUE
);
337 * Compares the specified object with this set for equality. Returns
338 * <tt>true</tt> if the two sets
339 * have the same size, and every member of the specified set is
340 * contained in this set (or equivalently, every member of this set is
341 * contained in the specified set).
343 * @param o set to be compared for equality with this set.
344 * @return <tt>true</tt> if the specified set is equal to this set.
346 UBool
UnicodeSet::operator==(const UnicodeSet
& o
) const {
347 if (len
!= o
.len
) return FALSE
;
348 for (int32_t i
= 0; i
< len
; ++i
) {
349 if (list
[i
] != o
.list
[i
]) return FALSE
;
351 if (*strings
!= *o
.strings
) return FALSE
;
356 * Returns the hash code value for this set.
358 * @return the hash code value for this set.
359 * @see Object#hashCode()
361 int32_t UnicodeSet::hashCode(void) const {
362 int32_t result
= len
;
363 for (int32_t i
= 0; i
< len
; ++i
) {
370 //----------------------------------------------------------------
372 //----------------------------------------------------------------
375 * Returns the number of elements in this set (its cardinality),
376 * Note than the elements of a set may include both individual
377 * codepoints and strings.
379 * @return the number of elements in this set (its cardinality).
381 int32_t UnicodeSet::size(void) const {
383 int32_t count
= getRangeCount();
384 for (int32_t i
= 0; i
< count
; ++i
) {
385 n
+= getRangeEnd(i
) - getRangeStart(i
) + 1;
387 return n
+ strings
->size();
391 * Returns <tt>true</tt> if this set contains no elements.
393 * @return <tt>true</tt> if this set contains no elements.
395 UBool
UnicodeSet::isEmpty(void) const {
396 return len
== 1 && strings
->size() == 0;
400 * Returns true if this set contains the given character.
401 * @param c character to be checked for containment
402 * @return true if the test condition is met
404 UBool
UnicodeSet::contains(UChar32 c
) const {
405 // Set i to the index of the start item greater than ch
406 // We know we will terminate without length test!
407 // LATER: for large sets, add binary search
410 // if (c < list[++i]) break;
412 if (bmpSet
!= NULL
) {
413 return bmpSet
->contains(c
);
415 if (stringSpan
!= NULL
) {
416 return stringSpan
->contains(c
);
418 if (c
>= UNICODESET_HIGH
) { // Don't need to check LOW bound
421 int32_t i
= findCodePoint(c
);
422 return (UBool
)(i
& 1); // return true if odd
426 * Returns the smallest value i such that c < list[i]. Caller
427 * must ensure that c is a legal value or this method will enter
428 * an infinite loop. This method performs a binary search.
429 * @param c a character in the range MIN_VALUE..MAX_VALUE
431 * @return the smallest integer i in the range 0..len-1,
432 * inclusive, such that c < list[i]
434 int32_t UnicodeSet::findCodePoint(UChar32 c
) const {
437 set list[] c=0 1 3 4 7 8
438 === ============== ===========
439 [] [110000] 0 0 0 0 0 0
440 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
441 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
442 [:Any:] [0, 110000] 1 1 1 1 1 1
445 // Return the smallest i such that c < list[i]. Assume
446 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
449 // High runner test. c is often after the last range, so an
450 // initial check for this condition pays off.
452 int32_t hi
= len
- 1;
453 if (lo
>= hi
|| c
>= list
[hi
-1])
455 // invariant: c >= list[lo]
456 // invariant: c < list[hi]
458 int32_t i
= (lo
+ hi
) >> 1;
461 } else if (c
< list
[i
]) {
471 * Returns true if this set contains every character
472 * of the given range.
473 * @param start first character, inclusive, of the range
474 * @param end last character, inclusive, of the range
475 * @return true if the test condition is met
477 UBool
UnicodeSet::contains(UChar32 start
, UChar32 end
) const {
480 // if (start < list[++i]) break;
482 int32_t i
= findCodePoint(start
);
483 return ((i
& 1) != 0 && end
< list
[i
]);
487 * Returns <tt>true</tt> if this set contains the given
488 * multicharacter string.
489 * @param s string to be checked for containment
490 * @return <tt>true</tt> if this set contains the specified string
492 UBool
UnicodeSet::contains(const UnicodeString
& s
) const {
493 if (s
.length() == 0) return FALSE
;
494 int32_t cp
= getSingleCP(s
);
496 return strings
->contains((void*) &s
);
498 return contains((UChar32
) cp
);
503 * Returns true if this set contains all the characters and strings
505 * @param c set to be checked for containment
506 * @return true if the test condition is met
508 UBool
UnicodeSet::containsAll(const UnicodeSet
& c
) const {
509 // The specified set is a subset if all of its pairs are contained in
510 // this set. It's possible to code this more efficiently in terms of
511 // direct manipulation of the inversion lists if the need arises.
512 int32_t n
= c
.getRangeCount();
513 for (int i
=0; i
<n
; ++i
) {
514 if (!contains(c
.getRangeStart(i
), c
.getRangeEnd(i
))) {
518 if (!strings
->containsAll(*c
.strings
)) return FALSE
;
523 * Returns true if this set contains all the characters
524 * of the given string.
525 * @param s string containing characters to be checked for containment
526 * @return true if the test condition is met
528 UBool
UnicodeSet::containsAll(const UnicodeString
& s
) const {
529 return (UBool
)(span(s
.getBuffer(), s
.length(), USET_SPAN_CONTAINED
) ==
534 * Returns true if this set contains none of the characters
535 * of the given range.
536 * @param start first character, inclusive, of the range
537 * @param end last character, inclusive, of the range
538 * @return true if the test condition is met
540 UBool
UnicodeSet::containsNone(UChar32 start
, UChar32 end
) const {
543 // if (start < list[++i]) break;
545 int32_t i
= findCodePoint(start
);
546 return ((i
& 1) == 0 && end
< list
[i
]);
550 * Returns true if this set contains none of the characters and strings
552 * @param c set to be checked for containment
553 * @return true if the test condition is met
555 UBool
UnicodeSet::containsNone(const UnicodeSet
& c
) const {
556 // The specified set is a subset if all of its pairs are contained in
557 // this set. It's possible to code this more efficiently in terms of
558 // direct manipulation of the inversion lists if the need arises.
559 int32_t n
= c
.getRangeCount();
560 for (int32_t i
=0; i
<n
; ++i
) {
561 if (!containsNone(c
.getRangeStart(i
), c
.getRangeEnd(i
))) {
565 if (!strings
->containsNone(*c
.strings
)) return FALSE
;
570 * Returns true if this set contains none of the characters
571 * of the given string.
572 * @param s string containing characters to be checked for containment
573 * @return true if the test condition is met
575 UBool
UnicodeSet::containsNone(const UnicodeString
& s
) const {
576 return (UBool
)(span(s
.getBuffer(), s
.length(), USET_SPAN_NOT_CONTAINED
) ==
581 * Returns <tt>true</tt> if this set contains any character whose low byte
582 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
585 UBool
UnicodeSet::matchesIndexValue(uint8_t v
) const {
586 /* The index value v, in the range [0,255], is contained in this set if
587 * it is contained in any pair of this set. Pairs either have the high
588 * bytes equal, or unequal. If the high bytes are equal, then we have
589 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
590 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
591 * Then v is contained if xx <= v || v <= yy. (This is identical to the
592 * time zone month containment logic.)
595 int32_t rangeCount
=getRangeCount();
596 for (i
=0; i
<rangeCount
; ++i
) {
597 UChar32 low
= getRangeStart(i
);
598 UChar32 high
= getRangeEnd(i
);
599 if ((low
& ~0xFF) == (high
& ~0xFF)) {
600 if ((low
& 0xFF) <= v
&& v
<= (high
& 0xFF)) {
603 } else if ((low
& 0xFF) <= v
|| v
<= (high
& 0xFF)) {
607 if (strings
->size() != 0) {
608 for (i
=0; i
<strings
->size(); ++i
) {
609 const UnicodeString
& s
= *(const UnicodeString
*)strings
->elementAt(i
);
610 //if (s.length() == 0) {
611 // // Empty strings match everything
614 // assert(s.length() != 0); // We enforce this elsewhere
615 UChar32 c
= s
.char32At(0);
616 if ((c
& 0xFF) == v
) {
625 * Implementation of UnicodeMatcher::matches(). Always matches the
626 * longest possible multichar string.
628 UMatchDegree
UnicodeSet::matches(const Replaceable
& text
,
632 if (offset
== limit
) {
633 // Strings, if any, have length != 0, so we don't worry
634 // about them here. If we ever allow zero-length strings
635 // we much check for them here.
636 if (contains(U_ETHER
)) {
637 return incremental
? U_PARTIAL_MATCH
: U_MATCH
;
642 if (strings
->size() != 0) { // try strings first
644 // might separate forward and backward loops later
645 // for now they are combined
647 // TODO Improve efficiency of this, at least in the forward
648 // direction, if not in both. In the forward direction we
649 // can assume the strings are sorted.
652 UBool forward
= offset
< limit
;
654 // firstChar is the leftmost char to match in the
655 // forward direction or the rightmost char to match in
656 // the reverse direction.
657 UChar firstChar
= text
.charAt(offset
);
659 // If there are multiple strings that can match we
660 // return the longest match.
661 int32_t highWaterLength
= 0;
663 for (i
=0; i
<strings
->size(); ++i
) {
664 const UnicodeString
& trial
= *(const UnicodeString
*)strings
->elementAt(i
);
666 //if (trial.length() == 0) {
667 // return U_MATCH; // null-string always matches
669 // assert(trial.length() != 0); // We ensure this elsewhere
671 UChar c
= trial
.charAt(forward
? 0 : trial
.length() - 1);
673 // Strings are sorted, so we can optimize in the
674 // forward direction.
675 if (forward
&& c
> firstChar
) break;
676 if (c
!= firstChar
) continue;
678 int32_t matchLen
= matchRest(text
, offset
, limit
, trial
);
681 int32_t maxLen
= forward
? limit
-offset
: offset
-limit
;
682 if (matchLen
== maxLen
) {
683 // We have successfully matched but only up to limit.
684 return U_PARTIAL_MATCH
;
688 if (matchLen
== trial
.length()) {
689 // We have successfully matched the whole string.
690 if (matchLen
> highWaterLength
) {
691 highWaterLength
= matchLen
;
693 // In the forward direction we know strings
694 // are sorted so we can bail early.
695 if (forward
&& matchLen
< highWaterLength
) {
702 // We've checked all strings without a partial match.
703 // If we have full matches, return the longest one.
704 if (highWaterLength
!= 0) {
705 offset
+= forward
? highWaterLength
: -highWaterLength
;
709 return UnicodeFilter::matches(text
, offset
, limit
, incremental
);
714 * Returns the longest match for s in text at the given position.
715 * If limit > start then match forward from start+1 to limit
716 * matching all characters except s.charAt(0). If limit < start,
717 * go backward starting from start-1 matching all characters
718 * except s.charAt(s.length()-1). This method assumes that the
719 * first character, text.charAt(start), matches s, so it does not
721 * @param text the text to match
722 * @param start the first character to match. In the forward
723 * direction, text.charAt(start) is matched against s.charAt(0).
724 * In the reverse direction, it is matched against
725 * s.charAt(s.length()-1).
726 * @param limit the limit offset for matching, either last+1 in
727 * the forward direction, or last-1 in the reverse direction,
728 * where last is the index of the last character to match.
729 * @return If part of s matches up to the limit, return |limit -
730 * start|. If all of s matches before reaching the limit, return
731 * s.length(). If there is a mismatch between s and text, return
734 int32_t UnicodeSet::matchRest(const Replaceable
& text
,
735 int32_t start
, int32_t limit
,
736 const UnicodeString
& s
) {
739 int32_t slen
= s
.length();
741 maxLen
= limit
- start
;
742 if (maxLen
> slen
) maxLen
= slen
;
743 for (i
= 1; i
< maxLen
; ++i
) {
744 if (text
.charAt(start
+ i
) != s
.charAt(i
)) return 0;
747 maxLen
= start
- limit
;
748 if (maxLen
> slen
) maxLen
= slen
;
749 --slen
; // <=> slen = s.length() - 1;
750 for (i
= 1; i
< maxLen
; ++i
) {
751 if (text
.charAt(start
- i
) != s
.charAt(slen
- i
)) return 0;
758 * Implement of UnicodeMatcher
760 void UnicodeSet::addMatchSetTo(UnicodeSet
& toUnionTo
) const {
761 toUnionTo
.addAll(*this);
765 * Returns the index of the given character within this set, where
766 * the set is ordered by ascending code point. If the character
767 * is not in this set, return -1. The inverse of this method is
768 * <code>charAt()</code>.
769 * @return an index from 0..size()-1, or -1
771 int32_t UnicodeSet::indexOf(UChar32 c
) const {
772 if (c
< MIN_VALUE
|| c
> MAX_VALUE
) {
778 UChar32 start
= list
[i
++];
782 UChar32 limit
= list
[i
++];
784 return n
+ c
- start
;
791 * Returns the character at the given index within this set, where
792 * the set is ordered by ascending code point. If the index is
793 * out of range, return (UChar32)-1. The inverse of this method is
794 * <code>indexOf()</code>.
795 * @param index an index from 0..size()-1
796 * @return the character at the given index, or (UChar32)-1.
798 UChar32
UnicodeSet::charAt(int32_t index
) const {
800 // len2 is the largest even integer <= len, that is, it is len
801 // for even values and len-1 for odd values. With odd values
802 // the last entry is UNICODESET_HIGH.
803 int32_t len2
= len
& ~1;
804 for (int32_t i
=0; i
< len2
;) {
805 UChar32 start
= list
[i
++];
806 int32_t count
= list
[i
++] - start
;
808 return (UChar32
)(start
+ index
);
817 * Make this object represent the range <code>start - end</code>.
818 * If <code>end > start</code> then this object is set to an
821 * @param start first character in the set, inclusive
822 * @rparam end last character in the set, inclusive
824 UnicodeSet
& UnicodeSet::set(UChar32 start
, UChar32 end
) {
826 complement(start
, end
);
831 * Adds the specified range to this set if it is not already
832 * present. If this set already contains the specified range,
833 * the call leaves this set unchanged. If <code>end > start</code>
834 * then an empty range is added, leaving the set unchanged.
836 * @param start first character, inclusive, of range to be added
838 * @param end last character, inclusive, of range to be added
841 UnicodeSet
& UnicodeSet::add(UChar32 start
, UChar32 end
) {
842 if (pinCodePoint(start
) < pinCodePoint(end
)) {
843 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
845 } else if (start
== end
) {
851 // #define DEBUG_US_ADD
855 void dump(UChar32 c
) {
857 printf("%c", (char)c
);
862 void dump(const UChar32
* list
, int32_t len
) {
864 for (int32_t i
=0; i
<len
; ++i
) {
865 if (i
!= 0) printf(", ");
873 * Adds the specified character to this set if it is not already
874 * present. If this set already contains the specified character,
875 * the call leaves this set unchanged.
877 UnicodeSet
& UnicodeSet::add(UChar32 c
) {
878 // find smallest i such that c < list[i]
879 // if odd, then it is IN the set
880 // if even, then it is OUT of the set
881 int32_t i
= findCodePoint(pinCodePoint(c
));
884 if ((i
& 1) != 0 || isFrozen() || isBogus()) return *this;
887 // assert(list[len-1] == HIGH);
890 // [start_0, limit_0, start_1, limit_1, HIGH]
892 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
896 // i == 0 means c is before the first range
901 printf(" found at %d", i
);
907 if (c
== list
[i
]-1) {
908 // c is before start of next range
910 // if we touched the HIGH mark, then add a new one
911 if (c
== (UNICODESET_HIGH
- 1)) {
912 UErrorCode status
= U_ZERO_ERROR
;
913 ensureCapacity(len
+1, status
);
914 if (U_FAILURE(status
)) {
915 return *this; // There is no way to report this error :-(
917 list
[len
++] = UNICODESET_HIGH
;
919 if (i
> 0 && c
== list
[i
-1]) {
920 // collapse adjacent ranges
922 // [..., start_k-1, c, c, limit_k, ..., HIGH]
926 //for (int32_t k=i-1; k<len-2; ++k) {
927 // list[k] = list[k+2];
929 UChar32
* dst
= list
+ i
- 1;
930 UChar32
* src
= dst
+ 2;
931 UChar32
* srclimit
= list
+ len
;
932 while (src
< srclimit
) *(dst
++) = *(src
++);
938 else if (i
> 0 && c
== list
[i
-1]) {
939 // c is after end of prior range
941 // no need to check for collapse here
945 // At this point we know the new char is not adjacent to
946 // any existing ranges, and it is not 10FFFF.
949 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
953 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
957 UErrorCode status
= U_ZERO_ERROR
;
958 ensureCapacity(len
+2, status
);
959 if (U_FAILURE(status
)) {
960 return *this; // There is no way to report this error :-(
963 //for (int32_t k=len-1; k>=i; --k) {
964 // list[k+2] = list[k];
966 UChar32
* src
= list
+ len
;
967 UChar32
* dst
= src
+ 2;
968 UChar32
* srclimit
= list
+ i
;
969 while (src
> srclimit
) *(--dst
) = *(--src
);
980 for (i
=1; i
<len
; ++i
) {
981 if (list
[i
] <= list
[i
-1]) {
983 printf("ERROR: list has been corrupted\n");
994 * Adds the specified multicharacter to this set if it is not already
995 * present. If this set already contains the multicharacter,
996 * the call leaves this set unchanged.
997 * Thus "ch" => {"ch"}
998 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
999 * @param s the source string
1000 * @return the modified set, for chaining
1002 UnicodeSet
& UnicodeSet::add(const UnicodeString
& s
) {
1003 if (s
.length() == 0 || isFrozen() || isBogus()) return *this;
1004 int32_t cp
= getSingleCP(s
);
1006 if (!strings
->contains((void*) &s
)) {
1017 * Adds the given string, in order, to 'strings'. The given string
1018 * must have been checked by the caller to not be empty and to not
1019 * already be in 'strings'.
1021 void UnicodeSet::_add(const UnicodeString
& s
) {
1022 if (isFrozen() || isBogus()) {
1025 UnicodeString
* t
= new UnicodeString(s
);
1026 if (t
== NULL
) { // Check for memory allocation error.
1030 UErrorCode ec
= U_ZERO_ERROR
;
1031 strings
->sortedInsert(t
, compareUnicodeString
, ec
);
1032 if (U_FAILURE(ec
)) {
1039 * @return a code point IF the string consists of a single one.
1040 * otherwise returns -1.
1041 * @param string to test
1043 int32_t UnicodeSet::getSingleCP(const UnicodeString
& s
) {
1044 //if (s.length() < 1) {
1045 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1047 if (s
.length() > 2) return -1;
1048 if (s
.length() == 1) return s
.charAt(0);
1050 // at this point, len = 2
1051 UChar32 cp
= s
.char32At(0);
1052 if (cp
> 0xFFFF) { // is surrogate pair
1059 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1060 * If this set already any particular character, it has no effect on that character.
1061 * @param the source string
1062 * @return the modified set, for chaining
1064 UnicodeSet
& UnicodeSet::addAll(const UnicodeString
& s
) {
1066 for (int32_t i
= 0; i
< s
.length(); i
+= U16_LENGTH(cp
)) {
1074 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1075 * If this set already any particular character, it has no effect on that character.
1076 * @param the source string
1077 * @return the modified set, for chaining
1079 UnicodeSet
& UnicodeSet::retainAll(const UnicodeString
& s
) {
1087 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1088 * If this set already any particular character, it has no effect on that character.
1089 * @param the source string
1090 * @return the modified set, for chaining
1092 UnicodeSet
& UnicodeSet::complementAll(const UnicodeString
& s
) {
1100 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1101 * If this set already any particular character, it has no effect on that character.
1102 * @param the source string
1103 * @return the modified set, for chaining
1105 UnicodeSet
& UnicodeSet::removeAll(const UnicodeString
& s
) {
1112 UnicodeSet
& UnicodeSet::removeAllStrings() {
1113 strings
->removeAllElements();
1119 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1120 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1121 * @param the source string
1122 * @return a newly created set containing the given string
1124 UnicodeSet
* U_EXPORT2
UnicodeSet::createFrom(const UnicodeString
& s
) {
1125 UnicodeSet
*set
= new UnicodeSet();
1126 if (set
!= NULL
) { // Check for memory allocation error.
1134 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1135 * @param the source string
1136 * @return a newly created set containing the given characters
1138 UnicodeSet
* U_EXPORT2
UnicodeSet::createFromAll(const UnicodeString
& s
) {
1139 UnicodeSet
*set
= new UnicodeSet();
1140 if (set
!= NULL
) { // Check for memory allocation error.
1147 * Retain only the elements in this set that are contained in the
1148 * specified range. If <code>end > start</code> then an empty range is
1149 * retained, leaving the set empty.
1151 * @param start first character, inclusive, of range to be retained
1153 * @param end last character, inclusive, of range to be retained
1156 UnicodeSet
& UnicodeSet::retain(UChar32 start
, UChar32 end
) {
1157 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1158 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1159 retain(range
, 2, 0);
1166 UnicodeSet
& UnicodeSet::retain(UChar32 c
) {
1167 return retain(c
, c
);
1171 * Removes the specified range from this set if it is present.
1172 * The set will not contain the specified range once the call
1173 * returns. If <code>end > start</code> then an empty range is
1174 * removed, leaving the set unchanged.
1176 * @param start first character, inclusive, of range to be removed
1178 * @param end last character, inclusive, of range to be removed
1181 UnicodeSet
& UnicodeSet::remove(UChar32 start
, UChar32 end
) {
1182 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1183 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1184 retain(range
, 2, 2);
1190 * Removes the specified character from this set if it is present.
1191 * The set will not contain the specified range once the call
1194 UnicodeSet
& UnicodeSet::remove(UChar32 c
) {
1195 return remove(c
, c
);
1199 * Removes the specified string from this set if it is present.
1200 * The set will not contain the specified character once the call
1202 * @param the source string
1203 * @return the modified set, for chaining
1205 UnicodeSet
& UnicodeSet::remove(const UnicodeString
& s
) {
1206 if (s
.length() == 0 || isFrozen() || isBogus()) return *this;
1207 int32_t cp
= getSingleCP(s
);
1209 strings
->removeElement((void*) &s
);
1212 remove((UChar32
)cp
, (UChar32
)cp
);
1218 * Complements the specified range in this set. Any character in
1219 * the range will be removed if it is in this set, or will be
1220 * added if it is not in this set. If <code>end > start</code>
1221 * then an empty range is xor'ed, leaving the set unchanged.
1223 * @param start first character, inclusive, of range to be removed
1225 * @param end last character, inclusive, of range to be removed
1228 UnicodeSet
& UnicodeSet::complement(UChar32 start
, UChar32 end
) {
1229 if (isFrozen() || isBogus()) {
1232 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1233 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1234 exclusiveOr(range
, 2, 0);
1240 UnicodeSet
& UnicodeSet::complement(UChar32 c
) {
1241 return complement(c
, c
);
1245 * This is equivalent to
1246 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1248 UnicodeSet
& UnicodeSet::complement(void) {
1249 if (isFrozen() || isBogus()) {
1252 UErrorCode status
= U_ZERO_ERROR
;
1253 if (list
[0] == UNICODESET_LOW
) {
1254 ensureBufferCapacity(len
-1, status
);
1255 if (U_FAILURE(status
)) {
1258 uprv_memcpy(buffer
, list
+ 1, (size_t)(len
-1)*sizeof(UChar32
));
1261 ensureBufferCapacity(len
+1, status
);
1262 if (U_FAILURE(status
)) {
1265 uprv_memcpy(buffer
+ 1, list
, (size_t)len
*sizeof(UChar32
));
1266 buffer
[0] = UNICODESET_LOW
;
1275 * Complement the specified string in this set.
1276 * The set will not contain the specified string once the call
1278 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1279 * @param s the string to complement
1280 * @return this object, for chaining
1282 UnicodeSet
& UnicodeSet::complement(const UnicodeString
& s
) {
1283 if (s
.length() == 0 || isFrozen() || isBogus()) return *this;
1284 int32_t cp
= getSingleCP(s
);
1286 if (strings
->contains((void*) &s
)) {
1287 strings
->removeElement((void*) &s
);
1293 complement((UChar32
)cp
, (UChar32
)cp
);
1299 * Adds all of the elements in the specified set to this set if
1300 * they're not already present. This operation effectively
1301 * modifies this set so that its value is the <i>union</i> of the two
1302 * sets. The behavior of this operation is unspecified if the specified
1303 * collection is modified while the operation is in progress.
1305 * @param c set whose elements are to be added to this set.
1306 * @see #add(char, char)
1308 UnicodeSet
& UnicodeSet::addAll(const UnicodeSet
& c
) {
1309 if ( c
.len
>0 && c
.list
!=NULL
) {
1310 add(c
.list
, c
.len
, 0);
1313 // Add strings in order
1314 if ( c
.strings
!=NULL
) {
1315 for (int32_t i
=0; i
<c
.strings
->size(); ++i
) {
1316 const UnicodeString
* s
= (const UnicodeString
*)c
.strings
->elementAt(i
);
1317 if (!strings
->contains((void*) s
)) {
1326 * Retains only the elements in this set that are contained in the
1327 * specified set. In other words, removes from this set all of
1328 * its elements that are not contained in the specified set. This
1329 * operation effectively modifies this set so that its value is
1330 * the <i>intersection</i> of the two sets.
1332 * @param c set that defines which elements this set will retain.
1334 UnicodeSet
& UnicodeSet::retainAll(const UnicodeSet
& c
) {
1335 if (isFrozen() || isBogus()) {
1338 retain(c
.list
, c
.len
, 0);
1339 strings
->retainAll(*c
.strings
);
1344 * Removes from this set all of its elements that are contained in the
1345 * specified set. This operation effectively modifies this
1346 * set so that its value is the <i>asymmetric set difference</i> of
1349 * @param c set that defines which elements will be removed from
1352 UnicodeSet
& UnicodeSet::removeAll(const UnicodeSet
& c
) {
1353 if (isFrozen() || isBogus()) {
1356 retain(c
.list
, c
.len
, 2);
1357 strings
->removeAll(*c
.strings
);
1362 * Complements in this set all elements contained in the specified
1363 * set. Any character in the other set will be removed if it is
1364 * in this set, or will be added if it is not in this set.
1366 * @param c set that defines which elements will be xor'ed from
1369 UnicodeSet
& UnicodeSet::complementAll(const UnicodeSet
& c
) {
1370 if (isFrozen() || isBogus()) {
1373 exclusiveOr(c
.list
, c
.len
, 0);
1375 for (int32_t i
=0; i
<c
.strings
->size(); ++i
) {
1376 void* e
= c
.strings
->elementAt(i
);
1377 if (!strings
->removeElement(e
)) {
1378 _add(*(const UnicodeString
*)e
);
1385 * Removes all of the elements from this set. This set will be
1386 * empty after this call returns.
1388 UnicodeSet
& UnicodeSet::clear(void) {
1393 list
[0] = UNICODESET_HIGH
;
1397 if (strings
!= NULL
) {
1398 strings
->removeAllElements();
1400 if (list
!= NULL
&& strings
!= NULL
) {
1408 * Iteration method that returns the number of ranges contained in
1410 * @see #getRangeStart
1413 int32_t UnicodeSet::getRangeCount() const {
1418 * Iteration method that returns the first character in the
1419 * specified range of this set.
1420 * @see #getRangeCount
1423 UChar32
UnicodeSet::getRangeStart(int32_t index
) const {
1424 return list
[index
*2];
1428 * Iteration method that returns the last character in the
1429 * specified range of this set.
1430 * @see #getRangeStart
1433 UChar32
UnicodeSet::getRangeEnd(int32_t index
) const {
1434 return list
[index
*2 + 1] - 1;
1437 int32_t UnicodeSet::getStringCount() const {
1438 return strings
->size();
1441 const UnicodeString
* UnicodeSet::getString(int32_t index
) const {
1442 return (const UnicodeString
*) strings
->elementAt(index
);
1446 * Reallocate this objects internal structures to take up the least
1447 * possible space, without changing this object's value.
1449 UnicodeSet
& UnicodeSet::compact() {
1450 if (isFrozen() || isBogus()) {
1453 // Delete buffer first to defragment memory less.
1454 if (buffer
!= NULL
) {
1458 if (len
< capacity
) {
1459 // Make the capacity equal to len or 1.
1460 // We don't want to realloc of 0 size.
1461 int32_t newCapacity
= len
+ (len
== 0);
1462 UChar32
* temp
= (UChar32
*) uprv_realloc(list
, sizeof(UChar32
) * newCapacity
);
1465 capacity
= newCapacity
;
1467 // else what the heck happened?! We allocated less memory!
1468 // Oh well. We'll keep our original array.
1473 #ifdef DEBUG_SERIALIZE
1478 * Deserialize constructor.
1480 UnicodeSet::UnicodeSet(const uint16_t data
[], int32_t dataLen
, ESerialization serialization
, UErrorCode
&ec
)
1481 : len(1), capacity(1+START_EXTRA
), list(0), bmpSet(0), buffer(0),
1482 bufferCapacity(0), patLen(0), pat(NULL
), strings(NULL
), stringSpan(NULL
),
1490 if( (serialization
!= kSerialized
)
1493 ec
= U_ILLEGAL_ARGUMENT_ERROR
;
1498 allocateStrings(ec
);
1499 if (U_FAILURE(ec
)) {
1505 int32_t headerSize
= ((data
[0]&0x8000)) ?2:1;
1506 int32_t bmpLength
= (headerSize
==1)?data
[0]:data
[1];
1508 len
= (((data
[0]&0x7FFF)-bmpLength
)/2)+bmpLength
;
1509 #ifdef DEBUG_SERIALIZE
1510 printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen
,headerSize
,bmpLength
,len
, data
[0],data
[1],data
[2],data
[3]);
1513 list
= (UChar32
*) uprv_malloc(sizeof(UChar32
) * capacity
);
1514 if(!list
|| U_FAILURE(ec
)) {
1520 for(i
= 0; i
< bmpLength
;i
++) {
1521 list
[i
] = data
[i
+headerSize
];
1522 #ifdef DEBUG_SERIALIZE
1523 printf("<<16@%d[%d] %X\n", i
+headerSize
, i
, list
[i
]);
1527 for(i
=bmpLength
;i
<len
;i
++) {
1528 list
[i
] = ((UChar32
)data
[headerSize
+bmpLength
+(i
-bmpLength
)*2+0] << 16) +
1529 ((UChar32
)data
[headerSize
+bmpLength
+(i
-bmpLength
)*2+1]);
1530 #ifdef DEBUG_SERIALIZE
1531 printf("<<32@%d+[%d] %lX\n", headerSize
+bmpLength
+i
, i
, list
[i
]);
1535 list
[len
++]=UNICODESET_HIGH
;
1539 int32_t UnicodeSet::serialize(uint16_t *dest
, int32_t destCapacity
, UErrorCode
& ec
) const {
1540 int32_t bmpLength
, length
, destLength
;
1542 if (U_FAILURE(ec
)) {
1546 if (destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
1547 ec
=U_ILLEGAL_ARGUMENT_ERROR
;
1551 /* count necessary 16-bit units */
1552 length
=this->len
-1; // Subtract 1 to ignore final UNICODESET_HIGH
1553 // assert(length>=0);
1556 if (destCapacity
>0) {
1559 ec
=U_BUFFER_OVERFLOW_ERROR
;
1565 if (this->list
[length
-1]<=0xffff) {
1568 } else if (this->list
[0]>=0x10000) {
1569 /* all supplementary */
1573 /* some BMP, some supplementary */
1574 for (bmpLength
=0; bmpLength
<length
&& this->list
[bmpLength
]<=0xffff; ++bmpLength
) {}
1575 length
=bmpLength
+2*(length
-bmpLength
);
1577 #ifdef DEBUG_SERIALIZE
1578 printf(">> bmpLength%d length%d len%d\n", bmpLength
, length
, len
);
1580 /* length: number of 16-bit array units */
1581 if (length
>0x7fff) {
1582 /* there are only 15 bits for the length in the first serialized word */
1583 ec
=U_INDEX_OUTOFBOUNDS_ERROR
;
1588 * total serialized length:
1589 * number of 16-bit array units (length) +
1590 * 1 length unit (always) +
1591 * 1 bmpLength unit (if there are supplementary values)
1593 destLength
=length
+((length
>bmpLength
)?2:1);
1594 if (destLength
<=destCapacity
) {
1598 #ifdef DEBUG_SERIALIZE
1599 printf("writeHdr\n");
1601 *dest
=(uint16_t)length
;
1602 if (length
>bmpLength
) {
1604 *++dest
=(uint16_t)bmpLength
;
1608 /* write the BMP part of the array */
1610 for (i
=0; i
<bmpLength
; ++i
) {
1611 #ifdef DEBUG_SERIALIZE
1612 printf("writebmp: %x\n", (int)*p
);
1614 *dest
++=(uint16_t)*p
++;
1617 /* write the supplementary part of the array */
1618 for (; i
<length
; i
+=2) {
1619 #ifdef DEBUG_SERIALIZE
1620 printf("write32: %x\n", (int)*p
);
1622 *dest
++=(uint16_t)(*p
>>16);
1623 *dest
++=(uint16_t)*p
++;
1626 ec
=U_BUFFER_OVERFLOW_ERROR
;
1631 //----------------------------------------------------------------
1632 // Implementation: Utility methods
1633 //----------------------------------------------------------------
1636 * Allocate our strings vector and return TRUE if successful.
1638 UBool
UnicodeSet::allocateStrings(UErrorCode
&status
) {
1639 if (U_FAILURE(status
)) {
1642 strings
= new UVector(uprv_deleteUObject
,
1643 uhash_compareUnicodeString
, 1, status
);
1644 if (strings
== NULL
) { // Check for memory allocation error.
1645 status
= U_MEMORY_ALLOCATION_ERROR
;
1648 if (U_FAILURE(status
)) {
1656 void UnicodeSet::ensureCapacity(int32_t newLen
, UErrorCode
& ec
) {
1657 if (newLen
<= capacity
)
1659 UChar32
* temp
= (UChar32
*) uprv_realloc(list
, sizeof(UChar32
) * (newLen
+ GROW_EXTRA
));
1661 ec
= U_MEMORY_ALLOCATION_ERROR
;
1666 capacity
= newLen
+ GROW_EXTRA
;
1667 // else we keep the original contents on the memory failure.
1670 void UnicodeSet::ensureBufferCapacity(int32_t newLen
, UErrorCode
& ec
) {
1671 if (buffer
!= NULL
&& newLen
<= bufferCapacity
)
1673 UChar32
* temp
= (UChar32
*) uprv_realloc(buffer
, sizeof(UChar32
) * (newLen
+ GROW_EXTRA
));
1675 ec
= U_MEMORY_ALLOCATION_ERROR
;
1680 bufferCapacity
= newLen
+ GROW_EXTRA
;
1681 // else we keep the original contents on the memory failure.
1685 * Swap list and buffer.
1687 void UnicodeSet::swapBuffers(void) {
1688 // swap list and buffer
1689 UChar32
* temp
= list
;
1693 int32_t c
= capacity
;
1694 capacity
= bufferCapacity
;
1698 void UnicodeSet::setToBogus() {
1699 clear(); // Remove everything in the set.
1703 //----------------------------------------------------------------
1704 // Implementation: Fundamental operators
1705 //----------------------------------------------------------------
1707 static inline UChar32
max(UChar32 a
, UChar32 b
) {
1708 return (a
> b
) ? a
: b
;
1711 // polarity = 0, 3 is normal: x xor y
1712 // polarity = 1, 2: x xor ~y == x === y
1714 void UnicodeSet::exclusiveOr(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1715 if (isFrozen() || isBogus()) {
1718 UErrorCode status
= U_ZERO_ERROR
;
1719 ensureBufferCapacity(len
+ otherLen
, status
);
1720 if (U_FAILURE(status
)) {
1724 int32_t i
= 0, j
= 0, k
= 0;
1725 UChar32 a
= list
[i
++];
1727 if (polarity
== 1 || polarity
== 2) {
1729 if (other
[j
] == UNICODESET_LOW
) { // skip base if already LOW
1736 // simplest of all the routines
1737 // sort the values, discarding identicals!
1745 } else if (a
!= UNICODESET_HIGH
) { // at this point, a == b
1746 // discard both values!
1750 buffer
[k
++] = UNICODESET_HIGH
;
1759 // polarity = 0 is normal: x union y
1760 // polarity = 2: x union ~y
1761 // polarity = 1: ~x union y
1762 // polarity = 3: ~x union ~y
1764 void UnicodeSet::add(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1765 if (isFrozen() || isBogus() || other
==NULL
) {
1768 UErrorCode status
= U_ZERO_ERROR
;
1769 ensureBufferCapacity(len
+ otherLen
, status
);
1770 if (U_FAILURE(status
)) {
1774 int32_t i
= 0, j
= 0, k
= 0;
1775 UChar32 a
= list
[i
++];
1776 UChar32 b
= other
[j
++];
1777 // change from xor is that we have to check overlapping pairs
1778 // polarity bit 1 means a is second, bit 2 means b is.
1781 case 0: // both first; take lower if unequal
1782 if (a
< b
) { // take a
1783 // Back up over overlapping ranges in buffer[]
1784 if (k
> 0 && a
<= buffer
[k
-1]) {
1785 // Pick latter end value in buffer[] vs. list[]
1786 a
= max(list
[i
], buffer
[--k
]);
1792 i
++; // Common if/else code factored out
1794 } else if (b
< a
) { // take b
1795 if (k
> 0 && b
<= buffer
[k
-1]) {
1796 b
= max(other
[j
], buffer
[--k
]);
1803 } else { // a == b, take a, drop b
1804 if (a
== UNICODESET_HIGH
) goto loop_end
;
1805 // This is symmetrical; it doesn't matter if
1806 // we backtrack with a or b. - liu
1807 if (k
> 0 && a
<= buffer
[k
-1]) {
1808 a
= max(list
[i
], buffer
[--k
]);
1820 case 3: // both second; take higher if unequal, and drop other
1821 if (b
<= a
) { // take a
1822 if (a
== UNICODESET_HIGH
) goto loop_end
;
1825 if (b
== UNICODESET_HIGH
) goto loop_end
;
1829 polarity
^= 1; // factored common code
1833 case 1: // a second, b first; if b < a, overlap
1834 if (a
< b
) { // no overlap, take a
1835 buffer
[k
++] = a
; a
= list
[i
++]; polarity
^= 1;
1836 } else if (b
< a
) { // OVERLAP, drop b
1839 } else { // a == b, drop both!
1840 if (a
== UNICODESET_HIGH
) goto loop_end
;
1847 case 2: // a first, b second; if a < b, overlap
1848 if (b
< a
) { // no overlap, take b
1852 } else if (a
< b
) { // OVERLAP, drop a
1855 } else { // a == b, drop both!
1856 if (a
== UNICODESET_HIGH
) goto loop_end
;
1866 buffer
[k
++] = UNICODESET_HIGH
; // terminate
1872 // polarity = 0 is normal: x intersect y
1873 // polarity = 2: x intersect ~y == set-minus
1874 // polarity = 1: ~x intersect y
1875 // polarity = 3: ~x intersect ~y
1877 void UnicodeSet::retain(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1878 if (isFrozen() || isBogus()) {
1881 UErrorCode status
= U_ZERO_ERROR
;
1882 ensureBufferCapacity(len
+ otherLen
, status
);
1883 if (U_FAILURE(status
)) {
1887 int32_t i
= 0, j
= 0, k
= 0;
1888 UChar32 a
= list
[i
++];
1889 UChar32 b
= other
[j
++];
1890 // change from xor is that we have to check overlapping pairs
1891 // polarity bit 1 means a is second, bit 2 means b is.
1894 case 0: // both first; drop the smaller
1895 if (a
< b
) { // drop a
1898 } else if (b
< a
) { // drop b
1901 } else { // a == b, take one, drop other
1902 if (a
== UNICODESET_HIGH
) goto loop_end
;
1910 case 3: // both second; take lower if unequal
1911 if (a
< b
) { // take a
1915 } else if (b
< a
) { // take b
1919 } else { // a == b, take one, drop other
1920 if (a
== UNICODESET_HIGH
) goto loop_end
;
1928 case 1: // a second, b first;
1929 if (a
< b
) { // NO OVERLAP, drop a
1932 } else if (b
< a
) { // OVERLAP, take b
1936 } else { // a == b, drop both!
1937 if (a
== UNICODESET_HIGH
) goto loop_end
;
1944 case 2: // a first, b second; if a < b, overlap
1945 if (b
< a
) { // no overlap, drop b
1948 } else if (a
< b
) { // OVERLAP, take a
1952 } else { // a == b, drop both!
1953 if (a
== UNICODESET_HIGH
) goto loop_end
;
1963 buffer
[k
++] = UNICODESET_HIGH
; // terminate
1970 * Append the <code>toPattern()</code> representation of a
1971 * string to the given <code>StringBuffer</code>.
1973 void UnicodeSet::_appendToPat(UnicodeString
& buf
, const UnicodeString
& s
, UBool
1974 escapeUnprintable
) {
1976 for (int32_t i
= 0; i
< s
.length(); i
+= U16_LENGTH(cp
)) {
1977 _appendToPat(buf
, cp
= s
.char32At(i
), escapeUnprintable
);
1982 * Append the <code>toPattern()</code> representation of a
1983 * character to the given <code>StringBuffer</code>.
1985 void UnicodeSet::_appendToPat(UnicodeString
& buf
, UChar32 c
, UBool
1986 escapeUnprintable
) {
1987 if (escapeUnprintable
&& ICU_Utility::isUnprintable(c
)) {
1988 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1990 if (ICU_Utility::escapeUnprintable(buf
, c
)) {
1994 // Okay to let ':' pass through
2005 case SymbolTable::SYMBOL_REF
:
2006 buf
.append(BACKSLASH
);
2009 // Escape whitespace
2010 if (PatternProps::isWhiteSpace(c
)) {
2011 buf
.append(BACKSLASH
);
2019 * Append a string representation of this set to result. This will be
2020 * a cleaned version of the string passed to applyPattern(), if there
2021 * is one. Otherwise it will be generated.
2023 UnicodeString
& UnicodeSet::_toPattern(UnicodeString
& result
,
2024 UBool escapeUnprintable
) const
2028 int32_t backslashCount
= 0;
2029 for (i
=0; i
<patLen
; ) {
2031 U16_NEXT(pat
, i
, patLen
, c
);
2032 if (escapeUnprintable
&& ICU_Utility::isUnprintable(c
)) {
2033 // If the unprintable character is preceded by an odd
2034 // number of backslashes, then it has been escaped.
2035 // Before unescaping it, we delete the final
2037 if ((backslashCount
% 2) == 1) {
2038 result
.truncate(result
.length() - 1);
2040 ICU_Utility::escapeUnprintable(result
, c
);
2044 if (c
== BACKSLASH
) {
2054 return _generatePattern(result
, escapeUnprintable
);
2058 * Returns a string representation of this set. If the result of
2059 * calling this function is passed to a UnicodeSet constructor, it
2060 * will produce another set that is equal to this one.
2062 UnicodeString
& UnicodeSet::toPattern(UnicodeString
& result
,
2063 UBool escapeUnprintable
) const
2066 return _toPattern(result
, escapeUnprintable
);
2070 * Generate and append a string representation of this set to result.
2071 * This does not use this.pat, the cleaned up copy of the string
2072 * passed to applyPattern().
2074 UnicodeString
& UnicodeSet::_generatePattern(UnicodeString
& result
,
2075 UBool escapeUnprintable
) const
2077 result
.append(SET_OPEN
);
2079 // // Check against the predefined categories. We implicitly build
2080 // // up ALL category sets the first time toPattern() is called.
2081 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2082 // if (*this == getCategorySet(cat)) {
2083 // result.append(COLON);
2084 // result.append(CATEGORY_NAMES, cat*2, 2);
2085 // return result.append(CATEGORY_CLOSE);
2089 int32_t count
= getRangeCount();
2091 // If the set contains at least 2 intervals and includes both
2092 // MIN_VALUE and MAX_VALUE, then the inverse representation will
2093 // be more economical.
2095 getRangeStart(0) == MIN_VALUE
&&
2096 getRangeEnd(count
-1) == MAX_VALUE
) {
2099 result
.append(COMPLEMENT
);
2101 for (int32_t i
= 1; i
< count
; ++i
) {
2102 UChar32 start
= getRangeEnd(i
-1)+1;
2103 UChar32 end
= getRangeStart(i
)-1;
2104 _appendToPat(result
, start
, escapeUnprintable
);
2106 if ((start
+1) != end
) {
2107 result
.append(HYPHEN
);
2109 _appendToPat(result
, end
, escapeUnprintable
);
2114 // Default; emit the ranges as pairs
2116 for (int32_t i
= 0; i
< count
; ++i
) {
2117 UChar32 start
= getRangeStart(i
);
2118 UChar32 end
= getRangeEnd(i
);
2119 _appendToPat(result
, start
, escapeUnprintable
);
2121 if ((start
+1) != end
) {
2122 result
.append(HYPHEN
);
2124 _appendToPat(result
, end
, escapeUnprintable
);
2129 for (int32_t i
= 0; i
<strings
->size(); ++i
) {
2130 result
.append(OPEN_BRACE
);
2131 _appendToPat(result
,
2132 *(const UnicodeString
*) strings
->elementAt(i
),
2134 result
.append(CLOSE_BRACE
);
2136 return result
.append(SET_CLOSE
);
2140 * Release existing cached pattern
2142 void UnicodeSet::releasePattern() {
2151 * Set the new pattern to cache.
2153 void UnicodeSet::setPattern(const UnicodeString
& newPat
) {
2155 int32_t newPatLen
= newPat
.length();
2156 pat
= (UChar
*)uprv_malloc((newPatLen
+ 1) * sizeof(UChar
));
2159 newPat
.extractBetween(0, patLen
, pat
);
2162 // else we don't care if malloc failed. This was just a nice cache.
2163 // We can regenerate an equivalent pattern later when requested.
2166 UnicodeFunctor
*UnicodeSet::freeze() {
2167 if(!isFrozen() && !isBogus()) {
2168 // Do most of what compact() does before freezing because
2169 // compact() will not work when the set is frozen.
2170 // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2172 // Delete buffer first to defragment memory less.
2173 if (buffer
!= NULL
) {
2177 if (capacity
> (len
+ GROW_EXTRA
)) {
2178 // Make the capacity equal to len or 1.
2179 // We don't want to realloc of 0 size.
2180 capacity
= len
+ (len
== 0);
2181 list
= (UChar32
*) uprv_realloc(list
, sizeof(UChar32
) * capacity
);
2182 if (list
== NULL
) { // Check for memory allocation error.
2188 // Optimize contains() and span() and similar functions.
2189 if (!strings
->isEmpty()) {
2190 stringSpan
= new UnicodeSetStringSpan(*this, *strings
, UnicodeSetStringSpan::ALL
);
2191 if (stringSpan
!= NULL
&& !stringSpan
->needsStringSpanUTF16()) {
2192 // All strings are irrelevant for span() etc. because
2193 // all of each string's code points are contained in this set.
2194 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2195 // many relevant strings as UTF-16.
2196 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2201 if (stringSpan
== NULL
) {
2202 // No span-relevant strings: Optimize for code point spans.
2203 bmpSet
=new BMPSet(list
, len
);
2204 if (bmpSet
== NULL
) { // Check for memory allocation error.
2212 int32_t UnicodeSet::span(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
2213 if(length
>0 && bmpSet
!=NULL
) {
2214 return (int32_t)(bmpSet
->span(s
, s
+length
, spanCondition
)-s
);
2222 if(stringSpan
!=NULL
) {
2223 return stringSpan
->span(s
, length
, spanCondition
);
2224 } else if(!strings
->isEmpty()) {
2225 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2226 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED
:
2227 UnicodeSetStringSpan::FWD_UTF16_CONTAINED
;
2228 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2229 if(strSpan
.needsStringSpanUTF16()) {
2230 return strSpan
.span(s
, length
, spanCondition
);
2234 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2235 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2239 int32_t start
=0, prev
=0;
2241 U16_NEXT(s
, start
, length
, c
);
2242 if(spanCondition
!=contains(c
)) {
2245 } while((prev
=start
)<length
);
2249 int32_t UnicodeSet::spanBack(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
2250 if(length
>0 && bmpSet
!=NULL
) {
2251 return (int32_t)(bmpSet
->spanBack(s
, s
+length
, spanCondition
)-s
);
2259 if(stringSpan
!=NULL
) {
2260 return stringSpan
->spanBack(s
, length
, spanCondition
);
2261 } else if(!strings
->isEmpty()) {
2262 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2263 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED
:
2264 UnicodeSetStringSpan::BACK_UTF16_CONTAINED
;
2265 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2266 if(strSpan
.needsStringSpanUTF16()) {
2267 return strSpan
.spanBack(s
, length
, spanCondition
);
2271 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2272 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2276 int32_t prev
=length
;
2278 U16_PREV(s
, 0, length
, c
);
2279 if(spanCondition
!=contains(c
)) {
2282 } while((prev
=length
)>0);
2286 int32_t UnicodeSet::spanUTF8(const char *s
, int32_t length
, USetSpanCondition spanCondition
) const {
2287 if(length
>0 && bmpSet
!=NULL
) {
2288 const uint8_t *s0
=(const uint8_t *)s
;
2289 return (int32_t)(bmpSet
->spanUTF8(s0
, length
, spanCondition
)-s0
);
2292 length
=(int32_t)uprv_strlen(s
);
2297 if(stringSpan
!=NULL
) {
2298 return stringSpan
->spanUTF8((const uint8_t *)s
, length
, spanCondition
);
2299 } else if(!strings
->isEmpty()) {
2300 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2301 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED
:
2302 UnicodeSetStringSpan::FWD_UTF8_CONTAINED
;
2303 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2304 if(strSpan
.needsStringSpanUTF8()) {
2305 return strSpan
.spanUTF8((const uint8_t *)s
, length
, spanCondition
);
2309 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2310 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2314 int32_t start
=0, prev
=0;
2316 U8_NEXT_OR_FFFD(s
, start
, length
, c
);
2317 if(spanCondition
!=contains(c
)) {
2320 } while((prev
=start
)<length
);
2324 int32_t UnicodeSet::spanBackUTF8(const char *s
, int32_t length
, USetSpanCondition spanCondition
) const {
2325 if(length
>0 && bmpSet
!=NULL
) {
2326 const uint8_t *s0
=(const uint8_t *)s
;
2327 return bmpSet
->spanBackUTF8(s0
, length
, spanCondition
);
2330 length
=(int32_t)uprv_strlen(s
);
2335 if(stringSpan
!=NULL
) {
2336 return stringSpan
->spanBackUTF8((const uint8_t *)s
, length
, spanCondition
);
2337 } else if(!strings
->isEmpty()) {
2338 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2339 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED
:
2340 UnicodeSetStringSpan::BACK_UTF8_CONTAINED
;
2341 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2342 if(strSpan
.needsStringSpanUTF8()) {
2343 return strSpan
.spanBackUTF8((const uint8_t *)s
, length
, spanCondition
);
2347 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2348 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2352 int32_t prev
=length
;
2354 U8_PREV_OR_FFFD(s
, 0, length
, c
);
2355 if(spanCondition
!=contains(c
)) {
2358 } while((prev
=length
)>0);