1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 1999-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 **********************************************************************
13 #include "unicode/utypes.h"
14 #include "unicode/parsepos.h"
15 #include "unicode/symtable.h"
16 #include "unicode/uniset.h"
17 #include "unicode/ustring.h"
18 #include "unicode/utf8.h"
19 #include "unicode/utf16.h"
23 #include "patternprops.h"
31 #include "unisetspan.h"
33 // Define UChar constants using hex for EBCDIC compatibility
34 // Used #define to reduce private static exports and memory access time.
35 #define SET_OPEN ((UChar)0x005B) /*[*/
36 #define SET_CLOSE ((UChar)0x005D) /*]*/
37 #define HYPHEN ((UChar)0x002D) /*-*/
38 #define COMPLEMENT ((UChar)0x005E) /*^*/
39 #define COLON ((UChar)0x003A) /*:*/
40 #define BACKSLASH ((UChar)0x005C) /*\*/
41 #define INTERSECTION ((UChar)0x0026) /*&*/
42 #define UPPER_U ((UChar)0x0055) /*U*/
43 #define LOWER_U ((UChar)0x0075) /*u*/
44 #define OPEN_BRACE ((UChar)123) /*{*/
45 #define CLOSE_BRACE ((UChar)125) /*}*/
46 #define UPPER_P ((UChar)0x0050) /*P*/
47 #define LOWER_P ((UChar)0x0070) /*p*/
48 #define UPPER_N ((UChar)78) /*N*/
49 #define EQUALS ((UChar)0x003D) /*=*/
51 // HIGH_VALUE > all valid values. 110000 for codepoints
52 #define UNICODESET_HIGH 0x0110000
54 // LOW <= all valid values. ZERO for codepoints
55 #define UNICODESET_LOW 0x000000
57 /** Max list [0, 1, 2, ..., max code point, HIGH] */
58 constexpr int32_t MAX_LENGTH
= UNICODESET_HIGH
+ 1;
62 SymbolTable::~SymbolTable() {}
64 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet
)
67 * Modify the given UChar32 variable so that it is in range, by
68 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
69 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
70 * It modifies its argument in-place and also returns it.
72 static inline UChar32
pinCodePoint(UChar32
& c
) {
73 if (c
< UNICODESET_LOW
) {
75 } else if (c
> (UNICODESET_HIGH
-1)) {
76 c
= (UNICODESET_HIGH
-1);
81 //----------------------------------------------------------------
83 //----------------------------------------------------------------
85 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
86 // To enable the debugging, define the symbol DEBUG_MEM in the line
87 // below. This will result in text being sent to stdout that looks
89 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
90 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
91 // Each line lists a construction (ct) or destruction (dt) event, the
92 // object address, the number of outstanding objects after the event,
93 // and the pattern of the object in question.
99 static int32_t _dbgCount
= 0;
101 static inline void _dbgct(UnicodeSet
* set
) {
103 set
->toPattern(str
, TRUE
);
105 str
.extract(0, 39, buf
, "");
106 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set
, ++_dbgCount
, buf
);
109 static inline void _dbgdt(UnicodeSet
* set
) {
111 set
->toPattern(str
, TRUE
);
113 str
.extract(0, 39, buf
, "");
114 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set
, --_dbgCount
, buf
);
124 //----------------------------------------------------------------
125 // UnicodeString in UVector support
126 //----------------------------------------------------------------
128 static void U_CALLCONV
cloneUnicodeString(UElement
*dst
, UElement
*src
) {
129 dst
->pointer
= new UnicodeString(*(UnicodeString
*)src
->pointer
);
132 static int8_t U_CALLCONV
compareUnicodeString(UElement t1
, UElement t2
) {
133 const UnicodeString
&a
= *(const UnicodeString
*)t1
.pointer
;
134 const UnicodeString
&b
= *(const UnicodeString
*)t2
.pointer
;
138 UBool
UnicodeSet::hasStrings() const {
139 return strings
!= nullptr && !strings
->isEmpty();
142 int32_t UnicodeSet::stringsSize() const {
143 return strings
== nullptr ? 0 : strings
->size();
146 UBool
UnicodeSet::stringsContains(const UnicodeString
&s
) const {
147 return strings
!= nullptr && strings
->contains((void*) &s
);
150 //----------------------------------------------------------------
152 //----------------------------------------------------------------
155 * Constructs an empty set.
157 UnicodeSet::UnicodeSet() {
158 list
[0] = UNICODESET_HIGH
;
163 * Constructs a set containing the given range. If <code>end >
164 * start</code> then an empty set is created.
166 * @param start first character, inclusive, of range
167 * @param end last character, inclusive, of range
169 UnicodeSet::UnicodeSet(UChar32 start
, UChar32 end
) {
170 list
[0] = UNICODESET_HIGH
;
176 * Constructs a set that is identical to the given UnicodeSet.
178 UnicodeSet::UnicodeSet(const UnicodeSet
& o
) : UnicodeFilter(o
) {
183 // Copy-construct as thawed.
184 UnicodeSet::UnicodeSet(const UnicodeSet
& o
, UBool
/* asThawed */) : UnicodeFilter(o
) {
185 if (ensureCapacity(o
.len
)) {
186 // *this = o except for bmpSet and stringSpan
188 uprv_memcpy(list
, o
.list
, (size_t)len
*sizeof(UChar32
));
189 if (o
.hasStrings()) {
190 UErrorCode status
= U_ZERO_ERROR
;
191 if (!allocateStrings(status
) ||
192 (strings
->assign(*o
.strings
, cloneUnicodeString
, status
), U_FAILURE(status
))) {
198 setPattern(o
.pat
, o
.patLen
);
207 UnicodeSet::~UnicodeSet() {
208 _dbgdt(this); // first!
209 if (list
!= stackList
) {
213 if (buffer
!= stackList
) {
222 * Assigns this object to be a copy of another.
224 UnicodeSet
& UnicodeSet::operator=(const UnicodeSet
& o
) {
225 return copyFrom(o
, FALSE
);
228 UnicodeSet
& UnicodeSet::copyFrom(const UnicodeSet
& o
, UBool asThawed
) {
239 if (!ensureCapacity(o
.len
)) {
240 // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
244 uprv_memcpy(list
, o
.list
, (size_t)len
*sizeof(UChar32
));
245 if (o
.bmpSet
!= nullptr && !asThawed
) {
246 bmpSet
= new BMPSet(*o
.bmpSet
, list
, len
);
247 if (bmpSet
== NULL
) { // Check for memory allocation error.
252 if (o
.hasStrings()) {
253 UErrorCode status
= U_ZERO_ERROR
;
254 if ((strings
== nullptr && !allocateStrings(status
)) ||
255 (strings
->assign(*o
.strings
, cloneUnicodeString
, status
), U_FAILURE(status
))) {
259 } else if (hasStrings()) {
260 strings
->removeAllElements();
262 if (o
.stringSpan
!= nullptr && !asThawed
) {
263 stringSpan
= new UnicodeSetStringSpan(*o
.stringSpan
, *strings
);
264 if (stringSpan
== NULL
) { // Check for memory allocation error.
271 setPattern(o
.pat
, o
.patLen
);
277 * Returns a copy of this object. All UnicodeMatcher objects have
278 * to support cloning in order to allow classes using
279 * UnicodeMatchers, such as Transliterator, to implement cloning.
281 UnicodeSet
* UnicodeSet::clone() const {
282 return new UnicodeSet(*this);
285 UnicodeSet
*UnicodeSet::cloneAsThawed() const {
286 return new UnicodeSet(*this, TRUE
);
290 * Compares the specified object with this set for equality. Returns
291 * <tt>true</tt> if the two sets
292 * have the same size, and every member of the specified set is
293 * contained in this set (or equivalently, every member of this set is
294 * contained in the specified set).
296 * @param o set to be compared for equality with this set.
297 * @return <tt>true</tt> if the specified set is equal to this set.
299 UBool
UnicodeSet::operator==(const UnicodeSet
& o
) const {
300 if (len
!= o
.len
) return FALSE
;
301 for (int32_t i
= 0; i
< len
; ++i
) {
302 if (list
[i
] != o
.list
[i
]) return FALSE
;
304 if (hasStrings() != o
.hasStrings()) { return FALSE
; }
305 if (hasStrings() && *strings
!= *o
.strings
) return FALSE
;
310 * Returns the hash code value for this set.
312 * @return the hash code value for this set.
313 * @see Object#hashCode()
315 int32_t UnicodeSet::hashCode(void) const {
316 uint32_t result
= static_cast<uint32_t>(len
);
317 for (int32_t i
= 0; i
< len
; ++i
) {
321 return static_cast<int32_t>(result
);
324 //----------------------------------------------------------------
326 //----------------------------------------------------------------
329 * Returns the number of elements in this set (its cardinality),
330 * Note than the elements of a set may include both individual
331 * codepoints and strings.
333 * @return the number of elements in this set (its cardinality).
335 int32_t UnicodeSet::size(void) const {
337 int32_t count
= getRangeCount();
338 for (int32_t i
= 0; i
< count
; ++i
) {
339 n
+= getRangeEnd(i
) - getRangeStart(i
) + 1;
341 return n
+ stringsSize();
345 * Returns <tt>true</tt> if this set contains no elements.
347 * @return <tt>true</tt> if this set contains no elements.
349 UBool
UnicodeSet::isEmpty(void) const {
350 return len
== 1 && !hasStrings();
354 * Returns true if this set contains the given character.
355 * @param c character to be checked for containment
356 * @return true if the test condition is met
358 UBool
UnicodeSet::contains(UChar32 c
) const {
359 // Set i to the index of the start item greater than ch
360 // We know we will terminate without length test!
361 // LATER: for large sets, add binary search
364 // if (c < list[++i]) break;
366 if (bmpSet
!= NULL
) {
367 return bmpSet
->contains(c
);
369 if (stringSpan
!= NULL
) {
370 return stringSpan
->contains(c
);
372 if (c
>= UNICODESET_HIGH
) { // Don't need to check LOW bound
375 int32_t i
= findCodePoint(c
);
376 return (UBool
)(i
& 1); // return true if odd
380 * Returns the smallest value i such that c < list[i]. Caller
381 * must ensure that c is a legal value or this method will enter
382 * an infinite loop. This method performs a binary search.
383 * @param c a character in the range MIN_VALUE..MAX_VALUE
385 * @return the smallest integer i in the range 0..len-1,
386 * inclusive, such that c < list[i]
388 int32_t UnicodeSet::findCodePoint(UChar32 c
) const {
391 set list[] c=0 1 3 4 7 8
392 === ============== ===========
393 [] [110000] 0 0 0 0 0 0
394 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
395 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
396 [:Any:] [0, 110000] 1 1 1 1 1 1
399 // Return the smallest i such that c < list[i]. Assume
400 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
403 // High runner test. c is often after the last range, so an
404 // initial check for this condition pays off.
406 int32_t hi
= len
- 1;
407 if (lo
>= hi
|| c
>= list
[hi
-1])
409 // invariant: c >= list[lo]
410 // invariant: c < list[hi]
412 int32_t i
= (lo
+ hi
) >> 1;
415 } else if (c
< list
[i
]) {
425 * Returns true if this set contains every character
426 * of the given range.
427 * @param start first character, inclusive, of the range
428 * @param end last character, inclusive, of the range
429 * @return true if the test condition is met
431 UBool
UnicodeSet::contains(UChar32 start
, UChar32 end
) const {
434 // if (start < list[++i]) break;
436 int32_t i
= findCodePoint(start
);
437 return ((i
& 1) != 0 && end
< list
[i
]);
441 * Returns <tt>true</tt> if this set contains the given
442 * multicharacter string.
443 * @param s string to be checked for containment
444 * @return <tt>true</tt> if this set contains the specified string
446 UBool
UnicodeSet::contains(const UnicodeString
& s
) const {
447 if (s
.length() == 0) return FALSE
;
448 int32_t cp
= getSingleCP(s
);
450 return stringsContains(s
);
452 return contains((UChar32
) cp
);
457 * Returns true if this set contains all the characters and strings
459 * @param c set to be checked for containment
460 * @return true if the test condition is met
462 UBool
UnicodeSet::containsAll(const UnicodeSet
& c
) const {
463 // The specified set is a subset if all of its pairs are contained in
464 // this set. It's possible to code this more efficiently in terms of
465 // direct manipulation of the inversion lists if the need arises.
466 int32_t n
= c
.getRangeCount();
467 for (int i
=0; i
<n
; ++i
) {
468 if (!contains(c
.getRangeStart(i
), c
.getRangeEnd(i
))) {
472 return !c
.hasStrings() || (strings
!= nullptr && strings
->containsAll(*c
.strings
));
476 * Returns true if this set contains all the characters
477 * of the given string.
478 * @param s string containing characters to be checked for containment
479 * @return true if the test condition is met
481 UBool
UnicodeSet::containsAll(const UnicodeString
& s
) const {
482 return (UBool
)(span(s
.getBuffer(), s
.length(), USET_SPAN_CONTAINED
) ==
487 * Returns true if this set contains none of the characters
488 * of the given range.
489 * @param start first character, inclusive, of the range
490 * @param end last character, inclusive, of the range
491 * @return true if the test condition is met
493 UBool
UnicodeSet::containsNone(UChar32 start
, UChar32 end
) const {
496 // if (start < list[++i]) break;
498 int32_t i
= findCodePoint(start
);
499 return ((i
& 1) == 0 && end
< list
[i
]);
503 * Returns true if this set contains none of the characters and strings
505 * @param c set to be checked for containment
506 * @return true if the test condition is met
508 UBool
UnicodeSet::containsNone(const UnicodeSet
& c
) const {
509 // The specified set is a subset if all of its pairs are contained in
510 // this set. It's possible to code this more efficiently in terms of
511 // direct manipulation of the inversion lists if the need arises.
512 int32_t n
= c
.getRangeCount();
513 for (int32_t i
=0; i
<n
; ++i
) {
514 if (!containsNone(c
.getRangeStart(i
), c
.getRangeEnd(i
))) {
518 return strings
== nullptr || !c
.hasStrings() || strings
->containsNone(*c
.strings
);
522 * Returns true if this set contains none of the characters
523 * of the given string.
524 * @param s string containing characters to be checked for containment
525 * @return true if the test condition is met
527 UBool
UnicodeSet::containsNone(const UnicodeString
& s
) const {
528 return (UBool
)(span(s
.getBuffer(), s
.length(), USET_SPAN_NOT_CONTAINED
) ==
533 * Returns <tt>true</tt> if this set contains any character whose low byte
534 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
537 UBool
UnicodeSet::matchesIndexValue(uint8_t v
) const {
538 /* The index value v, in the range [0,255], is contained in this set if
539 * it is contained in any pair of this set. Pairs either have the high
540 * bytes equal, or unequal. If the high bytes are equal, then we have
541 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
542 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
543 * Then v is contained if xx <= v || v <= yy. (This is identical to the
544 * time zone month containment logic.)
547 int32_t rangeCount
=getRangeCount();
548 for (i
=0; i
<rangeCount
; ++i
) {
549 UChar32 low
= getRangeStart(i
);
550 UChar32 high
= getRangeEnd(i
);
551 if ((low
& ~0xFF) == (high
& ~0xFF)) {
552 if ((low
& 0xFF) <= v
&& v
<= (high
& 0xFF)) {
555 } else if ((low
& 0xFF) <= v
|| v
<= (high
& 0xFF)) {
560 for (i
=0; i
<strings
->size(); ++i
) {
561 const UnicodeString
& s
= *(const UnicodeString
*)strings
->elementAt(i
);
562 //if (s.length() == 0) {
563 // // Empty strings match everything
566 // assert(s.length() != 0); // We enforce this elsewhere
567 UChar32 c
= s
.char32At(0);
568 if ((c
& 0xFF) == v
) {
577 * Implementation of UnicodeMatcher::matches(). Always matches the
578 * longest possible multichar string.
580 UMatchDegree
UnicodeSet::matches(const Replaceable
& text
,
584 if (offset
== limit
) {
585 // Strings, if any, have length != 0, so we don't worry
586 // about them here. If we ever allow zero-length strings
587 // we much check for them here.
588 if (contains(U_ETHER
)) {
589 return incremental
? U_PARTIAL_MATCH
: U_MATCH
;
594 if (hasStrings()) { // try strings first
596 // might separate forward and backward loops later
597 // for now they are combined
599 // TODO Improve efficiency of this, at least in the forward
600 // direction, if not in both. In the forward direction we
601 // can assume the strings are sorted.
604 UBool forward
= offset
< limit
;
606 // firstChar is the leftmost char to match in the
607 // forward direction or the rightmost char to match in
608 // the reverse direction.
609 UChar firstChar
= text
.charAt(offset
);
611 // If there are multiple strings that can match we
612 // return the longest match.
613 int32_t highWaterLength
= 0;
615 for (i
=0; i
<strings
->size(); ++i
) {
616 const UnicodeString
& trial
= *(const UnicodeString
*)strings
->elementAt(i
);
618 //if (trial.length() == 0) {
619 // return U_MATCH; // null-string always matches
621 // assert(trial.length() != 0); // We ensure this elsewhere
623 UChar c
= trial
.charAt(forward
? 0 : trial
.length() - 1);
625 // Strings are sorted, so we can optimize in the
626 // forward direction.
627 if (forward
&& c
> firstChar
) break;
628 if (c
!= firstChar
) continue;
630 int32_t matchLen
= matchRest(text
, offset
, limit
, trial
);
633 int32_t maxLen
= forward
? limit
-offset
: offset
-limit
;
634 if (matchLen
== maxLen
) {
635 // We have successfully matched but only up to limit.
636 return U_PARTIAL_MATCH
;
640 if (matchLen
== trial
.length()) {
641 // We have successfully matched the whole string.
642 if (matchLen
> highWaterLength
) {
643 highWaterLength
= matchLen
;
645 // In the forward direction we know strings
646 // are sorted so we can bail early.
647 if (forward
&& matchLen
< highWaterLength
) {
654 // We've checked all strings without a partial match.
655 // If we have full matches, return the longest one.
656 if (highWaterLength
!= 0) {
657 offset
+= forward
? highWaterLength
: -highWaterLength
;
661 return UnicodeFilter::matches(text
, offset
, limit
, incremental
);
666 * Returns the longest match for s in text at the given position.
667 * If limit > start then match forward from start+1 to limit
668 * matching all characters except s.charAt(0). If limit < start,
669 * go backward starting from start-1 matching all characters
670 * except s.charAt(s.length()-1). This method assumes that the
671 * first character, text.charAt(start), matches s, so it does not
673 * @param text the text to match
674 * @param start the first character to match. In the forward
675 * direction, text.charAt(start) is matched against s.charAt(0).
676 * In the reverse direction, it is matched against
677 * s.charAt(s.length()-1).
678 * @param limit the limit offset for matching, either last+1 in
679 * the forward direction, or last-1 in the reverse direction,
680 * where last is the index of the last character to match.
681 * @return If part of s matches up to the limit, return |limit -
682 * start|. If all of s matches before reaching the limit, return
683 * s.length(). If there is a mismatch between s and text, return
686 int32_t UnicodeSet::matchRest(const Replaceable
& text
,
687 int32_t start
, int32_t limit
,
688 const UnicodeString
& s
) {
691 int32_t slen
= s
.length();
693 maxLen
= limit
- start
;
694 if (maxLen
> slen
) maxLen
= slen
;
695 for (i
= 1; i
< maxLen
; ++i
) {
696 if (text
.charAt(start
+ i
) != s
.charAt(i
)) return 0;
699 maxLen
= start
- limit
;
700 if (maxLen
> slen
) maxLen
= slen
;
701 --slen
; // <=> slen = s.length() - 1;
702 for (i
= 1; i
< maxLen
; ++i
) {
703 if (text
.charAt(start
- i
) != s
.charAt(slen
- i
)) return 0;
710 * Implement of UnicodeMatcher
712 void UnicodeSet::addMatchSetTo(UnicodeSet
& toUnionTo
) const {
713 toUnionTo
.addAll(*this);
717 * Returns the index of the given character within this set, where
718 * the set is ordered by ascending code point. If the character
719 * is not in this set, return -1. The inverse of this method is
720 * <code>charAt()</code>.
721 * @return an index from 0..size()-1, or -1
723 int32_t UnicodeSet::indexOf(UChar32 c
) const {
724 if (c
< MIN_VALUE
|| c
> MAX_VALUE
) {
730 UChar32 start
= list
[i
++];
734 UChar32 limit
= list
[i
++];
736 return n
+ c
- start
;
743 * Returns the character at the given index within this set, where
744 * the set is ordered by ascending code point. If the index is
745 * out of range, return (UChar32)-1. The inverse of this method is
746 * <code>indexOf()</code>.
747 * @param index an index from 0..size()-1
748 * @return the character at the given index, or (UChar32)-1.
750 UChar32
UnicodeSet::charAt(int32_t index
) const {
752 // len2 is the largest even integer <= len, that is, it is len
753 // for even values and len-1 for odd values. With odd values
754 // the last entry is UNICODESET_HIGH.
755 int32_t len2
= len
& ~1;
756 for (int32_t i
=0; i
< len2
;) {
757 UChar32 start
= list
[i
++];
758 int32_t count
= list
[i
++] - start
;
760 return (UChar32
)(start
+ index
);
769 * Make this object represent the range <code>start - end</code>.
770 * If <code>end > start</code> then this object is set to an
773 * @param start first character in the set, inclusive
774 * @rparam end last character in the set, inclusive
776 UnicodeSet
& UnicodeSet::set(UChar32 start
, UChar32 end
) {
778 complement(start
, end
);
783 * Adds the specified range to this set if it is not already
784 * present. If this set already contains the specified range,
785 * the call leaves this set unchanged. If <code>end > start</code>
786 * then an empty range is added, leaving the set unchanged.
788 * @param start first character, inclusive, of range to be added
790 * @param end last character, inclusive, of range to be added
793 UnicodeSet
& UnicodeSet::add(UChar32 start
, UChar32 end
) {
794 if (pinCodePoint(start
) < pinCodePoint(end
)) {
795 UChar32 limit
= end
+ 1;
796 // Fast path for adding a new range after the last one.
797 // Odd list length: [..., lastStart, lastLimit, HIGH]
798 if ((len
& 1) != 0) {
799 // If the list is empty, set lastLimit low enough to not be adjacent to 0.
800 UChar32 lastLimit
= len
== 1 ? -2 : list
[len
- 2];
801 if (lastLimit
<= start
&& !isFrozen() && !isBogus()) {
802 if (lastLimit
== start
) {
803 // Extend the last range.
804 list
[len
- 2] = limit
;
805 if (limit
== UNICODESET_HIGH
) {
809 list
[len
- 1] = start
;
810 if (limit
< UNICODESET_HIGH
) {
811 if (ensureCapacity(len
+ 2)) {
813 list
[len
++] = UNICODESET_HIGH
;
815 } else { // limit == UNICODESET_HIGH
816 if (ensureCapacity(len
+ 1)) {
817 list
[len
++] = UNICODESET_HIGH
;
825 // This is slow. Could be much faster using findCodePoint(start)
826 // and modifying the list, dealing with adjacent & overlapping ranges.
827 UChar32 range
[3] = { start
, limit
, UNICODESET_HIGH
};
829 } else if (start
== end
) {
835 // #define DEBUG_US_ADD
839 void dump(UChar32 c
) {
841 printf("%c", (char)c
);
846 void dump(const UChar32
* list
, int32_t len
) {
848 for (int32_t i
=0; i
<len
; ++i
) {
849 if (i
!= 0) printf(", ");
857 * Adds the specified character to this set if it is not already
858 * present. If this set already contains the specified character,
859 * the call leaves this set unchanged.
861 UnicodeSet
& UnicodeSet::add(UChar32 c
) {
862 // find smallest i such that c < list[i]
863 // if odd, then it is IN the set
864 // if even, then it is OUT of the set
865 int32_t i
= findCodePoint(pinCodePoint(c
));
868 if ((i
& 1) != 0 || isFrozen() || isBogus()) return *this;
871 // assert(list[len-1] == HIGH);
874 // [start_0, limit_0, start_1, limit_1, HIGH]
876 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
880 // i == 0 means c is before the first range
885 printf(" found at %d", i
);
891 if (c
== list
[i
]-1) {
892 // c is before start of next range
894 // if we touched the HIGH mark, then add a new one
895 if (c
== (UNICODESET_HIGH
- 1)) {
896 if (!ensureCapacity(len
+1)) {
897 // ensureCapacity will mark the object as Bogus if OOM failure happens.
900 list
[len
++] = UNICODESET_HIGH
;
902 if (i
> 0 && c
== list
[i
-1]) {
903 // collapse adjacent ranges
905 // [..., start_k-1, c, c, limit_k, ..., HIGH]
909 //for (int32_t k=i-1; k<len-2; ++k) {
910 // list[k] = list[k+2];
912 UChar32
* dst
= list
+ i
- 1;
913 UChar32
* src
= dst
+ 2;
914 UChar32
* srclimit
= list
+ len
;
915 while (src
< srclimit
) *(dst
++) = *(src
++);
921 else if (i
> 0 && c
== list
[i
-1]) {
922 // c is after end of prior range
924 // no need to check for collapse here
928 // At this point we know the new char is not adjacent to
929 // any existing ranges, and it is not 10FFFF.
932 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
936 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
940 if (!ensureCapacity(len
+2)) {
941 // ensureCapacity will mark the object as Bogus if OOM failure happens.
945 UChar32
*p
= list
+ i
;
946 uprv_memmove(p
+ 2, p
, (len
- i
) * sizeof(*p
));
956 for (i
=1; i
<len
; ++i
) {
957 if (list
[i
] <= list
[i
-1]) {
959 printf("ERROR: list has been corrupted\n");
970 * Adds the specified multicharacter to this set if it is not already
971 * present. If this set already contains the multicharacter,
972 * the call leaves this set unchanged.
973 * Thus "ch" => {"ch"}
974 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
975 * @param s the source string
976 * @return the modified set, for chaining
978 UnicodeSet
& UnicodeSet::add(const UnicodeString
& s
) {
979 if (s
.length() == 0 || isFrozen() || isBogus()) return *this;
980 int32_t cp
= getSingleCP(s
);
982 if (!stringsContains(s
)) {
993 * Adds the given string, in order, to 'strings'. The given string
994 * must have been checked by the caller to not be empty and to not
995 * already be in 'strings'.
997 void UnicodeSet::_add(const UnicodeString
& s
) {
998 if (isFrozen() || isBogus()) {
1001 UErrorCode ec
= U_ZERO_ERROR
;
1002 if (strings
== nullptr && !allocateStrings(ec
)) {
1006 UnicodeString
* t
= new UnicodeString(s
);
1007 if (t
== NULL
) { // Check for memory allocation error.
1011 strings
->sortedInsert(t
, compareUnicodeString
, ec
);
1012 if (U_FAILURE(ec
)) {
1019 * @return a code point IF the string consists of a single one.
1020 * otherwise returns -1.
1021 * @param string to test
1023 int32_t UnicodeSet::getSingleCP(const UnicodeString
& s
) {
1024 //if (s.length() < 1) {
1025 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1027 if (s
.length() > 2) return -1;
1028 if (s
.length() == 1) return s
.charAt(0);
1030 // at this point, len = 2
1031 UChar32 cp
= s
.char32At(0);
1032 if (cp
> 0xFFFF) { // is surrogate pair
1039 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1040 * If this set already any particular character, it has no effect on that character.
1041 * @param the source string
1042 * @return the modified set, for chaining
1044 UnicodeSet
& UnicodeSet::addAll(const UnicodeString
& s
) {
1046 for (int32_t i
= 0; i
< s
.length(); i
+= U16_LENGTH(cp
)) {
1054 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1055 * If this set already any particular character, it has no effect on that character.
1056 * @param the source string
1057 * @return the modified set, for chaining
1059 UnicodeSet
& UnicodeSet::retainAll(const UnicodeString
& s
) {
1067 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1068 * If this set already any particular character, it has no effect on that character.
1069 * @param the source string
1070 * @return the modified set, for chaining
1072 UnicodeSet
& UnicodeSet::complementAll(const UnicodeString
& s
) {
1080 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1081 * If this set already any particular character, it has no effect on that character.
1082 * @param the source string
1083 * @return the modified set, for chaining
1085 UnicodeSet
& UnicodeSet::removeAll(const UnicodeString
& s
) {
1092 UnicodeSet
& UnicodeSet::removeAllStrings() {
1093 if (!isFrozen() && hasStrings()) {
1094 strings
->removeAllElements();
1102 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1103 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1104 * @param the source string
1105 * @return a newly created set containing the given string
1107 UnicodeSet
* U_EXPORT2
UnicodeSet::createFrom(const UnicodeString
& s
) {
1108 UnicodeSet
*set
= new UnicodeSet();
1109 if (set
!= NULL
) { // Check for memory allocation error.
1117 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1118 * @param the source string
1119 * @return a newly created set containing the given characters
1121 UnicodeSet
* U_EXPORT2
UnicodeSet::createFromAll(const UnicodeString
& s
) {
1122 UnicodeSet
*set
= new UnicodeSet();
1123 if (set
!= NULL
) { // Check for memory allocation error.
1130 * Retain only the elements in this set that are contained in the
1131 * specified range. If <code>end > start</code> then an empty range is
1132 * retained, leaving the set empty.
1134 * @param start first character, inclusive, of range to be retained
1136 * @param end last character, inclusive, of range to be retained
1139 UnicodeSet
& UnicodeSet::retain(UChar32 start
, UChar32 end
) {
1140 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1141 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1142 retain(range
, 2, 0);
1149 UnicodeSet
& UnicodeSet::retain(UChar32 c
) {
1150 return retain(c
, c
);
1154 * Removes the specified range from this set if it is present.
1155 * The set will not contain the specified range once the call
1156 * returns. If <code>end > start</code> then an empty range is
1157 * removed, leaving the set unchanged.
1159 * @param start first character, inclusive, of range to be removed
1161 * @param end last character, inclusive, of range to be removed
1164 UnicodeSet
& UnicodeSet::remove(UChar32 start
, UChar32 end
) {
1165 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1166 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1167 retain(range
, 2, 2);
1173 * Removes the specified character from this set if it is present.
1174 * The set will not contain the specified range once the call
1177 UnicodeSet
& UnicodeSet::remove(UChar32 c
) {
1178 return remove(c
, c
);
1182 * Removes the specified string from this set if it is present.
1183 * The set will not contain the specified character once the call
1185 * @param the source string
1186 * @return the modified set, for chaining
1188 UnicodeSet
& UnicodeSet::remove(const UnicodeString
& s
) {
1189 if (s
.length() == 0 || isFrozen() || isBogus()) return *this;
1190 int32_t cp
= getSingleCP(s
);
1192 if (strings
!= nullptr && strings
->removeElement((void*) &s
)) {
1196 remove((UChar32
)cp
, (UChar32
)cp
);
1202 * Complements the specified range in this set. Any character in
1203 * the range will be removed if it is in this set, or will be
1204 * added if it is not in this set. If <code>end > start</code>
1205 * then an empty range is xor'ed, leaving the set unchanged.
1207 * @param start first character, inclusive, of range to be removed
1209 * @param end last character, inclusive, of range to be removed
1212 UnicodeSet
& UnicodeSet::complement(UChar32 start
, UChar32 end
) {
1213 if (isFrozen() || isBogus()) {
1216 if (pinCodePoint(start
) <= pinCodePoint(end
)) {
1217 UChar32 range
[3] = { start
, end
+1, UNICODESET_HIGH
};
1218 exclusiveOr(range
, 2, 0);
1224 UnicodeSet
& UnicodeSet::complement(UChar32 c
) {
1225 return complement(c
, c
);
1229 * This is equivalent to
1230 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1232 UnicodeSet
& UnicodeSet::complement(void) {
1233 if (isFrozen() || isBogus()) {
1236 if (list
[0] == UNICODESET_LOW
) {
1237 uprv_memmove(list
, list
+ 1, (size_t)(len
-1)*sizeof(UChar32
));
1240 if (!ensureCapacity(len
+1)) {
1243 uprv_memmove(list
+ 1, list
, (size_t)len
*sizeof(UChar32
));
1244 list
[0] = UNICODESET_LOW
;
1252 * Complement the specified string in this set.
1253 * The set will not contain the specified string once the call
1255 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1256 * @param s the string to complement
1257 * @return this object, for chaining
1259 UnicodeSet
& UnicodeSet::complement(const UnicodeString
& s
) {
1260 if (s
.length() == 0 || isFrozen() || isBogus()) return *this;
1261 int32_t cp
= getSingleCP(s
);
1263 if (stringsContains(s
)) {
1264 strings
->removeElement((void*) &s
);
1270 complement((UChar32
)cp
, (UChar32
)cp
);
1276 * Adds all of the elements in the specified set to this set if
1277 * they're not already present. This operation effectively
1278 * modifies this set so that its value is the <i>union</i> of the two
1279 * sets. The behavior of this operation is unspecified if the specified
1280 * collection is modified while the operation is in progress.
1282 * @param c set whose elements are to be added to this set.
1283 * @see #add(char, char)
1285 UnicodeSet
& UnicodeSet::addAll(const UnicodeSet
& c
) {
1286 if ( c
.len
>0 && c
.list
!=NULL
) {
1287 add(c
.list
, c
.len
, 0);
1290 // Add strings in order
1291 if ( c
.strings
!=NULL
) {
1292 for (int32_t i
=0; i
<c
.strings
->size(); ++i
) {
1293 const UnicodeString
* s
= (const UnicodeString
*)c
.strings
->elementAt(i
);
1294 if (!stringsContains(*s
)) {
1303 * Retains only the elements in this set that are contained in the
1304 * specified set. In other words, removes from this set all of
1305 * its elements that are not contained in the specified set. This
1306 * operation effectively modifies this set so that its value is
1307 * the <i>intersection</i> of the two sets.
1309 * @param c set that defines which elements this set will retain.
1311 UnicodeSet
& UnicodeSet::retainAll(const UnicodeSet
& c
) {
1312 if (isFrozen() || isBogus()) {
1315 retain(c
.list
, c
.len
, 0);
1317 if (!c
.hasStrings()) {
1318 strings
->removeAllElements();
1320 strings
->retainAll(*c
.strings
);
1327 * Removes from this set all of its elements that are contained in the
1328 * specified set. This operation effectively modifies this
1329 * set so that its value is the <i>asymmetric set difference</i> of
1332 * @param c set that defines which elements will be removed from
1335 UnicodeSet
& UnicodeSet::removeAll(const UnicodeSet
& c
) {
1336 if (isFrozen() || isBogus()) {
1339 retain(c
.list
, c
.len
, 2);
1340 if (hasStrings() && c
.hasStrings()) {
1341 strings
->removeAll(*c
.strings
);
1347 * Complements in this set all elements contained in the specified
1348 * set. Any character in the other set will be removed if it is
1349 * in this set, or will be added if it is not in this set.
1351 * @param c set that defines which elements will be xor'ed from
1354 UnicodeSet
& UnicodeSet::complementAll(const UnicodeSet
& c
) {
1355 if (isFrozen() || isBogus()) {
1358 exclusiveOr(c
.list
, c
.len
, 0);
1360 if (c
.strings
!= nullptr) {
1361 for (int32_t i
=0; i
<c
.strings
->size(); ++i
) {
1362 void* e
= c
.strings
->elementAt(i
);
1363 if (strings
== nullptr || !strings
->removeElement(e
)) {
1364 _add(*(const UnicodeString
*)e
);
1372 * Removes all of the elements from this set. This set will be
1373 * empty after this call returns.
1375 UnicodeSet
& UnicodeSet::clear(void) {
1379 list
[0] = UNICODESET_HIGH
;
1382 if (strings
!= NULL
) {
1383 strings
->removeAllElements();
1391 * Iteration method that returns the number of ranges contained in
1393 * @see #getRangeStart
1396 int32_t UnicodeSet::getRangeCount() const {
1401 * Iteration method that returns the first character in the
1402 * specified range of this set.
1403 * @see #getRangeCount
1406 UChar32
UnicodeSet::getRangeStart(int32_t index
) const {
1407 return list
[index
*2];
1411 * Iteration method that returns the last character in the
1412 * specified range of this set.
1413 * @see #getRangeStart
1416 UChar32
UnicodeSet::getRangeEnd(int32_t index
) const {
1417 return list
[index
*2 + 1] - 1;
1420 const UnicodeString
* UnicodeSet::getString(int32_t index
) const {
1421 return (const UnicodeString
*) strings
->elementAt(index
);
1425 * Reallocate this objects internal structures to take up the least
1426 * possible space, without changing this object's value.
1428 UnicodeSet
& UnicodeSet::compact() {
1429 if (isFrozen() || isBogus()) {
1432 // Delete buffer first to defragment memory less.
1433 if (buffer
!= stackList
) {
1438 if (list
== stackList
) {
1440 } else if (len
<= INITIAL_CAPACITY
) {
1441 uprv_memcpy(stackList
, list
, len
* sizeof(UChar32
));
1444 capacity
= INITIAL_CAPACITY
;
1445 } else if ((len
+ 7) < capacity
) {
1446 // If we have more than a little unused capacity, shrink it to len.
1447 UChar32
* temp
= (UChar32
*) uprv_realloc(list
, sizeof(UChar32
) * len
);
1452 // else what the heck happened?! We allocated less memory!
1453 // Oh well. We'll keep our original array.
1455 if (strings
!= nullptr && strings
->isEmpty()) {
1462 #ifdef DEBUG_SERIALIZE
1467 * Deserialize constructor.
1469 UnicodeSet::UnicodeSet(const uint16_t data
[], int32_t dataLen
, ESerialization serialization
,
1477 if( (serialization
!= kSerialized
)
1480 ec
= U_ILLEGAL_ARGUMENT_ERROR
;
1486 int32_t headerSize
= ((data
[0]&0x8000)) ?2:1;
1487 int32_t bmpLength
= (headerSize
==1)?data
[0]:data
[1];
1489 int32_t newLength
= (((data
[0]&0x7FFF)-bmpLength
)/2)+bmpLength
;
1490 #ifdef DEBUG_SERIALIZE
1491 printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen
,headerSize
,bmpLength
,newLength
, data
[0],data
[1],data
[2],data
[3]);
1493 if(!ensureCapacity(newLength
+ 1)) { // +1 for HIGH
1498 for(i
= 0; i
< bmpLength
;i
++) {
1499 list
[i
] = data
[i
+headerSize
];
1500 #ifdef DEBUG_SERIALIZE
1501 printf("<<16@%d[%d] %X\n", i
+headerSize
, i
, list
[i
]);
1505 for(i
=bmpLength
;i
<newLength
;i
++) {
1506 list
[i
] = ((UChar32
)data
[headerSize
+bmpLength
+(i
-bmpLength
)*2+0] << 16) +
1507 ((UChar32
)data
[headerSize
+bmpLength
+(i
-bmpLength
)*2+1]);
1508 #ifdef DEBUG_SERIALIZE
1509 printf("<<32@%d+[%d] %lX\n", headerSize
+bmpLength
+i
, i
, list
[i
]);
1512 U_ASSERT(i
== newLength
);
1513 if (i
== 0 || list
[i
- 1] != UNICODESET_HIGH
) {
1514 list
[i
++] = UNICODESET_HIGH
;
1520 int32_t UnicodeSet::serialize(uint16_t *dest
, int32_t destCapacity
, UErrorCode
& ec
) const {
1521 int32_t bmpLength
, length
, destLength
;
1523 if (U_FAILURE(ec
)) {
1527 if (destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
1528 ec
=U_ILLEGAL_ARGUMENT_ERROR
;
1532 /* count necessary 16-bit units */
1533 length
=this->len
-1; // Subtract 1 to ignore final UNICODESET_HIGH
1534 // assert(length>=0);
1537 if (destCapacity
>0) {
1540 ec
=U_BUFFER_OVERFLOW_ERROR
;
1546 if (this->list
[length
-1]<=0xffff) {
1549 } else if (this->list
[0]>=0x10000) {
1550 /* all supplementary */
1554 /* some BMP, some supplementary */
1555 for (bmpLength
=0; bmpLength
<length
&& this->list
[bmpLength
]<=0xffff; ++bmpLength
) {}
1556 length
=bmpLength
+2*(length
-bmpLength
);
1558 #ifdef DEBUG_SERIALIZE
1559 printf(">> bmpLength%d length%d len%d\n", bmpLength
, length
, len
);
1561 /* length: number of 16-bit array units */
1562 if (length
>0x7fff) {
1563 /* there are only 15 bits for the length in the first serialized word */
1564 ec
=U_INDEX_OUTOFBOUNDS_ERROR
;
1569 * total serialized length:
1570 * number of 16-bit array units (length) +
1571 * 1 length unit (always) +
1572 * 1 bmpLength unit (if there are supplementary values)
1574 destLength
=length
+((length
>bmpLength
)?2:1);
1575 if (destLength
<=destCapacity
) {
1579 #ifdef DEBUG_SERIALIZE
1580 printf("writeHdr\n");
1582 *dest
=(uint16_t)length
;
1583 if (length
>bmpLength
) {
1585 *++dest
=(uint16_t)bmpLength
;
1589 /* write the BMP part of the array */
1591 for (i
=0; i
<bmpLength
; ++i
) {
1592 #ifdef DEBUG_SERIALIZE
1593 printf("writebmp: %x\n", (int)*p
);
1595 *dest
++=(uint16_t)*p
++;
1598 /* write the supplementary part of the array */
1599 for (; i
<length
; i
+=2) {
1600 #ifdef DEBUG_SERIALIZE
1601 printf("write32: %x\n", (int)*p
);
1603 *dest
++=(uint16_t)(*p
>>16);
1604 *dest
++=(uint16_t)*p
++;
1607 ec
=U_BUFFER_OVERFLOW_ERROR
;
1612 //----------------------------------------------------------------
1613 // Implementation: Utility methods
1614 //----------------------------------------------------------------
1617 * Allocate our strings vector and return TRUE if successful.
1619 UBool
UnicodeSet::allocateStrings(UErrorCode
&status
) {
1620 if (U_FAILURE(status
)) {
1623 strings
= new UVector(uprv_deleteUObject
,
1624 uhash_compareUnicodeString
, 1, status
);
1625 if (strings
== NULL
) { // Check for memory allocation error.
1626 status
= U_MEMORY_ALLOCATION_ERROR
;
1629 if (U_FAILURE(status
)) {
1637 int32_t UnicodeSet::nextCapacity(int32_t minCapacity
) {
1638 // Grow exponentially to reduce the frequency of allocations.
1639 if (minCapacity
< INITIAL_CAPACITY
) {
1640 return minCapacity
+ INITIAL_CAPACITY
;
1641 } else if (minCapacity
<= 2500) {
1642 return 5 * minCapacity
;
1644 int32_t newCapacity
= 2 * minCapacity
;
1645 if (newCapacity
> MAX_LENGTH
) {
1646 newCapacity
= MAX_LENGTH
;
1652 bool UnicodeSet::ensureCapacity(int32_t newLen
) {
1653 if (newLen
> MAX_LENGTH
) {
1654 newLen
= MAX_LENGTH
;
1656 if (newLen
<= capacity
) {
1659 int32_t newCapacity
= nextCapacity(newLen
);
1660 UChar32
* temp
= (UChar32
*) uprv_malloc(newCapacity
* sizeof(UChar32
));
1662 setToBogus(); // set the object to bogus state if an OOM failure occurred.
1665 // Copy only the actual contents.
1666 uprv_memcpy(temp
, list
, len
* sizeof(UChar32
));
1667 if (list
!= stackList
) {
1671 capacity
= newCapacity
;
1675 bool UnicodeSet::ensureBufferCapacity(int32_t newLen
) {
1676 if (newLen
> MAX_LENGTH
) {
1677 newLen
= MAX_LENGTH
;
1679 if (newLen
<= bufferCapacity
) {
1682 int32_t newCapacity
= nextCapacity(newLen
);
1683 UChar32
* temp
= (UChar32
*) uprv_malloc(newCapacity
* sizeof(UChar32
));
1688 // The buffer has no contents to be copied.
1689 // It is always filled from scratch after this call.
1690 if (buffer
!= stackList
) {
1694 bufferCapacity
= newCapacity
;
1699 * Swap list and buffer.
1701 void UnicodeSet::swapBuffers(void) {
1702 // swap list and buffer
1703 UChar32
* temp
= list
;
1707 int32_t c
= capacity
;
1708 capacity
= bufferCapacity
;
1712 void UnicodeSet::setToBogus() {
1713 clear(); // Remove everything in the set.
1717 //----------------------------------------------------------------
1718 // Implementation: Fundamental operators
1719 //----------------------------------------------------------------
1721 static inline UChar32
max(UChar32 a
, UChar32 b
) {
1722 return (a
> b
) ? a
: b
;
1725 // polarity = 0, 3 is normal: x xor y
1726 // polarity = 1, 2: x xor ~y == x === y
1728 void UnicodeSet::exclusiveOr(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1729 if (isFrozen() || isBogus()) {
1732 if (!ensureBufferCapacity(len
+ otherLen
)) {
1736 int32_t i
= 0, j
= 0, k
= 0;
1737 UChar32 a
= list
[i
++];
1739 if (polarity
== 1 || polarity
== 2) {
1741 if (other
[j
] == UNICODESET_LOW
) { // skip base if already LOW
1748 // simplest of all the routines
1749 // sort the values, discarding identicals!
1757 } else if (a
!= UNICODESET_HIGH
) { // at this point, a == b
1758 // discard both values!
1762 buffer
[k
++] = UNICODESET_HIGH
;
1771 // polarity = 0 is normal: x union y
1772 // polarity = 2: x union ~y
1773 // polarity = 1: ~x union y
1774 // polarity = 3: ~x union ~y
1776 void UnicodeSet::add(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1777 if (isFrozen() || isBogus() || other
==NULL
) {
1780 if (!ensureBufferCapacity(len
+ otherLen
)) {
1784 int32_t i
= 0, j
= 0, k
= 0;
1785 UChar32 a
= list
[i
++];
1786 UChar32 b
= other
[j
++];
1787 // change from xor is that we have to check overlapping pairs
1788 // polarity bit 1 means a is second, bit 2 means b is.
1791 case 0: // both first; take lower if unequal
1792 if (a
< b
) { // take a
1793 // Back up over overlapping ranges in buffer[]
1794 if (k
> 0 && a
<= buffer
[k
-1]) {
1795 // Pick latter end value in buffer[] vs. list[]
1796 a
= max(list
[i
], buffer
[--k
]);
1802 i
++; // Common if/else code factored out
1804 } else if (b
< a
) { // take b
1805 if (k
> 0 && b
<= buffer
[k
-1]) {
1806 b
= max(other
[j
], buffer
[--k
]);
1813 } else { // a == b, take a, drop b
1814 if (a
== UNICODESET_HIGH
) goto loop_end
;
1815 // This is symmetrical; it doesn't matter if
1816 // we backtrack with a or b. - liu
1817 if (k
> 0 && a
<= buffer
[k
-1]) {
1818 a
= max(list
[i
], buffer
[--k
]);
1830 case 3: // both second; take higher if unequal, and drop other
1831 if (b
<= a
) { // take a
1832 if (a
== UNICODESET_HIGH
) goto loop_end
;
1835 if (b
== UNICODESET_HIGH
) goto loop_end
;
1839 polarity
^= 1; // factored common code
1843 case 1: // a second, b first; if b < a, overlap
1844 if (a
< b
) { // no overlap, take a
1845 buffer
[k
++] = a
; a
= list
[i
++]; polarity
^= 1;
1846 } else if (b
< a
) { // OVERLAP, drop b
1849 } else { // a == b, drop both!
1850 if (a
== UNICODESET_HIGH
) goto loop_end
;
1857 case 2: // a first, b second; if a < b, overlap
1858 if (b
< a
) { // no overlap, take b
1862 } else if (a
< b
) { // OVERLAP, drop a
1865 } else { // a == b, drop both!
1866 if (a
== UNICODESET_HIGH
) goto loop_end
;
1876 buffer
[k
++] = UNICODESET_HIGH
; // terminate
1882 // polarity = 0 is normal: x intersect y
1883 // polarity = 2: x intersect ~y == set-minus
1884 // polarity = 1: ~x intersect y
1885 // polarity = 3: ~x intersect ~y
1887 void UnicodeSet::retain(const UChar32
* other
, int32_t otherLen
, int8_t polarity
) {
1888 if (isFrozen() || isBogus()) {
1891 if (!ensureBufferCapacity(len
+ otherLen
)) {
1895 int32_t i
= 0, j
= 0, k
= 0;
1896 UChar32 a
= list
[i
++];
1897 UChar32 b
= other
[j
++];
1898 // change from xor is that we have to check overlapping pairs
1899 // polarity bit 1 means a is second, bit 2 means b is.
1902 case 0: // both first; drop the smaller
1903 if (a
< b
) { // drop a
1906 } else if (b
< a
) { // drop b
1909 } else { // a == b, take one, drop other
1910 if (a
== UNICODESET_HIGH
) goto loop_end
;
1918 case 3: // both second; take lower if unequal
1919 if (a
< b
) { // take a
1923 } else if (b
< a
) { // take b
1927 } else { // a == b, take one, drop other
1928 if (a
== UNICODESET_HIGH
) goto loop_end
;
1936 case 1: // a second, b first;
1937 if (a
< b
) { // NO OVERLAP, drop a
1940 } else if (b
< a
) { // OVERLAP, take b
1944 } else { // a == b, drop both!
1945 if (a
== UNICODESET_HIGH
) goto loop_end
;
1952 case 2: // a first, b second; if a < b, overlap
1953 if (b
< a
) { // no overlap, drop b
1956 } else if (a
< b
) { // OVERLAP, take a
1960 } else { // a == b, drop both!
1961 if (a
== UNICODESET_HIGH
) goto loop_end
;
1971 buffer
[k
++] = UNICODESET_HIGH
; // terminate
1978 * Append the <code>toPattern()</code> representation of a
1979 * string to the given <code>StringBuffer</code>.
1981 void UnicodeSet::_appendToPat(UnicodeString
& buf
, const UnicodeString
& s
, UBool
1982 escapeUnprintable
) {
1984 for (int32_t i
= 0; i
< s
.length(); i
+= U16_LENGTH(cp
)) {
1985 _appendToPat(buf
, cp
= s
.char32At(i
), escapeUnprintable
);
1990 * Append the <code>toPattern()</code> representation of a
1991 * character to the given <code>StringBuffer</code>.
1993 void UnicodeSet::_appendToPat(UnicodeString
& buf
, UChar32 c
, UBool
1994 escapeUnprintable
) {
1995 if (escapeUnprintable
&& ICU_Utility::isUnprintable(c
)) {
1996 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1998 if (ICU_Utility::escapeUnprintable(buf
, c
)) {
2002 // Okay to let ':' pass through
2013 case SymbolTable::SYMBOL_REF
:
2014 buf
.append(BACKSLASH
);
2017 // Escape whitespace
2018 if (PatternProps::isWhiteSpace(c
)) {
2019 buf
.append(BACKSLASH
);
2027 * Append a string representation of this set to result. This will be
2028 * a cleaned version of the string passed to applyPattern(), if there
2029 * is one. Otherwise it will be generated.
2031 UnicodeString
& UnicodeSet::_toPattern(UnicodeString
& result
,
2032 UBool escapeUnprintable
) const
2036 int32_t backslashCount
= 0;
2037 for (i
=0; i
<patLen
; ) {
2039 U16_NEXT(pat
, i
, patLen
, c
);
2040 if (escapeUnprintable
&& ICU_Utility::isUnprintable(c
)) {
2041 // If the unprintable character is preceded by an odd
2042 // number of backslashes, then it has been escaped.
2043 // Before unescaping it, we delete the final
2045 if ((backslashCount
% 2) == 1) {
2046 result
.truncate(result
.length() - 1);
2048 ICU_Utility::escapeUnprintable(result
, c
);
2052 if (c
== BACKSLASH
) {
2062 return _generatePattern(result
, escapeUnprintable
);
2066 * Returns a string representation of this set. If the result of
2067 * calling this function is passed to a UnicodeSet constructor, it
2068 * will produce another set that is equal to this one.
2070 UnicodeString
& UnicodeSet::toPattern(UnicodeString
& result
,
2071 UBool escapeUnprintable
) const
2074 return _toPattern(result
, escapeUnprintable
);
2078 * Generate and append a string representation of this set to result.
2079 * This does not use this.pat, the cleaned up copy of the string
2080 * passed to applyPattern().
2082 UnicodeString
& UnicodeSet::_generatePattern(UnicodeString
& result
,
2083 UBool escapeUnprintable
) const
2085 result
.append(SET_OPEN
);
2087 // // Check against the predefined categories. We implicitly build
2088 // // up ALL category sets the first time toPattern() is called.
2089 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2090 // if (*this == getCategorySet(cat)) {
2091 // result.append(COLON);
2092 // result.append(CATEGORY_NAMES, cat*2, 2);
2093 // return result.append(CATEGORY_CLOSE);
2097 int32_t count
= getRangeCount();
2099 // If the set contains at least 2 intervals and includes both
2100 // MIN_VALUE and MAX_VALUE, then the inverse representation will
2101 // be more economical.
2103 getRangeStart(0) == MIN_VALUE
&&
2104 getRangeEnd(count
-1) == MAX_VALUE
) {
2107 result
.append(COMPLEMENT
);
2109 for (int32_t i
= 1; i
< count
; ++i
) {
2110 UChar32 start
= getRangeEnd(i
-1)+1;
2111 UChar32 end
= getRangeStart(i
)-1;
2112 _appendToPat(result
, start
, escapeUnprintable
);
2114 if ((start
+1) != end
) {
2115 result
.append(HYPHEN
);
2117 _appendToPat(result
, end
, escapeUnprintable
);
2122 // Default; emit the ranges as pairs
2124 for (int32_t i
= 0; i
< count
; ++i
) {
2125 UChar32 start
= getRangeStart(i
);
2126 UChar32 end
= getRangeEnd(i
);
2127 _appendToPat(result
, start
, escapeUnprintable
);
2129 if ((start
+1) != end
) {
2130 result
.append(HYPHEN
);
2132 _appendToPat(result
, end
, escapeUnprintable
);
2137 if (strings
!= nullptr) {
2138 for (int32_t i
= 0; i
<strings
->size(); ++i
) {
2139 result
.append(OPEN_BRACE
);
2140 _appendToPat(result
,
2141 *(const UnicodeString
*) strings
->elementAt(i
),
2143 result
.append(CLOSE_BRACE
);
2146 return result
.append(SET_CLOSE
);
2150 * Release existing cached pattern
2152 void UnicodeSet::releasePattern() {
2161 * Set the new pattern to cache.
2163 void UnicodeSet::setPattern(const char16_t *newPat
, int32_t newPatLen
) {
2165 pat
= (UChar
*)uprv_malloc((newPatLen
+ 1) * sizeof(UChar
));
2168 u_memcpy(pat
, newPat
, patLen
);
2171 // else we don't care if malloc failed. This was just a nice cache.
2172 // We can regenerate an equivalent pattern later when requested.
2175 UnicodeSet
*UnicodeSet::freeze() {
2176 if(!isFrozen() && !isBogus()) {
2179 // Optimize contains() and span() and similar functions.
2181 stringSpan
= new UnicodeSetStringSpan(*this, *strings
, UnicodeSetStringSpan::ALL
);
2182 if (stringSpan
== nullptr) {
2185 } else if (!stringSpan
->needsStringSpanUTF16()) {
2186 // All strings are irrelevant for span() etc. because
2187 // all of each string's code points are contained in this set.
2188 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2189 // many relevant strings as UTF-16.
2190 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2195 if (stringSpan
== NULL
) {
2196 // No span-relevant strings: Optimize for code point spans.
2197 bmpSet
=new BMPSet(list
, len
);
2198 if (bmpSet
== NULL
) { // Check for memory allocation error.
2206 int32_t UnicodeSet::span(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
2207 if(length
>0 && bmpSet
!=NULL
) {
2208 return (int32_t)(bmpSet
->span(s
, s
+length
, spanCondition
)-s
);
2216 if(stringSpan
!=NULL
) {
2217 return stringSpan
->span(s
, length
, spanCondition
);
2218 } else if(hasStrings()) {
2219 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2220 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED
:
2221 UnicodeSetStringSpan::FWD_UTF16_CONTAINED
;
2222 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2223 if(strSpan
.needsStringSpanUTF16()) {
2224 return strSpan
.span(s
, length
, spanCondition
);
2228 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2229 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2233 int32_t start
=0, prev
=0;
2235 U16_NEXT(s
, start
, length
, c
);
2236 if(spanCondition
!=contains(c
)) {
2239 } while((prev
=start
)<length
);
2243 int32_t UnicodeSet::spanBack(const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) const {
2244 if(length
>0 && bmpSet
!=NULL
) {
2245 return (int32_t)(bmpSet
->spanBack(s
, s
+length
, spanCondition
)-s
);
2253 if(stringSpan
!=NULL
) {
2254 return stringSpan
->spanBack(s
, length
, spanCondition
);
2255 } else if(hasStrings()) {
2256 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2257 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED
:
2258 UnicodeSetStringSpan::BACK_UTF16_CONTAINED
;
2259 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2260 if(strSpan
.needsStringSpanUTF16()) {
2261 return strSpan
.spanBack(s
, length
, spanCondition
);
2265 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2266 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2270 int32_t prev
=length
;
2272 U16_PREV(s
, 0, length
, c
);
2273 if(spanCondition
!=contains(c
)) {
2276 } while((prev
=length
)>0);
2280 int32_t UnicodeSet::spanUTF8(const char *s
, int32_t length
, USetSpanCondition spanCondition
) const {
2281 if(length
>0 && bmpSet
!=NULL
) {
2282 const uint8_t *s0
=(const uint8_t *)s
;
2283 return (int32_t)(bmpSet
->spanUTF8(s0
, length
, spanCondition
)-s0
);
2286 length
=(int32_t)uprv_strlen(s
);
2291 if(stringSpan
!=NULL
) {
2292 return stringSpan
->spanUTF8((const uint8_t *)s
, length
, spanCondition
);
2293 } else if(hasStrings()) {
2294 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2295 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED
:
2296 UnicodeSetStringSpan::FWD_UTF8_CONTAINED
;
2297 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2298 if(strSpan
.needsStringSpanUTF8()) {
2299 return strSpan
.spanUTF8((const uint8_t *)s
, length
, spanCondition
);
2303 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2304 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2308 int32_t start
=0, prev
=0;
2310 U8_NEXT_OR_FFFD(s
, start
, length
, c
);
2311 if(spanCondition
!=contains(c
)) {
2314 } while((prev
=start
)<length
);
2318 int32_t UnicodeSet::spanBackUTF8(const char *s
, int32_t length
, USetSpanCondition spanCondition
) const {
2319 if(length
>0 && bmpSet
!=NULL
) {
2320 const uint8_t *s0
=(const uint8_t *)s
;
2321 return bmpSet
->spanBackUTF8(s0
, length
, spanCondition
);
2324 length
=(int32_t)uprv_strlen(s
);
2329 if(stringSpan
!=NULL
) {
2330 return stringSpan
->spanBackUTF8((const uint8_t *)s
, length
, spanCondition
);
2331 } else if(hasStrings()) {
2332 uint32_t which
= spanCondition
==USET_SPAN_NOT_CONTAINED
?
2333 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED
:
2334 UnicodeSetStringSpan::BACK_UTF8_CONTAINED
;
2335 UnicodeSetStringSpan
strSpan(*this, *strings
, which
);
2336 if(strSpan
.needsStringSpanUTF8()) {
2337 return strSpan
.spanBackUTF8((const uint8_t *)s
, length
, spanCondition
);
2341 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
2342 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
2346 int32_t prev
=length
;
2348 U8_PREV_OR_FFFD(s
, 0, length
, c
);
2349 if(spanCondition
!=contains(c
)) {
2352 } while((prev
=length
)>0);