]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/uniset.cpp
ICU-8.11.tar.gz
[apple/icu.git] / icuSources / common / uniset.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 1999-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12 #include "unicode/uniset.h"
13 #include "unicode/parsepos.h"
14 #include "unicode/symtable.h"
15 #include "ruleiter.h"
16 #include "cmemory.h"
17 #include "uhash.h"
18 #include "util.h"
19 #include "uvector.h"
20 #include "charstr.h"
21 #include "ustrfmt.h"
22 #include "mutex.h"
23 #include "uassert.h"
24 #include "hash.h"
25
26 // Define UChar constants using hex for EBCDIC compatibility
27 // Used #define to reduce private static exports and memory access time.
28 #define SET_OPEN ((UChar)0x005B) /*[*/
29 #define SET_CLOSE ((UChar)0x005D) /*]*/
30 #define HYPHEN ((UChar)0x002D) /*-*/
31 #define COMPLEMENT ((UChar)0x005E) /*^*/
32 #define COLON ((UChar)0x003A) /*:*/
33 #define BACKSLASH ((UChar)0x005C) /*\*/
34 #define INTERSECTION ((UChar)0x0026) /*&*/
35 #define UPPER_U ((UChar)0x0055) /*U*/
36 #define LOWER_U ((UChar)0x0075) /*u*/
37 #define OPEN_BRACE ((UChar)123) /*{*/
38 #define CLOSE_BRACE ((UChar)125) /*}*/
39 #define UPPER_P ((UChar)0x0050) /*P*/
40 #define LOWER_P ((UChar)0x0070) /*p*/
41 #define UPPER_N ((UChar)78) /*N*/
42 #define EQUALS ((UChar)0x003D) /*=*/
43
44 // HIGH_VALUE > all valid values. 110000 for codepoints
45 #define UNICODESET_HIGH 0x0110000
46
47 // LOW <= all valid values. ZERO for codepoints
48 #define UNICODESET_LOW 0x000000
49
50 // initial storage. Must be >= 0
51 #define START_EXTRA 16
52
53 // extra amount for growth. Must be >= 0
54 #define GROW_EXTRA START_EXTRA
55
56 U_NAMESPACE_BEGIN
57
58 SymbolTable::~SymbolTable() {}
59
60 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
61
62 /**
63 * Modify the given UChar32 variable so that it is in range, by
64 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
65 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
66 * It modifies its argument in-place and also returns it.
67 */
68 static inline UChar32 pinCodePoint(UChar32& c) {
69 if (c < UNICODESET_LOW) {
70 c = UNICODESET_LOW;
71 } else if (c > (UNICODESET_HIGH-1)) {
72 c = (UNICODESET_HIGH-1);
73 }
74 return c;
75 }
76
77 //----------------------------------------------------------------
78 // Debugging
79 //----------------------------------------------------------------
80
81 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
82 // To enable the debugging, define the symbol DEBUG_MEM in the line
83 // below. This will result in text being sent to stdout that looks
84 // like this:
85 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
86 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
87 // Each line lists a construction (ct) or destruction (dt) event, the
88 // object address, the number of outstanding objects after the event,
89 // and the pattern of the object in question.
90
91 // #define DEBUG_MEM
92
93 #ifdef DEBUG_MEM
94 #include <stdio.h>
95 static int32_t _dbgCount = 0;
96
97 static inline void _dbgct(UnicodeSet* set) {
98 UnicodeString str;
99 set->toPattern(str, TRUE);
100 char buf[40];
101 str.extract(0, 39, buf, "");
102 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
103 }
104
105 static inline void _dbgdt(UnicodeSet* set) {
106 UnicodeString str;
107 set->toPattern(str, TRUE);
108 char buf[40];
109 str.extract(0, 39, buf, "");
110 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
111 }
112
113 #else
114
115 #define _dbgct(set)
116 #define _dbgdt(set)
117
118 #endif
119
120 //----------------------------------------------------------------
121 // UnicodeString in UVector support
122 //----------------------------------------------------------------
123
124 static void U_CALLCONV cloneUnicodeString(UHashTok *dst, UHashTok *src) {
125 dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
126 }
127
128 static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) {
129 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
130 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
131 return a.compare(b);
132 }
133
134 //----------------------------------------------------------------
135 // Constructors &c
136 //----------------------------------------------------------------
137
138 /**
139 * Constructs an empty set.
140 */
141 UnicodeSet::UnicodeSet() :
142 len(1), capacity(1 + START_EXTRA), bufferCapacity(0),
143 list(0), buffer(0), strings(0)
144 {
145 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
146 if(list!=NULL){
147 list[0] = UNICODESET_HIGH;
148 }
149 allocateStrings();
150 _dbgct(this);
151 }
152
153 /**
154 * Constructs a set containing the given range. If <code>end >
155 * start</code> then an empty set is created.
156 *
157 * @param start first character, inclusive, of range
158 * @param end last character, inclusive, of range
159 */
160 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
161 len(1), capacity(1 + START_EXTRA), bufferCapacity(0),
162 list(0), buffer(0), strings(0)
163 {
164 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
165 if(list!=NULL){
166 list[0] = UNICODESET_HIGH;
167 }
168 allocateStrings();
169 complement(start, end);
170 _dbgct(this);
171 }
172
173 /**
174 * Constructs a set that is identical to the given UnicodeSet.
175 */
176 UnicodeSet::UnicodeSet(const UnicodeSet& o) :
177 UnicodeFilter(o),
178 len(0), capacity(o.len + GROW_EXTRA), bufferCapacity(0),
179 list(0), buffer(0), strings(0)
180 {
181 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
182 if(list!=NULL){
183 allocateStrings();
184 *this = o;
185 }
186 _dbgct(this);
187 }
188
189 /**
190 * Destructs the set.
191 */
192 UnicodeSet::~UnicodeSet() {
193 _dbgdt(this); // first!
194 uprv_free(list);
195 if (buffer) {
196 uprv_free(buffer);
197 }
198 delete strings;
199 }
200
201 /**
202 * Assigns this object to be a copy of another.
203 */
204 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
205 ensureCapacity(o.len);
206 len = o.len;
207 uprv_memcpy(list, o.list, len*sizeof(UChar32));
208 UErrorCode ec = U_ZERO_ERROR;
209 strings->assign(*o.strings, cloneUnicodeString, ec);
210 pat = o.pat;
211 return *this;
212 }
213
214 /**
215 * Compares the specified object with this set for equality. Returns
216 * <tt>true</tt> if the two sets
217 * have the same size, and every member of the specified set is
218 * contained in this set (or equivalently, every member of this set is
219 * contained in the specified set).
220 *
221 * @param o set to be compared for equality with this set.
222 * @return <tt>true</tt> if the specified set is equal to this set.
223 */
224 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
225 if (len != o.len) return FALSE;
226 for (int32_t i = 0; i < len; ++i) {
227 if (list[i] != o.list[i]) return FALSE;
228 }
229 if (*strings != *o.strings) return FALSE;
230 return TRUE;
231 }
232
233 /**
234 * Returns a copy of this object. All UnicodeMatcher objects have
235 * to support cloning in order to allow classes using
236 * UnicodeMatchers, such as Transliterator, to implement cloning.
237 */
238 UnicodeFunctor* UnicodeSet::clone() const {
239 return new UnicodeSet(*this);
240 }
241
242 /**
243 * Returns the hash code value for this set.
244 *
245 * @return the hash code value for this set.
246 * @see Object#hashCode()
247 */
248 int32_t UnicodeSet::hashCode(void) const {
249 int32_t result = len;
250 for (int32_t i = 0; i < len; ++i) {
251 result *= 1000003;
252 result += list[i];
253 }
254 return result;
255 }
256
257 //----------------------------------------------------------------
258 // Public API
259 //----------------------------------------------------------------
260
261 /**
262 * Make this object represent the range <code>start - end</code>.
263 * If <code>end > start</code> then this object is set to an
264 * an empty range.
265 *
266 * @param start first character in the set, inclusive
267 * @rparam end last character in the set, inclusive
268 */
269 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
270 clear();
271 complement(start, end);
272 return *this;
273 }
274
275 /**
276 * Returns the number of elements in this set (its cardinality),
277 * Note than the elements of a set may include both individual
278 * codepoints and strings.
279 *
280 * @return the number of elements in this set (its cardinality).
281 */
282 int32_t UnicodeSet::size(void) const {
283 int32_t n = 0;
284 int32_t count = getRangeCount();
285 for (int32_t i = 0; i < count; ++i) {
286 n += getRangeEnd(i) - getRangeStart(i) + 1;
287 }
288 return n + strings->size();
289 }
290
291 /**
292 * Returns <tt>true</tt> if this set contains no elements.
293 *
294 * @return <tt>true</tt> if this set contains no elements.
295 */
296 UBool UnicodeSet::isEmpty(void) const {
297 return len == 1 && strings->size() == 0;
298 }
299
300 /**
301 * Returns true if this set contains the given character.
302 * @param c character to be checked for containment
303 * @return true if the test condition is met
304 */
305 UBool UnicodeSet::contains(UChar32 c) const {
306 // Set i to the index of the start item greater than ch
307 // We know we will terminate without length test!
308 // LATER: for large sets, add binary search
309 //int32_t i = -1;
310 //for (;;) {
311 // if (c < list[++i]) break;
312 //}
313 if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
314 return FALSE;
315 }
316 int32_t i = findCodePoint(c);
317 return ((i & 1) != 0); // return true if odd
318 }
319
320 /**
321 * Returns the smallest value i such that c < list[i]. Caller
322 * must ensure that c is a legal value or this method will enter
323 * an infinite loop. This method performs a binary search.
324 * @param c a character in the range MIN_VALUE..MAX_VALUE
325 * inclusive
326 * @return the smallest integer i in the range 0..len-1,
327 * inclusive, such that c < list[i]
328 */
329 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
330 /* Examples:
331 findCodePoint(c)
332 set list[] c=0 1 3 4 7 8
333 === ============== ===========
334 [] [110000] 0 0 0 0 0 0
335 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
336 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
337 [:Any:] [0, 110000] 1 1 1 1 1 1
338 */
339
340 // Return the smallest i such that c < list[i]. Assume
341 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
342 if (c < list[0])
343 return 0;
344 // High runner test. c is often after the last range, so an
345 // initial check for this condition pays off.
346 if (len >= 2 && c >= list[len-2])
347 return len-1;
348 int32_t lo = 0;
349 int32_t hi = len - 1;
350 // invariant: c >= list[lo]
351 // invariant: c < list[hi]
352 for (;;) {
353 int32_t i = (lo + hi) >> 1;
354 if (i == lo) {
355 break; // Found!
356 } else if (c < list[i]) {
357 hi = i;
358 } else {
359 lo = i;
360 }
361 }
362 return hi;
363 }
364
365 /**
366 * Returns true if this set contains every character
367 * of the given range.
368 * @param start first character, inclusive, of the range
369 * @param end last character, inclusive, of the range
370 * @return true if the test condition is met
371 */
372 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
373 //int32_t i = -1;
374 //for (;;) {
375 // if (start < list[++i]) break;
376 //}
377 int32_t i = findCodePoint(start);
378 return ((i & 1) != 0 && end < list[i]);
379 }
380
381 /**
382 * Returns <tt>true</tt> if this set contains the given
383 * multicharacter string.
384 * @param s string to be checked for containment
385 * @return <tt>true</tt> if this set contains the specified string
386 */
387 UBool UnicodeSet::contains(const UnicodeString& s) const {
388 if (s.length() == 0) return FALSE;
389 int32_t cp = getSingleCP(s);
390 if (cp < 0) {
391 return strings->contains((void*) &s);
392 } else {
393 return contains((UChar32) cp);
394 }
395 }
396
397 /**
398 * Returns true if this set contains all the characters and strings
399 * of the given set.
400 * @param c set to be checked for containment
401 * @return true if the test condition is met
402 */
403 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
404 // The specified set is a subset if all of its pairs are contained in
405 // this set. It's possible to code this more efficiently in terms of
406 // direct manipulation of the inversion lists if the need arises.
407 int32_t n = c.getRangeCount();
408 for (int i=0; i<n; ++i) {
409 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
410 return FALSE;
411 }
412 }
413 if (!strings->containsAll(*c.strings)) return FALSE;
414 return TRUE;
415 }
416
417 /**
418 * Returns true if this set contains all the characters
419 * of the given string.
420 * @param s string containing characters to be checked for containment
421 * @return true if the test condition is met
422 */
423 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
424 UChar32 cp;
425 for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
426 cp = s.char32At(i);
427 if (!contains(cp)) return FALSE;
428 }
429 return TRUE;
430 }
431
432 /**
433 * Returns true if this set contains none of the characters
434 * of the given range.
435 * @param start first character, inclusive, of the range
436 * @param end last character, inclusive, of the range
437 * @return true if the test condition is met
438 */
439 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
440 //int32_t i = -1;
441 //for (;;) {
442 // if (start < list[++i]) break;
443 //}
444 int32_t i = findCodePoint(start);
445 return ((i & 1) == 0 && end < list[i]);
446 }
447
448 /**
449 * Returns true if this set contains none of the characters and strings
450 * of the given set.
451 * @param c set to be checked for containment
452 * @return true if the test condition is met
453 */
454 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
455 // The specified set is a subset if all of its pairs are contained in
456 // this set. It's possible to code this more efficiently in terms of
457 // direct manipulation of the inversion lists if the need arises.
458 int32_t n = c.getRangeCount();
459 for (int32_t i=0; i<n; ++i) {
460 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
461 return FALSE;
462 }
463 }
464 if (!strings->containsNone(*c.strings)) return FALSE;
465 return TRUE;
466 }
467
468 /**
469 * Returns true if this set contains none of the characters
470 * of the given string.
471 * @param s string containing characters to be checked for containment
472 * @return true if the test condition is met
473 */
474 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
475 UChar32 cp;
476 for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
477 cp = s.char32At(i);
478 if (contains(cp)) return FALSE;
479 }
480 return TRUE;
481 }
482
483 /**
484 * Returns <tt>true</tt> if this set contains any character whose low byte
485 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
486 * indexing.
487 */
488 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
489 /* The index value v, in the range [0,255], is contained in this set if
490 * it is contained in any pair of this set. Pairs either have the high
491 * bytes equal, or unequal. If the high bytes are equal, then we have
492 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
493 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
494 * Then v is contained if xx <= v || v <= yy. (This is identical to the
495 * time zone month containment logic.)
496 */
497 int32_t i;
498 for (i=0; i<getRangeCount(); ++i) {
499 UChar32 low = getRangeStart(i);
500 UChar32 high = getRangeEnd(i);
501 if ((low & ~0xFF) == (high & ~0xFF)) {
502 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
503 return TRUE;
504 }
505 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
506 return TRUE;
507 }
508 }
509 if (strings->size() != 0) {
510 for (i=0; i<strings->size(); ++i) {
511 const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
512 //if (s.length() == 0) {
513 // // Empty strings match everything
514 // return TRUE;
515 //}
516 // assert(s.length() != 0); // We enforce this elsewhere
517 UChar32 c = s.char32At(0);
518 if ((c & 0xFF) == v) {
519 return TRUE;
520 }
521 }
522 }
523 return FALSE;
524 }
525
526 /**
527 * Implementation of UnicodeMatcher::matches(). Always matches the
528 * longest possible multichar string.
529 */
530 UMatchDegree UnicodeSet::matches(const Replaceable& text,
531 int32_t& offset,
532 int32_t limit,
533 UBool incremental) {
534 if (offset == limit) {
535 // Strings, if any, have length != 0, so we don't worry
536 // about them here. If we ever allow zero-length strings
537 // we much check for them here.
538 if (contains(U_ETHER)) {
539 return incremental ? U_PARTIAL_MATCH : U_MATCH;
540 } else {
541 return U_MISMATCH;
542 }
543 } else {
544 if (strings->size() != 0) { // try strings first
545
546 // might separate forward and backward loops later
547 // for now they are combined
548
549 // TODO Improve efficiency of this, at least in the forward
550 // direction, if not in both. In the forward direction we
551 // can assume the strings are sorted.
552
553 int32_t i;
554 UBool forward = offset < limit;
555
556 // firstChar is the leftmost char to match in the
557 // forward direction or the rightmost char to match in
558 // the reverse direction.
559 UChar firstChar = text.charAt(offset);
560
561 // If there are multiple strings that can match we
562 // return the longest match.
563 int32_t highWaterLength = 0;
564
565 for (i=0; i<strings->size(); ++i) {
566 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
567
568 //if (trial.length() == 0) {
569 // return U_MATCH; // null-string always matches
570 //}
571 // assert(trial.length() != 0); // We ensure this elsewhere
572
573 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
574
575 // Strings are sorted, so we can optimize in the
576 // forward direction.
577 if (forward && c > firstChar) break;
578 if (c != firstChar) continue;
579
580 int32_t matchLen = matchRest(text, offset, limit, trial);
581
582 if (incremental) {
583 int32_t maxLen = forward ? limit-offset : offset-limit;
584 if (matchLen == maxLen) {
585 // We have successfully matched but only up to limit.
586 return U_PARTIAL_MATCH;
587 }
588 }
589
590 if (matchLen == trial.length()) {
591 // We have successfully matched the whole string.
592 if (matchLen > highWaterLength) {
593 highWaterLength = matchLen;
594 }
595 // In the forward direction we know strings
596 // are sorted so we can bail early.
597 if (forward && matchLen < highWaterLength) {
598 break;
599 }
600 continue;
601 }
602 }
603
604 // We've checked all strings without a partial match.
605 // If we have full matches, return the longest one.
606 if (highWaterLength != 0) {
607 offset += forward ? highWaterLength : -highWaterLength;
608 return U_MATCH;
609 }
610 }
611 return UnicodeFilter::matches(text, offset, limit, incremental);
612 }
613 }
614
615 /**
616 * Returns the longest match for s in text at the given position.
617 * If limit > start then match forward from start+1 to limit
618 * matching all characters except s.charAt(0). If limit < start,
619 * go backward starting from start-1 matching all characters
620 * except s.charAt(s.length()-1). This method assumes that the
621 * first character, text.charAt(start), matches s, so it does not
622 * check it.
623 * @param text the text to match
624 * @param start the first character to match. In the forward
625 * direction, text.charAt(start) is matched against s.charAt(0).
626 * In the reverse direction, it is matched against
627 * s.charAt(s.length()-1).
628 * @param limit the limit offset for matching, either last+1 in
629 * the forward direction, or last-1 in the reverse direction,
630 * where last is the index of the last character to match.
631 * @return If part of s matches up to the limit, return |limit -
632 * start|. If all of s matches before reaching the limit, return
633 * s.length(). If there is a mismatch between s and text, return
634 * 0
635 */
636 int32_t UnicodeSet::matchRest(const Replaceable& text,
637 int32_t start, int32_t limit,
638 const UnicodeString& s) {
639 int32_t i;
640 int32_t maxLen;
641 int32_t slen = s.length();
642 if (start < limit) {
643 maxLen = limit - start;
644 if (maxLen > slen) maxLen = slen;
645 for (i = 1; i < maxLen; ++i) {
646 if (text.charAt(start + i) != s.charAt(i)) return 0;
647 }
648 } else {
649 maxLen = start - limit;
650 if (maxLen > slen) maxLen = slen;
651 --slen; // <=> slen = s.length() - 1;
652 for (i = 1; i < maxLen; ++i) {
653 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
654 }
655 }
656 return maxLen;
657 }
658
659 /**
660 * Implement of UnicodeMatcher
661 */
662 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
663 toUnionTo.addAll(*this);
664 }
665
666 /**
667 * Returns the index of the given character within this set, where
668 * the set is ordered by ascending code point. If the character
669 * is not in this set, return -1. The inverse of this method is
670 * <code>charAt()</code>.
671 * @return an index from 0..size()-1, or -1
672 */
673 int32_t UnicodeSet::indexOf(UChar32 c) const {
674 if (c < MIN_VALUE || c > MAX_VALUE) {
675 return -1;
676 }
677 int32_t i = 0;
678 int32_t n = 0;
679 for (;;) {
680 UChar32 start = list[i++];
681 if (c < start) {
682 return -1;
683 }
684 UChar32 limit = list[i++];
685 if (c < limit) {
686 return n + c - start;
687 }
688 n += limit - start;
689 }
690 }
691
692 /**
693 * Returns the character at the given index within this set, where
694 * the set is ordered by ascending code point. If the index is
695 * out of range, return (UChar32)-1. The inverse of this method is
696 * <code>indexOf()</code>.
697 * @param index an index from 0..size()-1
698 * @return the character at the given index, or (UChar32)-1.
699 */
700 UChar32 UnicodeSet::charAt(int32_t index) const {
701 if (index >= 0) {
702 // len2 is the largest even integer <= len, that is, it is len
703 // for even values and len-1 for odd values. With odd values
704 // the last entry is UNICODESET_HIGH.
705 int32_t len2 = len & ~1;
706 for (int32_t i=0; i < len2;) {
707 UChar32 start = list[i++];
708 int32_t count = list[i++] - start;
709 if (index < count) {
710 return (UChar32)(start + index);
711 }
712 index -= count;
713 }
714 }
715 return (UChar32)-1;
716 }
717
718 /**
719 * Adds the specified range to this set if it is not already
720 * present. If this set already contains the specified range,
721 * the call leaves this set unchanged. If <code>end > start</code>
722 * then an empty range is added, leaving the set unchanged.
723 *
724 * @param start first character, inclusive, of range to be added
725 * to this set.
726 * @param end last character, inclusive, of range to be added
727 * to this set.
728 */
729 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
730 if (pinCodePoint(start) < pinCodePoint(end)) {
731 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
732 add(range, 2, 0);
733 } else if (start == end) {
734 add(start);
735 }
736 return *this;
737 }
738
739 // #define DEBUG_US_ADD
740
741 #ifdef DEBUG_US_ADD
742 #include <stdio.h>
743 void dump(UChar32 c) {
744 if (c <= 0xFF) {
745 printf("%c", (char)c);
746 } else {
747 printf("U+%04X", c);
748 }
749 }
750 void dump(const UChar32* list, int32_t len) {
751 printf("[");
752 for (int32_t i=0; i<len; ++i) {
753 if (i != 0) printf(", ");
754 dump(list[i]);
755 }
756 printf("]");
757 }
758 #endif
759
760 /**
761 * Adds the specified character to this set if it is not already
762 * present. If this set already contains the specified character,
763 * the call leaves this set unchanged.
764 */
765 UnicodeSet& UnicodeSet::add(UChar32 c) {
766 // find smallest i such that c < list[i]
767 // if odd, then it is IN the set
768 // if even, then it is OUT of the set
769 int32_t i = findCodePoint(pinCodePoint(c));
770
771 // already in set?
772 if ((i & 1) != 0) return *this;
773
774 // HIGH is 0x110000
775 // assert(list[len-1] == HIGH);
776
777 // empty = [HIGH]
778 // [start_0, limit_0, start_1, limit_1, HIGH]
779
780 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
781 // ^
782 // list[i]
783
784 // i == 0 means c is before the first range
785
786 #ifdef DEBUG_US_ADD
787 printf("Add of ");
788 dump(c);
789 printf(" found at %d", i);
790 printf(": ");
791 dump(list, len);
792 printf(" => ");
793 #endif
794
795 if (c == list[i]-1) {
796 // c is before start of next range
797 list[i] = c;
798 // if we touched the HIGH mark, then add a new one
799 if (c == (UNICODESET_HIGH - 1)) {
800 ensureCapacity(len+1);
801 list[len++] = UNICODESET_HIGH;
802 }
803 if (i > 0 && c == list[i-1]) {
804 // collapse adjacent ranges
805
806 // [..., start_k-1, c, c, limit_k, ..., HIGH]
807 // ^
808 // list[i]
809
810 //for (int32_t k=i-1; k<len-2; ++k) {
811 // list[k] = list[k+2];
812 //}
813 UChar32* dst = list + i - 1;
814 UChar32* src = dst + 2;
815 UChar32* srclimit = list + len;
816 while (src < srclimit) *(dst++) = *(src++);
817
818 len -= 2;
819 }
820 }
821
822 else if (i > 0 && c == list[i-1]) {
823 // c is after end of prior range
824 list[i-1]++;
825 // no need to check for collapse here
826 }
827
828 else {
829 // At this point we know the new char is not adjacent to
830 // any existing ranges, and it is not 10FFFF.
831
832
833 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
834 // ^
835 // list[i]
836
837 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
838 // ^
839 // list[i]
840
841 ensureCapacity(len+2);
842
843 //for (int32_t k=len-1; k>=i; --k) {
844 // list[k+2] = list[k];
845 //}
846 UChar32* src = list + len;
847 UChar32* dst = src + 2;
848 UChar32* srclimit = list + i;
849 while (src > srclimit) *(--dst) = *(--src);
850
851 list[i] = c;
852 list[i+1] = c+1;
853 len += 2;
854 }
855
856 #ifdef DEBUG_US_ADD
857 dump(list, len);
858 printf("\n");
859
860 for (i=1; i<len; ++i) {
861 if (list[i] <= list[i-1]) {
862 // Corrupt array!
863 printf("ERROR: list has been corrupted\n");
864 exit(1);
865 }
866 }
867 #endif
868
869 pat.truncate(0);
870 return *this;
871 }
872
873 /**
874 * Adds the specified multicharacter to this set if it is not already
875 * present. If this set already contains the multicharacter,
876 * the call leaves this set unchanged.
877 * Thus "ch" => {"ch"}
878 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
879 * @param s the source string
880 * @return the modified set, for chaining
881 */
882 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
883 if (s.length() == 0) return *this;
884 int32_t cp = getSingleCP(s);
885 if (cp < 0) {
886 if (!strings->contains((void*) &s)) {
887 _add(s);
888 pat.truncate(0);
889 }
890 } else {
891 add((UChar32)cp, (UChar32)cp);
892 }
893 return *this;
894 }
895
896 /**
897 * Adds the given string, in order, to 'strings'. The given string
898 * must have been checked by the caller to not be empty and to not
899 * already be in 'strings'.
900 */
901 void UnicodeSet::_add(const UnicodeString& s) {
902 UnicodeString* t = new UnicodeString(s);
903 UErrorCode ec = U_ZERO_ERROR;
904 strings->sortedInsert(t, compareUnicodeString, ec);
905 }
906
907 /**
908 * @return a code point IF the string consists of a single one.
909 * otherwise returns -1.
910 * @param string to test
911 */
912 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
913 //if (s.length() < 1) {
914 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
915 //}
916 if (s.length() > 2) return -1;
917 if (s.length() == 1) return s.charAt(0);
918
919 // at this point, len = 2
920 UChar32 cp = s.char32At(0);
921 if (cp > 0xFFFF) { // is surrogate pair
922 return cp;
923 }
924 return -1;
925 }
926
927 /**
928 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
929 * If this set already any particular character, it has no effect on that character.
930 * @param the source string
931 * @return the modified set, for chaining
932 */
933 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
934 UChar32 cp;
935 for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
936 cp = s.char32At(i);
937 add(cp, cp);
938 }
939 return *this;
940 }
941
942 /**
943 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
944 * If this set already any particular character, it has no effect on that character.
945 * @param the source string
946 * @return the modified set, for chaining
947 */
948 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
949 UnicodeSet set;
950 set.addAll(s);
951 retainAll(set);
952 return *this;
953 }
954
955 /**
956 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
957 * If this set already any particular character, it has no effect on that character.
958 * @param the source string
959 * @return the modified set, for chaining
960 */
961 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
962 UnicodeSet set;
963 set.addAll(s);
964 complementAll(set);
965 return *this;
966 }
967
968 /**
969 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
970 * If this set already any particular character, it has no effect on that character.
971 * @param the source string
972 * @return the modified set, for chaining
973 */
974 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
975 UnicodeSet set;
976 set.addAll(s);
977 removeAll(set);
978 return *this;
979 }
980
981 /**
982 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
983 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
984 * @param the source string
985 * @return a newly created set containing the given string
986 */
987 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
988 UnicodeSet *set = new UnicodeSet();
989 set->add(s);
990 return set;
991 }
992
993
994 /**
995 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
996 * @param the source string
997 * @return a newly created set containing the given characters
998 */
999 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1000 UnicodeSet *set = new UnicodeSet();
1001 set->addAll(s);
1002 return set;
1003 }
1004
1005 /**
1006 * Retain only the elements in this set that are contained in the
1007 * specified range. If <code>end > start</code> then an empty range is
1008 * retained, leaving the set empty.
1009 *
1010 * @param start first character, inclusive, of range to be retained
1011 * to this set.
1012 * @param end last character, inclusive, of range to be retained
1013 * to this set.
1014 */
1015 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1016 if (pinCodePoint(start) <= pinCodePoint(end)) {
1017 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1018 retain(range, 2, 0);
1019 } else {
1020 clear();
1021 }
1022 return *this;
1023 }
1024
1025 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1026 return retain(c, c);
1027 }
1028
1029 /**
1030 * Removes the specified range from this set if it is present.
1031 * The set will not contain the specified range once the call
1032 * returns. If <code>end > start</code> then an empty range is
1033 * removed, leaving the set unchanged.
1034 *
1035 * @param start first character, inclusive, of range to be removed
1036 * from this set.
1037 * @param end last character, inclusive, of range to be removed
1038 * from this set.
1039 */
1040 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1041 if (pinCodePoint(start) <= pinCodePoint(end)) {
1042 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1043 retain(range, 2, 2);
1044 }
1045 return *this;
1046 }
1047
1048 /**
1049 * Removes the specified character from this set if it is present.
1050 * The set will not contain the specified range once the call
1051 * returns.
1052 */
1053 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1054 return remove(c, c);
1055 }
1056
1057 /**
1058 * Removes the specified string from this set if it is present.
1059 * The set will not contain the specified character once the call
1060 * returns.
1061 * @param the source string
1062 * @return the modified set, for chaining
1063 */
1064 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1065 if (s.length() == 0) return *this;
1066 int32_t cp = getSingleCP(s);
1067 if (cp < 0) {
1068 strings->removeElement((void*) &s);
1069 pat.truncate(0);
1070 } else {
1071 remove((UChar32)cp, (UChar32)cp);
1072 }
1073 return *this;
1074 }
1075
1076 /**
1077 * Complements the specified range in this set. Any character in
1078 * the range will be removed if it is in this set, or will be
1079 * added if it is not in this set. If <code>end > start</code>
1080 * then an empty range is xor'ed, leaving the set unchanged.
1081 *
1082 * @param start first character, inclusive, of range to be removed
1083 * from this set.
1084 * @param end last character, inclusive, of range to be removed
1085 * from this set.
1086 */
1087 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1088 if (pinCodePoint(start) <= pinCodePoint(end)) {
1089 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1090 exclusiveOr(range, 2, 0);
1091 }
1092 pat.truncate(0);
1093 return *this;
1094 }
1095
1096 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1097 return complement(c, c);
1098 }
1099
1100 /**
1101 * This is equivalent to
1102 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1103 */
1104 UnicodeSet& UnicodeSet::complement(void) {
1105 if (list[0] == UNICODESET_LOW) {
1106 ensureBufferCapacity(len-1);
1107 uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
1108 --len;
1109 } else {
1110 ensureBufferCapacity(len+1);
1111 uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));
1112 buffer[0] = UNICODESET_LOW;
1113 ++len;
1114 }
1115 swapBuffers();
1116 pat.truncate(0);
1117 return *this;
1118 }
1119
1120 /**
1121 * Complement the specified string in this set.
1122 * The set will not contain the specified string once the call
1123 * returns.
1124 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1125 * @param s the string to complement
1126 * @return this object, for chaining
1127 */
1128 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1129 if (s.length() == 0) return *this;
1130 int32_t cp = getSingleCP(s);
1131 if (cp < 0) {
1132 if (strings->contains((void*) &s)) {
1133 strings->removeElement((void*) &s);
1134 } else {
1135 _add(s);
1136 }
1137 pat.truncate(0);
1138 } else {
1139 complement((UChar32)cp, (UChar32)cp);
1140 }
1141 return *this;
1142 }
1143
1144 /**
1145 * Adds all of the elements in the specified set to this set if
1146 * they're not already present. This operation effectively
1147 * modifies this set so that its value is the <i>union</i> of the two
1148 * sets. The behavior of this operation is unspecified if the specified
1149 * collection is modified while the operation is in progress.
1150 *
1151 * @param c set whose elements are to be added to this set.
1152 * @see #add(char, char)
1153 */
1154 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1155 add(c.list, c.len, 0);
1156
1157 // Add strings in order
1158 for (int32_t i=0; i<c.strings->size(); ++i) {
1159 const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1160 if (!strings->contains((void*) s)) {
1161 _add(*s);
1162 }
1163 }
1164 return *this;
1165 }
1166
1167 /**
1168 * Retains only the elements in this set that are contained in the
1169 * specified set. In other words, removes from this set all of
1170 * its elements that are not contained in the specified set. This
1171 * operation effectively modifies this set so that its value is
1172 * the <i>intersection</i> of the two sets.
1173 *
1174 * @param c set that defines which elements this set will retain.
1175 */
1176 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1177 retain(c.list, c.len, 0);
1178 strings->retainAll(*c.strings);
1179 return *this;
1180 }
1181
1182 /**
1183 * Removes from this set all of its elements that are contained in the
1184 * specified set. This operation effectively modifies this
1185 * set so that its value is the <i>asymmetric set difference</i> of
1186 * the two sets.
1187 *
1188 * @param c set that defines which elements will be removed from
1189 * this set.
1190 */
1191 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1192 retain(c.list, c.len, 2);
1193 strings->removeAll(*c.strings);
1194 return *this;
1195 }
1196
1197 /**
1198 * Complements in this set all elements contained in the specified
1199 * set. Any character in the other set will be removed if it is
1200 * in this set, or will be added if it is not in this set.
1201 *
1202 * @param c set that defines which elements will be xor'ed from
1203 * this set.
1204 */
1205 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1206 exclusiveOr(c.list, c.len, 0);
1207
1208 for (int32_t i=0; i<c.strings->size(); ++i) {
1209 void* e = c.strings->elementAt(i);
1210 if (!strings->removeElement(e)) {
1211 _add(*(const UnicodeString*)e);
1212 }
1213 }
1214 return *this;
1215 }
1216
1217 /**
1218 * Removes all of the elements from this set. This set will be
1219 * empty after this call returns.
1220 */
1221 UnicodeSet& UnicodeSet::clear(void) {
1222 list[0] = UNICODESET_HIGH;
1223 len = 1;
1224 pat.truncate(0);
1225 strings->removeAllElements();
1226 return *this;
1227 }
1228
1229 /**
1230 * Iteration method that returns the number of ranges contained in
1231 * this set.
1232 * @see #getRangeStart
1233 * @see #getRangeEnd
1234 */
1235 int32_t UnicodeSet::getRangeCount() const {
1236 return len/2;
1237 }
1238
1239 /**
1240 * Iteration method that returns the first character in the
1241 * specified range of this set.
1242 * @see #getRangeCount
1243 * @see #getRangeEnd
1244 */
1245 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1246 return list[index*2];
1247 }
1248
1249 /**
1250 * Iteration method that returns the last character in the
1251 * specified range of this set.
1252 * @see #getRangeStart
1253 * @see #getRangeEnd
1254 */
1255 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1256 return list[index*2 + 1] - 1;
1257 }
1258
1259 int32_t UnicodeSet::getStringCount() const {
1260 return strings->size();
1261 }
1262
1263 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1264 return (const UnicodeString*) strings->elementAt(index);
1265 }
1266
1267 /**
1268 * Reallocate this objects internal structures to take up the least
1269 * possible space, without changing this object's value.
1270 */
1271 UnicodeSet& UnicodeSet::compact() {
1272 if (len != capacity) {
1273 capacity = len;
1274 UChar32* temp = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
1275 uprv_memcpy(temp, list, len*sizeof(UChar32));
1276 uprv_free(list);
1277 list = temp;
1278 }
1279 uprv_free(buffer);
1280 buffer = NULL;
1281 return *this;
1282 }
1283
1284 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1285 int32_t bmpLength, length, destLength;
1286
1287 if (U_FAILURE(ec)) {
1288 return 0;
1289 }
1290
1291 if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1292 ec=U_ILLEGAL_ARGUMENT_ERROR;
1293 return 0;
1294 }
1295
1296 /* count necessary 16-bit units */
1297 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1298 // assert(length>=0);
1299 if (length==0) {
1300 /* empty set */
1301 if (destCapacity>0) {
1302 *dest=0;
1303 } else {
1304 ec=U_BUFFER_OVERFLOW_ERROR;
1305 }
1306 return 1;
1307 }
1308 /* now length>0 */
1309
1310 if (this->list[length-1]<=0xffff) {
1311 /* all BMP */
1312 bmpLength=length;
1313 } else if (this->list[0]>=0x10000) {
1314 /* all supplementary */
1315 bmpLength=0;
1316 length*=2;
1317 } else {
1318 /* some BMP, some supplementary */
1319 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1320 length=bmpLength+2*(length-bmpLength);
1321 }
1322
1323 /* length: number of 16-bit array units */
1324 if (length>0x7fff) {
1325 /* there are only 15 bits for the length in the first serialized word */
1326 ec=U_INDEX_OUTOFBOUNDS_ERROR;
1327 return 0;
1328 }
1329
1330 /*
1331 * total serialized length:
1332 * number of 16-bit array units (length) +
1333 * 1 length unit (always) +
1334 * 1 bmpLength unit (if there are supplementary values)
1335 */
1336 destLength=length+((length>bmpLength)?2:1);
1337 if (destLength<=destCapacity) {
1338 const UChar32 *p;
1339 int32_t i;
1340
1341 *dest=(uint16_t)length;
1342 if (length>bmpLength) {
1343 *dest|=0x8000;
1344 *++dest=(uint16_t)bmpLength;
1345 }
1346 ++dest;
1347
1348 /* write the BMP part of the array */
1349 p=this->list;
1350 for (i=0; i<bmpLength; ++i) {
1351 *dest++=(uint16_t)*p++;
1352 }
1353
1354 /* write the supplementary part of the array */
1355 for (; i<length; i+=2) {
1356 *dest++=(uint16_t)(*p>>16);
1357 *dest++=(uint16_t)*p++;
1358 }
1359 } else {
1360 ec=U_BUFFER_OVERFLOW_ERROR;
1361 }
1362 return destLength;
1363 }
1364
1365 //----------------------------------------------------------------
1366 // Implementation: Utility methods
1367 //----------------------------------------------------------------
1368
1369 /**
1370 * Allocate our strings vector and return TRUE if successful.
1371 */
1372 UBool UnicodeSet::allocateStrings() {
1373 UErrorCode ec = U_ZERO_ERROR;
1374 strings = new UVector(uhash_deleteUnicodeString,
1375 uhash_compareUnicodeString, ec);
1376 if (U_FAILURE(ec)) {
1377 delete strings;
1378 strings = NULL;
1379 return FALSE;
1380 }
1381 return TRUE;
1382 }
1383
1384 void UnicodeSet::ensureCapacity(int32_t newLen) {
1385 if (newLen <= capacity)
1386 return;
1387 capacity = newLen + GROW_EXTRA;
1388 UChar32* temp = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
1389 uprv_memcpy(temp, list, len*sizeof(UChar32));
1390 uprv_free(list);
1391 list = temp;
1392 }
1393
1394 void UnicodeSet::ensureBufferCapacity(int32_t newLen) {
1395 if (buffer != NULL && newLen <= bufferCapacity)
1396 return;
1397 if (buffer) {
1398 uprv_free(buffer);
1399 }
1400 bufferCapacity = newLen + GROW_EXTRA;
1401 buffer = (UChar32*) uprv_malloc(sizeof(UChar32) * bufferCapacity);
1402 }
1403
1404 /**
1405 * Swap list and buffer.
1406 */
1407 void UnicodeSet::swapBuffers(void) {
1408 // swap list and buffer
1409 UChar32* temp = list;
1410 list = buffer;
1411 buffer = temp;
1412
1413 int32_t c = capacity;
1414 capacity = bufferCapacity;
1415 bufferCapacity = c;
1416 }
1417
1418 //----------------------------------------------------------------
1419 // Implementation: Fundamental operators
1420 //----------------------------------------------------------------
1421
1422 static inline UChar32 max(UChar32 a, UChar32 b) {
1423 return (a > b) ? a : b;
1424 }
1425
1426 // polarity = 0, 3 is normal: x xor y
1427 // polarity = 1, 2: x xor ~y == x === y
1428
1429 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1430 ensureBufferCapacity(len + otherLen);
1431 int32_t i = 0, j = 0, k = 0;
1432 UChar32 a = list[i++];
1433 UChar32 b;
1434 if (polarity == 1 || polarity == 2) {
1435 b = UNICODESET_LOW;
1436 if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1437 ++j;
1438 b = other[j];
1439 }
1440 } else {
1441 b = other[j++];
1442 }
1443 // simplest of all the routines
1444 // sort the values, discarding identicals!
1445 for (;;) {
1446 if (a < b) {
1447 buffer[k++] = a;
1448 a = list[i++];
1449 } else if (b < a) {
1450 buffer[k++] = b;
1451 b = other[j++];
1452 } else if (a != UNICODESET_HIGH) { // at this point, a == b
1453 // discard both values!
1454 a = list[i++];
1455 b = other[j++];
1456 } else { // DONE!
1457 buffer[k++] = UNICODESET_HIGH;
1458 len = k;
1459 break;
1460 }
1461 }
1462 swapBuffers();
1463 pat.truncate(0);
1464 }
1465
1466 // polarity = 0 is normal: x union y
1467 // polarity = 2: x union ~y
1468 // polarity = 1: ~x union y
1469 // polarity = 3: ~x union ~y
1470
1471 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1472 ensureBufferCapacity(len + otherLen);
1473 int32_t i = 0, j = 0, k = 0;
1474 UChar32 a = list[i++];
1475 UChar32 b = other[j++];
1476 // change from xor is that we have to check overlapping pairs
1477 // polarity bit 1 means a is second, bit 2 means b is.
1478 for (;;) {
1479 switch (polarity) {
1480 case 0: // both first; take lower if unequal
1481 if (a < b) { // take a
1482 // Back up over overlapping ranges in buffer[]
1483 if (k > 0 && a <= buffer[k-1]) {
1484 // Pick latter end value in buffer[] vs. list[]
1485 a = max(list[i], buffer[--k]);
1486 } else {
1487 // No overlap
1488 buffer[k++] = a;
1489 a = list[i];
1490 }
1491 i++; // Common if/else code factored out
1492 polarity ^= 1;
1493 } else if (b < a) { // take b
1494 if (k > 0 && b <= buffer[k-1]) {
1495 b = max(other[j], buffer[--k]);
1496 } else {
1497 buffer[k++] = b;
1498 b = other[j];
1499 }
1500 j++;
1501 polarity ^= 2;
1502 } else { // a == b, take a, drop b
1503 if (a == UNICODESET_HIGH) goto loop_end;
1504 // This is symmetrical; it doesn't matter if
1505 // we backtrack with a or b. - liu
1506 if (k > 0 && a <= buffer[k-1]) {
1507 a = max(list[i], buffer[--k]);
1508 } else {
1509 // No overlap
1510 buffer[k++] = a;
1511 a = list[i];
1512 }
1513 i++;
1514 polarity ^= 1;
1515 b = other[j++];
1516 polarity ^= 2;
1517 }
1518 break;
1519 case 3: // both second; take higher if unequal, and drop other
1520 if (b <= a) { // take a
1521 if (a == UNICODESET_HIGH) goto loop_end;
1522 buffer[k++] = a;
1523 } else { // take b
1524 if (b == UNICODESET_HIGH) goto loop_end;
1525 buffer[k++] = b;
1526 }
1527 a = list[i++];
1528 polarity ^= 1; // factored common code
1529 b = other[j++];
1530 polarity ^= 2;
1531 break;
1532 case 1: // a second, b first; if b < a, overlap
1533 if (a < b) { // no overlap, take a
1534 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1535 } else if (b < a) { // OVERLAP, drop b
1536 b = other[j++];
1537 polarity ^= 2;
1538 } else { // a == b, drop both!
1539 if (a == UNICODESET_HIGH) goto loop_end;
1540 a = list[i++];
1541 polarity ^= 1;
1542 b = other[j++];
1543 polarity ^= 2;
1544 }
1545 break;
1546 case 2: // a first, b second; if a < b, overlap
1547 if (b < a) { // no overlap, take b
1548 buffer[k++] = b;
1549 b = other[j++];
1550 polarity ^= 2;
1551 } else if (a < b) { // OVERLAP, drop a
1552 a = list[i++];
1553 polarity ^= 1;
1554 } else { // a == b, drop both!
1555 if (a == UNICODESET_HIGH) goto loop_end;
1556 a = list[i++];
1557 polarity ^= 1;
1558 b = other[j++];
1559 polarity ^= 2;
1560 }
1561 break;
1562 }
1563 }
1564 loop_end:
1565 buffer[k++] = UNICODESET_HIGH; // terminate
1566 len = k;
1567 swapBuffers();
1568 pat.truncate(0);
1569 }
1570
1571 // polarity = 0 is normal: x intersect y
1572 // polarity = 2: x intersect ~y == set-minus
1573 // polarity = 1: ~x intersect y
1574 // polarity = 3: ~x intersect ~y
1575
1576 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1577 ensureBufferCapacity(len + otherLen);
1578 int32_t i = 0, j = 0, k = 0;
1579 UChar32 a = list[i++];
1580 UChar32 b = other[j++];
1581 // change from xor is that we have to check overlapping pairs
1582 // polarity bit 1 means a is second, bit 2 means b is.
1583 for (;;) {
1584 switch (polarity) {
1585 case 0: // both first; drop the smaller
1586 if (a < b) { // drop a
1587 a = list[i++];
1588 polarity ^= 1;
1589 } else if (b < a) { // drop b
1590 b = other[j++];
1591 polarity ^= 2;
1592 } else { // a == b, take one, drop other
1593 if (a == UNICODESET_HIGH) goto loop_end;
1594 buffer[k++] = a;
1595 a = list[i++];
1596 polarity ^= 1;
1597 b = other[j++];
1598 polarity ^= 2;
1599 }
1600 break;
1601 case 3: // both second; take lower if unequal
1602 if (a < b) { // take a
1603 buffer[k++] = a;
1604 a = list[i++];
1605 polarity ^= 1;
1606 } else if (b < a) { // take b
1607 buffer[k++] = b;
1608 b = other[j++];
1609 polarity ^= 2;
1610 } else { // a == b, take one, drop other
1611 if (a == UNICODESET_HIGH) goto loop_end;
1612 buffer[k++] = a;
1613 a = list[i++];
1614 polarity ^= 1;
1615 b = other[j++];
1616 polarity ^= 2;
1617 }
1618 break;
1619 case 1: // a second, b first;
1620 if (a < b) { // NO OVERLAP, drop a
1621 a = list[i++];
1622 polarity ^= 1;
1623 } else if (b < a) { // OVERLAP, take b
1624 buffer[k++] = b;
1625 b = other[j++];
1626 polarity ^= 2;
1627 } else { // a == b, drop both!
1628 if (a == UNICODESET_HIGH) goto loop_end;
1629 a = list[i++];
1630 polarity ^= 1;
1631 b = other[j++];
1632 polarity ^= 2;
1633 }
1634 break;
1635 case 2: // a first, b second; if a < b, overlap
1636 if (b < a) { // no overlap, drop b
1637 b = other[j++];
1638 polarity ^= 2;
1639 } else if (a < b) { // OVERLAP, take a
1640 buffer[k++] = a;
1641 a = list[i++];
1642 polarity ^= 1;
1643 } else { // a == b, drop both!
1644 if (a == UNICODESET_HIGH) goto loop_end;
1645 a = list[i++];
1646 polarity ^= 1;
1647 b = other[j++];
1648 polarity ^= 2;
1649 }
1650 break;
1651 }
1652 }
1653 loop_end:
1654 buffer[k++] = UNICODESET_HIGH; // terminate
1655 len = k;
1656 swapBuffers();
1657 pat.truncate(0);
1658 }
1659
1660 /**
1661 * Append the <code>toPattern()</code> representation of a
1662 * string to the given <code>StringBuffer</code>.
1663 */
1664 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1665 escapeUnprintable) {
1666 UChar32 cp;
1667 for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
1668 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1669 }
1670 }
1671
1672 /**
1673 * Append the <code>toPattern()</code> representation of a
1674 * character to the given <code>StringBuffer</code>.
1675 */
1676 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1677 escapeUnprintable) {
1678 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1679 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1680 // unprintable
1681 if (ICU_Utility::escapeUnprintable(buf, c)) {
1682 return;
1683 }
1684 }
1685 // Okay to let ':' pass through
1686 switch (c) {
1687 case SET_OPEN:
1688 case SET_CLOSE:
1689 case HYPHEN:
1690 case COMPLEMENT:
1691 case INTERSECTION:
1692 case BACKSLASH:
1693 case OPEN_BRACE:
1694 case CLOSE_BRACE:
1695 case COLON:
1696 case SymbolTable::SYMBOL_REF:
1697 buf.append(BACKSLASH);
1698 break;
1699 default:
1700 // Escape whitespace
1701 if (uprv_isRuleWhiteSpace(c)) {
1702 buf.append(BACKSLASH);
1703 }
1704 break;
1705 }
1706 buf.append(c);
1707 }
1708
1709 /**
1710 * Append a string representation of this set to result. This will be
1711 * a cleaned version of the string passed to applyPattern(), if there
1712 * is one. Otherwise it will be generated.
1713 */
1714 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
1715 UBool escapeUnprintable) const {
1716 if (pat.length() > 0) {
1717 int32_t i;
1718 int32_t backslashCount = 0;
1719 for (i=0; i<pat.length(); ) {
1720 UChar32 c = pat.char32At(i);
1721 i += UTF_CHAR_LENGTH(c);
1722 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1723 // If the unprintable character is preceded by an odd
1724 // number of backslashes, then it has been escaped.
1725 // Before unescaping it, we delete the final
1726 // backslash.
1727 if ((backslashCount % 2) == 1) {
1728 result.truncate(result.length() - 1);
1729 }
1730 ICU_Utility::escapeUnprintable(result, c);
1731 backslashCount = 0;
1732 } else {
1733 result.append(c);
1734 if (c == BACKSLASH) {
1735 ++backslashCount;
1736 } else {
1737 backslashCount = 0;
1738 }
1739 }
1740 }
1741 return result;
1742 }
1743
1744 return _generatePattern(result, escapeUnprintable);
1745 }
1746
1747 /**
1748 * Returns a string representation of this set. If the result of
1749 * calling this function is passed to a UnicodeSet constructor, it
1750 * will produce another set that is equal to this one.
1751 */
1752 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
1753 UBool escapeUnprintable) const {
1754 result.truncate(0);
1755 return _toPattern(result, escapeUnprintable);
1756 }
1757
1758 /**
1759 * Generate and append a string representation of this set to result.
1760 * This does not use this.pat, the cleaned up copy of the string
1761 * passed to applyPattern().
1762 */
1763 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
1764 UBool escapeUnprintable) const {
1765 result.append(SET_OPEN);
1766
1767 // // Check against the predefined categories. We implicitly build
1768 // // up ALL category sets the first time toPattern() is called.
1769 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
1770 // if (*this == getCategorySet(cat)) {
1771 // result.append(COLON);
1772 // result.append(CATEGORY_NAMES, cat*2, 2);
1773 // return result.append(CATEGORY_CLOSE);
1774 // }
1775 // }
1776
1777 int32_t count = getRangeCount();
1778
1779 // If the set contains at least 2 intervals and includes both
1780 // MIN_VALUE and MAX_VALUE, then the inverse representation will
1781 // be more economical.
1782 if (count > 1 &&
1783 getRangeStart(0) == MIN_VALUE &&
1784 getRangeEnd(count-1) == MAX_VALUE) {
1785
1786 // Emit the inverse
1787 result.append(COMPLEMENT);
1788
1789 for (int32_t i = 1; i < count; ++i) {
1790 UChar32 start = getRangeEnd(i-1)+1;
1791 UChar32 end = getRangeStart(i)-1;
1792 _appendToPat(result, start, escapeUnprintable);
1793 if (start != end) {
1794 if ((start+1) != end) {
1795 result.append(HYPHEN);
1796 }
1797 _appendToPat(result, end, escapeUnprintable);
1798 }
1799 }
1800 }
1801
1802 // Default; emit the ranges as pairs
1803 else {
1804 for (int32_t i = 0; i < count; ++i) {
1805 UChar32 start = getRangeStart(i);
1806 UChar32 end = getRangeEnd(i);
1807 _appendToPat(result, start, escapeUnprintable);
1808 if (start != end) {
1809 if ((start+1) != end) {
1810 result.append(HYPHEN);
1811 }
1812 _appendToPat(result, end, escapeUnprintable);
1813 }
1814 }
1815 }
1816
1817 for (int32_t i = 0; i<strings->size(); ++i) {
1818 result.append(OPEN_BRACE);
1819 _appendToPat(result,
1820 *(const UnicodeString*) strings->elementAt(i),
1821 escapeUnprintable);
1822 result.append(CLOSE_BRACE);
1823 }
1824 return result.append(SET_CLOSE);
1825 }
1826
1827
1828 U_NAMESPACE_END