]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/uniset.cpp
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / common / uniset.cpp
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
374ca955 3* Copyright (C) 1999-2004, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6* Date Name Description
7* 10/20/99 alan Creation.
8**********************************************************************
9*/
10
374ca955 11#include "unicode/utypes.h"
b75a7d8f
A
12#include "unicode/uniset.h"
13#include "unicode/parsepos.h"
374ca955
A
14#include "unicode/symtable.h"
15#include "ruleiter.h"
b75a7d8f
A
16#include "cmemory.h"
17#include "uhash.h"
18#include "util.h"
19#include "uvector.h"
b75a7d8f
A
20#include "charstr.h"
21#include "ustrfmt.h"
22#include "mutex.h"
23#include "uassert.h"
24#include "hash.h"
b75a7d8f
A
25
26// Define UChar constants using hex for EBCDIC compatibility
27// Used #define to reduce private static exports and memory access time.
28#define SET_OPEN ((UChar)0x005B) /*[*/
29#define SET_CLOSE ((UChar)0x005D) /*]*/
30#define HYPHEN ((UChar)0x002D) /*-*/
31#define COMPLEMENT ((UChar)0x005E) /*^*/
32#define COLON ((UChar)0x003A) /*:*/
33#define BACKSLASH ((UChar)0x005C) /*\*/
34#define INTERSECTION ((UChar)0x0026) /*&*/
35#define UPPER_U ((UChar)0x0055) /*U*/
36#define LOWER_U ((UChar)0x0075) /*u*/
37#define OPEN_BRACE ((UChar)123) /*{*/
38#define CLOSE_BRACE ((UChar)125) /*}*/
39#define UPPER_P ((UChar)0x0050) /*P*/
40#define LOWER_P ((UChar)0x0070) /*p*/
41#define UPPER_N ((UChar)78) /*N*/
42#define EQUALS ((UChar)0x003D) /*=*/
43
374ca955
A
44// HIGH_VALUE > all valid values. 110000 for codepoints
45#define UNICODESET_HIGH 0x0110000
b75a7d8f 46
374ca955
A
47// LOW <= all valid values. ZERO for codepoints
48#define UNICODESET_LOW 0x000000
49
50// initial storage. Must be >= 0
51#define START_EXTRA 16
52
53// extra amount for growth. Must be >= 0
54#define GROW_EXTRA START_EXTRA
b75a7d8f
A
55
56U_NAMESPACE_BEGIN
57
374ca955 58SymbolTable::~SymbolTable() {}
b75a7d8f
A
59
60/**
61 * Minimum value that can be stored in a UnicodeSet.
62 */
63const UChar32 UnicodeSet::MIN_VALUE = UNICODESET_LOW;
64
65/**
66 * Maximum value that can be stored in a UnicodeSet.
67 */
68const UChar32 UnicodeSet::MAX_VALUE = UNICODESET_HIGH - 1;
69
374ca955 70UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
b75a7d8f
A
71
72/**
73 * Modify the given UChar32 variable so that it is in range, by
74 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
75 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
76 * It modifies its argument in-place and also returns it.
77 */
78static inline UChar32 pinCodePoint(UChar32& c) {
79 if (c < UNICODESET_LOW) {
80 c = UNICODESET_LOW;
81 } else if (c > (UNICODESET_HIGH-1)) {
82 c = (UNICODESET_HIGH-1);
83 }
84 return c;
85}
86
87//----------------------------------------------------------------
88// Debugging
89//----------------------------------------------------------------
90
91// DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
92// To enable the debugging, define the symbol DEBUG_MEM in the line
93// below. This will result in text being sent to stdout that looks
94// like this:
95// DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
96// DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
97// Each line lists a construction (ct) or destruction (dt) event, the
98// object address, the number of outstanding objects after the event,
99// and the pattern of the object in question.
100
101// #define DEBUG_MEM
102
103#ifdef DEBUG_MEM
104#include <stdio.h>
105static int32_t _dbgCount = 0;
106
107static inline void _dbgct(UnicodeSet* set) {
108 UnicodeString str;
109 set->toPattern(str, TRUE);
110 char buf[40];
111 str.extract(0, 39, buf, "");
112 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
113}
114
115static inline void _dbgdt(UnicodeSet* set) {
116 UnicodeString str;
117 set->toPattern(str, TRUE);
118 char buf[40];
119 str.extract(0, 39, buf, "");
120 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
121}
122
123#else
124
125#define _dbgct(set)
126#define _dbgdt(set)
127
128#endif
129
130//----------------------------------------------------------------
131// UnicodeString in UVector support
132//----------------------------------------------------------------
133
134static void U_CALLCONV cloneUnicodeString(UHashTok *dst, UHashTok *src) {
135 dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
136}
137
138static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) {
139 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
140 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
141 return a.compare(b);
142}
143
144//----------------------------------------------------------------
145// Constructors &c
146//----------------------------------------------------------------
147
148/**
149 * Constructs an empty set.
150 */
151UnicodeSet::UnicodeSet() :
152 len(1), capacity(1 + START_EXTRA), bufferCapacity(0),
153 list(0), buffer(0), strings(0)
154{
155 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
156 if(list!=NULL){
157 list[0] = UNICODESET_HIGH;
158 }
159 allocateStrings();
160 _dbgct(this);
161}
162
163/**
164 * Constructs a set containing the given range. If <code>end >
165 * start</code> then an empty set is created.
166 *
167 * @param start first character, inclusive, of range
168 * @param end last character, inclusive, of range
169 */
170UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
171 len(1), capacity(1 + START_EXTRA), bufferCapacity(0),
172 list(0), buffer(0), strings(0)
173{
174 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
175 if(list!=NULL){
176 list[0] = UNICODESET_HIGH;
177 }
178 allocateStrings();
179 complement(start, end);
180 _dbgct(this);
181}
182
b75a7d8f
A
183/**
184 * Constructs a set that is identical to the given UnicodeSet.
185 */
186UnicodeSet::UnicodeSet(const UnicodeSet& o) :
187 UnicodeFilter(o),
188 len(0), capacity(o.len + GROW_EXTRA), bufferCapacity(0),
189 list(0), buffer(0), strings(0)
190{
191 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
192 if(list!=NULL){
193 allocateStrings();
194 *this = o;
195 }
196 _dbgct(this);
197}
198
199/**
200 * Destructs the set.
201 */
202UnicodeSet::~UnicodeSet() {
203 _dbgdt(this); // first!
204 uprv_free(list);
205 if (buffer) {
206 uprv_free(buffer);
207 }
208 delete strings;
209}
210
211/**
212 * Assigns this object to be a copy of another.
213 */
214UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
215 ensureCapacity(o.len);
216 len = o.len;
217 uprv_memcpy(list, o.list, len*sizeof(UChar32));
218 UErrorCode ec = U_ZERO_ERROR;
219 strings->assign(*o.strings, cloneUnicodeString, ec);
220 pat = o.pat;
221 return *this;
222}
223
224/**
225 * Compares the specified object with this set for equality. Returns
226 * <tt>true</tt> if the two sets
227 * have the same size, and every member of the specified set is
228 * contained in this set (or equivalently, every member of this set is
229 * contained in the specified set).
230 *
231 * @param o set to be compared for equality with this set.
232 * @return <tt>true</tt> if the specified set is equal to this set.
233 */
234UBool UnicodeSet::operator==(const UnicodeSet& o) const {
235 if (len != o.len) return FALSE;
236 for (int32_t i = 0; i < len; ++i) {
237 if (list[i] != o.list[i]) return FALSE;
238 }
239 if (*strings != *o.strings) return FALSE;
240 return TRUE;
241}
242
243/**
244 * Returns a copy of this object. All UnicodeMatcher objects have
245 * to support cloning in order to allow classes using
246 * UnicodeMatchers, such as Transliterator, to implement cloning.
247 */
248UnicodeFunctor* UnicodeSet::clone() const {
249 return new UnicodeSet(*this);
250}
251
252/**
253 * Returns the hash code value for this set.
254 *
255 * @return the hash code value for this set.
256 * @see Object#hashCode()
257 */
258int32_t UnicodeSet::hashCode(void) const {
259 int32_t result = len;
260 for (int32_t i = 0; i < len; ++i) {
261 result *= 1000003;
262 result += list[i];
263 }
264 return result;
265}
266
267//----------------------------------------------------------------
268// Public API
269//----------------------------------------------------------------
270
271/**
272 * Make this object represent the range <code>start - end</code>.
273 * If <code>end > start</code> then this object is set to an
274 * an empty range.
275 *
276 * @param start first character in the set, inclusive
277 * @rparam end last character in the set, inclusive
278 */
279UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
280 clear();
281 complement(start, end);
282 return *this;
283}
284
b75a7d8f
A
285/**
286 * Returns the number of elements in this set (its cardinality),
374ca955
A
287 * Note than the elements of a set may include both individual
288 * codepoints and strings.
b75a7d8f
A
289 *
290 * @return the number of elements in this set (its cardinality).
291 */
292int32_t UnicodeSet::size(void) const {
293 int32_t n = 0;
294 int32_t count = getRangeCount();
295 for (int32_t i = 0; i < count; ++i) {
296 n += getRangeEnd(i) - getRangeStart(i) + 1;
297 }
298 return n + strings->size();
299}
300
301/**
302 * Returns <tt>true</tt> if this set contains no elements.
303 *
304 * @return <tt>true</tt> if this set contains no elements.
305 */
306UBool UnicodeSet::isEmpty(void) const {
307 return len == 1 && strings->size() == 0;
308}
309
310/**
311 * Returns true if this set contains the given character.
312 * @param c character to be checked for containment
313 * @return true if the test condition is met
314 */
315UBool UnicodeSet::contains(UChar32 c) const {
316 // Set i to the index of the start item greater than ch
317 // We know we will terminate without length test!
318 // LATER: for large sets, add binary search
319 //int32_t i = -1;
320 //for (;;) {
321 // if (c < list[++i]) break;
322 //}
323 if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
324 return FALSE;
325 }
326 int32_t i = findCodePoint(c);
327 return ((i & 1) != 0); // return true if odd
328}
329
330/**
331 * Returns the smallest value i such that c < list[i]. Caller
332 * must ensure that c is a legal value or this method will enter
333 * an infinite loop. This method performs a binary search.
334 * @param c a character in the range MIN_VALUE..MAX_VALUE
335 * inclusive
336 * @return the smallest integer i in the range 0..len-1,
337 * inclusive, such that c < list[i]
338 */
339int32_t UnicodeSet::findCodePoint(UChar32 c) const {
340 /* Examples:
341 findCodePoint(c)
342 set list[] c=0 1 3 4 7 8
343 === ============== ===========
344 [] [110000] 0 0 0 0 0 0
345 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
346 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
347 [:Any:] [0, 110000] 1 1 1 1 1 1
348 */
349
350 // Return the smallest i such that c < list[i]. Assume
351 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
352 if (c < list[0]) return 0;
353 // High runner test. c is often after the last range, so an
374ca955 354 // initial check for this condition pays off.
b75a7d8f
A
355 if (len >= 2 && c >= list[len-2]) return len-1;
356 int32_t lo = 0;
357 int32_t hi = len - 1;
358 // invariant: c >= list[lo]
359 // invariant: c < list[hi]
360 for (;;) {
361 int32_t i = (lo + hi) >> 1;
362 if (i == lo) return hi;
363 if (c < list[i]) {
364 hi = i;
365 } else {
366 lo = i;
367 }
368 }
369 return 0; // To make compiler happy; never reached
370}
371
372/**
373 * Returns true if this set contains every character
374 * of the given range.
375 * @param start first character, inclusive, of the range
376 * @param end last character, inclusive, of the range
377 * @return true if the test condition is met
378 */
379UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
380 //int32_t i = -1;
381 //for (;;) {
382 // if (start < list[++i]) break;
383 //}
384 int32_t i = findCodePoint(start);
385 return ((i & 1) != 0 && end < list[i]);
386}
387
388/**
389 * Returns <tt>true</tt> if this set contains the given
390 * multicharacter string.
391 * @param s string to be checked for containment
392 * @return <tt>true</tt> if this set contains the specified string
393 */
394UBool UnicodeSet::contains(const UnicodeString& s) const {
395 if (s.length() == 0) return FALSE;
396 int32_t cp = getSingleCP(s);
397 if (cp < 0) {
398 return strings->contains((void*) &s);
399 } else {
400 return contains((UChar32) cp);
401 }
402}
403
404/**
405 * Returns true if this set contains all the characters and strings
406 * of the given set.
407 * @param c set to be checked for containment
408 * @return true if the test condition is met
409 */
410UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
411 // The specified set is a subset if all of its pairs are contained in
412 // this set. It's possible to code this more efficiently in terms of
413 // direct manipulation of the inversion lists if the need arises.
414 int32_t n = c.getRangeCount();
415 for (int i=0; i<n; ++i) {
416 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
417 return FALSE;
418 }
419 }
420 if (!strings->containsAll(*c.strings)) return FALSE;
421 return TRUE;
422}
374ca955 423
b75a7d8f
A
424/**
425 * Returns true if this set contains all the characters
426 * of the given string.
427 * @param s string containing characters to be checked for containment
428 * @return true if the test condition is met
429 */
430UBool UnicodeSet::containsAll(const UnicodeString& s) const {
431 UChar32 cp;
432 for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
433 cp = s.char32At(i);
434 if (!contains(cp)) return FALSE;
435 }
436 return TRUE;
437}
374ca955 438
b75a7d8f
A
439/**
440 * Returns true if this set contains none of the characters
441 * of the given range.
442 * @param start first character, inclusive, of the range
443 * @param end last character, inclusive, of the range
444 * @return true if the test condition is met
445 */
446UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
447 //int32_t i = -1;
448 //for (;;) {
449 // if (start < list[++i]) break;
450 //}
451 int32_t i = findCodePoint(start);
452 return ((i & 1) == 0 && end < list[i]);
453}
454
455/**
456 * Returns true if this set contains none of the characters and strings
457 * of the given set.
458 * @param c set to be checked for containment
459 * @return true if the test condition is met
460 */
461UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
462 // The specified set is a subset if all of its pairs are contained in
463 // this set. It's possible to code this more efficiently in terms of
464 // direct manipulation of the inversion lists if the need arises.
465 int32_t n = c.getRangeCount();
466 for (int32_t i=0; i<n; ++i) {
467 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
468 return FALSE;
469 }
470 }
471 if (!strings->containsNone(*c.strings)) return FALSE;
472 return TRUE;
473}
374ca955 474
b75a7d8f
A
475/**
476 * Returns true if this set contains none of the characters
477 * of the given string.
478 * @param s string containing characters to be checked for containment
479 * @return true if the test condition is met
480 */
481UBool UnicodeSet::containsNone(const UnicodeString& s) const {
482 UChar32 cp;
483 for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
484 cp = s.char32At(i);
485 if (contains(cp)) return FALSE;
486 }
487 return TRUE;
488}
489
490/**
491 * Returns <tt>true</tt> if this set contains any character whose low byte
492 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
493 * indexing.
494 */
495UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
496 /* The index value v, in the range [0,255], is contained in this set if
497 * it is contained in any pair of this set. Pairs either have the high
498 * bytes equal, or unequal. If the high bytes are equal, then we have
499 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
500 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
501 * Then v is contained if xx <= v || v <= yy. (This is identical to the
502 * time zone month containment logic.)
503 */
504 int32_t i;
505 for (i=0; i<getRangeCount(); ++i) {
506 UChar32 low = getRangeStart(i);
507 UChar32 high = getRangeEnd(i);
508 if ((low & ~0xFF) == (high & ~0xFF)) {
509 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
510 return TRUE;
511 }
512 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
513 return TRUE;
514 }
515 }
516 if (strings->size() != 0) {
517 for (i=0; i<strings->size(); ++i) {
518 const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
519 //if (s.length() == 0) {
520 // // Empty strings match everything
521 // return TRUE;
522 //}
523 // assert(s.length() != 0); // We enforce this elsewhere
524 UChar32 c = s.char32At(0);
525 if ((c & 0xFF) == v) {
526 return TRUE;
527 }
528 }
529 }
530 return FALSE;
531}
532
533/**
374ca955
A
534 * Implementation of UnicodeMatcher::matches(). Always matches the
535 * longest possible multichar string.
b75a7d8f
A
536 */
537UMatchDegree UnicodeSet::matches(const Replaceable& text,
538 int32_t& offset,
539 int32_t limit,
540 UBool incremental) {
541 if (offset == limit) {
542 // Strings, if any, have length != 0, so we don't worry
543 // about them here. If we ever allow zero-length strings
544 // we much check for them here.
545 if (contains(U_ETHER)) {
546 return incremental ? U_PARTIAL_MATCH : U_MATCH;
547 } else {
548 return U_MISMATCH;
549 }
550 } else {
551 if (strings->size() != 0) { // try strings first
374ca955 552
b75a7d8f
A
553 // might separate forward and backward loops later
554 // for now they are combined
555
556 // TODO Improve efficiency of this, at least in the forward
557 // direction, if not in both. In the forward direction we
558 // can assume the strings are sorted.
559
560 int32_t i;
561 UBool forward = offset < limit;
562
563 // firstChar is the leftmost char to match in the
564 // forward direction or the rightmost char to match in
565 // the reverse direction.
566 UChar firstChar = text.charAt(offset);
567
568 // If there are multiple strings that can match we
569 // return the longest match.
570 int32_t highWaterLength = 0;
571
572 for (i=0; i<strings->size(); ++i) {
573 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
574
575 //if (trial.length() == 0) {
576 // return U_MATCH; // null-string always matches
577 //}
578 // assert(trial.length() != 0); // We ensure this elsewhere
579
580 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
581
582 // Strings are sorted, so we can optimize in the
583 // forward direction.
584 if (forward && c > firstChar) break;
585 if (c != firstChar) continue;
374ca955 586
b75a7d8f
A
587 int32_t matchLen = matchRest(text, offset, limit, trial);
588
589 if (incremental) {
590 int32_t maxLen = forward ? limit-offset : offset-limit;
591 if (matchLen == maxLen) {
592 // We have successfully matched but only up to limit.
593 return U_PARTIAL_MATCH;
594 }
595 }
596
597 if (matchLen == trial.length()) {
598 // We have successfully matched the whole string.
599 if (matchLen > highWaterLength) {
600 highWaterLength = matchLen;
601 }
602 // In the forward direction we know strings
603 // are sorted so we can bail early.
604 if (forward && matchLen < highWaterLength) {
605 break;
606 }
607 continue;
608 }
609 }
610
611 // We've checked all strings without a partial match.
612 // If we have full matches, return the longest one.
613 if (highWaterLength != 0) {
614 offset += forward ? highWaterLength : -highWaterLength;
615 return U_MATCH;
616 }
617 }
618 return UnicodeFilter::matches(text, offset, limit, incremental);
619 }
620}
621
622/**
623 * Returns the longest match for s in text at the given position.
624 * If limit > start then match forward from start+1 to limit
625 * matching all characters except s.charAt(0). If limit < start,
626 * go backward starting from start-1 matching all characters
627 * except s.charAt(s.length()-1). This method assumes that the
628 * first character, text.charAt(start), matches s, so it does not
629 * check it.
630 * @param text the text to match
631 * @param start the first character to match. In the forward
632 * direction, text.charAt(start) is matched against s.charAt(0).
633 * In the reverse direction, it is matched against
634 * s.charAt(s.length()-1).
635 * @param limit the limit offset for matching, either last+1 in
636 * the forward direction, or last-1 in the reverse direction,
637 * where last is the index of the last character to match.
638 * @return If part of s matches up to the limit, return |limit -
639 * start|. If all of s matches before reaching the limit, return
640 * s.length(). If there is a mismatch between s and text, return
641 * 0
642 */
643int32_t UnicodeSet::matchRest(const Replaceable& text,
644 int32_t start, int32_t limit,
645 const UnicodeString& s) {
646 int32_t i;
647 int32_t maxLen;
648 int32_t slen = s.length();
649 if (start < limit) {
650 maxLen = limit - start;
651 if (maxLen > slen) maxLen = slen;
652 for (i = 1; i < maxLen; ++i) {
653 if (text.charAt(start + i) != s.charAt(i)) return 0;
654 }
655 } else {
656 maxLen = start - limit;
657 if (maxLen > slen) maxLen = slen;
658 --slen; // <=> slen = s.length() - 1;
659 for (i = 1; i < maxLen; ++i) {
660 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
661 }
662 }
663 return maxLen;
664}
665
666/**
667 * Implement of UnicodeMatcher
668 */
669void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
670 toUnionTo.addAll(*this);
671}
672
673/**
674 * Returns the index of the given character within this set, where
675 * the set is ordered by ascending code point. If the character
676 * is not in this set, return -1. The inverse of this method is
677 * <code>charAt()</code>.
678 * @return an index from 0..size()-1, or -1
679 */
680int32_t UnicodeSet::indexOf(UChar32 c) const {
681 if (c < MIN_VALUE || c > MAX_VALUE) {
682 return -1;
683 }
684 int32_t i = 0;
685 int32_t n = 0;
686 for (;;) {
687 UChar32 start = list[i++];
688 if (c < start) {
689 return -1;
690 }
691 UChar32 limit = list[i++];
692 if (c < limit) {
693 return n + c - start;
694 }
695 n += limit - start;
696 }
697}
698
699/**
700 * Returns the character at the given index within this set, where
701 * the set is ordered by ascending code point. If the index is
702 * out of range, return (UChar32)-1. The inverse of this method is
703 * <code>indexOf()</code>.
704 * @param index an index from 0..size()-1
705 * @return the character at the given index, or (UChar32)-1.
706 */
707UChar32 UnicodeSet::charAt(int32_t index) const {
708 if (index >= 0) {
709 // len2 is the largest even integer <= len, that is, it is len
710 // for even values and len-1 for odd values. With odd values
711 // the last entry is UNICODESET_HIGH.
712 int32_t len2 = len & ~1;
713 for (int32_t i=0; i < len2;) {
714 UChar32 start = list[i++];
715 int32_t count = list[i++] - start;
716 if (index < count) {
717 return (UChar32)(start + index);
718 }
719 index -= count;
720 }
721 }
722 return (UChar32)-1;
723}
724
725/**
726 * Adds the specified range to this set if it is not already
727 * present. If this set already contains the specified range,
728 * the call leaves this set unchanged. If <code>end > start</code>
729 * then an empty range is added, leaving the set unchanged.
730 *
731 * @param start first character, inclusive, of range to be added
732 * to this set.
733 * @param end last character, inclusive, of range to be added
734 * to this set.
735 */
736UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
737 if (pinCodePoint(start) < pinCodePoint(end)) {
738 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
739 add(range, 2, 0);
740 } else if (start == end) {
741 add(start);
742 }
743 return *this;
744}
745
746// #define DEBUG_US_ADD
747
748#ifdef DEBUG_US_ADD
749#include <stdio.h>
750void dump(UChar32 c) {
751 if (c <= 0xFF) {
752 printf("%c", (char)c);
753 } else {
754 printf("U+%04X", c);
755 }
756}
757void dump(const UChar32* list, int32_t len) {
758 printf("[");
759 for (int32_t i=0; i<len; ++i) {
760 if (i != 0) printf(", ");
761 dump(list[i]);
762 }
763 printf("]");
764}
765#endif
766
767/**
768 * Adds the specified character to this set if it is not already
769 * present. If this set already contains the specified character,
770 * the call leaves this set unchanged.
771 */
772UnicodeSet& UnicodeSet::add(UChar32 c) {
773 // find smallest i such that c < list[i]
774 // if odd, then it is IN the set
775 // if even, then it is OUT of the set
776 int32_t i = findCodePoint(pinCodePoint(c));
777
778 // already in set?
779 if ((i & 1) != 0) return *this;
374ca955 780
b75a7d8f
A
781 // HIGH is 0x110000
782 // assert(list[len-1] == HIGH);
374ca955 783
b75a7d8f
A
784 // empty = [HIGH]
785 // [start_0, limit_0, start_1, limit_1, HIGH]
374ca955 786
b75a7d8f
A
787 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
788 // ^
789 // list[i]
790
791 // i == 0 means c is before the first range
792
793#ifdef DEBUG_US_ADD
794 printf("Add of ");
795 dump(c);
796 printf(" found at %d", i);
797 printf(": ");
798 dump(list, len);
799 printf(" => ");
800#endif
801
802 if (c == list[i]-1) {
803 // c is before start of next range
804 list[i] = c;
805 // if we touched the HIGH mark, then add a new one
806 if (c == (UNICODESET_HIGH - 1)) {
807 ensureCapacity(len+1);
808 list[len++] = UNICODESET_HIGH;
809 }
810 if (i > 0 && c == list[i-1]) {
811 // collapse adjacent ranges
812
813 // [..., start_k-1, c, c, limit_k, ..., HIGH]
814 // ^
815 // list[i]
816
817 //for (int32_t k=i-1; k<len-2; ++k) {
818 // list[k] = list[k+2];
819 //}
820 UChar32* dst = list + i - 1;
821 UChar32* src = dst + 2;
822 UChar32* srclimit = list + len;
823 while (src < srclimit) *(dst++) = *(src++);
824
825 len -= 2;
826 }
827 }
828
829 else if (i > 0 && c == list[i-1]) {
830 // c is after end of prior range
831 list[i-1]++;
832 // no need to check for collapse here
833 }
834
835 else {
836 // At this point we know the new char is not adjacent to
837 // any existing ranges, and it is not 10FFFF.
838
839
840 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
841 // ^
842 // list[i]
843
844 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
845 // ^
846 // list[i]
847
848 ensureCapacity(len+2);
849
850 //for (int32_t k=len-1; k>=i; --k) {
851 // list[k+2] = list[k];
852 //}
853 UChar32* src = list + len;
854 UChar32* dst = src + 2;
855 UChar32* srclimit = list + i;
856 while (src > srclimit) *(--dst) = *(--src);
857
858 list[i] = c;
859 list[i+1] = c+1;
860 len += 2;
861 }
374ca955 862
b75a7d8f
A
863#ifdef DEBUG_US_ADD
864 dump(list, len);
865 printf("\n");
866
867 for (i=1; i<len; ++i) {
868 if (list[i] <= list[i-1]) {
869 // Corrupt array!
870 printf("ERROR: list has been corrupted\n");
871 exit(1);
872 }
374ca955 873 }
b75a7d8f 874#endif
374ca955 875
b75a7d8f
A
876 pat.truncate(0);
877 return *this;
878}
879
880/**
881 * Adds the specified multicharacter to this set if it is not already
882 * present. If this set already contains the multicharacter,
883 * the call leaves this set unchanged.
884 * Thus "ch" => {"ch"}
885 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
886 * @param s the source string
887 * @return the modified set, for chaining
888 */
889UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
890 if (s.length() == 0) return *this;
891 int32_t cp = getSingleCP(s);
892 if (cp < 0) {
893 if (!strings->contains((void*) &s)) {
894 _add(s);
895 pat.truncate(0);
896 }
897 } else {
898 add((UChar32)cp, (UChar32)cp);
899 }
900 return *this;
901}
902
903/**
904 * Adds the given string, in order, to 'strings'. The given string
905 * must have been checked by the caller to not be empty and to not
906 * already be in 'strings'.
907 */
908void UnicodeSet::_add(const UnicodeString& s) {
909 UnicodeString* t = new UnicodeString(s);
910 UErrorCode ec = U_ZERO_ERROR;
911 strings->sortedInsert(t, compareUnicodeString, ec);
912}
913
914/**
915 * @return a code point IF the string consists of a single one.
916 * otherwise returns -1.
917 * @param string to test
918 */
919int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
920 //if (s.length() < 1) {
921 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
922 //}
923 if (s.length() > 2) return -1;
924 if (s.length() == 1) return s.charAt(0);
925
926 // at this point, len = 2
927 UChar32 cp = s.char32At(0);
928 if (cp > 0xFFFF) { // is surrogate pair
929 return cp;
930 }
931 return -1;
932}
933
934/**
935 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
936 * If this set already any particular character, it has no effect on that character.
937 * @param the source string
938 * @return the modified set, for chaining
939 */
940UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
941 UChar32 cp;
942 for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
943 cp = s.char32At(i);
944 add(cp, cp);
945 }
946 return *this;
947}
948
949/**
950 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
951 * If this set already any particular character, it has no effect on that character.
952 * @param the source string
953 * @return the modified set, for chaining
954 */
955UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
956 UnicodeSet set;
957 set.addAll(s);
958 retainAll(set);
959 return *this;
960}
961
962/**
963 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
964 * If this set already any particular character, it has no effect on that character.
965 * @param the source string
966 * @return the modified set, for chaining
967 */
968UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
969 UnicodeSet set;
970 set.addAll(s);
971 complementAll(set);
972 return *this;
973}
974
975/**
976 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
977 * If this set already any particular character, it has no effect on that character.
978 * @param the source string
979 * @return the modified set, for chaining
980 */
981UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
982 UnicodeSet set;
983 set.addAll(s);
984 removeAll(set);
985 return *this;
986}
987
988/**
989 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
990 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
991 * @param the source string
992 * @return a newly created set containing the given string
993 */
374ca955 994UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
b75a7d8f
A
995 UnicodeSet *set = new UnicodeSet();
996 set->add(s);
997 return set;
998}
999
1000
1001/**
1002 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1003 * @param the source string
1004 * @return a newly created set containing the given characters
1005 */
374ca955 1006UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
b75a7d8f
A
1007 UnicodeSet *set = new UnicodeSet();
1008 set->addAll(s);
1009 return set;
1010}
1011
1012/**
1013 * Retain only the elements in this set that are contained in the
1014 * specified range. If <code>end > start</code> then an empty range is
1015 * retained, leaving the set empty.
1016 *
1017 * @param start first character, inclusive, of range to be retained
1018 * to this set.
1019 * @param end last character, inclusive, of range to be retained
1020 * to this set.
1021 */
1022UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1023 if (pinCodePoint(start) <= pinCodePoint(end)) {
1024 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1025 retain(range, 2, 0);
1026 } else {
1027 clear();
1028 }
1029 return *this;
1030}
1031
1032UnicodeSet& UnicodeSet::retain(UChar32 c) {
1033 return retain(c, c);
1034}
1035
1036/**
1037 * Removes the specified range from this set if it is present.
1038 * The set will not contain the specified range once the call
1039 * returns. If <code>end > start</code> then an empty range is
1040 * removed, leaving the set unchanged.
374ca955 1041 *
b75a7d8f
A
1042 * @param start first character, inclusive, of range to be removed
1043 * from this set.
1044 * @param end last character, inclusive, of range to be removed
1045 * from this set.
1046 */
1047UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1048 if (pinCodePoint(start) <= pinCodePoint(end)) {
1049 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1050 retain(range, 2, 2);
1051 }
1052 return *this;
1053}
1054
1055/**
1056 * Removes the specified character from this set if it is present.
1057 * The set will not contain the specified range once the call
1058 * returns.
1059 */
1060UnicodeSet& UnicodeSet::remove(UChar32 c) {
1061 return remove(c, c);
1062}
1063
1064/**
1065 * Removes the specified string from this set if it is present.
1066 * The set will not contain the specified character once the call
1067 * returns.
1068 * @param the source string
1069 * @return the modified set, for chaining
1070 */
1071UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1072 if (s.length() == 0) return *this;
1073 int32_t cp = getSingleCP(s);
1074 if (cp < 0) {
1075 strings->removeElement((void*) &s);
1076 pat.truncate(0);
1077 } else {
1078 remove((UChar32)cp, (UChar32)cp);
1079 }
1080 return *this;
1081}
1082
1083/**
1084 * Complements the specified range in this set. Any character in
1085 * the range will be removed if it is in this set, or will be
1086 * added if it is not in this set. If <code>end > start</code>
1087 * then an empty range is xor'ed, leaving the set unchanged.
1088 *
1089 * @param start first character, inclusive, of range to be removed
1090 * from this set.
1091 * @param end last character, inclusive, of range to be removed
1092 * from this set.
1093 */
1094UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1095 if (pinCodePoint(start) <= pinCodePoint(end)) {
1096 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1097 exclusiveOr(range, 2, 0);
1098 }
1099 pat.truncate(0);
1100 return *this;
1101}
1102
1103UnicodeSet& UnicodeSet::complement(UChar32 c) {
1104 return complement(c, c);
1105}
1106
1107/**
1108 * This is equivalent to
1109 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1110 */
1111UnicodeSet& UnicodeSet::complement(void) {
374ca955 1112 if (list[0] == UNICODESET_LOW) {
b75a7d8f
A
1113 ensureBufferCapacity(len-1);
1114 uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
1115 --len;
1116 } else {
1117 ensureBufferCapacity(len+1);
1118 uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));
1119 buffer[0] = UNICODESET_LOW;
1120 ++len;
1121 }
1122 swapBuffers();
1123 pat.truncate(0);
1124 return *this;
1125}
1126
1127/**
1128 * Complement the specified string in this set.
1129 * The set will not contain the specified string once the call
1130 * returns.
1131 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1132 * @param s the string to complement
1133 * @return this object, for chaining
1134 */
1135UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1136 if (s.length() == 0) return *this;
1137 int32_t cp = getSingleCP(s);
1138 if (cp < 0) {
1139 if (strings->contains((void*) &s)) {
1140 strings->removeElement((void*) &s);
1141 } else {
1142 _add(s);
1143 }
1144 pat.truncate(0);
1145 } else {
1146 complement((UChar32)cp, (UChar32)cp);
1147 }
1148 return *this;
374ca955 1149}
b75a7d8f
A
1150
1151/**
1152 * Adds all of the elements in the specified set to this set if
1153 * they're not already present. This operation effectively
1154 * modifies this set so that its value is the <i>union</i> of the two
1155 * sets. The behavior of this operation is unspecified if the specified
1156 * collection is modified while the operation is in progress.
1157 *
1158 * @param c set whose elements are to be added to this set.
1159 * @see #add(char, char)
1160 */
1161UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1162 add(c.list, c.len, 0);
1163
374ca955 1164 // Add strings in order
b75a7d8f
A
1165 for (int32_t i=0; i<c.strings->size(); ++i) {
1166 const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1167 if (!strings->contains((void*) s)) {
1168 _add(*s);
1169 }
1170 }
1171 return *this;
1172}
1173
1174/**
1175 * Retains only the elements in this set that are contained in the
1176 * specified set. In other words, removes from this set all of
1177 * its elements that are not contained in the specified set. This
1178 * operation effectively modifies this set so that its value is
1179 * the <i>intersection</i> of the two sets.
1180 *
1181 * @param c set that defines which elements this set will retain.
1182 */
1183UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1184 retain(c.list, c.len, 0);
1185 strings->retainAll(*c.strings);
1186 return *this;
1187}
1188
1189/**
1190 * Removes from this set all of its elements that are contained in the
1191 * specified set. This operation effectively modifies this
1192 * set so that its value is the <i>asymmetric set difference</i> of
1193 * the two sets.
1194 *
1195 * @param c set that defines which elements will be removed from
1196 * this set.
1197 */
1198UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1199 retain(c.list, c.len, 2);
1200 strings->removeAll(*c.strings);
1201 return *this;
1202}
1203
1204/**
1205 * Complements in this set all elements contained in the specified
1206 * set. Any character in the other set will be removed if it is
1207 * in this set, or will be added if it is not in this set.
1208 *
1209 * @param c set that defines which elements will be xor'ed from
1210 * this set.
1211 */
1212UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1213 exclusiveOr(c.list, c.len, 0);
1214
1215 for (int32_t i=0; i<c.strings->size(); ++i) {
1216 void* e = c.strings->elementAt(i);
1217 if (!strings->removeElement(e)) {
1218 _add(*(const UnicodeString*)e);
1219 }
1220 }
1221 return *this;
1222}
1223
1224/**
1225 * Removes all of the elements from this set. This set will be
1226 * empty after this call returns.
1227 */
1228UnicodeSet& UnicodeSet::clear(void) {
1229 list[0] = UNICODESET_HIGH;
1230 len = 1;
1231 pat.truncate(0);
1232 strings->removeAllElements();
1233 return *this;
1234}
1235
1236/**
1237 * Iteration method that returns the number of ranges contained in
1238 * this set.
1239 * @see #getRangeStart
1240 * @see #getRangeEnd
1241 */
1242int32_t UnicodeSet::getRangeCount() const {
1243 return len/2;
1244}
1245
1246/**
1247 * Iteration method that returns the first character in the
1248 * specified range of this set.
1249 * @see #getRangeCount
1250 * @see #getRangeEnd
1251 */
1252UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1253 return list[index*2];
1254}
1255
1256/**
1257 * Iteration method that returns the last character in the
1258 * specified range of this set.
1259 * @see #getRangeStart
1260 * @see #getRangeEnd
1261 */
1262UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1263 return list[index*2 + 1] - 1;
1264}
1265
1266int32_t UnicodeSet::getStringCount() const {
1267 return strings->size();
1268}
1269
1270const UnicodeString* UnicodeSet::getString(int32_t index) const {
1271 return (const UnicodeString*) strings->elementAt(index);
1272}
1273
1274/**
1275 * Reallocate this objects internal structures to take up the least
1276 * possible space, without changing this object's value.
1277 */
1278UnicodeSet& UnicodeSet::compact() {
1279 if (len != capacity) {
1280 capacity = len;
1281 UChar32* temp = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
1282 uprv_memcpy(temp, list, len*sizeof(UChar32));
1283 uprv_free(list);
1284 list = temp;
1285 }
1286 uprv_free(buffer);
1287 buffer = NULL;
1288 return *this;
1289}
1290
1291int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1292 int32_t bmpLength, length, destLength;
1293
1294 if (U_FAILURE(ec)) {
1295 return 0;
1296 }
1297
1298 if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1299 ec=U_ILLEGAL_ARGUMENT_ERROR;
1300 return 0;
1301 }
1302
1303 /* count necessary 16-bit units */
1304 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1305 // assert(length>=0);
1306 if (length==0) {
1307 /* empty set */
1308 if (destCapacity>0) {
1309 *dest=0;
1310 } else {
1311 ec=U_BUFFER_OVERFLOW_ERROR;
1312 }
1313 return 1;
1314 }
1315 /* now length>0 */
1316
1317 if (this->list[length-1]<=0xffff) {
1318 /* all BMP */
1319 bmpLength=length;
1320 } else if (this->list[0]>=0x10000) {
1321 /* all supplementary */
1322 bmpLength=0;
1323 length*=2;
1324 } else {
1325 /* some BMP, some supplementary */
1326 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1327 length=bmpLength+2*(length-bmpLength);
1328 }
1329
1330 /* length: number of 16-bit array units */
1331 if (length>0x7fff) {
1332 /* there are only 15 bits for the length in the first serialized word */
1333 ec=U_INDEX_OUTOFBOUNDS_ERROR;
1334 return 0;
1335 }
1336
1337 /*
1338 * total serialized length:
1339 * number of 16-bit array units (length) +
1340 * 1 length unit (always) +
1341 * 1 bmpLength unit (if there are supplementary values)
1342 */
1343 destLength=length+((length>bmpLength)?2:1);
1344 if (destLength<=destCapacity) {
1345 const UChar32 *p;
1346 int32_t i;
1347
1348 *dest=(uint16_t)length;
1349 if (length>bmpLength) {
1350 *dest|=0x8000;
1351 *++dest=(uint16_t)bmpLength;
1352 }
1353 ++dest;
1354
1355 /* write the BMP part of the array */
1356 p=this->list;
1357 for (i=0; i<bmpLength; ++i) {
1358 *dest++=(uint16_t)*p++;
1359 }
1360
1361 /* write the supplementary part of the array */
1362 for (; i<length; i+=2) {
1363 *dest++=(uint16_t)(*p>>16);
1364 *dest++=(uint16_t)*p++;
1365 }
1366 } else {
1367 ec=U_BUFFER_OVERFLOW_ERROR;
1368 }
1369 return destLength;
1370}
1371
b75a7d8f
A
1372//----------------------------------------------------------------
1373// Implementation: Utility methods
1374//----------------------------------------------------------------
1375
1376/**
1377 * Allocate our strings vector and return TRUE if successful.
1378 */
1379UBool UnicodeSet::allocateStrings() {
1380 UErrorCode ec = U_ZERO_ERROR;
1381 strings = new UVector(uhash_deleteUnicodeString,
1382 uhash_compareUnicodeString, ec);
1383 if (U_FAILURE(ec)) {
1384 delete strings;
1385 strings = NULL;
1386 return FALSE;
1387 }
1388 return TRUE;
1389}
1390
1391void UnicodeSet::ensureCapacity(int32_t newLen) {
1392 if (newLen <= capacity)
1393 return;
1394 capacity = newLen + GROW_EXTRA;
1395 UChar32* temp = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
1396 uprv_memcpy(temp, list, len*sizeof(UChar32));
1397 uprv_free(list);
1398 list = temp;
1399}
1400
1401void UnicodeSet::ensureBufferCapacity(int32_t newLen) {
1402 if (buffer != NULL && newLen <= bufferCapacity)
1403 return;
1404 if (buffer) {
1405 uprv_free(buffer);
1406 }
1407 bufferCapacity = newLen + GROW_EXTRA;
1408 buffer = (UChar32*) uprv_malloc(sizeof(UChar32) * bufferCapacity);
1409}
1410
1411/**
1412 * Swap list and buffer.
1413 */
1414void UnicodeSet::swapBuffers(void) {
1415 // swap list and buffer
1416 UChar32* temp = list;
1417 list = buffer;
1418 buffer = temp;
1419
1420 int32_t c = capacity;
1421 capacity = bufferCapacity;
1422 bufferCapacity = c;
1423}
1424
1425//----------------------------------------------------------------
1426// Implementation: Fundamental operators
1427//----------------------------------------------------------------
1428
1429static inline UChar32 max(UChar32 a, UChar32 b) {
1430 return (a > b) ? a : b;
1431}
1432
1433// polarity = 0, 3 is normal: x xor y
1434// polarity = 1, 2: x xor ~y == x === y
1435
1436void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1437 ensureBufferCapacity(len + otherLen);
1438 int32_t i = 0, j = 0, k = 0;
1439 UChar32 a = list[i++];
1440 UChar32 b;
1441 if (polarity == 1 || polarity == 2) {
1442 b = UNICODESET_LOW;
1443 if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1444 ++j;
1445 b = other[j];
1446 }
1447 } else {
1448 b = other[j++];
1449 }
1450 // simplest of all the routines
1451 // sort the values, discarding identicals!
1452 for (;;) {
1453 if (a < b) {
1454 buffer[k++] = a;
1455 a = list[i++];
1456 } else if (b < a) {
1457 buffer[k++] = b;
1458 b = other[j++];
1459 } else if (a != UNICODESET_HIGH) { // at this point, a == b
1460 // discard both values!
1461 a = list[i++];
1462 b = other[j++];
1463 } else { // DONE!
1464 buffer[k++] = UNICODESET_HIGH;
1465 len = k;
1466 break;
1467 }
1468 }
1469 swapBuffers();
1470 pat.truncate(0);
1471}
1472
1473// polarity = 0 is normal: x union y
1474// polarity = 2: x union ~y
1475// polarity = 1: ~x union y
1476// polarity = 3: ~x union ~y
1477
1478void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1479 ensureBufferCapacity(len + otherLen);
1480 int32_t i = 0, j = 0, k = 0;
1481 UChar32 a = list[i++];
1482 UChar32 b = other[j++];
1483 // change from xor is that we have to check overlapping pairs
1484 // polarity bit 1 means a is second, bit 2 means b is.
1485 for (;;) {
1486 switch (polarity) {
1487 case 0: // both first; take lower if unequal
1488 if (a < b) { // take a
1489 // Back up over overlapping ranges in buffer[]
1490 if (k > 0 && a <= buffer[k-1]) {
1491 // Pick latter end value in buffer[] vs. list[]
1492 a = max(list[i], buffer[--k]);
1493 } else {
1494 // No overlap
1495 buffer[k++] = a;
1496 a = list[i];
1497 }
1498 i++; // Common if/else code factored out
1499 polarity ^= 1;
1500 } else if (b < a) { // take b
1501 if (k > 0 && b <= buffer[k-1]) {
1502 b = max(other[j], buffer[--k]);
1503 } else {
1504 buffer[k++] = b;
1505 b = other[j];
1506 }
1507 j++;
1508 polarity ^= 2;
1509 } else { // a == b, take a, drop b
1510 if (a == UNICODESET_HIGH) goto loop_end;
1511 // This is symmetrical; it doesn't matter if
1512 // we backtrack with a or b. - liu
1513 if (k > 0 && a <= buffer[k-1]) {
1514 a = max(list[i], buffer[--k]);
1515 } else {
1516 // No overlap
1517 buffer[k++] = a;
1518 a = list[i];
1519 }
1520 i++;
1521 polarity ^= 1;
1522 b = other[j++];
1523 polarity ^= 2;
1524 }
1525 break;
1526 case 3: // both second; take higher if unequal, and drop other
1527 if (b <= a) { // take a
1528 if (a == UNICODESET_HIGH) goto loop_end;
1529 buffer[k++] = a;
1530 } else { // take b
1531 if (b == UNICODESET_HIGH) goto loop_end;
1532 buffer[k++] = b;
1533 }
1534 a = list[i++];
1535 polarity ^= 1; // factored common code
1536 b = other[j++];
1537 polarity ^= 2;
1538 break;
1539 case 1: // a second, b first; if b < a, overlap
1540 if (a < b) { // no overlap, take a
1541 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1542 } else if (b < a) { // OVERLAP, drop b
1543 b = other[j++];
1544 polarity ^= 2;
1545 } else { // a == b, drop both!
1546 if (a == UNICODESET_HIGH) goto loop_end;
1547 a = list[i++];
1548 polarity ^= 1;
1549 b = other[j++];
1550 polarity ^= 2;
1551 }
1552 break;
1553 case 2: // a first, b second; if a < b, overlap
1554 if (b < a) { // no overlap, take b
1555 buffer[k++] = b;
1556 b = other[j++];
1557 polarity ^= 2;
1558 } else if (a < b) { // OVERLAP, drop a
1559 a = list[i++];
1560 polarity ^= 1;
1561 } else { // a == b, drop both!
1562 if (a == UNICODESET_HIGH) goto loop_end;
1563 a = list[i++];
1564 polarity ^= 1;
1565 b = other[j++];
1566 polarity ^= 2;
1567 }
1568 break;
1569 }
1570 }
1571 loop_end:
1572 buffer[k++] = UNICODESET_HIGH; // terminate
1573 len = k;
1574 swapBuffers();
1575 pat.truncate(0);
1576}
1577
1578// polarity = 0 is normal: x intersect y
1579// polarity = 2: x intersect ~y == set-minus
1580// polarity = 1: ~x intersect y
1581// polarity = 3: ~x intersect ~y
1582
1583void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1584 ensureBufferCapacity(len + otherLen);
1585 int32_t i = 0, j = 0, k = 0;
1586 UChar32 a = list[i++];
1587 UChar32 b = other[j++];
1588 // change from xor is that we have to check overlapping pairs
1589 // polarity bit 1 means a is second, bit 2 means b is.
1590 for (;;) {
1591 switch (polarity) {
1592 case 0: // both first; drop the smaller
1593 if (a < b) { // drop a
1594 a = list[i++];
1595 polarity ^= 1;
1596 } else if (b < a) { // drop b
1597 b = other[j++];
1598 polarity ^= 2;
1599 } else { // a == b, take one, drop other
1600 if (a == UNICODESET_HIGH) goto loop_end;
1601 buffer[k++] = a;
1602 a = list[i++];
1603 polarity ^= 1;
1604 b = other[j++];
1605 polarity ^= 2;
1606 }
1607 break;
1608 case 3: // both second; take lower if unequal
1609 if (a < b) { // take a
1610 buffer[k++] = a;
1611 a = list[i++];
1612 polarity ^= 1;
1613 } else if (b < a) { // take b
1614 buffer[k++] = b;
1615 b = other[j++];
1616 polarity ^= 2;
1617 } else { // a == b, take one, drop other
1618 if (a == UNICODESET_HIGH) goto loop_end;
1619 buffer[k++] = a;
1620 a = list[i++];
1621 polarity ^= 1;
1622 b = other[j++];
1623 polarity ^= 2;
1624 }
1625 break;
1626 case 1: // a second, b first;
1627 if (a < b) { // NO OVERLAP, drop a
1628 a = list[i++];
1629 polarity ^= 1;
1630 } else if (b < a) { // OVERLAP, take b
1631 buffer[k++] = b;
1632 b = other[j++];
1633 polarity ^= 2;
1634 } else { // a == b, drop both!
1635 if (a == UNICODESET_HIGH) goto loop_end;
1636 a = list[i++];
1637 polarity ^= 1;
1638 b = other[j++];
1639 polarity ^= 2;
1640 }
1641 break;
1642 case 2: // a first, b second; if a < b, overlap
1643 if (b < a) { // no overlap, drop b
1644 b = other[j++];
1645 polarity ^= 2;
1646 } else if (a < b) { // OVERLAP, take a
1647 buffer[k++] = a;
1648 a = list[i++];
1649 polarity ^= 1;
1650 } else { // a == b, drop both!
1651 if (a == UNICODESET_HIGH) goto loop_end;
1652 a = list[i++];
1653 polarity ^= 1;
1654 b = other[j++];
1655 polarity ^= 2;
1656 }
1657 break;
1658 }
1659 }
1660 loop_end:
1661 buffer[k++] = UNICODESET_HIGH; // terminate
1662 len = k;
1663 swapBuffers();
1664 pat.truncate(0);
1665}
1666
b75a7d8f 1667/**
374ca955
A
1668 * Append the <code>toPattern()</code> representation of a
1669 * string to the given <code>StringBuffer</code>.
b75a7d8f 1670 */
374ca955
A
1671void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1672escapeUnprintable) {
1673 UChar32 cp;
1674 for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
1675 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
b75a7d8f
A
1676 }
1677}
1678
374ca955
A
1679/**
1680 * Append the <code>toPattern()</code> representation of a
1681 * character to the given <code>StringBuffer</code>.
1682 */
1683void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1684escapeUnprintable) {
1685 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1686 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1687 // unprintable
1688 if (ICU_Utility::escapeUnprintable(buf, c)) {
1689 return;
b75a7d8f 1690 }
b75a7d8f 1691 }
374ca955
A
1692 // Okay to let ':' pass through
1693 switch (c) {
1694 case SET_OPEN:
1695 case SET_CLOSE:
1696 case HYPHEN:
1697 case COMPLEMENT:
1698 case INTERSECTION:
1699 case BACKSLASH:
1700 case OPEN_BRACE:
1701 case CLOSE_BRACE:
1702 case COLON:
1703 case SymbolTable::SYMBOL_REF:
1704 buf.append(BACKSLASH);
1705 break;
1706 default:
1707 // Escape whitespace
1708 if (uprv_isRuleWhiteSpace(c)) {
1709 buf.append(BACKSLASH);
b75a7d8f 1710 }
374ca955 1711 break;
b75a7d8f 1712 }
374ca955 1713 buf.append(c);
b75a7d8f
A
1714}
1715
374ca955
A
1716/**
1717 * Append a string representation of this set to result. This will be
1718 * a cleaned version of the string passed to applyPattern(), if there
1719 * is one. Otherwise it will be generated.
1720 */
1721UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
1722 UBool escapeUnprintable) const {
1723 if (pat.length() > 0) {
1724 int32_t i;
1725 int32_t backslashCount = 0;
1726 for (i=0; i<pat.length(); ) {
1727 UChar32 c = pat.char32At(i);
1728 i += UTF_CHAR_LENGTH(c);
1729 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1730 // If the unprintable character is preceded by an odd
1731 // number of backslashes, then it has been escaped.
1732 // Before unescaping it, we delete the final
1733 // backslash.
1734 if ((backslashCount % 2) == 1) {
1735 result.truncate(result.length() - 1);
b75a7d8f 1736 }
374ca955
A
1737 ICU_Utility::escapeUnprintable(result, c);
1738 backslashCount = 0;
1739 } else {
1740 result.append(c);
1741 if (c == BACKSLASH) {
1742 ++backslashCount;
b75a7d8f 1743 } else {
374ca955 1744 backslashCount = 0;
b75a7d8f
A
1745 }
1746 }
1747 }
374ca955 1748 return result;
b75a7d8f
A
1749 }
1750
374ca955 1751 return _generatePattern(result, escapeUnprintable);
b75a7d8f
A
1752}
1753
1754/**
374ca955
A
1755 * Returns a string representation of this set. If the result of
1756 * calling this function is passed to a UnicodeSet constructor, it
1757 * will produce another set that is equal to this one.
b75a7d8f 1758 */
374ca955
A
1759UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
1760 UBool escapeUnprintable) const {
1761 result.truncate(0);
1762 return _toPattern(result, escapeUnprintable);
b75a7d8f
A
1763}
1764
b75a7d8f 1765/**
374ca955
A
1766 * Generate and append a string representation of this set to result.
1767 * This does not use this.pat, the cleaned up copy of the string
1768 * passed to applyPattern().
b75a7d8f 1769 */
374ca955
A
1770UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
1771 UBool escapeUnprintable) const {
1772 result.append(SET_OPEN);
b75a7d8f 1773
374ca955
A
1774// // Check against the predefined categories. We implicitly build
1775// // up ALL category sets the first time toPattern() is called.
1776// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
1777// if (*this == getCategorySet(cat)) {
1778// result.append(COLON);
1779// result.append(CATEGORY_NAMES, cat*2, 2);
1780// return result.append(CATEGORY_CLOSE);
1781// }
1782// }
b75a7d8f 1783
374ca955 1784 int32_t count = getRangeCount();
b75a7d8f 1785
374ca955
A
1786 // If the set contains at least 2 intervals and includes both
1787 // MIN_VALUE and MAX_VALUE, then the inverse representation will
1788 // be more economical.
1789 if (count > 1 &&
1790 getRangeStart(0) == MIN_VALUE &&
1791 getRangeEnd(count-1) == MAX_VALUE) {
b75a7d8f 1792
374ca955
A
1793 // Emit the inverse
1794 result.append(COMPLEMENT);
b75a7d8f 1795
374ca955
A
1796 for (int32_t i = 1; i < count; ++i) {
1797 UChar32 start = getRangeEnd(i-1)+1;
1798 UChar32 end = getRangeStart(i)-1;
1799 _appendToPat(result, start, escapeUnprintable);
1800 if (start != end) {
1801 if ((start+1) != end) {
1802 result.append(HYPHEN);
b75a7d8f 1803 }
374ca955 1804 _appendToPat(result, end, escapeUnprintable);
b75a7d8f 1805 }
b75a7d8f
A
1806 }
1807 }
1808
374ca955
A
1809 // Default; emit the ranges as pairs
1810 else {
1811 for (int32_t i = 0; i < count; ++i) {
1812 UChar32 start = getRangeStart(i);
1813 UChar32 end = getRangeEnd(i);
1814 _appendToPat(result, start, escapeUnprintable);
1815 if (start != end) {
1816 if ((start+1) != end) {
1817 result.append(HYPHEN);
b75a7d8f 1818 }
374ca955 1819 _appendToPat(result, end, escapeUnprintable);
b75a7d8f 1820 }
b75a7d8f
A
1821 }
1822 }
1823
374ca955
A
1824 for (int32_t i = 0; i<strings->size(); ++i) {
1825 result.append(OPEN_BRACE);
1826 _appendToPat(result,
1827 *(const UnicodeString*) strings->elementAt(i),
1828 escapeUnprintable);
1829 result.append(CLOSE_BRACE);
b75a7d8f 1830 }
374ca955 1831 return result.append(SET_CLOSE);
b75a7d8f
A
1832}
1833
374ca955 1834
b75a7d8f 1835U_NAMESPACE_END