]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/uset.cpp
ICU-531.30.tar.gz
[apple/icu.git] / icuSources / common / uset.cpp
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2002-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: uset.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002mar07
14 * created by: Markus W. Scherer
15 *
16 * There are functions to efficiently serialize a USet into an array of uint16_t
17 * and functions to use such a serialized form efficiently without
18 * instantiating a new USet.
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/uobject.h"
23 #include "unicode/uset.h"
24 #include "unicode/uniset.h"
25 #include "cmemory.h"
26 #include "unicode/ustring.h"
27 #include "unicode/parsepos.h"
28
29 U_NAMESPACE_USE
30
31 U_CAPI USet* U_EXPORT2
32 uset_openEmpty() {
33 return (USet*) new UnicodeSet();
34 }
35
36 U_CAPI USet* U_EXPORT2
37 uset_open(UChar32 start, UChar32 end) {
38 return (USet*) new UnicodeSet(start, end);
39 }
40
41 U_CAPI void U_EXPORT2
42 uset_close(USet* set) {
43 delete (UnicodeSet*) set;
44 }
45
46 U_CAPI USet * U_EXPORT2
47 uset_clone(const USet *set) {
48 return (USet*) (((UnicodeSet*) set)->UnicodeSet::clone());
49 }
50
51 U_CAPI UBool U_EXPORT2
52 uset_isFrozen(const USet *set) {
53 return ((UnicodeSet*) set)->UnicodeSet::isFrozen();
54 }
55
56 U_CAPI void U_EXPORT2
57 uset_freeze(USet *set) {
58 ((UnicodeSet*) set)->UnicodeSet::freeze();
59 }
60
61 U_CAPI USet * U_EXPORT2
62 uset_cloneAsThawed(const USet *set) {
63 return (USet*) (((UnicodeSet*) set)->UnicodeSet::cloneAsThawed());
64 }
65
66 U_CAPI void U_EXPORT2
67 uset_set(USet* set,
68 UChar32 start, UChar32 end) {
69 ((UnicodeSet*) set)->UnicodeSet::set(start, end);
70 }
71
72 U_CAPI void U_EXPORT2
73 uset_addAll(USet* set, const USet *additionalSet) {
74 ((UnicodeSet*) set)->UnicodeSet::addAll(*((const UnicodeSet*)additionalSet));
75 }
76
77 U_CAPI void U_EXPORT2
78 uset_add(USet* set, UChar32 c) {
79 ((UnicodeSet*) set)->UnicodeSet::add(c);
80 }
81
82 U_CAPI void U_EXPORT2
83 uset_addRange(USet* set, UChar32 start, UChar32 end) {
84 ((UnicodeSet*) set)->UnicodeSet::add(start, end);
85 }
86
87 U_CAPI void U_EXPORT2
88 uset_addString(USet* set, const UChar* str, int32_t strLen) {
89 // UnicodeString handles -1 for strLen
90 UnicodeString s(strLen<0, str, strLen);
91 ((UnicodeSet*) set)->UnicodeSet::add(s);
92 }
93
94 U_CAPI void U_EXPORT2
95 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen) {
96 // UnicodeString handles -1 for strLen
97 UnicodeString s(str, strLen);
98 ((UnicodeSet*) set)->UnicodeSet::addAll(s);
99 }
100
101 U_CAPI void U_EXPORT2
102 uset_remove(USet* set, UChar32 c) {
103 ((UnicodeSet*) set)->UnicodeSet::remove(c);
104 }
105
106 U_CAPI void U_EXPORT2
107 uset_removeRange(USet* set, UChar32 start, UChar32 end) {
108 ((UnicodeSet*) set)->UnicodeSet::remove(start, end);
109 }
110
111 U_CAPI void U_EXPORT2
112 uset_removeString(USet* set, const UChar* str, int32_t strLen) {
113 UnicodeString s(strLen==-1, str, strLen);
114 ((UnicodeSet*) set)->UnicodeSet::remove(s);
115 }
116
117 U_CAPI void U_EXPORT2
118 uset_removeAll(USet* set, const USet* remove) {
119 ((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove);
120 }
121
122 U_CAPI void U_EXPORT2
123 uset_retain(USet* set, UChar32 start, UChar32 end) {
124 ((UnicodeSet*) set)->UnicodeSet::retain(start, end);
125 }
126
127 U_CAPI void U_EXPORT2
128 uset_retainAll(USet* set, const USet* retain) {
129 ((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain);
130 }
131
132 U_CAPI void U_EXPORT2
133 uset_compact(USet* set) {
134 ((UnicodeSet*) set)->UnicodeSet::compact();
135 }
136
137 U_CAPI void U_EXPORT2
138 uset_complement(USet* set) {
139 ((UnicodeSet*) set)->UnicodeSet::complement();
140 }
141
142 U_CAPI void U_EXPORT2
143 uset_complementAll(USet* set, const USet* complement) {
144 ((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement);
145 }
146
147 U_CAPI void U_EXPORT2
148 uset_clear(USet* set) {
149 ((UnicodeSet*) set)->UnicodeSet::clear();
150 }
151
152 U_CAPI void U_EXPORT2
153 uset_removeAllStrings(USet* set) {
154 ((UnicodeSet*) set)->UnicodeSet::removeAllStrings();
155 }
156
157 U_CAPI UBool U_EXPORT2
158 uset_isEmpty(const USet* set) {
159 return ((const UnicodeSet*) set)->UnicodeSet::isEmpty();
160 }
161
162 U_CAPI UBool U_EXPORT2
163 uset_contains(const USet* set, UChar32 c) {
164 return ((const UnicodeSet*) set)->UnicodeSet::contains(c);
165 }
166
167 U_CAPI UBool U_EXPORT2
168 uset_containsRange(const USet* set, UChar32 start, UChar32 end) {
169 return ((const UnicodeSet*) set)->UnicodeSet::contains(start, end);
170 }
171
172 U_CAPI UBool U_EXPORT2
173 uset_containsString(const USet* set, const UChar* str, int32_t strLen) {
174 UnicodeString s(strLen==-1, str, strLen);
175 return ((const UnicodeSet*) set)->UnicodeSet::contains(s);
176 }
177
178 U_CAPI UBool U_EXPORT2
179 uset_containsAll(const USet* set1, const USet* set2) {
180 return ((const UnicodeSet*) set1)->UnicodeSet::containsAll(* (const UnicodeSet*) set2);
181 }
182
183 U_CAPI UBool U_EXPORT2
184 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen) {
185 // Create a string alias, since nothing is being added to the set.
186 UnicodeString s(strLen==-1, str, strLen);
187 return ((const UnicodeSet*) set)->UnicodeSet::containsAll(s);
188 }
189
190 U_CAPI UBool U_EXPORT2
191 uset_containsNone(const USet* set1, const USet* set2) {
192 return ((const UnicodeSet*) set1)->UnicodeSet::containsNone(* (const UnicodeSet*) set2);
193 }
194
195 U_CAPI UBool U_EXPORT2
196 uset_containsSome(const USet* set1, const USet* set2) {
197 return ((const UnicodeSet*) set1)->UnicodeSet::containsSome(* (const UnicodeSet*) set2);
198 }
199
200 U_CAPI int32_t U_EXPORT2
201 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) {
202 return ((UnicodeSet*) set)->UnicodeSet::span(s, length, spanCondition);
203 }
204
205 U_CAPI int32_t U_EXPORT2
206 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) {
207 return ((UnicodeSet*) set)->UnicodeSet::spanBack(s, length, spanCondition);
208 }
209
210 U_CAPI int32_t U_EXPORT2
211 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) {
212 return ((UnicodeSet*) set)->UnicodeSet::spanUTF8(s, length, spanCondition);
213 }
214
215 U_CAPI int32_t U_EXPORT2
216 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) {
217 return ((UnicodeSet*) set)->UnicodeSet::spanBackUTF8(s, length, spanCondition);
218 }
219
220 U_CAPI UBool U_EXPORT2
221 uset_equals(const USet* set1, const USet* set2) {
222 return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2;
223 }
224
225 U_CAPI int32_t U_EXPORT2
226 uset_indexOf(const USet* set, UChar32 c) {
227 return ((UnicodeSet*) set)->UnicodeSet::indexOf(c);
228 }
229
230 U_CAPI UChar32 U_EXPORT2
231 uset_charAt(const USet* set, int32_t index) {
232 return ((UnicodeSet*) set)->UnicodeSet::charAt(index);
233 }
234
235 U_CAPI int32_t U_EXPORT2
236 uset_size(const USet* set) {
237 return ((const UnicodeSet*) set)->UnicodeSet::size();
238 }
239
240 U_NAMESPACE_BEGIN
241 /**
242 * This class only exists to provide access to the UnicodeSet private
243 * USet support API. Declaring a class a friend is more portable than
244 * trying to declare extern "C" functions as friends.
245 */
246 class USetAccess /* not : public UObject because all methods are static */ {
247 public:
248 /* Try to have the compiler inline these*/
249 inline static int32_t getStringCount(const UnicodeSet& set) {
250 return set.getStringCount();
251 }
252 inline static const UnicodeString* getString(const UnicodeSet& set,
253 int32_t i) {
254 return set.getString(i);
255 }
256 private:
257 /* do not instantiate*/
258 USetAccess();
259 };
260 U_NAMESPACE_END
261
262 U_CAPI int32_t U_EXPORT2
263 uset_getItemCount(const USet* uset) {
264 const UnicodeSet& set = *(const UnicodeSet*)uset;
265 return set.getRangeCount() + USetAccess::getStringCount(set);
266 }
267
268 U_CAPI int32_t U_EXPORT2
269 uset_getItem(const USet* uset, int32_t itemIndex,
270 UChar32* start, UChar32* end,
271 UChar* str, int32_t strCapacity,
272 UErrorCode* ec) {
273 if (U_FAILURE(*ec)) return 0;
274 const UnicodeSet& set = *(const UnicodeSet*)uset;
275 int32_t rangeCount;
276
277 if (itemIndex < 0) {
278 *ec = U_ILLEGAL_ARGUMENT_ERROR;
279 return -1;
280 } else if (itemIndex < (rangeCount = set.getRangeCount())) {
281 *start = set.getRangeStart(itemIndex);
282 *end = set.getRangeEnd(itemIndex);
283 return 0;
284 } else {
285 itemIndex -= rangeCount;
286 if (itemIndex < USetAccess::getStringCount(set)) {
287 const UnicodeString* s = USetAccess::getString(set, itemIndex);
288 return s->extract(str, strCapacity, *ec);
289 } else {
290 *ec = U_INDEX_OUTOFBOUNDS_ERROR;
291 return -1;
292 }
293 }
294 }
295
296 //U_CAPI int32_t U_EXPORT2
297 //uset_getRangeCount(const USet* set) {
298 // return ((const UnicodeSet*) set)->getRangeCount();
299 //}
300 //
301 //U_CAPI UBool U_EXPORT2
302 //uset_getRange(const USet* set, int32_t rangeIndex,
303 // UChar32* pStart, UChar32* pEnd) {
304 // if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) {
305 // return FALSE;
306 // }
307 // const UnicodeSet* us = (const UnicodeSet*) set;
308 // *pStart = us->getRangeStart(rangeIndex);
309 // *pEnd = us->getRangeEnd(rangeIndex);
310 // return TRUE;
311 //}
312
313 /*
314 * Serialize a USet into 16-bit units.
315 * Store BMP code points as themselves with one 16-bit unit each.
316 *
317 * Important: the code points in the array are in ascending order,
318 * therefore all BMP code points precede all supplementary code points.
319 *
320 * Store each supplementary code point in 2 16-bit units,
321 * simply with higher-then-lower 16-bit halfs.
322 *
323 * Precede the entire list with the length.
324 * If there are supplementary code points, then set bit 15 in the length
325 * and add the bmpLength between it and the array.
326 *
327 * In other words:
328 * - all BMP: (length=bmpLength) BMP, .., BMP
329 * - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, ..
330 */
331 U_CAPI int32_t U_EXPORT2
332 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* ec) {
333 if (ec==NULL || U_FAILURE(*ec)) {
334 return 0;
335 }
336
337 return ((const UnicodeSet*) set)->UnicodeSet::serialize(dest, destCapacity,* ec);
338 }
339
340 U_CAPI UBool U_EXPORT2
341 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength) {
342 int32_t length;
343
344 if(fillSet==NULL) {
345 return FALSE;
346 }
347 if(src==NULL || srcLength<=0) {
348 fillSet->length=fillSet->bmpLength=0;
349 return FALSE;
350 }
351
352 length=*src++;
353 if(length&0x8000) {
354 /* there are supplementary values */
355 length&=0x7fff;
356 if(srcLength<(2+length)) {
357 fillSet->length=fillSet->bmpLength=0;
358 return FALSE;
359 }
360 fillSet->bmpLength=*src++;
361 } else {
362 /* only BMP values */
363 if(srcLength<(1+length)) {
364 fillSet->length=fillSet->bmpLength=0;
365 return FALSE;
366 }
367 fillSet->bmpLength=length;
368 }
369 fillSet->array=src;
370 fillSet->length=length;
371 return TRUE;
372 }
373
374 U_CAPI void U_EXPORT2
375 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c) {
376 if(fillSet==NULL || (uint32_t)c>0x10ffff) {
377 return;
378 }
379
380 fillSet->array=fillSet->staticArray;
381 if(c<0xffff) {
382 fillSet->bmpLength=fillSet->length=2;
383 fillSet->staticArray[0]=(uint16_t)c;
384 fillSet->staticArray[1]=(uint16_t)c+1;
385 } else if(c==0xffff) {
386 fillSet->bmpLength=1;
387 fillSet->length=3;
388 fillSet->staticArray[0]=0xffff;
389 fillSet->staticArray[1]=1;
390 fillSet->staticArray[2]=0;
391 } else if(c<0x10ffff) {
392 fillSet->bmpLength=0;
393 fillSet->length=4;
394 fillSet->staticArray[0]=(uint16_t)(c>>16);
395 fillSet->staticArray[1]=(uint16_t)c;
396 ++c;
397 fillSet->staticArray[2]=(uint16_t)(c>>16);
398 fillSet->staticArray[3]=(uint16_t)c;
399 } else /* c==0x10ffff */ {
400 fillSet->bmpLength=0;
401 fillSet->length=2;
402 fillSet->staticArray[0]=0x10;
403 fillSet->staticArray[1]=0xffff;
404 }
405 }
406
407 U_CAPI UBool U_EXPORT2
408 uset_serializedContains(const USerializedSet* set, UChar32 c) {
409 const uint16_t* array;
410
411 if(set==NULL || (uint32_t)c>0x10ffff) {
412 return FALSE;
413 }
414
415 array=set->array;
416 if(c<=0xffff) {
417 /* find c in the BMP part */
418 int32_t lo = 0;
419 int32_t hi = set->bmpLength-1;
420 if (c < array[0]) {
421 hi = 0;
422 } else if (c < array[hi]) {
423 for(;;) {
424 int32_t i = (lo + hi) >> 1;
425 if (i == lo) {
426 break; // Done!
427 } else if (c < array[i]) {
428 hi = i;
429 } else {
430 lo = i;
431 }
432 }
433 } else {
434 hi += 1;
435 }
436 return (UBool)(hi&1);
437 } else {
438 /* find c in the supplementary part */
439 uint16_t high=(uint16_t)(c>>16), low=(uint16_t)c;
440 int32_t base = set->bmpLength;
441 int32_t lo = 0;
442 int32_t hi = set->length - 2 - base;
443 if (high < array[base] || (high==array[base] && low<array[base+1])) {
444 hi = 0;
445 } else if (high < array[base+hi] || (high==array[base+hi] && low<array[base+hi+1])) {
446 for (;;) {
447 int32_t i = ((lo + hi) >> 1) & ~1; // Guarantee even result
448 int32_t iabs = i + base;
449 if (i == lo) {
450 break; // Done!
451 } else if (high < array[iabs] || (high==array[iabs] && low<array[iabs+1])) {
452 hi = i;
453 } else {
454 lo = i;
455 }
456 }
457 } else {
458 hi += 2;
459 }
460 /* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */
461 return (UBool)(((hi+(base<<1))&2)!=0);
462 }
463 }
464
465 U_CAPI int32_t U_EXPORT2
466 uset_getSerializedRangeCount(const USerializedSet* set) {
467 if(set==NULL) {
468 return 0;
469 }
470
471 return (set->bmpLength+(set->length-set->bmpLength)/2+1)/2;
472 }
473
474 U_CAPI UBool U_EXPORT2
475 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
476 UChar32* pStart, UChar32* pEnd) {
477 const uint16_t* array;
478 int32_t bmpLength, length;
479
480 if(set==NULL || rangeIndex<0 || pStart==NULL || pEnd==NULL) {
481 return FALSE;
482 }
483
484 array=set->array;
485 length=set->length;
486 bmpLength=set->bmpLength;
487
488 rangeIndex*=2; /* address start/limit pairs */
489 if(rangeIndex<bmpLength) {
490 *pStart=array[rangeIndex++];
491 if(rangeIndex<bmpLength) {
492 *pEnd=array[rangeIndex]-1;
493 } else if(rangeIndex<length) {
494 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1;
495 } else {
496 *pEnd=0x10ffff;
497 }
498 return TRUE;
499 } else {
500 rangeIndex-=bmpLength;
501 rangeIndex*=2; /* address pairs of pairs of units */
502 length-=bmpLength;
503 if(rangeIndex<length) {
504 array+=bmpLength;
505 *pStart=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1];
506 rangeIndex+=2;
507 if(rangeIndex<length) {
508 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1;
509 } else {
510 *pEnd=0x10ffff;
511 }
512 return TRUE;
513 } else {
514 return FALSE;
515 }
516 }
517 }
518
519 // TODO The old, internal uset.c had an efficient uset_containsOne function.
520 // Returned the one and only code point, or else -1 or something.
521 // Consider adding such a function to both C and C++ UnicodeSet/uset.
522 // See tools/gennorm/store.c for usage, now usetContainsOne there.
523
524 // TODO Investigate incorporating this code into UnicodeSet to improve
525 // efficiency.
526 // ---
527 // #define USET_GROW_DELTA 20
528 //
529 // static int32_t
530 // findChar(const UChar32* array, int32_t length, UChar32 c) {
531 // int32_t i;
532 //
533 // /* check the last range limit first for more efficient appending */
534 // if(length>0) {
535 // if(c>=array[length-1]) {
536 // return length;
537 // }
538 //
539 // /* do not check the last range limit again in the loop below */
540 // --length;
541 // }
542 //
543 // for(i=0; i<length && c>=array[i]; ++i) {}
544 // return i;
545 // }
546 //
547 // static UBool
548 // addRemove(USet* set, UChar32 c, int32_t doRemove) {
549 // int32_t i, length, more;
550 //
551 // if(set==NULL || (uint32_t)c>0x10ffff) {
552 // return FALSE;
553 // }
554 //
555 // length=set->length;
556 // i=findChar(set->array, length, c);
557 // if((i&1)^doRemove) {
558 // /* c is already in the set */
559 // return TRUE;
560 // }
561 //
562 // /* how many more array items do we need? */
563 // if(i<length && (c+1)==set->array[i]) {
564 // /* c is just before the following range, extend that in-place by one */
565 // set->array[i]=c;
566 // if(i>0) {
567 // --i;
568 // if(c==set->array[i]) {
569 // /* the previous range collapsed, remove it */
570 // set->length=length-=2;
571 // if(i<length) {
572 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4);
573 // }
574 // }
575 // }
576 // return TRUE;
577 // } else if(i>0 && c==set->array[i-1]) {
578 // /* c is just after the previous range, extend that in-place by one */
579 // if(++c<=0x10ffff) {
580 // set->array[i-1]=c;
581 // if(i<length && c==set->array[i]) {
582 // /* the following range collapsed, remove it */
583 // --i;
584 // set->length=length-=2;
585 // if(i<length) {
586 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4);
587 // }
588 // }
589 // } else {
590 // /* extend the previous range (had limit 0x10ffff) to the end of Unicode */
591 // set->length=i-1;
592 // }
593 // return TRUE;
594 // } else if(i==length && c==0x10ffff) {
595 // /* insert one range limit c */
596 // more=1;
597 // } else {
598 // /* insert two range limits c, c+1 */
599 // more=2;
600 // }
601 //
602 // /* insert <more> range limits */
603 // if(length+more>set->capacity) {
604 // /* reallocate */
605 // int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA;
606 // UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4);
607 // if(newArray==NULL) {
608 // return FALSE;
609 // }
610 // set->capacity=newCapacity;
611 // uprv_memcpy(newArray, set->array, length*4);
612 //
613 // if(set->array!=set->staticBuffer) {
614 // uprv_free(set->array);
615 // }
616 // set->array=newArray;
617 // }
618 //
619 // if(i<length) {
620 // uprv_memmove(set->array+i+more, set->array+i, (length-i)*4);
621 // }
622 // set->array[i]=c;
623 // if(more==2) {
624 // set->array[i+1]=c+1;
625 // }
626 // set->length+=more;
627 //
628 // return TRUE;
629 // }
630 //
631 // U_CAPI UBool U_EXPORT2
632 // uset_add(USet* set, UChar32 c) {
633 // return addRemove(set, c, 0);
634 // }
635 //
636 // U_CAPI void U_EXPORT2
637 // uset_remove(USet* set, UChar32 c) {
638 // addRemove(set, c, 1);
639 // }