]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/uset.cpp
ICU-461.18.tar.gz
[apple/icu.git] / icuSources / common / uset.cpp
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2002-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: uset.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002mar07
14 * created by: Markus W. Scherer
15 *
16 * There are functions to efficiently serialize a USet into an array of uint16_t
17 * and functions to use such a serialized form efficiently without
18 * instantiating a new USet.
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/uobject.h"
23 #include "unicode/uset.h"
24 #include "unicode/uniset.h"
25 #include "cmemory.h"
26 #include "unicode/ustring.h"
27 #include "unicode/parsepos.h"
28
29 U_NAMESPACE_USE
30
31 U_CAPI USet* U_EXPORT2
32 uset_openEmpty() {
33 return (USet*) new UnicodeSet();
34 }
35
36 U_CAPI USet* U_EXPORT2
37 uset_open(UChar32 start, UChar32 end) {
38 return (USet*) new UnicodeSet(start, end);
39 }
40
41 U_CAPI void U_EXPORT2
42 uset_close(USet* set) {
43 delete (UnicodeSet*) set;
44 }
45
46 U_CAPI USet * U_EXPORT2
47 uset_clone(const USet *set) {
48 return (USet*) (((UnicodeSet*) set)->UnicodeSet::clone());
49 }
50
51 U_CAPI UBool U_EXPORT2
52 uset_isFrozen(const USet *set) {
53 return ((UnicodeSet*) set)->UnicodeSet::isFrozen();
54 }
55
56 U_CAPI void U_EXPORT2
57 uset_freeze(USet *set) {
58 ((UnicodeSet*) set)->UnicodeSet::freeze();
59 }
60
61 U_CAPI USet * U_EXPORT2
62 uset_cloneAsThawed(const USet *set) {
63 return (USet*) (((UnicodeSet*) set)->UnicodeSet::cloneAsThawed());
64 }
65
66 U_CAPI void U_EXPORT2
67 uset_set(USet* set,
68 UChar32 start, UChar32 end) {
69 ((UnicodeSet*) set)->UnicodeSet::set(start, end);
70 }
71
72 U_CAPI void U_EXPORT2
73 uset_addAll(USet* set, const USet *additionalSet) {
74 ((UnicodeSet*) set)->UnicodeSet::addAll(*((const UnicodeSet*)additionalSet));
75 }
76
77 U_CAPI void U_EXPORT2
78 uset_add(USet* set, UChar32 c) {
79 ((UnicodeSet*) set)->UnicodeSet::add(c);
80 }
81
82 U_CAPI void U_EXPORT2
83 uset_addRange(USet* set, UChar32 start, UChar32 end) {
84 ((UnicodeSet*) set)->UnicodeSet::add(start, end);
85 }
86
87 U_CAPI void U_EXPORT2
88 uset_addString(USet* set, const UChar* str, int32_t strLen) {
89 // UnicodeString handles -1 for strLen
90 UnicodeString s(strLen<0, str, strLen);
91 ((UnicodeSet*) set)->UnicodeSet::add(s);
92 }
93
94 U_CAPI void U_EXPORT2
95 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen) {
96 // UnicodeString handles -1 for strLen
97 UnicodeString s(str, strLen);
98 ((UnicodeSet*) set)->UnicodeSet::addAll(s);
99 }
100
101 U_CAPI void U_EXPORT2
102 uset_remove(USet* set, UChar32 c) {
103 ((UnicodeSet*) set)->UnicodeSet::remove(c);
104 }
105
106 U_CAPI void U_EXPORT2
107 uset_removeRange(USet* set, UChar32 start, UChar32 end) {
108 ((UnicodeSet*) set)->UnicodeSet::remove(start, end);
109 }
110
111 U_CAPI void U_EXPORT2
112 uset_removeString(USet* set, const UChar* str, int32_t strLen) {
113 UnicodeString s(strLen==-1, str, strLen);
114 ((UnicodeSet*) set)->UnicodeSet::remove(s);
115 }
116
117 U_CAPI void U_EXPORT2
118 uset_removeAll(USet* set, const USet* remove) {
119 ((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove);
120 }
121
122 U_CAPI void U_EXPORT2
123 uset_retain(USet* set, UChar32 start, UChar32 end) {
124 ((UnicodeSet*) set)->UnicodeSet::retain(start, end);
125 }
126
127 U_CAPI void U_EXPORT2
128 uset_retainAll(USet* set, const USet* retain) {
129 ((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain);
130 }
131
132 U_CAPI void U_EXPORT2
133 uset_compact(USet* set) {
134 ((UnicodeSet*) set)->UnicodeSet::compact();
135 }
136
137 U_CAPI void U_EXPORT2
138 uset_complement(USet* set) {
139 ((UnicodeSet*) set)->UnicodeSet::complement();
140 }
141
142 U_CAPI void U_EXPORT2
143 uset_complementAll(USet* set, const USet* complement) {
144 ((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement);
145 }
146
147 U_CAPI void U_EXPORT2
148 uset_clear(USet* set) {
149 ((UnicodeSet*) set)->UnicodeSet::clear();
150 }
151
152 U_CAPI void U_EXPORT2
153 uset_closeOver(USet* set, int32_t attributes) {
154 ((UnicodeSet*) set)->UnicodeSet::closeOver(attributes);
155 }
156
157 U_CAPI void U_EXPORT2
158 uset_removeAllStrings(USet* set) {
159 ((UnicodeSet*) set)->UnicodeSet::removeAllStrings();
160 }
161
162 U_CAPI UBool U_EXPORT2
163 uset_isEmpty(const USet* set) {
164 return ((const UnicodeSet*) set)->UnicodeSet::isEmpty();
165 }
166
167 U_CAPI UBool U_EXPORT2
168 uset_contains(const USet* set, UChar32 c) {
169 return ((const UnicodeSet*) set)->UnicodeSet::contains(c);
170 }
171
172 U_CAPI UBool U_EXPORT2
173 uset_containsRange(const USet* set, UChar32 start, UChar32 end) {
174 return ((const UnicodeSet*) set)->UnicodeSet::contains(start, end);
175 }
176
177 U_CAPI UBool U_EXPORT2
178 uset_containsString(const USet* set, const UChar* str, int32_t strLen) {
179 UnicodeString s(strLen==-1, str, strLen);
180 return ((const UnicodeSet*) set)->UnicodeSet::contains(s);
181 }
182
183 U_CAPI UBool U_EXPORT2
184 uset_containsAll(const USet* set1, const USet* set2) {
185 return ((const UnicodeSet*) set1)->UnicodeSet::containsAll(* (const UnicodeSet*) set2);
186 }
187
188 U_CAPI UBool U_EXPORT2
189 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen) {
190 // Create a string alias, since nothing is being added to the set.
191 UnicodeString s(strLen==-1, str, strLen);
192 return ((const UnicodeSet*) set)->UnicodeSet::containsAll(s);
193 }
194
195 U_CAPI UBool U_EXPORT2
196 uset_containsNone(const USet* set1, const USet* set2) {
197 return ((const UnicodeSet*) set1)->UnicodeSet::containsNone(* (const UnicodeSet*) set2);
198 }
199
200 U_CAPI UBool U_EXPORT2
201 uset_containsSome(const USet* set1, const USet* set2) {
202 return ((const UnicodeSet*) set1)->UnicodeSet::containsSome(* (const UnicodeSet*) set2);
203 }
204
205 U_CAPI int32_t U_EXPORT2
206 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) {
207 return ((UnicodeSet*) set)->UnicodeSet::span(s, length, spanCondition);
208 }
209
210 U_CAPI int32_t U_EXPORT2
211 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) {
212 return ((UnicodeSet*) set)->UnicodeSet::spanBack(s, length, spanCondition);
213 }
214
215 U_CAPI int32_t U_EXPORT2
216 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) {
217 return ((UnicodeSet*) set)->UnicodeSet::spanUTF8(s, length, spanCondition);
218 }
219
220 U_CAPI int32_t U_EXPORT2
221 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) {
222 return ((UnicodeSet*) set)->UnicodeSet::spanBackUTF8(s, length, spanCondition);
223 }
224
225 U_CAPI UBool U_EXPORT2
226 uset_equals(const USet* set1, const USet* set2) {
227 return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2;
228 }
229
230 U_CAPI int32_t U_EXPORT2
231 uset_indexOf(const USet* set, UChar32 c) {
232 return ((UnicodeSet*) set)->UnicodeSet::indexOf(c);
233 }
234
235 U_CAPI UChar32 U_EXPORT2
236 uset_charAt(const USet* set, int32_t index) {
237 return ((UnicodeSet*) set)->UnicodeSet::charAt(index);
238 }
239
240 U_CAPI int32_t U_EXPORT2
241 uset_size(const USet* set) {
242 return ((const UnicodeSet*) set)->UnicodeSet::size();
243 }
244
245 U_NAMESPACE_BEGIN
246 /**
247 * This class only exists to provide access to the UnicodeSet private
248 * USet support API. Declaring a class a friend is more portable than
249 * trying to declare extern "C" functions as friends.
250 */
251 class USetAccess /* not : public UObject because all methods are static */ {
252 public:
253 /* Try to have the compiler inline these*/
254 inline static int32_t getStringCount(const UnicodeSet& set) {
255 return set.getStringCount();
256 }
257 inline static const UnicodeString* getString(const UnicodeSet& set,
258 int32_t i) {
259 return set.getString(i);
260 }
261 private:
262 /* do not instantiate*/
263 USetAccess();
264 };
265 U_NAMESPACE_END
266
267 U_CAPI int32_t U_EXPORT2
268 uset_getItemCount(const USet* uset) {
269 const UnicodeSet& set = *(const UnicodeSet*)uset;
270 return set.getRangeCount() + USetAccess::getStringCount(set);
271 }
272
273 U_CAPI int32_t U_EXPORT2
274 uset_getItem(const USet* uset, int32_t itemIndex,
275 UChar32* start, UChar32* end,
276 UChar* str, int32_t strCapacity,
277 UErrorCode* ec) {
278 if (U_FAILURE(*ec)) return 0;
279 const UnicodeSet& set = *(const UnicodeSet*)uset;
280 int32_t rangeCount;
281
282 if (itemIndex < 0) {
283 *ec = U_ILLEGAL_ARGUMENT_ERROR;
284 return -1;
285 } else if (itemIndex < (rangeCount = set.getRangeCount())) {
286 *start = set.getRangeStart(itemIndex);
287 *end = set.getRangeEnd(itemIndex);
288 return 0;
289 } else {
290 itemIndex -= rangeCount;
291 if (itemIndex < USetAccess::getStringCount(set)) {
292 const UnicodeString* s = USetAccess::getString(set, itemIndex);
293 return s->extract(str, strCapacity, *ec);
294 } else {
295 *ec = U_INDEX_OUTOFBOUNDS_ERROR;
296 return -1;
297 }
298 }
299 }
300
301 //U_CAPI int32_t U_EXPORT2
302 //uset_getRangeCount(const USet* set) {
303 // return ((const UnicodeSet*) set)->getRangeCount();
304 //}
305 //
306 //U_CAPI UBool U_EXPORT2
307 //uset_getRange(const USet* set, int32_t rangeIndex,
308 // UChar32* pStart, UChar32* pEnd) {
309 // if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) {
310 // return FALSE;
311 // }
312 // const UnicodeSet* us = (const UnicodeSet*) set;
313 // *pStart = us->getRangeStart(rangeIndex);
314 // *pEnd = us->getRangeEnd(rangeIndex);
315 // return TRUE;
316 //}
317
318 /*
319 * Serialize a USet into 16-bit units.
320 * Store BMP code points as themselves with one 16-bit unit each.
321 *
322 * Important: the code points in the array are in ascending order,
323 * therefore all BMP code points precede all supplementary code points.
324 *
325 * Store each supplementary code point in 2 16-bit units,
326 * simply with higher-then-lower 16-bit halfs.
327 *
328 * Precede the entire list with the length.
329 * If there are supplementary code points, then set bit 15 in the length
330 * and add the bmpLength between it and the array.
331 *
332 * In other words:
333 * - all BMP: (length=bmpLength) BMP, .., BMP
334 * - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, ..
335 */
336 U_CAPI int32_t U_EXPORT2
337 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* ec) {
338 if (ec==NULL || U_FAILURE(*ec)) {
339 return 0;
340 }
341
342 return ((const UnicodeSet*) set)->UnicodeSet::serialize(dest, destCapacity,* ec);
343 }
344
345 U_CAPI UBool U_EXPORT2
346 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength) {
347 int32_t length;
348
349 if(fillSet==NULL) {
350 return FALSE;
351 }
352 if(src==NULL || srcLength<=0) {
353 fillSet->length=fillSet->bmpLength=0;
354 return FALSE;
355 }
356
357 length=*src++;
358 if(length&0x8000) {
359 /* there are supplementary values */
360 length&=0x7fff;
361 if(srcLength<(2+length)) {
362 fillSet->length=fillSet->bmpLength=0;
363 return FALSE;
364 }
365 fillSet->bmpLength=*src++;
366 } else {
367 /* only BMP values */
368 if(srcLength<(1+length)) {
369 fillSet->length=fillSet->bmpLength=0;
370 return FALSE;
371 }
372 fillSet->bmpLength=length;
373 }
374 fillSet->array=src;
375 fillSet->length=length;
376 return TRUE;
377 }
378
379 U_CAPI void U_EXPORT2
380 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c) {
381 if(fillSet==NULL || (uint32_t)c>0x10ffff) {
382 return;
383 }
384
385 fillSet->array=fillSet->staticArray;
386 if(c<0xffff) {
387 fillSet->bmpLength=fillSet->length=2;
388 fillSet->staticArray[0]=(uint16_t)c;
389 fillSet->staticArray[1]=(uint16_t)c+1;
390 } else if(c==0xffff) {
391 fillSet->bmpLength=1;
392 fillSet->length=3;
393 fillSet->staticArray[0]=0xffff;
394 fillSet->staticArray[1]=1;
395 fillSet->staticArray[2]=0;
396 } else if(c<0x10ffff) {
397 fillSet->bmpLength=0;
398 fillSet->length=4;
399 fillSet->staticArray[0]=(uint16_t)(c>>16);
400 fillSet->staticArray[1]=(uint16_t)c;
401 ++c;
402 fillSet->staticArray[2]=(uint16_t)(c>>16);
403 fillSet->staticArray[3]=(uint16_t)c;
404 } else /* c==0x10ffff */ {
405 fillSet->bmpLength=0;
406 fillSet->length=2;
407 fillSet->staticArray[0]=0x10;
408 fillSet->staticArray[1]=0xffff;
409 }
410 }
411
412 U_CAPI UBool U_EXPORT2
413 uset_serializedContains(const USerializedSet* set, UChar32 c) {
414 const uint16_t* array;
415
416 if(set==NULL || (uint32_t)c>0x10ffff) {
417 return FALSE;
418 }
419
420 array=set->array;
421 if(c<=0xffff) {
422 /* find c in the BMP part */
423 int32_t lo = 0;
424 int32_t hi = set->bmpLength-1;
425 if (c < array[0]) {
426 hi = 0;
427 } else if (c < array[hi]) {
428 for(;;) {
429 int32_t i = (lo + hi) >> 1;
430 if (i == lo) {
431 break; // Done!
432 } else if (c < array[i]) {
433 hi = i;
434 } else {
435 lo = i;
436 }
437 }
438 } else {
439 hi += 1;
440 }
441 return (UBool)(hi&1);
442 } else {
443 /* find c in the supplementary part */
444 uint16_t high=(uint16_t)(c>>16), low=(uint16_t)c;
445 int32_t base = set->bmpLength;
446 int32_t lo = 0;
447 int32_t hi = set->length - 2 - base;
448 if (high < array[base] || (high==array[base] && low<array[base+1])) {
449 hi = 0;
450 } else if (high < array[base+hi] || (high==array[base+hi] && low<array[base+hi+1])) {
451 for (;;) {
452 int32_t i = ((lo + hi) >> 1) & ~1; // Guarantee even result
453 int32_t iabs = i + base;
454 if (i == lo) {
455 break; // Done!
456 } else if (high < array[iabs] || (high==array[iabs] && low<array[iabs+1])) {
457 hi = i;
458 } else {
459 lo = i;
460 }
461 }
462 } else {
463 hi += 2;
464 }
465 /* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */
466 return (UBool)(((hi+(base<<1))&2)!=0);
467 }
468 }
469
470 U_CAPI int32_t U_EXPORT2
471 uset_getSerializedRangeCount(const USerializedSet* set) {
472 if(set==NULL) {
473 return 0;
474 }
475
476 return (set->bmpLength+(set->length-set->bmpLength)/2+1)/2;
477 }
478
479 U_CAPI UBool U_EXPORT2
480 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
481 UChar32* pStart, UChar32* pEnd) {
482 const uint16_t* array;
483 int32_t bmpLength, length;
484
485 if(set==NULL || rangeIndex<0 || pStart==NULL || pEnd==NULL) {
486 return FALSE;
487 }
488
489 array=set->array;
490 length=set->length;
491 bmpLength=set->bmpLength;
492
493 rangeIndex*=2; /* address start/limit pairs */
494 if(rangeIndex<bmpLength) {
495 *pStart=array[rangeIndex++];
496 if(rangeIndex<bmpLength) {
497 *pEnd=array[rangeIndex]-1;
498 } else if(rangeIndex<length) {
499 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1;
500 } else {
501 *pEnd=0x10ffff;
502 }
503 return TRUE;
504 } else {
505 rangeIndex-=bmpLength;
506 rangeIndex*=2; /* address pairs of pairs of units */
507 length-=bmpLength;
508 if(rangeIndex<length) {
509 array+=bmpLength;
510 *pStart=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1];
511 rangeIndex+=2;
512 if(rangeIndex<length) {
513 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1;
514 } else {
515 *pEnd=0x10ffff;
516 }
517 return TRUE;
518 } else {
519 return FALSE;
520 }
521 }
522 }
523
524 // TODO The old, internal uset.c had an efficient uset_containsOne function.
525 // Returned the one and only code point, or else -1 or something.
526 // Consider adding such a function to both C and C++ UnicodeSet/uset.
527 // See tools/gennorm/store.c for usage, now usetContainsOne there.
528
529 // TODO Investigate incorporating this code into UnicodeSet to improve
530 // efficiency.
531 // ---
532 // #define USET_GROW_DELTA 20
533 //
534 // static U_INLINE int32_t
535 // findChar(const UChar32* array, int32_t length, UChar32 c) {
536 // int32_t i;
537 //
538 // /* check the last range limit first for more efficient appending */
539 // if(length>0) {
540 // if(c>=array[length-1]) {
541 // return length;
542 // }
543 //
544 // /* do not check the last range limit again in the loop below */
545 // --length;
546 // }
547 //
548 // for(i=0; i<length && c>=array[i]; ++i) {}
549 // return i;
550 // }
551 //
552 // static UBool
553 // addRemove(USet* set, UChar32 c, int32_t doRemove) {
554 // int32_t i, length, more;
555 //
556 // if(set==NULL || (uint32_t)c>0x10ffff) {
557 // return FALSE;
558 // }
559 //
560 // length=set->length;
561 // i=findChar(set->array, length, c);
562 // if((i&1)^doRemove) {
563 // /* c is already in the set */
564 // return TRUE;
565 // }
566 //
567 // /* how many more array items do we need? */
568 // if(i<length && (c+1)==set->array[i]) {
569 // /* c is just before the following range, extend that in-place by one */
570 // set->array[i]=c;
571 // if(i>0) {
572 // --i;
573 // if(c==set->array[i]) {
574 // /* the previous range collapsed, remove it */
575 // set->length=length-=2;
576 // if(i<length) {
577 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4);
578 // }
579 // }
580 // }
581 // return TRUE;
582 // } else if(i>0 && c==set->array[i-1]) {
583 // /* c is just after the previous range, extend that in-place by one */
584 // if(++c<=0x10ffff) {
585 // set->array[i-1]=c;
586 // if(i<length && c==set->array[i]) {
587 // /* the following range collapsed, remove it */
588 // --i;
589 // set->length=length-=2;
590 // if(i<length) {
591 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4);
592 // }
593 // }
594 // } else {
595 // /* extend the previous range (had limit 0x10ffff) to the end of Unicode */
596 // set->length=i-1;
597 // }
598 // return TRUE;
599 // } else if(i==length && c==0x10ffff) {
600 // /* insert one range limit c */
601 // more=1;
602 // } else {
603 // /* insert two range limits c, c+1 */
604 // more=2;
605 // }
606 //
607 // /* insert <more> range limits */
608 // if(length+more>set->capacity) {
609 // /* reallocate */
610 // int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA;
611 // UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4);
612 // if(newArray==NULL) {
613 // return FALSE;
614 // }
615 // set->capacity=newCapacity;
616 // uprv_memcpy(newArray, set->array, length*4);
617 //
618 // if(set->array!=set->staticBuffer) {
619 // uprv_free(set->array);
620 // }
621 // set->array=newArray;
622 // }
623 //
624 // if(i<length) {
625 // uprv_memmove(set->array+i+more, set->array+i, (length-i)*4);
626 // }
627 // set->array[i]=c;
628 // if(more==2) {
629 // set->array[i+1]=c+1;
630 // }
631 // set->length+=more;
632 //
633 // return TRUE;
634 // }
635 //
636 // U_CAPI UBool U_EXPORT2
637 // uset_add(USet* set, UChar32 c) {
638 // return addRemove(set, c, 0);
639 // }
640 //
641 // U_CAPI void U_EXPORT2
642 // uset_remove(USet* set, UChar32 c) {
643 // addRemove(set, c, 1);
644 // }