]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/uset.cpp
ICU-400.37.tar.gz
[apple/icu.git] / icuSources / common / uset.cpp
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2002-2007, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: uset.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002mar07
14 * created by: Markus W. Scherer
15 *
16 * The serialized structure, the array of range limits, is
17 * the same as in UnicodeSet, except that the HIGH value is not stored.
18 *
19 * There are functions to efficiently serialize a USet into an array of uint16_t
20 * and functions to use such a serialized form efficiently without
21 * instantiating a new USet.
22 */
23
24 #include "unicode/utypes.h"
25 #include "unicode/uobject.h"
26 #include "unicode/uset.h"
27 #include "unicode/uniset.h"
28 #include "cmemory.h"
29 #include "unicode/ustring.h"
30 #include "unicode/parsepos.h"
31
32 U_NAMESPACE_USE
33
34 U_CAPI USet* U_EXPORT2
35 uset_open(UChar32 start, UChar32 end) {
36 return (USet*) new UnicodeSet(start, end);
37 }
38
39 U_CAPI void U_EXPORT2
40 uset_close(USet* set) {
41 delete (UnicodeSet*) set;
42 }
43
44 U_CAPI USet * U_EXPORT2
45 uset_clone(const USet *set) {
46 return (USet*) (((UnicodeSet*) set)->UnicodeSet::clone());
47 }
48
49 U_CAPI UBool U_EXPORT2
50 uset_isFrozen(const USet *set) {
51 return ((UnicodeSet*) set)->UnicodeSet::isFrozen();
52 }
53
54 U_CAPI void U_EXPORT2
55 uset_freeze(USet *set) {
56 ((UnicodeSet*) set)->UnicodeSet::freeze();
57 }
58
59 U_CAPI USet * U_EXPORT2
60 uset_cloneAsThawed(const USet *set) {
61 return (USet*) (((UnicodeSet*) set)->UnicodeSet::cloneAsThawed());
62 }
63
64 U_CAPI void U_EXPORT2
65 uset_set(USet* set,
66 UChar32 start, UChar32 end) {
67 ((UnicodeSet*) set)->UnicodeSet::set(start, end);
68 }
69
70 U_CAPI void U_EXPORT2
71 uset_addAll(USet* set, const USet *additionalSet) {
72 ((UnicodeSet*) set)->UnicodeSet::addAll(*((const UnicodeSet*)additionalSet));
73 }
74
75 U_CAPI void U_EXPORT2
76 uset_add(USet* set, UChar32 c) {
77 ((UnicodeSet*) set)->UnicodeSet::add(c);
78 }
79
80 U_CAPI void U_EXPORT2
81 uset_addRange(USet* set, UChar32 start, UChar32 end) {
82 ((UnicodeSet*) set)->UnicodeSet::add(start, end);
83 }
84
85 U_CAPI void U_EXPORT2
86 uset_addString(USet* set, const UChar* str, int32_t strLen) {
87 // UnicodeString handles -1 for strLen
88 UnicodeString s(strLen<0, str, strLen);
89 ((UnicodeSet*) set)->UnicodeSet::add(s);
90 }
91
92 U_CAPI void U_EXPORT2
93 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen) {
94 // UnicodeString handles -1 for strLen
95 UnicodeString s(str, strLen);
96 ((UnicodeSet*) set)->UnicodeSet::addAll(s);
97 }
98
99 U_CAPI void U_EXPORT2
100 uset_remove(USet* set, UChar32 c) {
101 ((UnicodeSet*) set)->UnicodeSet::remove(c);
102 }
103
104 U_CAPI void U_EXPORT2
105 uset_removeRange(USet* set, UChar32 start, UChar32 end) {
106 ((UnicodeSet*) set)->UnicodeSet::remove(start, end);
107 }
108
109 U_CAPI void U_EXPORT2
110 uset_removeString(USet* set, const UChar* str, int32_t strLen) {
111 UnicodeString s(strLen==-1, str, strLen);
112 ((UnicodeSet*) set)->UnicodeSet::remove(s);
113 }
114
115 U_CAPI void U_EXPORT2
116 uset_removeAll(USet* set, const USet* remove) {
117 ((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove);
118 }
119
120 U_CAPI void U_EXPORT2
121 uset_retain(USet* set, UChar32 start, UChar32 end) {
122 ((UnicodeSet*) set)->UnicodeSet::retain(start, end);
123 }
124
125 U_CAPI void U_EXPORT2
126 uset_retainAll(USet* set, const USet* retain) {
127 ((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain);
128 }
129
130 U_CAPI void U_EXPORT2
131 uset_compact(USet* set) {
132 ((UnicodeSet*) set)->UnicodeSet::compact();
133 }
134
135 U_CAPI void U_EXPORT2
136 uset_complement(USet* set) {
137 ((UnicodeSet*) set)->UnicodeSet::complement();
138 }
139
140 U_CAPI void U_EXPORT2
141 uset_complementAll(USet* set, const USet* complement) {
142 ((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement);
143 }
144
145 U_CAPI void U_EXPORT2
146 uset_clear(USet* set) {
147 ((UnicodeSet*) set)->UnicodeSet::clear();
148 }
149
150 U_CAPI UBool U_EXPORT2
151 uset_isEmpty(const USet* set) {
152 return ((const UnicodeSet*) set)->UnicodeSet::isEmpty();
153 }
154
155 U_CAPI UBool U_EXPORT2
156 uset_contains(const USet* set, UChar32 c) {
157 return ((const UnicodeSet*) set)->UnicodeSet::contains(c);
158 }
159
160 U_CAPI UBool U_EXPORT2
161 uset_containsRange(const USet* set, UChar32 start, UChar32 end) {
162 return ((const UnicodeSet*) set)->UnicodeSet::contains(start, end);
163 }
164
165 U_CAPI UBool U_EXPORT2
166 uset_containsString(const USet* set, const UChar* str, int32_t strLen) {
167 UnicodeString s(strLen==-1, str, strLen);
168 return ((const UnicodeSet*) set)->UnicodeSet::contains(s);
169 }
170
171 U_CAPI UBool U_EXPORT2
172 uset_containsAll(const USet* set1, const USet* set2) {
173 return ((const UnicodeSet*) set1)->UnicodeSet::containsAll(* (const UnicodeSet*) set2);
174 }
175
176 U_CAPI UBool U_EXPORT2
177 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen) {
178 // Create a string alias, since nothing is being added to the set.
179 UnicodeString s(strLen==-1, str, strLen);
180 return ((const UnicodeSet*) set)->UnicodeSet::containsAll(s);
181 }
182
183 U_CAPI UBool U_EXPORT2
184 uset_containsNone(const USet* set1, const USet* set2) {
185 return ((const UnicodeSet*) set1)->UnicodeSet::containsNone(* (const UnicodeSet*) set2);
186 }
187
188 U_CAPI UBool U_EXPORT2
189 uset_containsSome(const USet* set1, const USet* set2) {
190 return ((const UnicodeSet*) set1)->UnicodeSet::containsSome(* (const UnicodeSet*) set2);
191 }
192
193 U_CAPI int32_t U_EXPORT2
194 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) {
195 return ((UnicodeSet*) set)->UnicodeSet::span(s, length, spanCondition);
196 }
197
198 U_CAPI int32_t U_EXPORT2
199 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) {
200 return ((UnicodeSet*) set)->UnicodeSet::spanBack(s, length, spanCondition);
201 }
202
203 U_CAPI int32_t U_EXPORT2
204 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) {
205 return ((UnicodeSet*) set)->UnicodeSet::spanUTF8(s, length, spanCondition);
206 }
207
208 U_CAPI int32_t U_EXPORT2
209 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) {
210 return ((UnicodeSet*) set)->UnicodeSet::spanBackUTF8(s, length, spanCondition);
211 }
212
213 U_CAPI UBool U_EXPORT2
214 uset_equals(const USet* set1, const USet* set2) {
215 return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2;
216 }
217
218 U_CAPI int32_t U_EXPORT2
219 uset_indexOf(const USet* set, UChar32 c) {
220 return ((UnicodeSet*) set)->UnicodeSet::indexOf(c);
221 }
222
223 U_CAPI UChar32 U_EXPORT2
224 uset_charAt(const USet* set, int32_t index) {
225 return ((UnicodeSet*) set)->UnicodeSet::charAt(index);
226 }
227
228 U_CAPI int32_t U_EXPORT2
229 uset_size(const USet* set) {
230 return ((const UnicodeSet*) set)->UnicodeSet::size();
231 }
232
233 U_NAMESPACE_BEGIN
234 /**
235 * This class only exists to provide access to the UnicodeSet private
236 * USet support API. Declaring a class a friend is more portable than
237 * trying to declare extern "C" functions as friends.
238 */
239 class USetAccess /* not : public UObject because all methods are static */ {
240 public:
241 /* Try to have the compiler inline these*/
242 inline static int32_t getStringCount(const UnicodeSet& set) {
243 return set.getStringCount();
244 }
245 inline static const UnicodeString* getString(const UnicodeSet& set,
246 int32_t i) {
247 return set.getString(i);
248 }
249 private:
250 /* do not instantiate*/
251 USetAccess();
252 };
253 U_NAMESPACE_END
254
255 U_CAPI int32_t U_EXPORT2
256 uset_getItemCount(const USet* uset) {
257 const UnicodeSet& set = *(const UnicodeSet*)uset;
258 return set.getRangeCount() + USetAccess::getStringCount(set);
259 }
260
261 U_CAPI int32_t U_EXPORT2
262 uset_getItem(const USet* uset, int32_t itemIndex,
263 UChar32* start, UChar32* end,
264 UChar* str, int32_t strCapacity,
265 UErrorCode* ec) {
266 if (U_FAILURE(*ec)) return 0;
267 const UnicodeSet& set = *(const UnicodeSet*)uset;
268 int32_t rangeCount;
269
270 if (itemIndex < 0) {
271 *ec = U_ILLEGAL_ARGUMENT_ERROR;
272 return -1;
273 } else if (itemIndex < (rangeCount = set.getRangeCount())) {
274 *start = set.getRangeStart(itemIndex);
275 *end = set.getRangeEnd(itemIndex);
276 return 0;
277 } else {
278 itemIndex -= rangeCount;
279 if (itemIndex < USetAccess::getStringCount(set)) {
280 const UnicodeString* s = USetAccess::getString(set, itemIndex);
281 return s->extract(str, strCapacity, *ec);
282 } else {
283 *ec = U_INDEX_OUTOFBOUNDS_ERROR;
284 return -1;
285 }
286 }
287 }
288
289 //U_CAPI int32_t U_EXPORT2
290 //uset_getRangeCount(const USet* set) {
291 // return ((const UnicodeSet*) set)->getRangeCount();
292 //}
293 //
294 //U_CAPI UBool U_EXPORT2
295 //uset_getRange(const USet* set, int32_t rangeIndex,
296 // UChar32* pStart, UChar32* pEnd) {
297 // if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) {
298 // return FALSE;
299 // }
300 // const UnicodeSet* us = (const UnicodeSet*) set;
301 // *pStart = us->getRangeStart(rangeIndex);
302 // *pEnd = us->getRangeEnd(rangeIndex);
303 // return TRUE;
304 //}
305
306 /*
307 * Serialize a USet into 16-bit units.
308 * Store BMP code points as themselves with one 16-bit unit each.
309 *
310 * Important: the code points in the array are in ascending order,
311 * therefore all BMP code points precede all supplementary code points.
312 *
313 * Store each supplementary code point in 2 16-bit units,
314 * simply with higher-then-lower 16-bit halfs.
315 *
316 * Precede the entire list with the length.
317 * If there are supplementary code points, then set bit 15 in the length
318 * and add the bmpLength between it and the array.
319 *
320 * In other words:
321 * - all BMP: (length=bmpLength) BMP, .., BMP
322 * - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, ..
323 */
324 U_CAPI int32_t U_EXPORT2
325 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* ec) {
326 if (ec==NULL || U_FAILURE(*ec)) {
327 return 0;
328 }
329
330 return ((const UnicodeSet*) set)->UnicodeSet::serialize(dest, destCapacity,* ec);
331 }
332
333 U_CAPI UBool U_EXPORT2
334 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength) {
335 int32_t length;
336
337 if(fillSet==NULL) {
338 return FALSE;
339 }
340 if(src==NULL || srcLength<=0) {
341 fillSet->length=fillSet->bmpLength=0;
342 return FALSE;
343 }
344
345 length=*src++;
346 if(length&0x8000) {
347 /* there are supplementary values */
348 length&=0x7fff;
349 if(srcLength<(2+length)) {
350 fillSet->length=fillSet->bmpLength=0;
351 return FALSE;
352 }
353 fillSet->bmpLength=*src++;
354 } else {
355 /* only BMP values */
356 if(srcLength<(1+length)) {
357 fillSet->length=fillSet->bmpLength=0;
358 return FALSE;
359 }
360 fillSet->bmpLength=length;
361 }
362 fillSet->array=src;
363 fillSet->length=length;
364 return TRUE;
365 }
366
367 U_CAPI void U_EXPORT2
368 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c) {
369 if(fillSet==NULL || (uint32_t)c>0x10ffff) {
370 return;
371 }
372
373 fillSet->array=fillSet->staticArray;
374 if(c<0xffff) {
375 fillSet->bmpLength=fillSet->length=2;
376 fillSet->staticArray[0]=(uint16_t)c;
377 fillSet->staticArray[1]=(uint16_t)c+1;
378 } else if(c==0xffff) {
379 fillSet->bmpLength=1;
380 fillSet->length=3;
381 fillSet->staticArray[0]=0xffff;
382 fillSet->staticArray[1]=1;
383 fillSet->staticArray[2]=0;
384 } else if(c<0x10ffff) {
385 fillSet->bmpLength=0;
386 fillSet->length=4;
387 fillSet->staticArray[0]=(uint16_t)(c>>16);
388 fillSet->staticArray[1]=(uint16_t)c;
389 ++c;
390 fillSet->staticArray[2]=(uint16_t)(c>>16);
391 fillSet->staticArray[3]=(uint16_t)c;
392 } else /* c==0x10ffff */ {
393 fillSet->bmpLength=0;
394 fillSet->length=2;
395 fillSet->staticArray[0]=0x10;
396 fillSet->staticArray[1]=0xffff;
397 }
398 }
399
400 U_CAPI UBool U_EXPORT2
401 uset_serializedContains(const USerializedSet* set, UChar32 c) {
402 const uint16_t* array;
403
404 if(set==NULL || (uint32_t)c>0x10ffff) {
405 return FALSE;
406 }
407
408 array=set->array;
409 if(c<=0xffff) {
410 /* find c in the BMP part */
411 int32_t lo = 0;
412 int32_t hi = set->bmpLength-1;
413 if (c < array[0]) {
414 hi = 0;
415 } else if (c < array[hi]) {
416 for(;;) {
417 int32_t i = (lo + hi) >> 1;
418 if (i == lo) {
419 break; // Done!
420 } else if (c < array[i]) {
421 hi = i;
422 } else {
423 lo = i;
424 }
425 }
426 } else {
427 hi += 1;
428 }
429 return (UBool)(hi&1);
430 } else {
431 /* find c in the supplementary part */
432 uint16_t high=(uint16_t)(c>>16), low=(uint16_t)c;
433 int32_t base = set->bmpLength;
434 int32_t lo = 0;
435 int32_t hi = set->length - 2 - base;
436 if (high < array[base] || (high==array[base] && low<array[base+1])) {
437 hi = 0;
438 } else if (high < array[base+hi] || (high==array[base+hi] && low<array[base+hi+1])) {
439 for (;;) {
440 int32_t i = ((lo + hi) >> 1) & ~1; // Guarantee even result
441 int32_t iabs = i + base;
442 if (i == lo) {
443 break; // Done!
444 } else if (high < array[iabs] || (high==array[iabs] && low<array[iabs+1])) {
445 hi = i;
446 } else {
447 lo = i;
448 }
449 }
450 } else {
451 hi += 2;
452 }
453 /* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */
454 return (UBool)(((hi+(base<<1))&2)!=0);
455 }
456 }
457
458 U_CAPI int32_t U_EXPORT2
459 uset_getSerializedRangeCount(const USerializedSet* set) {
460 if(set==NULL) {
461 return 0;
462 }
463
464 return (set->bmpLength+(set->length-set->bmpLength)/2+1)/2;
465 }
466
467 U_CAPI UBool U_EXPORT2
468 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
469 UChar32* pStart, UChar32* pEnd) {
470 const uint16_t* array;
471 int32_t bmpLength, length;
472
473 if(set==NULL || rangeIndex<0 || pStart==NULL || pEnd==NULL) {
474 return FALSE;
475 }
476
477 array=set->array;
478 length=set->length;
479 bmpLength=set->bmpLength;
480
481 rangeIndex*=2; /* address start/limit pairs */
482 if(rangeIndex<bmpLength) {
483 *pStart=array[rangeIndex++];
484 if(rangeIndex<bmpLength) {
485 *pEnd=array[rangeIndex];
486 } else if(rangeIndex<length) {
487 *pEnd=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1];
488 } else {
489 *pEnd=0x110000;
490 }
491 --*pEnd;
492 return TRUE;
493 } else {
494 rangeIndex-=bmpLength;
495 rangeIndex*=2; /* address pairs of pairs of units */
496 length-=bmpLength;
497 if(rangeIndex<length) {
498 array+=bmpLength;
499 *pStart=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1];
500 rangeIndex+=2;
501 if(rangeIndex<length) {
502 *pEnd=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1];
503 } else {
504 *pEnd=0x110000;
505 }
506 --*pEnd;
507 return TRUE;
508 } else {
509 return FALSE;
510 }
511 }
512 }
513
514 // TODO The old, internal uset.c had an efficient uset_containsOne function.
515 // Returned the one and only code point, or else -1 or something.
516 // Consider adding such a function to both C and C++ UnicodeSet/uset.
517 // See tools/gennorm/store.c for usage, now usetContainsOne there.
518
519 // TODO Investigate incorporating this code into UnicodeSet to improve
520 // efficiency.
521 // ---
522 // #define USET_GROW_DELTA 20
523 //
524 // static U_INLINE int32_t
525 // findChar(const UChar32* array, int32_t length, UChar32 c) {
526 // int32_t i;
527 //
528 // /* check the last range limit first for more efficient appending */
529 // if(length>0) {
530 // if(c>=array[length-1]) {
531 // return length;
532 // }
533 //
534 // /* do not check the last range limit again in the loop below */
535 // --length;
536 // }
537 //
538 // for(i=0; i<length && c>=array[i]; ++i) {}
539 // return i;
540 // }
541 //
542 // static UBool
543 // addRemove(USet* set, UChar32 c, int32_t doRemove) {
544 // int32_t i, length, more;
545 //
546 // if(set==NULL || (uint32_t)c>0x10ffff) {
547 // return FALSE;
548 // }
549 //
550 // length=set->length;
551 // i=findChar(set->array, length, c);
552 // if((i&1)^doRemove) {
553 // /* c is already in the set */
554 // return TRUE;
555 // }
556 //
557 // /* how many more array items do we need? */
558 // if(i<length && (c+1)==set->array[i]) {
559 // /* c is just before the following range, extend that in-place by one */
560 // set->array[i]=c;
561 // if(i>0) {
562 // --i;
563 // if(c==set->array[i]) {
564 // /* the previous range collapsed, remove it */
565 // set->length=length-=2;
566 // if(i<length) {
567 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4);
568 // }
569 // }
570 // }
571 // return TRUE;
572 // } else if(i>0 && c==set->array[i-1]) {
573 // /* c is just after the previous range, extend that in-place by one */
574 // if(++c<=0x10ffff) {
575 // set->array[i-1]=c;
576 // if(i<length && c==set->array[i]) {
577 // /* the following range collapsed, remove it */
578 // --i;
579 // set->length=length-=2;
580 // if(i<length) {
581 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4);
582 // }
583 // }
584 // } else {
585 // /* extend the previous range (had limit 0x10ffff) to the end of Unicode */
586 // set->length=i-1;
587 // }
588 // return TRUE;
589 // } else if(i==length && c==0x10ffff) {
590 // /* insert one range limit c */
591 // more=1;
592 // } else {
593 // /* insert two range limits c, c+1 */
594 // more=2;
595 // }
596 //
597 // /* insert <more> range limits */
598 // if(length+more>set->capacity) {
599 // /* reallocate */
600 // int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA;
601 // UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4);
602 // if(newArray==NULL) {
603 // return FALSE;
604 // }
605 // set->capacity=newCapacity;
606 // uprv_memcpy(newArray, set->array, length*4);
607 //
608 // if(set->array!=set->staticBuffer) {
609 // uprv_free(set->array);
610 // }
611 // set->array=newArray;
612 // }
613 //
614 // if(i<length) {
615 // uprv_memmove(set->array+i+more, set->array+i, (length-i)*4);
616 // }
617 // set->array[i]=c;
618 // if(more==2) {
619 // set->array[i+1]=c+1;
620 // }
621 // set->length+=more;
622 //
623 // return TRUE;
624 // }
625 //
626 // U_CAPI UBool U_EXPORT2
627 // uset_add(USet* set, UChar32 c) {
628 // return addRemove(set, c, 0);
629 // }
630 //
631 // U_CAPI void U_EXPORT2
632 // uset_remove(USet* set, UChar32 c) {
633 // addRemove(set, c, 1);
634 // }