]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/uset.cpp
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / common / uset.cpp
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2002-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: uset.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002mar07
14 * created by: Markus W. Scherer
15 *
16 * The serialized structure, the array of range limits, is
17 * the same as in UnicodeSet, except that the HIGH value is not stored.
18 *
19 * There are functions to efficiently serialize a USet into an array of uint16_t
20 * and functions to use such a serialized form efficiently without
21 * instantiating a new USet.
22 */
23
24 #include "unicode/utypes.h"
25 #include "unicode/uobject.h"
26 #include "unicode/uset.h"
27 #include "unicode/uniset.h"
28 #include "cmemory.h"
29 #include "unicode/ustring.h"
30 #include "unicode/parsepos.h"
31
32 U_CAPI USet* U_EXPORT2
33 uset_open(UChar32 start, UChar32 end) {
34 return (USet*) new UnicodeSet(start, end);
35 }
36
37 U_CAPI void U_EXPORT2
38 uset_close(USet* set) {
39 delete (UnicodeSet*) set;
40 }
41
42 U_CAPI void U_EXPORT2
43 uset_set(USet* set,
44 UChar32 start, UChar32 end) {
45 ((UnicodeSet*) set)->set(start, end);
46 }
47
48 U_CAPI void U_EXPORT2
49 uset_addAll(USet* set, const USet *additionalSet) {
50 ((UnicodeSet*) set)->addAll(*((const UnicodeSet*)additionalSet));
51 }
52
53 U_CAPI void U_EXPORT2
54 uset_add(USet* set, UChar32 c) {
55 ((UnicodeSet*) set)->add(c);
56 }
57
58 U_CAPI void U_EXPORT2
59 uset_addRange(USet* set, UChar32 start, UChar32 end) {
60 ((UnicodeSet*) set)->add(start, end);
61 }
62
63 U_CAPI void U_EXPORT2
64 uset_addString(USet* set, const UChar* str, int32_t strLen) {
65 // WRONG! Do not alias, it will stay aliased, even after
66 // copying. TODO: do we need a copy ctor that unaliases
67 //UnicodeString s(strLen==-1, str, strLen);
68 // We promised -1 for zero terminated
69 if(strLen == -1) {
70 strLen = u_strlen(str);
71 }
72 UnicodeString s(str, strLen);
73 ((UnicodeSet*) set)->add(s);
74 }
75
76 U_CAPI void U_EXPORT2
77 uset_remove(USet* set, UChar32 c) {
78 ((UnicodeSet*) set)->remove(c);
79 }
80
81 U_CAPI void U_EXPORT2
82 uset_removeRange(USet* set, UChar32 start, UChar32 end) {
83 ((UnicodeSet*) set)->remove(start, end);
84 }
85
86 U_CAPI void U_EXPORT2
87 uset_removeString(USet* set, const UChar* str, int32_t strLen) {
88 UnicodeString s(strLen==-1, str, strLen);
89 ((UnicodeSet*) set)->remove(s);
90 }
91
92 U_CAPI void U_EXPORT2
93 uset_removeAll(USet* set, const USet* remove) {
94 ((UnicodeSet*) set)->removeAll(*(const UnicodeSet*)remove);
95 }
96
97 U_CAPI void U_EXPORT2
98 uset_retain(USet* set, UChar32 start, UChar32 end) {
99 ((UnicodeSet*) set)->retain(start, end);
100 }
101
102 U_CAPI void U_EXPORT2
103 uset_retainAll(USet* set, const USet* retain) {
104 ((UnicodeSet*) set)->retainAll(*(const UnicodeSet*)retain);
105 }
106
107 U_CAPI void U_EXPORT2
108 uset_compact(USet* set) {
109 ((UnicodeSet*) set)->compact();
110 }
111
112 U_CAPI void U_EXPORT2
113 uset_complement(USet* set) {
114 ((UnicodeSet*) set)->complement();
115 }
116
117 U_CAPI void U_EXPORT2
118 uset_complementAll(USet* set, const USet* complement) {
119 ((UnicodeSet*) set)->complementAll(*(const UnicodeSet*)complement);
120 }
121
122 U_CAPI void U_EXPORT2
123 uset_clear(USet* set) {
124 ((UnicodeSet*) set)->clear();
125 }
126
127 U_CAPI UBool U_EXPORT2
128 uset_isEmpty(const USet* set) {
129 return ((const UnicodeSet*) set)->isEmpty();
130 }
131
132 U_CAPI UBool U_EXPORT2
133 uset_contains(const USet* set, UChar32 c) {
134 return ((const UnicodeSet*) set)->contains(c);
135 }
136
137 U_CAPI UBool U_EXPORT2
138 uset_containsRange(const USet* set, UChar32 start, UChar32 end) {
139 return ((const UnicodeSet*) set)->contains(start, end);
140 }
141
142 U_CAPI UBool U_EXPORT2
143 uset_containsString(const USet* set, const UChar* str, int32_t strLen) {
144 UnicodeString s(strLen==-1, str, strLen);
145 return ((const UnicodeSet*) set)->contains(s);
146 }
147
148 U_CAPI UBool U_EXPORT2
149 uset_containsAll(const USet* set1, const USet* set2) {
150 return ((const UnicodeSet*) set1)->containsAll(* (const UnicodeSet*) set2);
151 }
152
153 U_CAPI UBool U_EXPORT2
154 uset_containsNone(const USet* set1, const USet* set2) {
155 return ((const UnicodeSet*) set1)->containsNone(* (const UnicodeSet*) set2);
156 }
157
158 U_CAPI UBool U_EXPORT2
159 uset_containsSome(const USet* set1, const USet* set2) {
160 return ((const UnicodeSet*) set1)->containsSome(* (const UnicodeSet*) set2);
161 }
162
163 U_CAPI UBool U_EXPORT2
164 uset_equals(const USet* set1, const USet* set2) {
165 return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2;
166 }
167
168 U_CAPI int32_t U_EXPORT2
169 uset_indexOf(const USet* set, UChar32 c) {
170 return ((UnicodeSet*) set)->indexOf(c);
171 }
172
173 U_CAPI UChar32 U_EXPORT2
174 uset_charAt(const USet* set, int32_t index) {
175 return ((UnicodeSet*) set)->charAt(index);
176 }
177
178 U_CAPI int32_t U_EXPORT2
179 uset_size(const USet* set) {
180 return ((const UnicodeSet*) set)->size();
181 }
182
183 U_NAMESPACE_BEGIN
184 /**
185 * This class only exists to provide access to the UnicodeSet private
186 * USet support API. Declaring a class a friend is more portable than
187 * trying to declare extern "C" functions as friends.
188 */
189 class USetAccess /* not : public UObject because all methods are static */ {
190 public:
191 /* Try to have the compiler inline these*/
192 inline static int32_t getStringCount(const UnicodeSet& set) {
193 return set.getStringCount();
194 }
195 inline static const UnicodeString* getString(const UnicodeSet& set,
196 int32_t i) {
197 return set.getString(i);
198 }
199 private:
200 /* do not instantiate*/
201 USetAccess();
202 };
203 U_NAMESPACE_END
204
205 U_CAPI int32_t U_EXPORT2
206 uset_getItemCount(const USet* uset) {
207 const UnicodeSet& set = *(const UnicodeSet*)uset;
208 return set.getRangeCount() + USetAccess::getStringCount(set);
209 }
210
211 U_CAPI int32_t U_EXPORT2
212 uset_getItem(const USet* uset, int32_t itemIndex,
213 UChar32* start, UChar32* end,
214 UChar* str, int32_t strCapacity,
215 UErrorCode* ec) {
216 if (U_FAILURE(*ec)) return 0;
217 const UnicodeSet& set = *(const UnicodeSet*)uset;
218 int32_t rangeCount;
219
220 if (itemIndex < 0) {
221 *ec = U_ILLEGAL_ARGUMENT_ERROR;
222 return -1;
223 } else if (itemIndex < (rangeCount = set.getRangeCount())) {
224 *start = set.getRangeStart(itemIndex);
225 *end = set.getRangeEnd(itemIndex);
226 return 0;
227 } else {
228 itemIndex -= rangeCount;
229 if (itemIndex < USetAccess::getStringCount(set)) {
230 const UnicodeString* s = USetAccess::getString(set, itemIndex);
231 return s->extract(str, strCapacity, *ec);
232 } else {
233 *ec = U_INDEX_OUTOFBOUNDS_ERROR;
234 return -1;
235 }
236 }
237 }
238
239 //U_CAPI int32_t U_EXPORT2
240 //uset_getRangeCount(const USet* set) {
241 // return ((const UnicodeSet*) set)->getRangeCount();
242 //}
243 //
244 //U_CAPI UBool U_EXPORT2
245 //uset_getRange(const USet* set, int32_t rangeIndex,
246 // UChar32* pStart, UChar32* pEnd) {
247 // if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) {
248 // return FALSE;
249 // }
250 // const UnicodeSet* us = (const UnicodeSet*) set;
251 // *pStart = us->getRangeStart(rangeIndex);
252 // *pEnd = us->getRangeEnd(rangeIndex);
253 // return TRUE;
254 //}
255
256 /*
257 * Serialize a USet into 16-bit units.
258 * Store BMP code points as themselves with one 16-bit unit each.
259 *
260 * Important: the code points in the array are in ascending order,
261 * therefore all BMP code points precede all supplementary code points.
262 *
263 * Store each supplementary code point in 2 16-bit units,
264 * simply with higher-then-lower 16-bit halfs.
265 *
266 * Precede the entire list with the length.
267 * If there are supplementary code points, then set bit 15 in the length
268 * and add the bmpLength between it and the array.
269 *
270 * In other words:
271 * - all BMP: (length=bmpLength) BMP, .., BMP
272 * - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, ..
273 */
274 U_CAPI int32_t U_EXPORT2
275 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* ec) {
276 if (ec==NULL || U_FAILURE(*ec)) {
277 return 0;
278 }
279
280 return ((const UnicodeSet*) set)->serialize(dest, destCapacity,* ec);
281 }
282
283 U_CAPI UBool U_EXPORT2
284 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength) {
285 int32_t length;
286
287 if(fillSet==NULL) {
288 return FALSE;
289 }
290 if(src==NULL || srcLength<=0) {
291 fillSet->length=fillSet->bmpLength=0;
292 return FALSE;
293 }
294
295 length=*src++;
296 if(length&0x8000) {
297 /* there are supplementary values */
298 length&=0x7fff;
299 if(srcLength<(2+length)) {
300 fillSet->length=fillSet->bmpLength=0;
301 return FALSE;
302 }
303 fillSet->bmpLength=*src++;
304 } else {
305 /* only BMP values */
306 if(srcLength<(1+length)) {
307 fillSet->length=fillSet->bmpLength=0;
308 return FALSE;
309 }
310 fillSet->bmpLength=length;
311 }
312 fillSet->array=src;
313 fillSet->length=length;
314 return TRUE;
315 }
316
317 U_CAPI void U_EXPORT2
318 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c) {
319 if(fillSet==NULL || (uint32_t)c>0x10ffff) {
320 return;
321 }
322
323 fillSet->array=fillSet->staticArray;
324 if(c<0xffff) {
325 fillSet->bmpLength=fillSet->length=2;
326 fillSet->staticArray[0]=(uint16_t)c;
327 fillSet->staticArray[1]=(uint16_t)c+1;
328 } else if(c==0xffff) {
329 fillSet->bmpLength=1;
330 fillSet->length=3;
331 fillSet->staticArray[0]=0xffff;
332 fillSet->staticArray[1]=1;
333 fillSet->staticArray[2]=0;
334 } else if(c<0x10ffff) {
335 fillSet->bmpLength=0;
336 fillSet->length=4;
337 fillSet->staticArray[0]=(uint16_t)(c>>16);
338 fillSet->staticArray[1]=(uint16_t)c;
339 ++c;
340 fillSet->staticArray[2]=(uint16_t)(c>>16);
341 fillSet->staticArray[3]=(uint16_t)c;
342 } else /* c==0x10ffff */ {
343 fillSet->bmpLength=0;
344 fillSet->length=2;
345 fillSet->staticArray[0]=0x10;
346 fillSet->staticArray[1]=0xffff;
347 }
348 }
349
350 U_CAPI UBool U_EXPORT2
351 uset_serializedContains(const USerializedSet* set, UChar32 c) {
352 const uint16_t* array;
353
354 if(set==NULL || (uint32_t)c>0x10ffff) {
355 return FALSE;
356 }
357
358 array=set->array;
359 if(c<=0xffff) {
360 /* find c in the BMP part */
361 int32_t i, bmpLength=set->bmpLength;
362 for(i=0; i<bmpLength && (uint16_t)c>=array[i]; ++i) {}
363 return (UBool)(i&1);
364 } else {
365 /* find c in the supplementary part */
366 int32_t i, length=set->length;
367 uint16_t high=(uint16_t)(c>>16), low=(uint16_t)c;
368 for(i=set->bmpLength;
369 i<length && (high>array[i] || (high==array[i] && low>=array[i+1]));
370 i+=2) {}
371
372 /* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */
373 return (UBool)(((i+set->bmpLength)&2)!=0);
374 }
375 }
376
377 U_CAPI int32_t U_EXPORT2
378 uset_getSerializedRangeCount(const USerializedSet* set) {
379 if(set==NULL) {
380 return 0;
381 }
382
383 return (set->bmpLength+(set->length-set->bmpLength)/2+1)/2;
384 }
385
386 U_CAPI UBool U_EXPORT2
387 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
388 UChar32* pStart, UChar32* pEnd) {
389 const uint16_t* array;
390 int32_t bmpLength, length;
391
392 if(set==NULL || rangeIndex<0 || pStart==NULL || pEnd==NULL) {
393 return FALSE;
394 }
395
396 array=set->array;
397 length=set->length;
398 bmpLength=set->bmpLength;
399
400 rangeIndex*=2; /* address start/limit pairs */
401 if(rangeIndex<bmpLength) {
402 *pStart=array[rangeIndex++];
403 if(rangeIndex<bmpLength) {
404 *pEnd=array[rangeIndex];
405 } else if(rangeIndex<length) {
406 *pEnd=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1];
407 } else {
408 *pEnd=0x110000;
409 }
410 --*pEnd;
411 return TRUE;
412 } else {
413 rangeIndex-=bmpLength;
414 rangeIndex*=2; /* address pairs of pairs of units */
415 length-=bmpLength;
416 if(rangeIndex<length) {
417 array+=bmpLength;
418 *pStart=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1];
419 rangeIndex+=2;
420 if(rangeIndex<length) {
421 *pEnd=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1];
422 } else {
423 *pEnd=0x110000;
424 }
425 --*pEnd;
426 return TRUE;
427 } else {
428 return FALSE;
429 }
430 }
431 }
432
433 // TODO The old, internal uset.c had an efficient uset_containsOne function.
434 // Returned the one and only code point, or else -1 or something.
435 // Consider adding such a function to both C and C++ UnicodeSet/uset.
436 // See tools/gennorm/store.c for usage, now usetContainsOne there.
437
438 // TODO Investigate incorporating this code into UnicodeSet to improve
439 // efficiency.
440 // ---
441 // #define USET_GROW_DELTA 20
442 //
443 // static U_INLINE int32_t
444 // findChar(const UChar32* array, int32_t length, UChar32 c) {
445 // int32_t i;
446 //
447 // /* check the last range limit first for more efficient appending */
448 // if(length>0) {
449 // if(c>=array[length-1]) {
450 // return length;
451 // }
452 //
453 // /* do not check the last range limit again in the loop below */
454 // --length;
455 // }
456 //
457 // for(i=0; i<length && c>=array[i]; ++i) {}
458 // return i;
459 // }
460 //
461 // static UBool
462 // addRemove(USet* set, UChar32 c, int32_t doRemove) {
463 // int32_t i, length, more;
464 //
465 // if(set==NULL || (uint32_t)c>0x10ffff) {
466 // return FALSE;
467 // }
468 //
469 // length=set->length;
470 // i=findChar(set->array, length, c);
471 // if((i&1)^doRemove) {
472 // /* c is already in the set */
473 // return TRUE;
474 // }
475 //
476 // /* how many more array items do we need? */
477 // if(i<length && (c+1)==set->array[i]) {
478 // /* c is just before the following range, extend that in-place by one */
479 // set->array[i]=c;
480 // if(i>0) {
481 // --i;
482 // if(c==set->array[i]) {
483 // /* the previous range collapsed, remove it */
484 // set->length=length-=2;
485 // if(i<length) {
486 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4);
487 // }
488 // }
489 // }
490 // return TRUE;
491 // } else if(i>0 && c==set->array[i-1]) {
492 // /* c is just after the previous range, extend that in-place by one */
493 // if(++c<=0x10ffff) {
494 // set->array[i-1]=c;
495 // if(i<length && c==set->array[i]) {
496 // /* the following range collapsed, remove it */
497 // --i;
498 // set->length=length-=2;
499 // if(i<length) {
500 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4);
501 // }
502 // }
503 // } else {
504 // /* extend the previous range (had limit 0x10ffff) to the end of Unicode */
505 // set->length=i-1;
506 // }
507 // return TRUE;
508 // } else if(i==length && c==0x10ffff) {
509 // /* insert one range limit c */
510 // more=1;
511 // } else {
512 // /* insert two range limits c, c+1 */
513 // more=2;
514 // }
515 //
516 // /* insert <more> range limits */
517 // if(length+more>set->capacity) {
518 // /* reallocate */
519 // int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA;
520 // UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4);
521 // if(newArray==NULL) {
522 // return FALSE;
523 // }
524 // set->capacity=newCapacity;
525 // uprv_memcpy(newArray, set->array, length*4);
526 //
527 // if(set->array!=set->staticBuffer) {
528 // uprv_free(set->array);
529 // }
530 // set->array=newArray;
531 // }
532 //
533 // if(i<length) {
534 // uprv_memmove(set->array+i+more, set->array+i, (length-i)*4);
535 // }
536 // set->array[i]=c;
537 // if(more==2) {
538 // set->array[i+1]=c+1;
539 // }
540 // set->length+=more;
541 //
542 // return TRUE;
543 // }
544 //
545 // U_CAPI UBool U_EXPORT2
546 // uset_add(USet* set, UChar32 c) {
547 // return addRemove(set, c, 0);
548 // }
549 //
550 // U_CAPI void U_EXPORT2
551 // uset_remove(USet* set, UChar32 c) {
552 // addRemove(set, c, 1);
553 // }