1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2002-2011, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
12 * tab size: 8 (not used)
15 * created on: 2002mar07
16 * created by: Markus W. Scherer
18 * There are functions to efficiently serialize a USet into an array of uint16_t
19 * and functions to use such a serialized form efficiently without
20 * instantiating a new USet.
23 #include "unicode/utypes.h"
24 #include "unicode/uobject.h"
25 #include "unicode/uset.h"
26 #include "unicode/uniset.h"
28 #include "unicode/ustring.h"
29 #include "unicode/parsepos.h"
33 U_CAPI USet
* U_EXPORT2
35 return (USet
*) new UnicodeSet();
38 U_CAPI USet
* U_EXPORT2
39 uset_open(UChar32 start
, UChar32 end
) {
40 return (USet
*) new UnicodeSet(start
, end
);
44 uset_close(USet
* set
) {
45 delete (UnicodeSet
*) set
;
48 U_CAPI USet
* U_EXPORT2
49 uset_clone(const USet
*set
) {
50 return (USet
*) (((UnicodeSet
*) set
)->UnicodeSet::clone());
53 U_CAPI UBool U_EXPORT2
54 uset_isFrozen(const USet
*set
) {
55 return ((UnicodeSet
*) set
)->UnicodeSet::isFrozen();
59 uset_freeze(USet
*set
) {
60 ((UnicodeSet
*) set
)->UnicodeSet::freeze();
63 U_CAPI USet
* U_EXPORT2
64 uset_cloneAsThawed(const USet
*set
) {
65 return (USet
*) (((UnicodeSet
*) set
)->UnicodeSet::cloneAsThawed());
70 UChar32 start
, UChar32 end
) {
71 ((UnicodeSet
*) set
)->UnicodeSet::set(start
, end
);
75 uset_addAll(USet
* set
, const USet
*additionalSet
) {
76 ((UnicodeSet
*) set
)->UnicodeSet::addAll(*((const UnicodeSet
*)additionalSet
));
80 uset_add(USet
* set
, UChar32 c
) {
81 ((UnicodeSet
*) set
)->UnicodeSet::add(c
);
85 uset_addRange(USet
* set
, UChar32 start
, UChar32 end
) {
86 ((UnicodeSet
*) set
)->UnicodeSet::add(start
, end
);
90 uset_addString(USet
* set
, const UChar
* str
, int32_t strLen
) {
91 // UnicodeString handles -1 for strLen
92 UnicodeString
s(strLen
<0, str
, strLen
);
93 ((UnicodeSet
*) set
)->UnicodeSet::add(s
);
97 uset_addAllCodePoints(USet
* set
, const UChar
*str
, int32_t strLen
) {
98 // UnicodeString handles -1 for strLen
99 UnicodeString
s(str
, strLen
);
100 ((UnicodeSet
*) set
)->UnicodeSet::addAll(s
);
103 U_CAPI
void U_EXPORT2
104 uset_remove(USet
* set
, UChar32 c
) {
105 ((UnicodeSet
*) set
)->UnicodeSet::remove(c
);
108 U_CAPI
void U_EXPORT2
109 uset_removeRange(USet
* set
, UChar32 start
, UChar32 end
) {
110 ((UnicodeSet
*) set
)->UnicodeSet::remove(start
, end
);
113 U_CAPI
void U_EXPORT2
114 uset_removeString(USet
* set
, const UChar
* str
, int32_t strLen
) {
115 UnicodeString
s(strLen
==-1, str
, strLen
);
116 ((UnicodeSet
*) set
)->UnicodeSet::remove(s
);
119 U_CAPI
void U_EXPORT2
120 uset_removeAll(USet
* set
, const USet
* remove
) {
121 ((UnicodeSet
*) set
)->UnicodeSet::removeAll(*(const UnicodeSet
*)remove
);
124 U_CAPI
void U_EXPORT2
125 uset_retain(USet
* set
, UChar32 start
, UChar32 end
) {
126 ((UnicodeSet
*) set
)->UnicodeSet::retain(start
, end
);
129 U_CAPI
void U_EXPORT2
130 uset_retainAll(USet
* set
, const USet
* retain
) {
131 ((UnicodeSet
*) set
)->UnicodeSet::retainAll(*(const UnicodeSet
*)retain
);
134 U_CAPI
void U_EXPORT2
135 uset_compact(USet
* set
) {
136 ((UnicodeSet
*) set
)->UnicodeSet::compact();
139 U_CAPI
void U_EXPORT2
140 uset_complement(USet
* set
) {
141 ((UnicodeSet
*) set
)->UnicodeSet::complement();
144 U_CAPI
void U_EXPORT2
145 uset_complementAll(USet
* set
, const USet
* complement
) {
146 ((UnicodeSet
*) set
)->UnicodeSet::complementAll(*(const UnicodeSet
*)complement
);
149 U_CAPI
void U_EXPORT2
150 uset_clear(USet
* set
) {
151 ((UnicodeSet
*) set
)->UnicodeSet::clear();
154 U_CAPI
void U_EXPORT2
155 uset_removeAllStrings(USet
* set
) {
156 ((UnicodeSet
*) set
)->UnicodeSet::removeAllStrings();
159 U_CAPI UBool U_EXPORT2
160 uset_isEmpty(const USet
* set
) {
161 return ((const UnicodeSet
*) set
)->UnicodeSet::isEmpty();
164 U_CAPI UBool U_EXPORT2
165 uset_contains(const USet
* set
, UChar32 c
) {
166 return ((const UnicodeSet
*) set
)->UnicodeSet::contains(c
);
169 U_CAPI UBool U_EXPORT2
170 uset_containsRange(const USet
* set
, UChar32 start
, UChar32 end
) {
171 return ((const UnicodeSet
*) set
)->UnicodeSet::contains(start
, end
);
174 U_CAPI UBool U_EXPORT2
175 uset_containsString(const USet
* set
, const UChar
* str
, int32_t strLen
) {
176 UnicodeString
s(strLen
==-1, str
, strLen
);
177 return ((const UnicodeSet
*) set
)->UnicodeSet::contains(s
);
180 U_CAPI UBool U_EXPORT2
181 uset_containsAll(const USet
* set1
, const USet
* set2
) {
182 return ((const UnicodeSet
*) set1
)->UnicodeSet::containsAll(* (const UnicodeSet
*) set2
);
185 U_CAPI UBool U_EXPORT2
186 uset_containsAllCodePoints(const USet
* set
, const UChar
*str
, int32_t strLen
) {
187 // Create a string alias, since nothing is being added to the set.
188 UnicodeString
s(strLen
==-1, str
, strLen
);
189 return ((const UnicodeSet
*) set
)->UnicodeSet::containsAll(s
);
192 U_CAPI UBool U_EXPORT2
193 uset_containsNone(const USet
* set1
, const USet
* set2
) {
194 return ((const UnicodeSet
*) set1
)->UnicodeSet::containsNone(* (const UnicodeSet
*) set2
);
197 U_CAPI UBool U_EXPORT2
198 uset_containsSome(const USet
* set1
, const USet
* set2
) {
199 return ((const UnicodeSet
*) set1
)->UnicodeSet::containsSome(* (const UnicodeSet
*) set2
);
202 U_CAPI
int32_t U_EXPORT2
203 uset_span(const USet
*set
, const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) {
204 return ((UnicodeSet
*) set
)->UnicodeSet::span(s
, length
, spanCondition
);
207 U_CAPI
int32_t U_EXPORT2
208 uset_spanBack(const USet
*set
, const UChar
*s
, int32_t length
, USetSpanCondition spanCondition
) {
209 return ((UnicodeSet
*) set
)->UnicodeSet::spanBack(s
, length
, spanCondition
);
212 U_CAPI
int32_t U_EXPORT2
213 uset_spanUTF8(const USet
*set
, const char *s
, int32_t length
, USetSpanCondition spanCondition
) {
214 return ((UnicodeSet
*) set
)->UnicodeSet::spanUTF8(s
, length
, spanCondition
);
217 U_CAPI
int32_t U_EXPORT2
218 uset_spanBackUTF8(const USet
*set
, const char *s
, int32_t length
, USetSpanCondition spanCondition
) {
219 return ((UnicodeSet
*) set
)->UnicodeSet::spanBackUTF8(s
, length
, spanCondition
);
222 U_CAPI UBool U_EXPORT2
223 uset_equals(const USet
* set1
, const USet
* set2
) {
224 return *(const UnicodeSet
*)set1
== *(const UnicodeSet
*)set2
;
227 U_CAPI
int32_t U_EXPORT2
228 uset_indexOf(const USet
* set
, UChar32 c
) {
229 return ((UnicodeSet
*) set
)->UnicodeSet::indexOf(c
);
232 U_CAPI UChar32 U_EXPORT2
233 uset_charAt(const USet
* set
, int32_t index
) {
234 return ((UnicodeSet
*) set
)->UnicodeSet::charAt(index
);
237 U_CAPI
int32_t U_EXPORT2
238 uset_size(const USet
* set
) {
239 return ((const UnicodeSet
*) set
)->UnicodeSet::size();
244 * This class only exists to provide access to the UnicodeSet private
245 * USet support API. Declaring a class a friend is more portable than
246 * trying to declare extern "C" functions as friends.
248 class USetAccess
/* not : public UObject because all methods are static */ {
250 /* Try to have the compiler inline these*/
251 inline static int32_t getStringCount(const UnicodeSet
& set
) {
252 return set
.getStringCount();
254 inline static const UnicodeString
* getString(const UnicodeSet
& set
,
256 return set
.getString(i
);
259 /* do not instantiate*/
264 U_CAPI
int32_t U_EXPORT2
265 uset_getItemCount(const USet
* uset
) {
266 const UnicodeSet
& set
= *(const UnicodeSet
*)uset
;
267 return set
.getRangeCount() + USetAccess::getStringCount(set
);
270 U_CAPI
int32_t U_EXPORT2
271 uset_getItem(const USet
* uset
, int32_t itemIndex
,
272 UChar32
* start
, UChar32
* end
,
273 UChar
* str
, int32_t strCapacity
,
275 if (U_FAILURE(*ec
)) return 0;
276 const UnicodeSet
& set
= *(const UnicodeSet
*)uset
;
280 *ec
= U_ILLEGAL_ARGUMENT_ERROR
;
282 } else if (itemIndex
< (rangeCount
= set
.getRangeCount())) {
283 *start
= set
.getRangeStart(itemIndex
);
284 *end
= set
.getRangeEnd(itemIndex
);
287 itemIndex
-= rangeCount
;
288 if (itemIndex
< USetAccess::getStringCount(set
)) {
289 const UnicodeString
* s
= USetAccess::getString(set
, itemIndex
);
290 return s
->extract(str
, strCapacity
, *ec
);
292 *ec
= U_INDEX_OUTOFBOUNDS_ERROR
;
298 //U_CAPI int32_t U_EXPORT2
299 //uset_getRangeCount(const USet* set) {
300 // return ((const UnicodeSet*) set)->getRangeCount();
303 //U_CAPI UBool U_EXPORT2
304 //uset_getRange(const USet* set, int32_t rangeIndex,
305 // UChar32* pStart, UChar32* pEnd) {
306 // if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) {
309 // const UnicodeSet* us = (const UnicodeSet*) set;
310 // *pStart = us->getRangeStart(rangeIndex);
311 // *pEnd = us->getRangeEnd(rangeIndex);
316 * Serialize a USet into 16-bit units.
317 * Store BMP code points as themselves with one 16-bit unit each.
319 * Important: the code points in the array are in ascending order,
320 * therefore all BMP code points precede all supplementary code points.
322 * Store each supplementary code point in 2 16-bit units,
323 * simply with higher-then-lower 16-bit halfs.
325 * Precede the entire list with the length.
326 * If there are supplementary code points, then set bit 15 in the length
327 * and add the bmpLength between it and the array.
330 * - all BMP: (length=bmpLength) BMP, .., BMP
331 * - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, ..
333 U_CAPI
int32_t U_EXPORT2
334 uset_serialize(const USet
* set
, uint16_t* dest
, int32_t destCapacity
, UErrorCode
* ec
) {
335 if (ec
==NULL
|| U_FAILURE(*ec
)) {
339 return ((const UnicodeSet
*) set
)->UnicodeSet::serialize(dest
, destCapacity
,* ec
);
342 U_CAPI UBool U_EXPORT2
343 uset_getSerializedSet(USerializedSet
* fillSet
, const uint16_t* src
, int32_t srcLength
) {
349 if(src
==NULL
|| srcLength
<=0) {
350 fillSet
->length
=fillSet
->bmpLength
=0;
356 /* there are supplementary values */
358 if(srcLength
<(2+length
)) {
359 fillSet
->length
=fillSet
->bmpLength
=0;
362 fillSet
->bmpLength
=*src
++;
364 /* only BMP values */
365 if(srcLength
<(1+length
)) {
366 fillSet
->length
=fillSet
->bmpLength
=0;
369 fillSet
->bmpLength
=length
;
372 fillSet
->length
=length
;
376 U_CAPI
void U_EXPORT2
377 uset_setSerializedToOne(USerializedSet
* fillSet
, UChar32 c
) {
378 if(fillSet
==NULL
|| (uint32_t)c
>0x10ffff) {
382 fillSet
->array
=fillSet
->staticArray
;
384 fillSet
->bmpLength
=fillSet
->length
=2;
385 fillSet
->staticArray
[0]=(uint16_t)c
;
386 fillSet
->staticArray
[1]=(uint16_t)c
+1;
387 } else if(c
==0xffff) {
388 fillSet
->bmpLength
=1;
390 fillSet
->staticArray
[0]=0xffff;
391 fillSet
->staticArray
[1]=1;
392 fillSet
->staticArray
[2]=0;
393 } else if(c
<0x10ffff) {
394 fillSet
->bmpLength
=0;
396 fillSet
->staticArray
[0]=(uint16_t)(c
>>16);
397 fillSet
->staticArray
[1]=(uint16_t)c
;
399 fillSet
->staticArray
[2]=(uint16_t)(c
>>16);
400 fillSet
->staticArray
[3]=(uint16_t)c
;
401 } else /* c==0x10ffff */ {
402 fillSet
->bmpLength
=0;
404 fillSet
->staticArray
[0]=0x10;
405 fillSet
->staticArray
[1]=0xffff;
409 U_CAPI UBool U_EXPORT2
410 uset_serializedContains(const USerializedSet
* set
, UChar32 c
) {
411 const uint16_t* array
;
413 if(set
==NULL
|| (uint32_t)c
>0x10ffff) {
419 /* find c in the BMP part */
421 int32_t hi
= set
->bmpLength
-1;
424 } else if (c
< array
[hi
]) {
426 int32_t i
= (lo
+ hi
) >> 1;
429 } else if (c
< array
[i
]) {
438 return (UBool
)(hi
&1);
440 /* find c in the supplementary part */
441 uint16_t high
=(uint16_t)(c
>>16), low
=(uint16_t)c
;
442 int32_t base
= set
->bmpLength
;
444 int32_t hi
= set
->length
- 2 - base
;
445 if (high
< array
[base
] || (high
==array
[base
] && low
<array
[base
+1])) {
447 } else if (high
< array
[base
+hi
] || (high
==array
[base
+hi
] && low
<array
[base
+hi
+1])) {
449 int32_t i
= ((lo
+ hi
) >> 1) & ~1; // Guarantee even result
450 int32_t iabs
= i
+ base
;
453 } else if (high
< array
[iabs
] || (high
==array
[iabs
] && low
<array
[iabs
+1])) {
462 /* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */
463 return (UBool
)(((hi
+(base
<<1))&2)!=0);
467 U_CAPI
int32_t U_EXPORT2
468 uset_getSerializedRangeCount(const USerializedSet
* set
) {
473 return (set
->bmpLength
+(set
->length
-set
->bmpLength
)/2+1)/2;
476 U_CAPI UBool U_EXPORT2
477 uset_getSerializedRange(const USerializedSet
* set
, int32_t rangeIndex
,
478 UChar32
* pStart
, UChar32
* pEnd
) {
479 const uint16_t* array
;
480 int32_t bmpLength
, length
;
482 if(set
==NULL
|| rangeIndex
<0 || pStart
==NULL
|| pEnd
==NULL
) {
488 bmpLength
=set
->bmpLength
;
490 rangeIndex
*=2; /* address start/limit pairs */
491 if(rangeIndex
<bmpLength
) {
492 *pStart
=array
[rangeIndex
++];
493 if(rangeIndex
<bmpLength
) {
494 *pEnd
=array
[rangeIndex
]-1;
495 } else if(rangeIndex
<length
) {
496 *pEnd
=((((int32_t)array
[rangeIndex
])<<16)|array
[rangeIndex
+1])-1;
502 rangeIndex
-=bmpLength
;
503 rangeIndex
*=2; /* address pairs of pairs of units */
505 if(rangeIndex
<length
) {
507 *pStart
=(((int32_t)array
[rangeIndex
])<<16)|array
[rangeIndex
+1];
509 if(rangeIndex
<length
) {
510 *pEnd
=((((int32_t)array
[rangeIndex
])<<16)|array
[rangeIndex
+1])-1;
521 // TODO The old, internal uset.c had an efficient uset_containsOne function.
522 // Returned the one and only code point, or else -1 or something.
523 // Consider adding such a function to both C and C++ UnicodeSet/uset.
524 // See tools/gennorm/store.c for usage, now usetContainsOne there.
526 // TODO Investigate incorporating this code into UnicodeSet to improve
529 // #define USET_GROW_DELTA 20
532 // findChar(const UChar32* array, int32_t length, UChar32 c) {
535 // /* check the last range limit first for more efficient appending */
537 // if(c>=array[length-1]) {
541 // /* do not check the last range limit again in the loop below */
545 // for(i=0; i<length && c>=array[i]; ++i) {}
550 // addRemove(USet* set, UChar32 c, int32_t doRemove) {
551 // int32_t i, length, more;
553 // if(set==NULL || (uint32_t)c>0x10ffff) {
557 // length=set->length;
558 // i=findChar(set->array, length, c);
559 // if((i&1)^doRemove) {
560 // /* c is already in the set */
564 // /* how many more array items do we need? */
565 // if(i<length && (c+1)==set->array[i]) {
566 // /* c is just before the following range, extend that in-place by one */
570 // if(c==set->array[i]) {
571 // /* the previous range collapsed, remove it */
572 // set->length=length-=2;
574 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4);
579 // } else if(i>0 && c==set->array[i-1]) {
580 // /* c is just after the previous range, extend that in-place by one */
581 // if(++c<=0x10ffff) {
582 // set->array[i-1]=c;
583 // if(i<length && c==set->array[i]) {
584 // /* the following range collapsed, remove it */
586 // set->length=length-=2;
588 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4);
592 // /* extend the previous range (had limit 0x10ffff) to the end of Unicode */
596 // } else if(i==length && c==0x10ffff) {
597 // /* insert one range limit c */
600 // /* insert two range limits c, c+1 */
604 // /* insert <more> range limits */
605 // if(length+more>set->capacity) {
607 // int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA;
608 // UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4);
609 // if(newArray==NULL) {
612 // set->capacity=newCapacity;
613 // uprv_memcpy(newArray, set->array, length*4);
615 // if(set->array!=set->staticBuffer) {
616 // uprv_free(set->array);
618 // set->array=newArray;
622 // uprv_memmove(set->array+i+more, set->array+i, (length-i)*4);
626 // set->array[i+1]=c+1;
628 // set->length+=more;
633 // U_CAPI UBool U_EXPORT2
634 // uset_add(USet* set, UChar32 c) {
635 // return addRemove(set, c, 0);
638 // U_CAPI void U_EXPORT2
639 // uset_remove(USet* set, UChar32 c) {
640 // addRemove(set, c, 1);