2 ******************************************************************************* 
   4 *   Copyright (C) 2002-2010, International Business Machines 
   5 *   Corporation and others.  All Rights Reserved. 
   7 ******************************************************************************* 
  10 *   tab size:   8 (not used) 
  13 *   created on: 2002mar07 
  14 *   created by: Markus W. Scherer 
  16 *   There are functions to efficiently serialize a USet into an array of uint16_t 
  17 *   and functions to use such a serialized form efficiently without 
  18 *   instantiating a new USet. 
  21 #include "unicode/utypes.h" 
  22 #include "unicode/uobject.h" 
  23 #include "unicode/uset.h" 
  24 #include "unicode/uniset.h" 
  26 #include "unicode/ustring.h" 
  27 #include "unicode/parsepos.h" 
  31 U_CAPI USet
* U_EXPORT2
 
  33     return (USet
*) new UnicodeSet(); 
  36 U_CAPI USet
* U_EXPORT2
 
  37 uset_open(UChar32 start
, UChar32 end
) { 
  38     return (USet
*) new UnicodeSet(start
, end
); 
  42 uset_close(USet
* set
) { 
  43     delete (UnicodeSet
*) set
; 
  46 U_CAPI USet 
* U_EXPORT2
 
  47 uset_clone(const USet 
*set
) { 
  48     return (USet
*) (((UnicodeSet
*) set
)->UnicodeSet::clone()); 
  51 U_CAPI UBool U_EXPORT2
 
  52 uset_isFrozen(const USet 
*set
) { 
  53     return ((UnicodeSet
*) set
)->UnicodeSet::isFrozen(); 
  57 uset_freeze(USet 
*set
) { 
  58     ((UnicodeSet
*) set
)->UnicodeSet::freeze(); 
  61 U_CAPI USet 
* U_EXPORT2
 
  62 uset_cloneAsThawed(const USet 
*set
) { 
  63     return (USet
*) (((UnicodeSet
*) set
)->UnicodeSet::cloneAsThawed()); 
  68      UChar32 start
, UChar32 end
) { 
  69     ((UnicodeSet
*) set
)->UnicodeSet::set(start
, end
); 
  73 uset_addAll(USet
* set
, const USet 
*additionalSet
) { 
  74     ((UnicodeSet
*) set
)->UnicodeSet::addAll(*((const UnicodeSet
*)additionalSet
)); 
  78 uset_add(USet
* set
, UChar32 c
) { 
  79     ((UnicodeSet
*) set
)->UnicodeSet::add(c
); 
  83 uset_addRange(USet
* set
, UChar32 start
, UChar32 end
) { 
  84     ((UnicodeSet
*) set
)->UnicodeSet::add(start
, end
);     
  88 uset_addString(USet
* set
, const UChar
* str
, int32_t strLen
) { 
  89     // UnicodeString handles -1 for strLen 
  90     UnicodeString 
s(strLen
<0, str
, strLen
); 
  91     ((UnicodeSet
*) set
)->UnicodeSet::add(s
); 
  95 uset_addAllCodePoints(USet
* set
, const UChar 
*str
, int32_t strLen
) { 
  96     // UnicodeString handles -1 for strLen 
  97     UnicodeString 
s(str
, strLen
); 
  98     ((UnicodeSet
*) set
)->UnicodeSet::addAll(s
); 
 101 U_CAPI 
void U_EXPORT2
 
 102 uset_remove(USet
* set
, UChar32 c
) { 
 103     ((UnicodeSet
*) set
)->UnicodeSet::remove(c
); 
 106 U_CAPI 
void U_EXPORT2
 
 107 uset_removeRange(USet
* set
, UChar32 start
, UChar32 end
) { 
 108     ((UnicodeSet
*) set
)->UnicodeSet::remove(start
, end
); 
 111 U_CAPI 
void U_EXPORT2
 
 112 uset_removeString(USet
* set
, const UChar
* str
, int32_t strLen
) { 
 113     UnicodeString 
s(strLen
==-1, str
, strLen
); 
 114     ((UnicodeSet
*) set
)->UnicodeSet::remove(s
); 
 117 U_CAPI 
void U_EXPORT2
 
 118 uset_removeAll(USet
* set
, const USet
* remove
) { 
 119     ((UnicodeSet
*) set
)->UnicodeSet::removeAll(*(const UnicodeSet
*)remove
); 
 122 U_CAPI 
void U_EXPORT2
 
 123 uset_retain(USet
* set
, UChar32 start
, UChar32 end
) { 
 124     ((UnicodeSet
*) set
)->UnicodeSet::retain(start
, end
); 
 127 U_CAPI 
void U_EXPORT2
 
 128 uset_retainAll(USet
* set
, const USet
* retain
) { 
 129     ((UnicodeSet
*) set
)->UnicodeSet::retainAll(*(const UnicodeSet
*)retain
); 
 132 U_CAPI 
void U_EXPORT2
 
 133 uset_compact(USet
* set
) { 
 134     ((UnicodeSet
*) set
)->UnicodeSet::compact(); 
 137 U_CAPI 
void U_EXPORT2
 
 138 uset_complement(USet
* set
) { 
 139     ((UnicodeSet
*) set
)->UnicodeSet::complement(); 
 142 U_CAPI 
void U_EXPORT2
 
 143 uset_complementAll(USet
* set
, const USet
* complement
) { 
 144     ((UnicodeSet
*) set
)->UnicodeSet::complementAll(*(const UnicodeSet
*)complement
); 
 147 U_CAPI 
void U_EXPORT2
 
 148 uset_clear(USet
* set
) { 
 149     ((UnicodeSet
*) set
)->UnicodeSet::clear(); 
 152 U_CAPI 
void U_EXPORT2
 
 153 uset_closeOver(USet
* set
, int32_t attributes
) { 
 154     ((UnicodeSet
*) set
)->UnicodeSet::closeOver(attributes
); 
 157 U_CAPI 
void U_EXPORT2
 
 158 uset_removeAllStrings(USet
* set
) { 
 159     ((UnicodeSet
*) set
)->UnicodeSet::removeAllStrings(); 
 162 U_CAPI UBool U_EXPORT2
 
 163 uset_isEmpty(const USet
* set
) { 
 164     return ((const UnicodeSet
*) set
)->UnicodeSet::isEmpty(); 
 167 U_CAPI UBool U_EXPORT2
 
 168 uset_contains(const USet
* set
, UChar32 c
) { 
 169     return ((const UnicodeSet
*) set
)->UnicodeSet::contains(c
); 
 172 U_CAPI UBool U_EXPORT2
 
 173 uset_containsRange(const USet
* set
, UChar32 start
, UChar32 end
) { 
 174     return ((const UnicodeSet
*) set
)->UnicodeSet::contains(start
, end
); 
 177 U_CAPI UBool U_EXPORT2
 
 178 uset_containsString(const USet
* set
, const UChar
* str
, int32_t strLen
) { 
 179     UnicodeString 
s(strLen
==-1, str
, strLen
); 
 180     return ((const UnicodeSet
*) set
)->UnicodeSet::contains(s
); 
 183 U_CAPI UBool U_EXPORT2
 
 184 uset_containsAll(const USet
* set1
, const USet
* set2
) { 
 185     return ((const UnicodeSet
*) set1
)->UnicodeSet::containsAll(* (const UnicodeSet
*) set2
); 
 188 U_CAPI UBool U_EXPORT2
 
 189 uset_containsAllCodePoints(const USet
* set
, const UChar 
*str
, int32_t strLen
) { 
 190     // Create a string alias, since nothing is being added to the set. 
 191     UnicodeString 
s(strLen
==-1, str
, strLen
); 
 192     return ((const UnicodeSet
*) set
)->UnicodeSet::containsAll(s
); 
 195 U_CAPI UBool U_EXPORT2
 
 196 uset_containsNone(const USet
* set1
, const USet
* set2
) { 
 197     return ((const UnicodeSet
*) set1
)->UnicodeSet::containsNone(* (const UnicodeSet
*) set2
); 
 200 U_CAPI UBool U_EXPORT2
 
 201 uset_containsSome(const USet
* set1
, const USet
* set2
) { 
 202     return ((const UnicodeSet
*) set1
)->UnicodeSet::containsSome(* (const UnicodeSet
*) set2
); 
 205 U_CAPI 
int32_t U_EXPORT2
 
 206 uset_span(const USet 
*set
, const UChar 
*s
, int32_t length
, USetSpanCondition spanCondition
) { 
 207     return ((UnicodeSet
*) set
)->UnicodeSet::span(s
, length
, spanCondition
); 
 210 U_CAPI 
int32_t U_EXPORT2
 
 211 uset_spanBack(const USet 
*set
, const UChar 
*s
, int32_t length
, USetSpanCondition spanCondition
) { 
 212     return ((UnicodeSet
*) set
)->UnicodeSet::spanBack(s
, length
, spanCondition
); 
 215 U_CAPI 
int32_t U_EXPORT2
 
 216 uset_spanUTF8(const USet 
*set
, const char *s
, int32_t length
, USetSpanCondition spanCondition
) { 
 217     return ((UnicodeSet
*) set
)->UnicodeSet::spanUTF8(s
, length
, spanCondition
); 
 220 U_CAPI 
int32_t U_EXPORT2
 
 221 uset_spanBackUTF8(const USet 
*set
, const char *s
, int32_t length
, USetSpanCondition spanCondition
) { 
 222     return ((UnicodeSet
*) set
)->UnicodeSet::spanBackUTF8(s
, length
, spanCondition
); 
 225 U_CAPI UBool U_EXPORT2
 
 226 uset_equals(const USet
* set1
, const USet
* set2
) { 
 227     return *(const UnicodeSet
*)set1 
== *(const UnicodeSet
*)set2
; 
 230 U_CAPI 
int32_t U_EXPORT2
 
 231 uset_indexOf(const USet
* set
, UChar32 c
) { 
 232     return ((UnicodeSet
*) set
)->UnicodeSet::indexOf(c
); 
 235 U_CAPI UChar32 U_EXPORT2
 
 236 uset_charAt(const USet
* set
, int32_t index
) { 
 237     return ((UnicodeSet
*) set
)->UnicodeSet::charAt(index
); 
 240 U_CAPI 
int32_t U_EXPORT2
 
 241 uset_size(const USet
* set
) { 
 242     return ((const UnicodeSet
*) set
)->UnicodeSet::size(); 
 247  * This class only exists to provide access to the UnicodeSet private 
 248  * USet support API.  Declaring a class a friend is more portable than 
 249  * trying to declare extern "C" functions as friends. 
 251 class USetAccess 
/* not : public UObject because all methods are static */ { 
 253     /* Try to have the compiler inline these*/ 
 254     inline static int32_t getStringCount(const UnicodeSet
& set
) { 
 255         return set
.getStringCount(); 
 257     inline static const UnicodeString
* getString(const UnicodeSet
& set
, 
 259         return set
.getString(i
); 
 262     /* do not instantiate*/ 
 267 U_CAPI 
int32_t U_EXPORT2
 
 268 uset_getItemCount(const USet
* uset
) { 
 269     const UnicodeSet
& set 
= *(const UnicodeSet
*)uset
; 
 270     return set
.getRangeCount() + USetAccess::getStringCount(set
); 
 273 U_CAPI 
int32_t U_EXPORT2
 
 274 uset_getItem(const USet
* uset
, int32_t itemIndex
, 
 275              UChar32
* start
, UChar32
* end
, 
 276              UChar
* str
, int32_t strCapacity
, 
 278     if (U_FAILURE(*ec
)) return 0; 
 279     const UnicodeSet
& set 
= *(const UnicodeSet
*)uset
; 
 283         *ec 
= U_ILLEGAL_ARGUMENT_ERROR
; 
 285     } else if (itemIndex 
< (rangeCount 
= set
.getRangeCount())) { 
 286         *start 
= set
.getRangeStart(itemIndex
); 
 287         *end 
= set
.getRangeEnd(itemIndex
); 
 290         itemIndex 
-= rangeCount
; 
 291         if (itemIndex 
< USetAccess::getStringCount(set
)) { 
 292             const UnicodeString
* s 
= USetAccess::getString(set
, itemIndex
); 
 293             return s
->extract(str
, strCapacity
, *ec
); 
 295             *ec 
= U_INDEX_OUTOFBOUNDS_ERROR
; 
 301 //U_CAPI int32_t U_EXPORT2 
 302 //uset_getRangeCount(const USet* set) { 
 303 //    return ((const UnicodeSet*) set)->getRangeCount(); 
 306 //U_CAPI UBool U_EXPORT2 
 307 //uset_getRange(const USet* set, int32_t rangeIndex, 
 308 //              UChar32* pStart, UChar32* pEnd) { 
 309 //    if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) { 
 312 //    const UnicodeSet* us = (const UnicodeSet*) set; 
 313 //    *pStart = us->getRangeStart(rangeIndex); 
 314 //    *pEnd = us->getRangeEnd(rangeIndex); 
 319  * Serialize a USet into 16-bit units. 
 320  * Store BMP code points as themselves with one 16-bit unit each. 
 322  * Important: the code points in the array are in ascending order, 
 323  * therefore all BMP code points precede all supplementary code points. 
 325  * Store each supplementary code point in 2 16-bit units, 
 326  * simply with higher-then-lower 16-bit halfs. 
 328  * Precede the entire list with the length. 
 329  * If there are supplementary code points, then set bit 15 in the length 
 330  * and add the bmpLength between it and the array. 
 333  * - all BMP:            (length=bmpLength) BMP, .., BMP 
 334  * - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, .. 
 336 U_CAPI 
int32_t U_EXPORT2
 
 337 uset_serialize(const USet
* set
, uint16_t* dest
, int32_t destCapacity
, UErrorCode
* ec
) { 
 338     if (ec
==NULL 
|| U_FAILURE(*ec
)) { 
 342     return ((const UnicodeSet
*) set
)->UnicodeSet::serialize(dest
, destCapacity
,* ec
); 
 345 U_CAPI UBool U_EXPORT2
 
 346 uset_getSerializedSet(USerializedSet
* fillSet
, const uint16_t* src
, int32_t srcLength
) { 
 352     if(src
==NULL 
|| srcLength
<=0) { 
 353         fillSet
->length
=fillSet
->bmpLength
=0; 
 359         /* there are supplementary values */ 
 361         if(srcLength
<(2+length
)) { 
 362             fillSet
->length
=fillSet
->bmpLength
=0; 
 365         fillSet
->bmpLength
=*src
++; 
 367         /* only BMP values */ 
 368         if(srcLength
<(1+length
)) { 
 369             fillSet
->length
=fillSet
->bmpLength
=0; 
 372         fillSet
->bmpLength
=length
; 
 375     fillSet
->length
=length
; 
 379 U_CAPI 
void U_EXPORT2
 
 380 uset_setSerializedToOne(USerializedSet
* fillSet
, UChar32 c
) { 
 381     if(fillSet
==NULL 
|| (uint32_t)c
>0x10ffff) { 
 385     fillSet
->array
=fillSet
->staticArray
; 
 387         fillSet
->bmpLength
=fillSet
->length
=2; 
 388         fillSet
->staticArray
[0]=(uint16_t)c
; 
 389         fillSet
->staticArray
[1]=(uint16_t)c
+1; 
 390     } else if(c
==0xffff) { 
 391         fillSet
->bmpLength
=1; 
 393         fillSet
->staticArray
[0]=0xffff; 
 394         fillSet
->staticArray
[1]=1; 
 395         fillSet
->staticArray
[2]=0; 
 396     } else if(c
<0x10ffff) { 
 397         fillSet
->bmpLength
=0; 
 399         fillSet
->staticArray
[0]=(uint16_t)(c
>>16); 
 400         fillSet
->staticArray
[1]=(uint16_t)c
; 
 402         fillSet
->staticArray
[2]=(uint16_t)(c
>>16); 
 403         fillSet
->staticArray
[3]=(uint16_t)c
; 
 404     } else /* c==0x10ffff */ { 
 405         fillSet
->bmpLength
=0; 
 407         fillSet
->staticArray
[0]=0x10; 
 408         fillSet
->staticArray
[1]=0xffff; 
 412 U_CAPI UBool U_EXPORT2
 
 413 uset_serializedContains(const USerializedSet
* set
, UChar32 c
) { 
 414     const uint16_t* array
; 
 416     if(set
==NULL 
|| (uint32_t)c
>0x10ffff) { 
 422         /* find c in the BMP part */ 
 424         int32_t hi 
= set
->bmpLength
-1; 
 427         } else if (c 
< array
[hi
]) { 
 429                 int32_t i 
= (lo 
+ hi
) >> 1; 
 432                 } else if (c 
< array
[i
]) { 
 441         return (UBool
)(hi
&1); 
 443         /* find c in the supplementary part */ 
 444         uint16_t high
=(uint16_t)(c
>>16), low
=(uint16_t)c
; 
 445         int32_t base 
= set
->bmpLength
; 
 447         int32_t hi 
= set
->length 
- 2 - base
; 
 448         if (high 
< array
[base
] || (high
==array
[base
] && low
<array
[base
+1])) { 
 450         } else if (high 
< array
[base
+hi
] || (high
==array
[base
+hi
] && low
<array
[base
+hi
+1])) { 
 452                 int32_t i 
= ((lo 
+ hi
) >> 1) & ~1;  // Guarantee even result 
 453                 int32_t iabs 
= i 
+ base
; 
 456                 } else if (high 
< array
[iabs
] || (high
==array
[iabs
] && low
<array
[iabs
+1])) { 
 465         /* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */ 
 466         return (UBool
)(((hi
+(base
<<1))&2)!=0); 
 470 U_CAPI 
int32_t U_EXPORT2
 
 471 uset_getSerializedRangeCount(const USerializedSet
* set
) { 
 476     return (set
->bmpLength
+(set
->length
-set
->bmpLength
)/2+1)/2; 
 479 U_CAPI UBool U_EXPORT2
 
 480 uset_getSerializedRange(const USerializedSet
* set
, int32_t rangeIndex
, 
 481                         UChar32
* pStart
, UChar32
* pEnd
) { 
 482     const uint16_t* array
; 
 483     int32_t bmpLength
, length
; 
 485     if(set
==NULL 
|| rangeIndex
<0 || pStart
==NULL 
|| pEnd
==NULL
) { 
 491     bmpLength
=set
->bmpLength
; 
 493     rangeIndex
*=2; /* address start/limit pairs */ 
 494     if(rangeIndex
<bmpLength
) { 
 495         *pStart
=array
[rangeIndex
++]; 
 496         if(rangeIndex
<bmpLength
) { 
 497             *pEnd
=array
[rangeIndex
]-1; 
 498         } else if(rangeIndex
<length
) { 
 499             *pEnd
=((((int32_t)array
[rangeIndex
])<<16)|array
[rangeIndex
+1])-1; 
 505         rangeIndex
-=bmpLength
; 
 506         rangeIndex
*=2; /* address pairs of pairs of units */ 
 508         if(rangeIndex
<length
) { 
 510             *pStart
=(((int32_t)array
[rangeIndex
])<<16)|array
[rangeIndex
+1]; 
 512             if(rangeIndex
<length
) { 
 513                 *pEnd
=((((int32_t)array
[rangeIndex
])<<16)|array
[rangeIndex
+1])-1; 
 524 // TODO The old, internal uset.c had an efficient uset_containsOne function. 
 525 // Returned the one and only code point, or else -1 or something. 
 526 // Consider adding such a function to both C and C++ UnicodeSet/uset. 
 527 // See tools/gennorm/store.c for usage, now usetContainsOne there. 
 529 // TODO Investigate incorporating this code into UnicodeSet to improve 
 532 // #define USET_GROW_DELTA 20 
 534 // static U_INLINE int32_t 
 535 // findChar(const UChar32* array, int32_t length, UChar32 c) { 
 538 //     /* check the last range limit first for more efficient appending */ 
 540 //         if(c>=array[length-1]) { 
 544 //         /* do not check the last range limit again in the loop below */ 
 548 //     for(i=0; i<length && c>=array[i]; ++i) {} 
 553 // addRemove(USet* set, UChar32 c, int32_t doRemove) { 
 554 //     int32_t i, length, more; 
 556 //     if(set==NULL || (uint32_t)c>0x10ffff) { 
 560 //     length=set->length; 
 561 //     i=findChar(set->array, length, c); 
 562 //     if((i&1)^doRemove) { 
 563 //         /* c is already in the set */ 
 567 //     /* how many more array items do we need? */ 
 568 //     if(i<length && (c+1)==set->array[i]) { 
 569 //         /* c is just before the following range, extend that in-place by one */ 
 573 //             if(c==set->array[i]) { 
 574 //                 /* the previous range collapsed, remove it */ 
 575 //                 set->length=length-=2; 
 577 //                     uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); 
 582 //     } else if(i>0 && c==set->array[i-1]) { 
 583 //         /* c is just after the previous range, extend that in-place by one */ 
 584 //         if(++c<=0x10ffff) { 
 585 //             set->array[i-1]=c; 
 586 //             if(i<length && c==set->array[i]) { 
 587 //                 /* the following range collapsed, remove it */ 
 589 //                 set->length=length-=2; 
 591 //                     uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); 
 595 //             /* extend the previous range (had limit 0x10ffff) to the end of Unicode */ 
 599 //     } else if(i==length && c==0x10ffff) { 
 600 //         /* insert one range limit c */ 
 603 //         /* insert two range limits c, c+1 */ 
 607 //     /* insert <more> range limits */ 
 608 //     if(length+more>set->capacity) { 
 610 //         int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA; 
 611 //         UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4); 
 612 //         if(newArray==NULL) { 
 615 //         set->capacity=newCapacity; 
 616 //         uprv_memcpy(newArray, set->array, length*4); 
 618 //         if(set->array!=set->staticBuffer) { 
 619 //             uprv_free(set->array); 
 621 //         set->array=newArray; 
 625 //         uprv_memmove(set->array+i+more, set->array+i, (length-i)*4); 
 629 //         set->array[i+1]=c+1; 
 631 //     set->length+=more; 
 636 // U_CAPI UBool U_EXPORT2 
 637 // uset_add(USet* set, UChar32 c) { 
 638 //     return addRemove(set, c, 0); 
 641 // U_CAPI void U_EXPORT2 
 642 // uset_remove(USet* set, UChar32 c) { 
 643 //     addRemove(set, c, 1);