2 **********************************************************************
3 * Copyright (C) 2007, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: trieset.cpp
8 * tab size: 8 (not used)
11 * created on: 2007jan15
12 * created by: Markus Scherer
14 * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
15 * using a UTrie with 8-bit (byte) results per code point.
16 * Modifies the trie index to make the BMP linear, and uses the original set
17 * for supplementary code points.
20 #include "unicode/utypes.h"
23 #define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
25 #define UTRIE_GET8_FROM_LEAD(trie, c16) \
26 ((const uint8_t *)(trie)->data32)[ \
27 ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \
31 class TrieSet
: public UObject
, public UnicodeContainable
{
33 TrieSet(const UnicodeSet
&set
, UErrorCode
&errorCode
)
34 : trieData(NULL
), latin1(NULL
), restSet(set
.clone()) {
35 if(U_FAILURE(errorCode
)) {
39 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
43 UNewTrie
*newTrie
=utrie_open(NULL
, NULL
, 0x11000, 0, 0, TRUE
);
46 UnicodeSetIterator
iter(set
);
48 while(iter
.nextRange() && !iter
.isString()) {
49 start
=iter
.getCodepoint();
50 end
=iter
.getCodepointEnd();
57 if(!utrie_setRange32(newTrie
, start
, end
+1, TRUE
, TRUE
)) {
58 errorCode
=U_INTERNAL_PROGRAM_ERROR
;
63 // Preflight the trie length.
64 int32_t length
=utrie_serialize(newTrie
, NULL
, 0, NULL
, 8, &errorCode
);
65 if(errorCode
!=U_BUFFER_OVERFLOW_ERROR
) {
69 trieData
=(uint32_t *)uprv_malloc(length
);
71 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
75 errorCode
=U_ZERO_ERROR
;
76 utrie_serialize(newTrie
, trieData
, length
, NULL
, 8, &errorCode
);
77 utrie_unserialize(&trie
, trieData
, length
, &errorCode
); // TODO: Implement for 8-bit UTrie!
79 if(U_SUCCESS(errorCode
)) {
80 // Copy the indexes for surrogate code points into the BMP range
81 // for simple access across the entire BMP.
82 uprv_memcpy((uint16_t *)trie
.index
+(0xd800>>UTRIE_SHIFT
),
83 trie
.index
+UTRIE_BMP_INDEX_LENGTH
,
84 (0x800>>UTRIE_SHIFT
)*2);
85 latin1
=UTRIE_GET8_LATIN1(&trie
);
88 restSet
.remove(0, 0xffff);
96 UBool
contains(UChar32 c
) const {
97 if((uint32_t)c
<=0xff) {
98 return (UBool
)latin1
[c
];
99 } else if((uint32_t)c
<0xffff) {
100 return (UBool
)UTRIE_GET8_FROM_LEAD(&trie
, c
);
102 return restSet
->contains(c
);
108 const uint8_t *latin1
;