2 *************************************************************************
3 * © 2016 and later: Unicode, Inc. and others.
4 * License & terms of use: http://www.unicode.org/copyright.html#License
5 *************************************************************************
6 *************************************************************************
7 * Copyright (C) 2007, International Business Machines
8 * Corporation and others. All Rights Reserved.
9 *************************************************************************
10 * file name: trieset.cpp
12 * tab size: 8 (not used)
15 * created on: 2007jan15
16 * created by: Markus Scherer
18 * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
19 * using a UTrie with 8-bit (byte) results per code point.
20 * Modifies the trie index to make the BMP linear, and uses the original set
21 * for supplementary code points.
24 #include "unicode/utypes.h"
27 #define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
29 #define UTRIE_GET8_FROM_LEAD(trie, c16) \
30 ((const uint8_t *)(trie)->data32)[ \
31 ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \
35 class TrieSet
: public UObject
, public UnicodeContainable
{
37 TrieSet(const UnicodeSet
&set
, UErrorCode
&errorCode
)
38 : trieData(NULL
), latin1(NULL
), restSet(set
.clone()) {
39 if(U_FAILURE(errorCode
)) {
43 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
47 UNewTrie
*newTrie
=utrie_open(NULL
, NULL
, 0x11000, 0, 0, TRUE
);
50 UnicodeSetIterator
iter(set
);
52 while(iter
.nextRange() && !iter
.isString()) {
53 start
=iter
.getCodepoint();
54 end
=iter
.getCodepointEnd();
61 if(!utrie_setRange32(newTrie
, start
, end
+1, TRUE
, TRUE
)) {
62 errorCode
=U_INTERNAL_PROGRAM_ERROR
;
67 // Preflight the trie length.
68 int32_t length
=utrie_serialize(newTrie
, NULL
, 0, NULL
, 8, &errorCode
);
69 if(errorCode
!=U_BUFFER_OVERFLOW_ERROR
) {
73 trieData
=(uint32_t *)uprv_malloc(length
);
75 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
79 errorCode
=U_ZERO_ERROR
;
80 utrie_serialize(newTrie
, trieData
, length
, NULL
, 8, &errorCode
);
81 utrie_unserialize(&trie
, trieData
, length
, &errorCode
); // TODO: Implement for 8-bit UTrie!
83 if(U_SUCCESS(errorCode
)) {
84 // Copy the indexes for surrogate code points into the BMP range
85 // for simple access across the entire BMP.
86 uprv_memcpy((uint16_t *)trie
.index
+(0xd800>>UTRIE_SHIFT
),
87 trie
.index
+UTRIE_BMP_INDEX_LENGTH
,
88 (0x800>>UTRIE_SHIFT
)*2);
89 latin1
=UTRIE_GET8_LATIN1(&trie
);
92 restSet
.remove(0, 0xffff);
100 UBool
contains(UChar32 c
) const {
101 if((uint32_t)c
<=0xff) {
102 return (UBool
)latin1
[c
];
103 } else if((uint32_t)c
<0xffff) {
104 return (UBool
)UTRIE_GET8_FROM_LEAD(&trie
, c
);
106 return restSet
->contains(c
);
112 const uint8_t *latin1
;