]>
Commit | Line | Data |
---|---|---|
51004dcb A |
1 | /* |
2 | ******************************************************************************* | |
57a6839d | 3 | * Copyright (C) 2014, International Business Machines |
51004dcb A |
4 | * Corporation and others. All Rights Reserved. |
5 | ******************************************************************************* | |
6 | * dictionarydata.h | |
7 | * | |
8 | * created on: 2012may31 | |
9 | * created by: Markus W. Scherer & Maxime Serrano | |
10 | */ | |
11 | ||
12 | #include "dictionarydata.h" | |
13 | #include "unicode/ucharstrie.h" | |
14 | #include "unicode/bytestrie.h" | |
15 | #include "unicode/udata.h" | |
16 | #include "cmemory.h" | |
17 | ||
18 | #if !UCONFIG_NO_BREAK_ITERATION | |
19 | ||
20 | U_NAMESPACE_BEGIN | |
21 | ||
57a6839d A |
22 | const int32_t DictionaryData::TRIE_TYPE_BYTES = 0; |
23 | const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1; | |
24 | const int32_t DictionaryData::TRIE_TYPE_MASK = 7; | |
25 | const int32_t DictionaryData::TRIE_HAS_VALUES = 8; | |
51004dcb | 26 | |
57a6839d A |
27 | const int32_t DictionaryData::TRANSFORM_NONE = 0; |
28 | const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000; | |
29 | const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000; | |
30 | const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff; | |
31 | ||
51004dcb A |
32 | DictionaryMatcher::~DictionaryMatcher() { |
33 | } | |
34 | ||
35 | UCharsDictionaryMatcher::~UCharsDictionaryMatcher() { | |
36 | udata_close(file); | |
37 | } | |
38 | ||
39 | int32_t UCharsDictionaryMatcher::getType() const { | |
40 | return DictionaryData::TRIE_TYPE_UCHARS; | |
41 | } | |
42 | ||
b331163b A |
43 | int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, |
44 | int32_t *lengths, int32_t *cpLengths, int32_t *values, | |
45 | int32_t *prefix) const { | |
46 | ||
51004dcb | 47 | UCharsTrie uct(characters); |
b331163b A |
48 | int32_t startingTextIndex = utext_getNativeIndex(text); |
49 | int32_t wordCount = 0; | |
50 | int32_t codePointsMatched = 0; | |
51 | ||
52 | for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { | |
53 | UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c); | |
54 | int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex; | |
55 | codePointsMatched += 1; | |
51004dcb | 56 | if (USTRINGTRIE_HAS_VALUE(result)) { |
b331163b | 57 | if (wordCount < limit) { |
51004dcb | 58 | if (values != NULL) { |
b331163b A |
59 | values[wordCount] = uct.getValue(); |
60 | } | |
61 | if (lengths != NULL) { | |
62 | lengths[wordCount] = lengthMatched; | |
63 | } | |
64 | if (cpLengths != NULL) { | |
65 | cpLengths[wordCount] = codePointsMatched; | |
51004dcb | 66 | } |
b331163b | 67 | ++wordCount; |
51004dcb A |
68 | } |
69 | if (result == USTRINGTRIE_FINAL_VALUE) { | |
70 | break; | |
71 | } | |
72 | } | |
73 | else if (result == USTRINGTRIE_NO_MATCH) { | |
74 | break; | |
75 | } | |
b331163b | 76 | if (lengthMatched >= maxLength) { |
51004dcb A |
77 | break; |
78 | } | |
b331163b | 79 | } |
51004dcb | 80 | |
b331163b A |
81 | if (prefix != NULL) { |
82 | *prefix = codePointsMatched; | |
51004dcb | 83 | } |
b331163b | 84 | return wordCount; |
51004dcb A |
85 | } |
86 | ||
87 | BytesDictionaryMatcher::~BytesDictionaryMatcher() { | |
88 | udata_close(file); | |
89 | } | |
90 | ||
91 | UChar32 BytesDictionaryMatcher::transform(UChar32 c) const { | |
92 | if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) { | |
93 | if (c == 0x200D) { | |
94 | return 0xFF; | |
95 | } else if (c == 0x200C) { | |
96 | return 0xFE; | |
97 | } | |
98 | int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK); | |
99 | if (delta < 0 || 0xFD < delta) { | |
100 | return U_SENTINEL; | |
101 | } | |
102 | return (UChar32)delta; | |
103 | } | |
104 | return c; | |
105 | } | |
106 | ||
107 | int32_t BytesDictionaryMatcher::getType() const { | |
108 | return DictionaryData::TRIE_TYPE_BYTES; | |
109 | } | |
110 | ||
b331163b A |
111 | int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, |
112 | int32_t *lengths, int32_t *cpLengths, int32_t *values, | |
113 | int32_t *prefix) const { | |
51004dcb | 114 | BytesTrie bt(characters); |
b331163b A |
115 | int32_t startingTextIndex = utext_getNativeIndex(text); |
116 | int32_t wordCount = 0; | |
117 | int32_t codePointsMatched = 0; | |
118 | ||
119 | for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { | |
120 | UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c)); | |
121 | int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex; | |
122 | codePointsMatched += 1; | |
51004dcb | 123 | if (USTRINGTRIE_HAS_VALUE(result)) { |
b331163b | 124 | if (wordCount < limit) { |
51004dcb | 125 | if (values != NULL) { |
b331163b A |
126 | values[wordCount] = bt.getValue(); |
127 | } | |
128 | if (lengths != NULL) { | |
129 | lengths[wordCount] = lengthMatched; | |
57a6839d | 130 | } |
b331163b A |
131 | if (cpLengths != NULL) { |
132 | cpLengths[wordCount] = codePointsMatched; | |
133 | } | |
134 | ++wordCount; | |
51004dcb A |
135 | } |
136 | if (result == USTRINGTRIE_FINAL_VALUE) { | |
137 | break; | |
138 | } | |
139 | } | |
140 | else if (result == USTRINGTRIE_NO_MATCH) { | |
141 | break; | |
142 | } | |
b331163b | 143 | if (lengthMatched >= maxLength) { |
51004dcb A |
144 | break; |
145 | } | |
b331163b | 146 | } |
51004dcb | 147 | |
b331163b A |
148 | if (prefix != NULL) { |
149 | *prefix = codePointsMatched; | |
51004dcb | 150 | } |
b331163b | 151 | return wordCount; |
51004dcb A |
152 | } |
153 | ||
154 | ||
155 | U_NAMESPACE_END | |
156 | ||
157 | U_NAMESPACE_USE | |
158 | ||
159 | U_CAPI int32_t U_EXPORT2 | |
160 | udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, | |
161 | void *outData, UErrorCode *pErrorCode) { | |
162 | const UDataInfo *pInfo; | |
163 | int32_t headerSize; | |
164 | const uint8_t *inBytes; | |
165 | uint8_t *outBytes; | |
166 | const int32_t *inIndexes; | |
167 | int32_t indexes[DictionaryData::IX_COUNT]; | |
168 | int32_t i, offset, size; | |
169 | ||
170 | headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); | |
171 | if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0; | |
172 | pInfo = (const UDataInfo *)((const char *)inData + 4); | |
173 | if (!(pInfo->dataFormat[0] == 0x44 && | |
174 | pInfo->dataFormat[1] == 0x69 && | |
175 | pInfo->dataFormat[2] == 0x63 && | |
176 | pInfo->dataFormat[3] == 0x74 && | |
177 | pInfo->formatVersion[0] == 1)) { | |
178 | udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n", | |
179 | pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); | |
180 | *pErrorCode = U_UNSUPPORTED_ERROR; | |
181 | return 0; | |
182 | } | |
183 | ||
184 | inBytes = (const uint8_t *)inData + headerSize; | |
185 | outBytes = (uint8_t *)outData + headerSize; | |
186 | ||
187 | inIndexes = (const int32_t *)inBytes; | |
188 | if (length >= 0) { | |
189 | length -= headerSize; | |
190 | if (length < (int32_t)(sizeof(indexes))) { | |
191 | udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length); | |
192 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
193 | return 0; | |
194 | } | |
195 | } | |
196 | ||
197 | for (i = 0; i < DictionaryData::IX_COUNT; i++) { | |
198 | indexes[i] = udata_readInt32(ds, inIndexes[i]); | |
199 | } | |
200 | ||
201 | size = indexes[DictionaryData::IX_TOTAL_SIZE]; | |
202 | ||
203 | if (length >= 0) { | |
204 | if (length < size) { | |
205 | udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length); | |
206 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
207 | return 0; | |
208 | } | |
209 | ||
210 | if (inBytes != outBytes) { | |
211 | uprv_memcpy(outBytes, inBytes, size); | |
212 | } | |
213 | ||
214 | offset = 0; | |
215 | ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode); | |
216 | offset = (int32_t)sizeof(indexes); | |
217 | int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; | |
218 | int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET]; | |
219 | ||
220 | if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { | |
221 | ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode); | |
222 | } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) { | |
223 | // nothing to do | |
224 | } else { | |
225 | udata_printError(ds, "udict_swap(): unknown trie type!\n"); | |
226 | *pErrorCode = U_UNSUPPORTED_ERROR; | |
227 | return 0; | |
228 | } | |
229 | ||
230 | // these next two sections are empty in the current format, | |
231 | // but may be used later. | |
232 | offset = nextOffset; | |
233 | nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET]; | |
234 | offset = nextOffset; | |
235 | nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE]; | |
236 | offset = nextOffset; | |
237 | } | |
238 | return headerSize + size; | |
239 | } | |
240 | #endif |