]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
51004dcb A |
3 | /* |
4 | ******************************************************************************* | |
2ca993e8 | 5 | * Copyright (C) 2014-2016, International Business Machines |
51004dcb A |
6 | * Corporation and others. All Rights Reserved. |
7 | ******************************************************************************* | |
8 | * dictionarydata.h | |
9 | * | |
10 | * created on: 2012may31 | |
11 | * created by: Markus W. Scherer & Maxime Serrano | |
12 | */ | |
13 | ||
14 | #include "dictionarydata.h" | |
15 | #include "unicode/ucharstrie.h" | |
16 | #include "unicode/bytestrie.h" | |
17 | #include "unicode/udata.h" | |
18 | #include "cmemory.h" | |
19 | ||
20 | #if !UCONFIG_NO_BREAK_ITERATION | |
21 | ||
22 | U_NAMESPACE_BEGIN | |
23 | ||
57a6839d A |
24 | const int32_t DictionaryData::TRIE_TYPE_BYTES = 0; |
25 | const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1; | |
26 | const int32_t DictionaryData::TRIE_TYPE_MASK = 7; | |
27 | const int32_t DictionaryData::TRIE_HAS_VALUES = 8; | |
51004dcb | 28 | |
57a6839d A |
29 | const int32_t DictionaryData::TRANSFORM_NONE = 0; |
30 | const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000; | |
31 | const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000; | |
32 | const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff; | |
33 | ||
51004dcb A |
34 | DictionaryMatcher::~DictionaryMatcher() { |
35 | } | |
36 | ||
37 | UCharsDictionaryMatcher::~UCharsDictionaryMatcher() { | |
38 | udata_close(file); | |
39 | } | |
40 | ||
41 | int32_t UCharsDictionaryMatcher::getType() const { | |
42 | return DictionaryData::TRIE_TYPE_UCHARS; | |
43 | } | |
44 | ||
b331163b A |
45 | int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, |
46 | int32_t *lengths, int32_t *cpLengths, int32_t *values, | |
47 | int32_t *prefix) const { | |
48 | ||
51004dcb | 49 | UCharsTrie uct(characters); |
2ca993e8 | 50 | int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); |
b331163b A |
51 | int32_t wordCount = 0; |
52 | int32_t codePointsMatched = 0; | |
53 | ||
54 | for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { | |
55 | UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c); | |
2ca993e8 | 56 | int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; |
b331163b | 57 | codePointsMatched += 1; |
51004dcb | 58 | if (USTRINGTRIE_HAS_VALUE(result)) { |
b331163b | 59 | if (wordCount < limit) { |
51004dcb | 60 | if (values != NULL) { |
b331163b A |
61 | values[wordCount] = uct.getValue(); |
62 | } | |
63 | if (lengths != NULL) { | |
64 | lengths[wordCount] = lengthMatched; | |
65 | } | |
66 | if (cpLengths != NULL) { | |
67 | cpLengths[wordCount] = codePointsMatched; | |
51004dcb | 68 | } |
b331163b | 69 | ++wordCount; |
51004dcb A |
70 | } |
71 | if (result == USTRINGTRIE_FINAL_VALUE) { | |
72 | break; | |
73 | } | |
74 | } | |
75 | else if (result == USTRINGTRIE_NO_MATCH) { | |
76 | break; | |
77 | } | |
b331163b | 78 | if (lengthMatched >= maxLength) { |
51004dcb A |
79 | break; |
80 | } | |
b331163b | 81 | } |
51004dcb | 82 | |
b331163b A |
83 | if (prefix != NULL) { |
84 | *prefix = codePointsMatched; | |
51004dcb | 85 | } |
b331163b | 86 | return wordCount; |
51004dcb A |
87 | } |
88 | ||
89 | BytesDictionaryMatcher::~BytesDictionaryMatcher() { | |
90 | udata_close(file); | |
91 | } | |
92 | ||
93 | UChar32 BytesDictionaryMatcher::transform(UChar32 c) const { | |
94 | if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) { | |
95 | if (c == 0x200D) { | |
96 | return 0xFF; | |
97 | } else if (c == 0x200C) { | |
98 | return 0xFE; | |
99 | } | |
100 | int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK); | |
101 | if (delta < 0 || 0xFD < delta) { | |
102 | return U_SENTINEL; | |
103 | } | |
104 | return (UChar32)delta; | |
105 | } | |
106 | return c; | |
107 | } | |
108 | ||
109 | int32_t BytesDictionaryMatcher::getType() const { | |
110 | return DictionaryData::TRIE_TYPE_BYTES; | |
111 | } | |
112 | ||
b331163b A |
113 | int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, |
114 | int32_t *lengths, int32_t *cpLengths, int32_t *values, | |
115 | int32_t *prefix) const { | |
51004dcb | 116 | BytesTrie bt(characters); |
2ca993e8 | 117 | int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); |
b331163b A |
118 | int32_t wordCount = 0; |
119 | int32_t codePointsMatched = 0; | |
120 | ||
121 | for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { | |
122 | UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c)); | |
2ca993e8 | 123 | int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; |
b331163b | 124 | codePointsMatched += 1; |
51004dcb | 125 | if (USTRINGTRIE_HAS_VALUE(result)) { |
b331163b | 126 | if (wordCount < limit) { |
51004dcb | 127 | if (values != NULL) { |
b331163b A |
128 | values[wordCount] = bt.getValue(); |
129 | } | |
130 | if (lengths != NULL) { | |
131 | lengths[wordCount] = lengthMatched; | |
57a6839d | 132 | } |
b331163b A |
133 | if (cpLengths != NULL) { |
134 | cpLengths[wordCount] = codePointsMatched; | |
135 | } | |
136 | ++wordCount; | |
51004dcb A |
137 | } |
138 | if (result == USTRINGTRIE_FINAL_VALUE) { | |
139 | break; | |
140 | } | |
141 | } | |
142 | else if (result == USTRINGTRIE_NO_MATCH) { | |
143 | break; | |
144 | } | |
b331163b | 145 | if (lengthMatched >= maxLength) { |
51004dcb A |
146 | break; |
147 | } | |
b331163b | 148 | } |
51004dcb | 149 | |
b331163b A |
150 | if (prefix != NULL) { |
151 | *prefix = codePointsMatched; | |
51004dcb | 152 | } |
b331163b | 153 | return wordCount; |
51004dcb A |
154 | } |
155 | ||
156 | ||
157 | U_NAMESPACE_END | |
158 | ||
159 | U_NAMESPACE_USE | |
160 | ||
161 | U_CAPI int32_t U_EXPORT2 | |
162 | udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, | |
163 | void *outData, UErrorCode *pErrorCode) { | |
164 | const UDataInfo *pInfo; | |
165 | int32_t headerSize; | |
166 | const uint8_t *inBytes; | |
167 | uint8_t *outBytes; | |
168 | const int32_t *inIndexes; | |
169 | int32_t indexes[DictionaryData::IX_COUNT]; | |
170 | int32_t i, offset, size; | |
171 | ||
172 | headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); | |
173 | if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0; | |
174 | pInfo = (const UDataInfo *)((const char *)inData + 4); | |
175 | if (!(pInfo->dataFormat[0] == 0x44 && | |
176 | pInfo->dataFormat[1] == 0x69 && | |
177 | pInfo->dataFormat[2] == 0x63 && | |
178 | pInfo->dataFormat[3] == 0x74 && | |
179 | pInfo->formatVersion[0] == 1)) { | |
180 | udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n", | |
181 | pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); | |
182 | *pErrorCode = U_UNSUPPORTED_ERROR; | |
183 | return 0; | |
184 | } | |
185 | ||
186 | inBytes = (const uint8_t *)inData + headerSize; | |
187 | outBytes = (uint8_t *)outData + headerSize; | |
188 | ||
189 | inIndexes = (const int32_t *)inBytes; | |
190 | if (length >= 0) { | |
191 | length -= headerSize; | |
192 | if (length < (int32_t)(sizeof(indexes))) { | |
193 | udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length); | |
194 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
195 | return 0; | |
196 | } | |
197 | } | |
198 | ||
199 | for (i = 0; i < DictionaryData::IX_COUNT; i++) { | |
200 | indexes[i] = udata_readInt32(ds, inIndexes[i]); | |
201 | } | |
202 | ||
203 | size = indexes[DictionaryData::IX_TOTAL_SIZE]; | |
204 | ||
205 | if (length >= 0) { | |
206 | if (length < size) { | |
207 | udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length); | |
208 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
209 | return 0; | |
210 | } | |
211 | ||
212 | if (inBytes != outBytes) { | |
213 | uprv_memcpy(outBytes, inBytes, size); | |
214 | } | |
215 | ||
216 | offset = 0; | |
217 | ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode); | |
218 | offset = (int32_t)sizeof(indexes); | |
219 | int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; | |
220 | int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET]; | |
221 | ||
222 | if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { | |
223 | ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode); | |
224 | } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) { | |
225 | // nothing to do | |
226 | } else { | |
227 | udata_printError(ds, "udict_swap(): unknown trie type!\n"); | |
228 | *pErrorCode = U_UNSUPPORTED_ERROR; | |
229 | return 0; | |
230 | } | |
231 | ||
232 | // these next two sections are empty in the current format, | |
233 | // but may be used later. | |
234 | offset = nextOffset; | |
235 | nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET]; | |
236 | offset = nextOffset; | |
237 | nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE]; | |
238 | offset = nextOffset; | |
239 | } | |
240 | return headerSize + size; | |
241 | } | |
242 | #endif |