]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/dictionarydata.cpp
ICU-551.24.tar.gz
[apple/icu.git] / icuSources / common / dictionarydata.cpp
CommitLineData
51004dcb
A
1/*
2*******************************************************************************
57a6839d 3* Copyright (C) 2014, International Business Machines
51004dcb
A
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* dictionarydata.h
7*
8* created on: 2012may31
9* created by: Markus W. Scherer & Maxime Serrano
10*/
11
12#include "dictionarydata.h"
13#include "unicode/ucharstrie.h"
14#include "unicode/bytestrie.h"
15#include "unicode/udata.h"
16#include "cmemory.h"
17
18#if !UCONFIG_NO_BREAK_ITERATION
19
20U_NAMESPACE_BEGIN
21
57a6839d
A
22const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;
23const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;
24const int32_t DictionaryData::TRIE_TYPE_MASK = 7;
25const int32_t DictionaryData::TRIE_HAS_VALUES = 8;
51004dcb 26
57a6839d
A
27const int32_t DictionaryData::TRANSFORM_NONE = 0;
28const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
29const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
30const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
31
51004dcb
A
32DictionaryMatcher::~DictionaryMatcher() {
33}
34
35UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
36 udata_close(file);
37}
38
39int32_t UCharsDictionaryMatcher::getType() const {
40 return DictionaryData::TRIE_TYPE_UCHARS;
41}
42
b331163b
A
43int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
44 int32_t *lengths, int32_t *cpLengths, int32_t *values,
45 int32_t *prefix) const {
46
51004dcb 47 UCharsTrie uct(characters);
b331163b
A
48 int32_t startingTextIndex = utext_getNativeIndex(text);
49 int32_t wordCount = 0;
50 int32_t codePointsMatched = 0;
51
52 for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
53 UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
54 int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
55 codePointsMatched += 1;
51004dcb 56 if (USTRINGTRIE_HAS_VALUE(result)) {
b331163b 57 if (wordCount < limit) {
51004dcb 58 if (values != NULL) {
b331163b
A
59 values[wordCount] = uct.getValue();
60 }
61 if (lengths != NULL) {
62 lengths[wordCount] = lengthMatched;
63 }
64 if (cpLengths != NULL) {
65 cpLengths[wordCount] = codePointsMatched;
51004dcb 66 }
b331163b 67 ++wordCount;
51004dcb
A
68 }
69 if (result == USTRINGTRIE_FINAL_VALUE) {
70 break;
71 }
72 }
73 else if (result == USTRINGTRIE_NO_MATCH) {
74 break;
75 }
b331163b 76 if (lengthMatched >= maxLength) {
51004dcb
A
77 break;
78 }
b331163b 79 }
51004dcb 80
b331163b
A
81 if (prefix != NULL) {
82 *prefix = codePointsMatched;
51004dcb 83 }
b331163b 84 return wordCount;
51004dcb
A
85}
86
87BytesDictionaryMatcher::~BytesDictionaryMatcher() {
88 udata_close(file);
89}
90
91UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
92 if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
93 if (c == 0x200D) {
94 return 0xFF;
95 } else if (c == 0x200C) {
96 return 0xFE;
97 }
98 int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
99 if (delta < 0 || 0xFD < delta) {
100 return U_SENTINEL;
101 }
102 return (UChar32)delta;
103 }
104 return c;
105}
106
107int32_t BytesDictionaryMatcher::getType() const {
108 return DictionaryData::TRIE_TYPE_BYTES;
109}
110
b331163b
A
111int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
112 int32_t *lengths, int32_t *cpLengths, int32_t *values,
113 int32_t *prefix) const {
51004dcb 114 BytesTrie bt(characters);
b331163b
A
115 int32_t startingTextIndex = utext_getNativeIndex(text);
116 int32_t wordCount = 0;
117 int32_t codePointsMatched = 0;
118
119 for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
120 UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
121 int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
122 codePointsMatched += 1;
51004dcb 123 if (USTRINGTRIE_HAS_VALUE(result)) {
b331163b 124 if (wordCount < limit) {
51004dcb 125 if (values != NULL) {
b331163b
A
126 values[wordCount] = bt.getValue();
127 }
128 if (lengths != NULL) {
129 lengths[wordCount] = lengthMatched;
57a6839d 130 }
b331163b
A
131 if (cpLengths != NULL) {
132 cpLengths[wordCount] = codePointsMatched;
133 }
134 ++wordCount;
51004dcb
A
135 }
136 if (result == USTRINGTRIE_FINAL_VALUE) {
137 break;
138 }
139 }
140 else if (result == USTRINGTRIE_NO_MATCH) {
141 break;
142 }
b331163b 143 if (lengthMatched >= maxLength) {
51004dcb
A
144 break;
145 }
b331163b 146 }
51004dcb 147
b331163b
A
148 if (prefix != NULL) {
149 *prefix = codePointsMatched;
51004dcb 150 }
b331163b 151 return wordCount;
51004dcb
A
152}
153
154
155U_NAMESPACE_END
156
157U_NAMESPACE_USE
158
159U_CAPI int32_t U_EXPORT2
160udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
161 void *outData, UErrorCode *pErrorCode) {
162 const UDataInfo *pInfo;
163 int32_t headerSize;
164 const uint8_t *inBytes;
165 uint8_t *outBytes;
166 const int32_t *inIndexes;
167 int32_t indexes[DictionaryData::IX_COUNT];
168 int32_t i, offset, size;
169
170 headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
171 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
172 pInfo = (const UDataInfo *)((const char *)inData + 4);
173 if (!(pInfo->dataFormat[0] == 0x44 &&
174 pInfo->dataFormat[1] == 0x69 &&
175 pInfo->dataFormat[2] == 0x63 &&
176 pInfo->dataFormat[3] == 0x74 &&
177 pInfo->formatVersion[0] == 1)) {
178 udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
179 pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
180 *pErrorCode = U_UNSUPPORTED_ERROR;
181 return 0;
182 }
183
184 inBytes = (const uint8_t *)inData + headerSize;
185 outBytes = (uint8_t *)outData + headerSize;
186
187 inIndexes = (const int32_t *)inBytes;
188 if (length >= 0) {
189 length -= headerSize;
190 if (length < (int32_t)(sizeof(indexes))) {
191 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
192 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
193 return 0;
194 }
195 }
196
197 for (i = 0; i < DictionaryData::IX_COUNT; i++) {
198 indexes[i] = udata_readInt32(ds, inIndexes[i]);
199 }
200
201 size = indexes[DictionaryData::IX_TOTAL_SIZE];
202
203 if (length >= 0) {
204 if (length < size) {
205 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
206 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
207 return 0;
208 }
209
210 if (inBytes != outBytes) {
211 uprv_memcpy(outBytes, inBytes, size);
212 }
213
214 offset = 0;
215 ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
216 offset = (int32_t)sizeof(indexes);
217 int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
218 int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
219
220 if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
221 ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
222 } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
223 // nothing to do
224 } else {
225 udata_printError(ds, "udict_swap(): unknown trie type!\n");
226 *pErrorCode = U_UNSUPPORTED_ERROR;
227 return 0;
228 }
229
230 // these next two sections are empty in the current format,
231 // but may be used later.
232 offset = nextOffset;
233 nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
234 offset = nextOffset;
235 nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
236 offset = nextOffset;
237 }
238 return headerSize + size;
239}
240#endif