]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/dictionarydata.cpp
ICU-62107.0.1.tar.gz
[apple/icu.git] / icuSources / common / dictionarydata.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
51004dcb
A
3/*
4*******************************************************************************
2ca993e8 5* Copyright (C) 2014-2016, International Business Machines
51004dcb
A
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* dictionarydata.h
9*
10* created on: 2012may31
11* created by: Markus W. Scherer & Maxime Serrano
12*/
13
14#include "dictionarydata.h"
15#include "unicode/ucharstrie.h"
16#include "unicode/bytestrie.h"
17#include "unicode/udata.h"
18#include "cmemory.h"
19
20#if !UCONFIG_NO_BREAK_ITERATION
21
22U_NAMESPACE_BEGIN
23
57a6839d
A
24const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;
25const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;
26const int32_t DictionaryData::TRIE_TYPE_MASK = 7;
27const int32_t DictionaryData::TRIE_HAS_VALUES = 8;
51004dcb 28
57a6839d
A
29const int32_t DictionaryData::TRANSFORM_NONE = 0;
30const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
31const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
32const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
33
51004dcb
A
34DictionaryMatcher::~DictionaryMatcher() {
35}
36
37UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
38 udata_close(file);
39}
40
41int32_t UCharsDictionaryMatcher::getType() const {
42 return DictionaryData::TRIE_TYPE_UCHARS;
43}
44
b331163b
A
45int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
46 int32_t *lengths, int32_t *cpLengths, int32_t *values,
47 int32_t *prefix) const {
48
51004dcb 49 UCharsTrie uct(characters);
2ca993e8 50 int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
b331163b
A
51 int32_t wordCount = 0;
52 int32_t codePointsMatched = 0;
53
54 for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
55 UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
2ca993e8 56 int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
b331163b 57 codePointsMatched += 1;
51004dcb 58 if (USTRINGTRIE_HAS_VALUE(result)) {
b331163b 59 if (wordCount < limit) {
51004dcb 60 if (values != NULL) {
b331163b
A
61 values[wordCount] = uct.getValue();
62 }
63 if (lengths != NULL) {
64 lengths[wordCount] = lengthMatched;
65 }
66 if (cpLengths != NULL) {
67 cpLengths[wordCount] = codePointsMatched;
51004dcb 68 }
b331163b 69 ++wordCount;
51004dcb
A
70 }
71 if (result == USTRINGTRIE_FINAL_VALUE) {
72 break;
73 }
74 }
75 else if (result == USTRINGTRIE_NO_MATCH) {
76 break;
77 }
b331163b 78 if (lengthMatched >= maxLength) {
51004dcb
A
79 break;
80 }
b331163b 81 }
51004dcb 82
b331163b
A
83 if (prefix != NULL) {
84 *prefix = codePointsMatched;
51004dcb 85 }
b331163b 86 return wordCount;
51004dcb
A
87}
88
89BytesDictionaryMatcher::~BytesDictionaryMatcher() {
90 udata_close(file);
91}
92
93UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
94 if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
95 if (c == 0x200D) {
96 return 0xFF;
97 } else if (c == 0x200C) {
98 return 0xFE;
99 }
100 int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
101 if (delta < 0 || 0xFD < delta) {
102 return U_SENTINEL;
103 }
104 return (UChar32)delta;
105 }
106 return c;
107}
108
109int32_t BytesDictionaryMatcher::getType() const {
110 return DictionaryData::TRIE_TYPE_BYTES;
111}
112
b331163b
A
113int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
114 int32_t *lengths, int32_t *cpLengths, int32_t *values,
115 int32_t *prefix) const {
51004dcb 116 BytesTrie bt(characters);
2ca993e8 117 int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
b331163b
A
118 int32_t wordCount = 0;
119 int32_t codePointsMatched = 0;
120
121 for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
122 UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
2ca993e8 123 int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
b331163b 124 codePointsMatched += 1;
51004dcb 125 if (USTRINGTRIE_HAS_VALUE(result)) {
b331163b 126 if (wordCount < limit) {
51004dcb 127 if (values != NULL) {
b331163b
A
128 values[wordCount] = bt.getValue();
129 }
130 if (lengths != NULL) {
131 lengths[wordCount] = lengthMatched;
57a6839d 132 }
b331163b
A
133 if (cpLengths != NULL) {
134 cpLengths[wordCount] = codePointsMatched;
135 }
136 ++wordCount;
51004dcb
A
137 }
138 if (result == USTRINGTRIE_FINAL_VALUE) {
139 break;
140 }
141 }
142 else if (result == USTRINGTRIE_NO_MATCH) {
143 break;
144 }
b331163b 145 if (lengthMatched >= maxLength) {
51004dcb
A
146 break;
147 }
b331163b 148 }
51004dcb 149
b331163b
A
150 if (prefix != NULL) {
151 *prefix = codePointsMatched;
51004dcb 152 }
b331163b 153 return wordCount;
51004dcb
A
154}
155
156
157U_NAMESPACE_END
158
159U_NAMESPACE_USE
160
161U_CAPI int32_t U_EXPORT2
162udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
163 void *outData, UErrorCode *pErrorCode) {
164 const UDataInfo *pInfo;
165 int32_t headerSize;
166 const uint8_t *inBytes;
167 uint8_t *outBytes;
168 const int32_t *inIndexes;
169 int32_t indexes[DictionaryData::IX_COUNT];
170 int32_t i, offset, size;
171
172 headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
173 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
174 pInfo = (const UDataInfo *)((const char *)inData + 4);
175 if (!(pInfo->dataFormat[0] == 0x44 &&
176 pInfo->dataFormat[1] == 0x69 &&
177 pInfo->dataFormat[2] == 0x63 &&
178 pInfo->dataFormat[3] == 0x74 &&
179 pInfo->formatVersion[0] == 1)) {
180 udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
181 pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
182 *pErrorCode = U_UNSUPPORTED_ERROR;
183 return 0;
184 }
185
186 inBytes = (const uint8_t *)inData + headerSize;
187 outBytes = (uint8_t *)outData + headerSize;
188
189 inIndexes = (const int32_t *)inBytes;
190 if (length >= 0) {
191 length -= headerSize;
192 if (length < (int32_t)(sizeof(indexes))) {
193 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
194 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
195 return 0;
196 }
197 }
198
199 for (i = 0; i < DictionaryData::IX_COUNT; i++) {
200 indexes[i] = udata_readInt32(ds, inIndexes[i]);
201 }
202
203 size = indexes[DictionaryData::IX_TOTAL_SIZE];
204
205 if (length >= 0) {
206 if (length < size) {
207 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
208 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
209 return 0;
210 }
211
212 if (inBytes != outBytes) {
213 uprv_memcpy(outBytes, inBytes, size);
214 }
215
216 offset = 0;
217 ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
218 offset = (int32_t)sizeof(indexes);
219 int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
220 int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
221
222 if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
223 ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
224 } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
225 // nothing to do
226 } else {
227 udata_printError(ds, "udict_swap(): unknown trie type!\n");
228 *pErrorCode = U_UNSUPPORTED_ERROR;
229 return 0;
230 }
231
232 // these next two sections are empty in the current format,
233 // but may be used later.
234 offset = nextOffset;
235 nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
236 offset = nextOffset;
237 nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
238 offset = nextOffset;
239 }
240 return headerSize + size;
241}
242#endif