1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2014-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * created on: 2012may31
11 * created by: Markus W. Scherer & Maxime Serrano
14 #include "dictionarydata.h"
15 #include "unicode/ucharstrie.h"
16 #include "unicode/bytestrie.h"
17 #include "unicode/udata.h"
20 #if !UCONFIG_NO_BREAK_ITERATION
24 const int32_t DictionaryData::TRIE_TYPE_BYTES
= 0;
25 const int32_t DictionaryData::TRIE_TYPE_UCHARS
= 1;
26 const int32_t DictionaryData::TRIE_TYPE_MASK
= 7;
27 const int32_t DictionaryData::TRIE_HAS_VALUES
= 8;
29 const int32_t DictionaryData::TRANSFORM_NONE
= 0;
30 const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET
= 0x1000000;
31 const int32_t DictionaryData::TRANSFORM_TYPE_MASK
= 0x7f000000;
32 const int32_t DictionaryData::TRANSFORM_OFFSET_MASK
= 0x1fffff;
34 DictionaryMatcher::~DictionaryMatcher() {
37 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
41 int32_t UCharsDictionaryMatcher::getType() const {
42 return DictionaryData::TRIE_TYPE_UCHARS
;
45 int32_t UCharsDictionaryMatcher::matches(UText
*text
, int32_t maxLength
, int32_t limit
,
46 int32_t *lengths
, int32_t *cpLengths
, int32_t *values
,
47 int32_t *prefix
) const {
49 UCharsTrie
uct(characters
);
50 int32_t startingTextIndex
= (int32_t)utext_getNativeIndex(text
);
51 int32_t wordCount
= 0;
52 int32_t codePointsMatched
= 0;
54 for (UChar32 c
= utext_next32(text
); c
>= 0; c
=utext_next32(text
)) {
55 UStringTrieResult result
= (codePointsMatched
== 0) ? uct
.first(c
) : uct
.next(c
);
56 int32_t lengthMatched
= (int32_t)utext_getNativeIndex(text
) - startingTextIndex
;
57 codePointsMatched
+= 1;
58 if (USTRINGTRIE_HAS_VALUE(result
)) {
59 if (wordCount
< limit
) {
61 values
[wordCount
] = uct
.getValue();
63 if (lengths
!= NULL
) {
64 lengths
[wordCount
] = lengthMatched
;
66 if (cpLengths
!= NULL
) {
67 cpLengths
[wordCount
] = codePointsMatched
;
71 if (result
== USTRINGTRIE_FINAL_VALUE
) {
75 else if (result
== USTRINGTRIE_NO_MATCH
) {
78 if (lengthMatched
>= maxLength
) {
84 *prefix
= codePointsMatched
;
89 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
93 UChar32
BytesDictionaryMatcher::transform(UChar32 c
) const {
94 if ((transformConstant
& DictionaryData::TRANSFORM_TYPE_MASK
) == DictionaryData::TRANSFORM_TYPE_OFFSET
) {
97 } else if (c
== 0x200C) {
100 int32_t delta
= c
- (transformConstant
& DictionaryData::TRANSFORM_OFFSET_MASK
);
101 if (delta
< 0 || 0xFD < delta
) {
104 return (UChar32
)delta
;
109 int32_t BytesDictionaryMatcher::getType() const {
110 return DictionaryData::TRIE_TYPE_BYTES
;
113 int32_t BytesDictionaryMatcher::matches(UText
*text
, int32_t maxLength
, int32_t limit
,
114 int32_t *lengths
, int32_t *cpLengths
, int32_t *values
,
115 int32_t *prefix
) const {
116 BytesTrie
bt(characters
);
117 int32_t startingTextIndex
= (int32_t)utext_getNativeIndex(text
);
118 int32_t wordCount
= 0;
119 int32_t codePointsMatched
= 0;
121 for (UChar32 c
= utext_next32(text
); c
>= 0; c
=utext_next32(text
)) {
122 UStringTrieResult result
= (codePointsMatched
== 0) ? bt
.first(transform(c
)) : bt
.next(transform(c
));
123 int32_t lengthMatched
= (int32_t)utext_getNativeIndex(text
) - startingTextIndex
;
124 codePointsMatched
+= 1;
125 if (USTRINGTRIE_HAS_VALUE(result
)) {
126 if (wordCount
< limit
) {
127 if (values
!= NULL
) {
128 values
[wordCount
] = bt
.getValue();
130 if (lengths
!= NULL
) {
131 lengths
[wordCount
] = lengthMatched
;
133 if (cpLengths
!= NULL
) {
134 cpLengths
[wordCount
] = codePointsMatched
;
138 if (result
== USTRINGTRIE_FINAL_VALUE
) {
142 else if (result
== USTRINGTRIE_NO_MATCH
) {
145 if (lengthMatched
>= maxLength
) {
150 if (prefix
!= NULL
) {
151 *prefix
= codePointsMatched
;
161 U_CAPI
int32_t U_EXPORT2
162 udict_swap(const UDataSwapper
*ds
, const void *inData
, int32_t length
,
163 void *outData
, UErrorCode
*pErrorCode
) {
164 const UDataInfo
*pInfo
;
166 const uint8_t *inBytes
;
168 const int32_t *inIndexes
;
169 int32_t indexes
[DictionaryData::IX_COUNT
];
170 int32_t i
, offset
, size
;
172 headerSize
= udata_swapDataHeader(ds
, inData
, length
, outData
, pErrorCode
);
173 if (pErrorCode
== NULL
|| U_FAILURE(*pErrorCode
)) return 0;
174 pInfo
= (const UDataInfo
*)((const char *)inData
+ 4);
175 if (!(pInfo
->dataFormat
[0] == 0x44 &&
176 pInfo
->dataFormat
[1] == 0x69 &&
177 pInfo
->dataFormat
[2] == 0x63 &&
178 pInfo
->dataFormat
[3] == 0x74 &&
179 pInfo
->formatVersion
[0] == 1)) {
180 udata_printError(ds
, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
181 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1], pInfo
->dataFormat
[2], pInfo
->dataFormat
[3], pInfo
->formatVersion
[0]);
182 *pErrorCode
= U_UNSUPPORTED_ERROR
;
186 inBytes
= (const uint8_t *)inData
+ headerSize
;
187 outBytes
= (uint8_t *)outData
+ headerSize
;
189 inIndexes
= (const int32_t *)inBytes
;
191 length
-= headerSize
;
192 if (length
< (int32_t)(sizeof(indexes
))) {
193 udata_printError(ds
, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length
);
194 *pErrorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
199 for (i
= 0; i
< DictionaryData::IX_COUNT
; i
++) {
200 indexes
[i
] = udata_readInt32(ds
, inIndexes
[i
]);
203 size
= indexes
[DictionaryData::IX_TOTAL_SIZE
];
207 udata_printError(ds
, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length
);
208 *pErrorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
212 if (inBytes
!= outBytes
) {
213 uprv_memcpy(outBytes
, inBytes
, size
);
217 ds
->swapArray32(ds
, inBytes
, sizeof(indexes
), outBytes
, pErrorCode
);
218 offset
= (int32_t)sizeof(indexes
);
219 int32_t trieType
= indexes
[DictionaryData::IX_TRIE_TYPE
] & DictionaryData::TRIE_TYPE_MASK
;
220 int32_t nextOffset
= indexes
[DictionaryData::IX_RESERVED1_OFFSET
];
222 if (trieType
== DictionaryData::TRIE_TYPE_UCHARS
) {
223 ds
->swapArray16(ds
, inBytes
+ offset
, nextOffset
- offset
, outBytes
+ offset
, pErrorCode
);
224 } else if (trieType
== DictionaryData::TRIE_TYPE_BYTES
) {
227 udata_printError(ds
, "udict_swap(): unknown trie type!\n");
228 *pErrorCode
= U_UNSUPPORTED_ERROR
;
232 // these next two sections are empty in the current format,
233 // but may be used later.
235 nextOffset
= indexes
[DictionaryData::IX_RESERVED2_OFFSET
];
237 nextOffset
= indexes
[DictionaryData::IX_TOTAL_SIZE
];
240 return headerSize
+ size
;