]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrsbcs.cpp
2 **********************************************************************
3 * Copyright (C) 2005-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_CONVERSION
16 #define N_GRAM_MASK 0xFFFFFF
20 NGramParser::NGramParser(const int32_t *theNgramList
, const uint8_t *theCharMap
)
21 :byteIndex(0), ngram(0)
23 ngramList
= theNgramList
;
26 ngramCount
= hitCount
= 0;
30 * Binary search for value in table, which must have exactly 64 entries.
33 int32_t NGramParser::search(const int32_t *table
, int32_t value
)
37 if (table
[index
+ 32] <= value
) {
41 if (table
[index
+ 16] <= value
) {
45 if (table
[index
+ 8] <= value
) {
49 if (table
[index
+ 4] <= value
) {
53 if (table
[index
+ 2] <= value
) {
57 if (table
[index
+ 1] <= value
) {
61 if (table
[index
] > value
) {
65 if (index
< 0 || table
[index
] != value
) {
72 void NGramParser::lookup(int32_t thisNgram
)
76 if (search(ngramList
, thisNgram
) >= 0) {
82 void NGramParser::addByte(int32_t b
)
84 ngram
= ((ngram
<< 8) + b
) & N_GRAM_MASK
;
88 int32_t NGramParser::nextByte(InputText
*det
)
90 if (byteIndex
>= det
->fInputLen
) {
94 return det
->fInputBytes
[byteIndex
++];
97 int32_t NGramParser::parse(InputText
*det
)
100 bool ignoreSpace
= FALSE
;
102 while ((b
= nextByte(det
)) >= 0) {
103 uint8_t mb
= charMap
[b
];
105 // TODO: 0x20 might not be a space in all character sets...
107 if (!(mb
== 0x20 && ignoreSpace
)) {
111 ignoreSpace
= (mb
== 0x20);
115 // TODO: Is this OK? The buffer could have ended in the middle of a word...
118 double rawPercent
= (double) hitCount
/ (double) ngramCount
;
120 // if (rawPercent <= 2.0) {
124 // TODO - This is a bit of a hack to take care of a case
125 // were we were getting a confidence of 135...
126 if (rawPercent
> 0.33) {
130 return (int32_t) (rawPercent
* 300.0);
133 CharsetRecog_sbcs::CharsetRecog_sbcs()
136 // nothing else to do
139 CharsetRecog_sbcs::~CharsetRecog_sbcs()
144 int32_t CharsetRecog_sbcs::match_sbcs(InputText
*det
, const int32_t ngrams
[], const uint8_t byteMap
[])
146 NGramParser
parser(ngrams
, byteMap
);
149 haveC1Bytes
= det
->fC1Bytes
;
150 result
= parser
.parse(det
);
155 static const uint8_t charMap_8859_1
[] = {
156 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
157 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
158 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
159 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
160 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
161 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
162 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
163 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
164 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
165 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
166 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
167 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
168 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
169 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
170 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
171 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
172 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
173 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
174 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
175 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
176 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
177 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
178 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
179 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
180 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
181 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
182 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
183 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
184 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
185 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
186 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
187 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
190 static const uint8_t charMap_8859_2
[] = {
191 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
192 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
193 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
194 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
195 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
196 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
197 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
198 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
199 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
200 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
201 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
202 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
203 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
204 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
205 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
206 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
207 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
208 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
209 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
210 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
211 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
212 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
213 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
214 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
215 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
216 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
217 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
218 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
219 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
220 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
221 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
222 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
225 static const uint8_t charMap_8859_5
[] = {
226 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
227 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
228 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
229 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
230 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
231 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
232 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
233 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
234 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
235 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
236 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
237 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
238 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
239 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
240 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
241 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
242 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
243 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
244 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
245 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
246 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
247 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
248 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
249 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
250 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
251 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
252 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
253 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
254 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
255 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
256 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
257 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
260 static const uint8_t charMap_8859_6
[] = {
261 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
262 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
263 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
264 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
265 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
266 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
267 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
268 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
269 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
270 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
271 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
272 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
273 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
274 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
275 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
276 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
277 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
278 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
279 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
280 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
281 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
282 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
283 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
284 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
285 0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
286 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
287 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
288 0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
289 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
290 0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
291 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
292 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
295 static const uint8_t charMap_8859_7
[] = {
296 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
297 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
298 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
299 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
300 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
301 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
302 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
303 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
304 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
305 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
306 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
307 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
308 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
309 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
310 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
311 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
312 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
313 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
314 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
315 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
316 0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
317 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
318 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
319 0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
320 0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
321 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
322 0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
323 0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
324 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
325 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
326 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
327 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
330 static const uint8_t charMap_8859_8
[] = {
331 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
332 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
333 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
334 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
335 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
336 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
337 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
338 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
339 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
340 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
341 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
342 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
343 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
344 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
345 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
346 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
347 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
348 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
349 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
350 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
351 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
352 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
353 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
354 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
355 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
356 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
357 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
358 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
359 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
360 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
361 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
362 0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
365 static const uint8_t charMap_8859_9
[] = {
366 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
367 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
368 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
369 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
370 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
371 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
372 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
373 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
374 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
375 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
376 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
377 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
378 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
379 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
380 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
381 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
382 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
383 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
384 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
385 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
386 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
387 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
388 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
389 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
390 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
391 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
392 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
393 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
394 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
395 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
396 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
397 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
400 static const int32_t ngrams_windows_1251
[] = {
401 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
402 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
403 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
404 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
407 static const uint8_t charMap_windows_1251
[] = {
408 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
409 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
410 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
411 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
412 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
413 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
414 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
415 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
416 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
417 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
418 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
419 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
420 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
421 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
422 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
423 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
424 0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
425 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
426 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
427 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
428 0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
429 0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
430 0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
431 0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
432 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
433 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
434 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
435 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
436 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
437 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
438 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
439 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
442 static const int32_t ngrams_windows_1256
[] = {
443 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
444 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
445 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
446 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
449 static const uint8_t charMap_windows_1256
[] = {
450 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
451 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
452 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
453 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
454 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
455 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
456 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
457 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
458 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
459 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
460 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
461 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
462 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
463 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
464 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
465 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
466 0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
467 0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
468 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
469 0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
470 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
471 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
472 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
473 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
474 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
475 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
476 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
477 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
478 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
479 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
480 0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
481 0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
484 static const int32_t ngrams_KOI8_R
[] = {
485 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
486 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
487 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
488 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
491 static const uint8_t charMap_KOI8_R
[] = {
492 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
493 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
494 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
495 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
496 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
497 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
498 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
499 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
500 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
501 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
502 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
503 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
504 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
505 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
506 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
507 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
508 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
509 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
510 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
511 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
512 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
513 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
514 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
515 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
516 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
517 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
518 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
519 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
520 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
521 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
522 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
523 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
526 static const int32_t ngrams_IBM424_he_rtl
[] = {
527 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
528 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
529 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
530 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
533 static const int32_t ngrams_IBM424_he_ltr
[] = {
534 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
535 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
536 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
537 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
540 static const uint8_t charMap_IBM424_he
[] = {
541 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
542 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
543 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
544 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
545 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
546 /* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
547 /* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
548 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
549 /* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
550 /* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
551 /* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
552 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
553 /* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
554 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
555 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
556 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
557 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
560 static const int32_t ngrams_IBM420_ar_rtl
[] = {
561 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
562 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
563 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
564 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
567 static const int32_t ngrams_IBM420_ar_ltr
[] = {
568 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
569 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
570 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
571 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
574 static const uint8_t charMap_IBM420_ar
[]= {
575 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
576 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
577 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
578 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
579 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
580 /* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
581 /* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
582 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
583 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
584 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
585 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
586 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
587 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
588 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
589 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
590 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
591 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
594 //ISO-8859-1,2,5,6,7,8,9 Ngrams
595 static const int32_t ngrams_8859_1_en
[] = {
596 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
597 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
598 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
599 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
602 static const int32_t ngrams_8859_1_da
[] = {
603 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
604 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
605 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
606 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
609 static const int32_t ngrams_8859_1_de
[] = {
610 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
611 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
612 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
613 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
616 static const int32_t ngrams_8859_1_es
[] = {
617 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
618 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
619 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
620 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
623 static const int32_t ngrams_8859_1_fr
[] = {
624 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
625 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
626 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
627 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
630 static const int32_t ngrams_8859_1_it
[] = {
631 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
632 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
633 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
634 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
637 static const int32_t ngrams_8859_1_nl
[] = {
638 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
639 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
640 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
641 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
644 static const int32_t ngrams_8859_1_no
[] = {
645 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
646 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
647 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
648 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
651 static const int32_t ngrams_8859_1_pt
[] = {
652 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
653 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
654 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
655 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
658 static const int32_t ngrams_8859_1_sv
[] = {
659 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
660 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
661 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
662 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
665 static const int32_t ngrams_8859_2_cs
[] = {
666 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
667 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
668 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
669 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
672 static const int32_t ngrams_8859_2_hu
[] = {
673 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
674 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
675 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
676 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
679 static const int32_t ngrams_8859_2_pl
[] = {
680 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
681 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
682 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
683 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
686 static const int32_t ngrams_8859_2_ro
[] = {
687 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
688 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
689 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
690 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
693 static const int32_t ngrams_8859_5_ru
[] = {
694 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
695 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
696 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
697 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
700 static const int32_t ngrams_8859_6_ar
[] = {
701 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
702 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
703 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
704 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
707 static const int32_t ngrams_8859_7_el
[] = {
708 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
709 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
710 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
711 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
714 static const int32_t ngrams_8859_8_I_he
[] = {
715 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
716 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
717 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
718 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
721 static const int32_t ngrams_8859_8_he
[] = {
722 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
723 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
724 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
725 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
728 static const int32_t ngrams_8859_9_tr
[] = {
729 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
730 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
731 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
732 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
735 CharsetRecog_8859_1::~CharsetRecog_8859_1()
740 const char *CharsetRecog_8859_1::getName() const
742 return haveC1Bytes
? "windows-1252" : "ISO-8859-1";
745 const char *CharsetRecog_8859_1_en::getLanguage() const
750 CharsetRecog_8859_1_en::~CharsetRecog_8859_1_en()
755 int32_t CharsetRecog_8859_1_en::match(InputText
*textIn
)
757 int32_t result
= match_sbcs(textIn
, ngrams_8859_1_en
, charMap_8859_1
);
759 // printf("8859_1_en: result = %d\n", result);
760 return result
; //match_sbcs(textIn, ngrams, charMap);
763 CharsetRecog_8859_1_da::~CharsetRecog_8859_1_da()
768 const char *CharsetRecog_8859_1_da::getLanguage() const
773 int32_t CharsetRecog_8859_1_da::match(InputText
*textIn
)
775 return match_sbcs(textIn
, ngrams_8859_1_da
, charMap_8859_1
);
778 CharsetRecog_8859_1_de::~CharsetRecog_8859_1_de() {}
780 const char *CharsetRecog_8859_1_de::getLanguage() const
785 int32_t CharsetRecog_8859_1_de::match(InputText
*textIn
)
787 return match_sbcs(textIn
, ngrams_8859_1_de
, charMap_8859_1
);
790 CharsetRecog_8859_1_es::~CharsetRecog_8859_1_es()
795 const char *CharsetRecog_8859_1_es::getLanguage() const
800 int32_t CharsetRecog_8859_1_es::match(InputText
*textIn
)
802 return match_sbcs(textIn
, ngrams_8859_1_es
, charMap_8859_1
);
805 CharsetRecog_8859_1_fr::~CharsetRecog_8859_1_fr()
810 const char *CharsetRecog_8859_1_fr::getLanguage() const
815 int32_t CharsetRecog_8859_1_fr::match(InputText
*textIn
)
817 return match_sbcs(textIn
, ngrams_8859_1_fr
, charMap_8859_1
);
820 CharsetRecog_8859_1_it::~CharsetRecog_8859_1_it()
825 const char *CharsetRecog_8859_1_it::getLanguage() const
830 int32_t CharsetRecog_8859_1_it::match(InputText
*textIn
)
832 return match_sbcs(textIn
, ngrams_8859_1_it
, charMap_8859_1
);
835 CharsetRecog_8859_1_nl::~CharsetRecog_8859_1_nl()
840 const char *CharsetRecog_8859_1_nl::getLanguage() const
845 int32_t CharsetRecog_8859_1_nl::match(InputText
*textIn
)
847 return match_sbcs(textIn
, ngrams_8859_1_nl
, charMap_8859_1
);
850 CharsetRecog_8859_1_no::~CharsetRecog_8859_1_no() {}
852 const char *CharsetRecog_8859_1_no::getLanguage() const
857 int32_t CharsetRecog_8859_1_no::match(InputText
*textIn
)
859 return match_sbcs(textIn
, ngrams_8859_1_no
, charMap_8859_1
);
862 CharsetRecog_8859_1_pt::~CharsetRecog_8859_1_pt()
867 const char *CharsetRecog_8859_1_pt::getLanguage() const
872 int32_t CharsetRecog_8859_1_pt::match(InputText
*textIn
)
874 return match_sbcs(textIn
, ngrams_8859_1_pt
, charMap_8859_1
);
877 CharsetRecog_8859_1_sv::~CharsetRecog_8859_1_sv() {}
879 const char *CharsetRecog_8859_1_sv::getLanguage() const
884 int32_t CharsetRecog_8859_1_sv::match(InputText
*textIn
)
886 return match_sbcs(textIn
, ngrams_8859_1_sv
, charMap_8859_1
);
889 CharsetRecog_8859_2::~CharsetRecog_8859_2()
894 const char *CharsetRecog_8859_2::getName() const
896 return haveC1Bytes
? "windows-1250" : "ISO-8859-2";
899 CharsetRecog_8859_2_cs::~CharsetRecog_8859_2_cs()
904 const char *CharsetRecog_8859_2_cs::getLanguage() const
909 int32_t CharsetRecog_8859_2_cs::match(InputText
*textIn
)
911 return match_sbcs(textIn
, ngrams_8859_2_cs
, charMap_8859_2
);
914 CharsetRecog_8859_2_hu::~CharsetRecog_8859_2_hu()
919 const char *CharsetRecog_8859_2_hu::getLanguage() const
924 int32_t CharsetRecog_8859_2_hu::match(InputText
*textIn
)
926 return match_sbcs(textIn
, ngrams_8859_2_hu
, charMap_8859_2
);
929 CharsetRecog_8859_2_pl::~CharsetRecog_8859_2_pl()
934 const char *CharsetRecog_8859_2_pl::getLanguage() const
939 int32_t CharsetRecog_8859_2_pl::match(InputText
*textIn
)
941 return match_sbcs(textIn
, ngrams_8859_2_pl
, charMap_8859_2
);
944 CharsetRecog_8859_2_ro::~CharsetRecog_8859_2_ro()
949 const char *CharsetRecog_8859_2_ro::getLanguage() const
954 int32_t CharsetRecog_8859_2_ro::match(InputText
*textIn
)
956 return match_sbcs(textIn
, ngrams_8859_2_ro
, charMap_8859_2
);
959 CharsetRecog_8859_5::~CharsetRecog_8859_5()
964 const char *CharsetRecog_8859_5::getName() const
969 CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru()
974 const char *CharsetRecog_8859_5_ru::getLanguage() const
979 int32_t CharsetRecog_8859_5_ru::match(InputText
*textIn
)
981 return match_sbcs(textIn
, ngrams_8859_5_ru
, charMap_8859_5
);
984 CharsetRecog_8859_6::~CharsetRecog_8859_6()
989 const char *CharsetRecog_8859_6::getName() const
994 CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar()
999 const char *CharsetRecog_8859_6_ar::getLanguage() const
1004 int32_t CharsetRecog_8859_6_ar::match(InputText
*textIn
)
1006 return match_sbcs(textIn
, ngrams_8859_6_ar
, charMap_8859_6
);
1009 CharsetRecog_8859_7::~CharsetRecog_8859_7()
1014 const char *CharsetRecog_8859_7::getName() const
1016 return haveC1Bytes
? "windows-1253" : "ISO-8859-7";
1019 CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el()
1024 const char *CharsetRecog_8859_7_el::getLanguage() const
1029 int32_t CharsetRecog_8859_7_el::match(InputText
*textIn
)
1031 return match_sbcs(textIn
, ngrams_8859_7_el
, charMap_8859_7
);
1034 CharsetRecog_8859_8::~CharsetRecog_8859_8()
1039 const char *CharsetRecog_8859_8::getName() const
1041 return haveC1Bytes
? "windows-1255" : "ISO-8859-8";
1044 CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he ()
1049 const char *CharsetRecog_8859_8_I_he::getName() const
1051 return haveC1Bytes
? "windows-1255" : "ISO-8859-8-I";
1054 const char *CharsetRecog_8859_8_I_he::getLanguage() const
1059 int32_t CharsetRecog_8859_8_I_he::match(InputText
*textIn
)
1061 return match_sbcs(textIn
, ngrams_8859_8_I_he
, charMap_8859_8
);
1064 CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he()
1069 const char *CharsetRecog_8859_8_he::getLanguage() const
1074 int32_t CharsetRecog_8859_8_he::match(InputText
*textIn
)
1076 return match_sbcs(textIn
, ngrams_8859_8_he
, charMap_8859_8
);
1079 CharsetRecog_8859_9::~CharsetRecog_8859_9()
1084 const char *CharsetRecog_8859_9::getName() const
1086 return haveC1Bytes
? "windows-1254" : "ISO-8859-9";
1089 CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr ()
1094 const char *CharsetRecog_8859_9_tr::getLanguage() const
1099 int32_t CharsetRecog_8859_9_tr::match(InputText
*textIn
)
1101 return match_sbcs(textIn
, ngrams_8859_9_tr
, charMap_8859_9
);
1104 CharsetRecog_windows_1256::~CharsetRecog_windows_1256()
1109 const char *CharsetRecog_windows_1256::getName() const
1111 return "windows-1256";
1114 const char *CharsetRecog_windows_1256::getLanguage() const
1119 int32_t CharsetRecog_windows_1256::match(InputText
*textIn
)
1121 return match_sbcs(textIn
, ngrams_windows_1256
, charMap_windows_1256
);
1124 CharsetRecog_windows_1251::~CharsetRecog_windows_1251()
1129 const char *CharsetRecog_windows_1251::getName() const
1131 return "windows-1251";
1134 const char *CharsetRecog_windows_1251::getLanguage() const
1139 int32_t CharsetRecog_windows_1251::match(InputText
*textIn
)
1141 return match_sbcs(textIn
, ngrams_windows_1251
, charMap_windows_1251
);
1144 CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R()
1149 const char *CharsetRecog_KOI8_R::getName() const
1154 const char *CharsetRecog_KOI8_R::getLanguage() const
1159 int32_t CharsetRecog_KOI8_R::match(InputText
*textIn
)
1161 return match_sbcs(textIn
, ngrams_KOI8_R
, charMap_KOI8_R
);
1164 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
1169 const char *CharsetRecog_IBM424_he::getLanguage() const
1174 CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
1179 const char *CharsetRecog_IBM424_he_rtl::getName() const
1181 return "IBM424_rtl";
1184 int32_t CharsetRecog_IBM424_he_rtl::match(InputText
*textIn
)
1186 return match_sbcs(textIn
, ngrams_IBM424_he_rtl
, charMap_IBM424_he
);
1189 CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
1194 const char *CharsetRecog_IBM424_he_ltr::getName() const
1196 return "IBM424_ltr";
1199 int32_t CharsetRecog_IBM424_he_ltr::match(InputText
*textIn
)
1201 return match_sbcs(textIn
, ngrams_IBM424_he_ltr
, charMap_IBM424_he
);
1204 static const uint8_t unshapeMap_IBM420
[] = {
1205 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
1206 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1207 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1208 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1209 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1210 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
1211 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
1212 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
1213 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
1214 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
1215 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
1216 /* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
1217 /* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
1218 /* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
1219 /* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
1220 /* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
1221 /* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
1224 CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
1229 const char *CharsetRecog_IBM420_ar::getLanguage() const
1234 void CharsetRecog_IBM420_ar::matchInit(InputText
*textIn
) {
1235 prev_fInputBytesLength
= textIn
->fInputLen
;
1236 prev_fInputBytes
= textIn
->fInputBytes
;
1239 uint8_t *bb
= unshape(prev_fInputBytes
, prev_fInputBytesLength
, length
);
1242 textIn
->fInputBytes
= bb
;
1243 textIn
->fInputLen
= length
;
1245 deleteBuffer
= TRUE
;
1247 deleteBuffer
= FALSE
;
1251 uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes
, int32_t inputBytesLength
, int32_t &length
) {
1252 uint8_t *resultArray
= unshapeLamAlef(inputBytes
, inputBytesLength
, length
);
1254 if (resultArray
!= NULL
) {
1255 for (int32_t i
= 0; i
< inputBytesLength
; i
++) {
1256 resultArray
[i
] = unshapeMap_IBM420
[resultArray
[i
]];
1263 uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes
, int32_t inputBytesLength
, int32_t &length
) {
1264 int32_t bigBufferLength
= inputBytesLength
* 2;
1265 uint8_t *bigBuffer
= (uint8_t *)uprv_malloc(bigBufferLength
);
1266 uint8_t *resultBuffer
= NULL
;
1268 if (bigBuffer
!= NULL
) {
1269 int32_t bufferIndex
;
1270 uint8_t unshapedLamAlef
[] = { 0xb1, 0x56 };
1272 for (int32_t i
= bufferIndex
= 0; i
< inputBytesLength
; i
++) {
1273 if (isLamAlef(inputBytes
[i
])) {
1274 bigBuffer
[bufferIndex
++] = unshapedLamAlef
[0];
1275 bigBuffer
[bufferIndex
++] = unshapedLamAlef
[1];
1277 bigBuffer
[bufferIndex
++] = inputBytes
[i
];
1281 length
= bufferIndex
;
1282 resultBuffer
= (uint8_t *)uprv_malloc(length
);
1283 if (resultBuffer
!= NULL
) {
1284 uprv_memcpy(resultBuffer
, bigBuffer
, length
);
1288 if (bigBuffer
!= NULL
) {
1289 uprv_free(bigBuffer
);
1292 return resultBuffer
;
1295 void CharsetRecog_IBM420_ar::matchFinish(InputText
*textIn
) {
1297 uprv_free(textIn
->fInputBytes
);
1299 textIn
->fInputBytes
= prev_fInputBytes
;
1300 textIn
->fInputLen
= prev_fInputBytesLength
;
1304 UBool
CharsetRecog_IBM420_ar::isLamAlef(uint8_t b
) {
1305 uint8_t shapedLamAlef
[] = {
1306 0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8
1309 for (uint32_t i
= 0; i
< sizeof(shapedLamAlef
); i
++) {
1310 if (b
== shapedLamAlef
[i
]) {
1318 CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
1323 const char *CharsetRecog_IBM420_ar_rtl::getName() const
1325 return "IBM420_rtl";
1328 int32_t CharsetRecog_IBM420_ar_rtl::match(InputText
*textIn
)
1330 return match_sbcs(textIn
, ngrams_IBM420_ar_rtl
, charMap_IBM420_ar
);
1333 CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
1338 const char *CharsetRecog_IBM420_ar_ltr::getName() const
1340 return "IBM420_ltr";
1343 int32_t CharsetRecog_IBM420_ar_ltr::match(InputText
*textIn
)
1345 return match_sbcs(textIn
, ngrams_IBM420_ar_ltr
, charMap_IBM420_ar
);