2 **********************************************************************
3 * Copyright (C) 2005-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_CONVERSION
17 #define N_GRAM_MASK 0xFFFFFF
18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
22 NGramParser::NGramParser(const int32_t *theNgramList
, const uint8_t *theCharMap
)
23 : ngram(0), byteIndex(0)
25 ngramList
= theNgramList
;
28 ngramCount
= hitCount
= 0;
32 * Binary search for value in table, which must have exactly 64 entries.
35 int32_t NGramParser::search(const int32_t *table
, int32_t value
)
39 if (table
[index
+ 32] <= value
) {
43 if (table
[index
+ 16] <= value
) {
47 if (table
[index
+ 8] <= value
) {
51 if (table
[index
+ 4] <= value
) {
55 if (table
[index
+ 2] <= value
) {
59 if (table
[index
+ 1] <= value
) {
63 if (table
[index
] > value
) {
67 if (index
< 0 || table
[index
] != value
) {
74 void NGramParser::lookup(int32_t thisNgram
)
78 if (search(ngramList
, thisNgram
) >= 0) {
84 void NGramParser::addByte(int32_t b
)
86 ngram
= ((ngram
<< 8) + b
) & N_GRAM_MASK
;
90 int32_t NGramParser::nextByte(InputText
*det
)
92 if (byteIndex
>= det
->fInputLen
) {
96 return det
->fInputBytes
[byteIndex
++];
99 void NGramParser::parseCharacters(InputText
*det
)
102 bool ignoreSpace
= FALSE
;
104 while ((b
= nextByte(det
)) >= 0) {
105 uint8_t mb
= charMap
[b
];
107 // TODO: 0x20 might not be a space in all character sets...
109 if (!(mb
== 0x20 && ignoreSpace
)) {
113 ignoreSpace
= (mb
== 0x20);
118 int32_t NGramParser::parse(InputText
*det
)
120 parseCharacters(det
);
122 // TODO: Is this OK? The buffer could have ended in the middle of a word...
125 double rawPercent
= (double) hitCount
/ (double) ngramCount
;
127 // if (rawPercent <= 2.0) {
131 // TODO - This is a bit of a hack to take care of a case
132 // were we were getting a confidence of 135...
133 if (rawPercent
> 0.33) {
137 return (int32_t) (rawPercent
* 300.0);
140 static const uint8_t unshapeMap_IBM420
[] = {
141 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
142 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
143 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
144 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
145 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
146 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
147 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
148 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
149 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
150 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
151 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
152 /* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
153 /* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
154 /* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
155 /* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
156 /* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
157 /* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
160 NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList
, const uint8_t *theCharMap
):NGramParser(theNgramList
, theCharMap
)
166 int32_t NGramParser_IBM420::isLamAlef(int32_t b
)
168 if(b
== 0xB2 || b
== 0xB3){
170 }else if(b
== 0xB4 || b
== 0xB5){
172 }else if(b
== 0xB8 || b
== 0xB9){
179 * Arabic shaping needs to be done manually. Cannot call ArabicShaping class
180 * because CharsetDetector is dealing with bytes not Unicode code points. We could
181 * convert the bytes to Unicode code points but that would leave us dependent
182 * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
183 * of JDK can produce different results and therefore is also avoided.
185 int32_t NGramParser_IBM420::nextByte(InputText
*det
)
188 if (byteIndex
>= det
->fInputLen
|| det
->fInputBytes
[byteIndex
] == 0) {
193 alef
= isLamAlef(det
->fInputBytes
[byteIndex
]);
197 next
= unshapeMap_IBM420
[det
->fInputBytes
[byteIndex
]& 0xFF] & 0xFF;
204 void NGramParser_IBM420::parseCharacters(InputText
*det
)
207 bool ignoreSpace
= FALSE
;
209 while ((b
= nextByte(det
)) >= 0) {
210 uint8_t mb
= charMap
[b
];
212 // TODO: 0x20 might not be a space in all character sets...
214 if (!(mb
== 0x20 && ignoreSpace
)) {
217 ignoreSpace
= (mb
== 0x20);
221 mb
= charMap
[alef
& 0xFF];
223 // TODO: 0x20 might not be a space in all character sets...
225 if (!(mb
== 0x20 && ignoreSpace
)) {
229 ignoreSpace
= (mb
== 0x20);
236 CharsetRecog_sbcs::CharsetRecog_sbcs()
238 // nothing else to do
241 CharsetRecog_sbcs::~CharsetRecog_sbcs()
246 int32_t CharsetRecog_sbcs::match_sbcs(InputText
*det
, const int32_t ngrams
[], const uint8_t byteMap
[]) const
248 NGramParser
parser(ngrams
, byteMap
);
251 result
= parser
.parse(det
);
256 static const uint8_t charMap_8859_1
[] = {
257 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
258 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
259 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
260 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
261 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
262 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
263 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
264 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
265 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
266 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
267 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
268 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
269 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
270 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
271 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
272 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
273 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
274 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
275 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
276 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
277 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
278 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
279 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
280 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
281 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
282 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
283 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
284 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
285 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
286 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
287 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
288 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
291 static const uint8_t charMap_8859_2
[] = {
292 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
293 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
294 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
295 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
296 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
297 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
298 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
299 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
300 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
301 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
302 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
303 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
304 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
305 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
306 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
307 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
308 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
309 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
310 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
311 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
312 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
313 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
314 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
315 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
316 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
317 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
318 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
319 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
320 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
321 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
322 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
323 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
326 static const uint8_t charMap_8859_5
[] = {
327 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
328 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
329 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
330 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
331 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
332 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
333 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
334 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
335 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
336 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
337 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
338 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
339 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
340 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
341 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
342 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
343 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
344 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
345 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
346 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
347 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
348 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
349 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
350 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
351 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
352 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
353 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
354 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
355 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
356 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
357 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
358 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
361 static const uint8_t charMap_8859_6
[] = {
362 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
363 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
364 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
365 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
366 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
367 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
368 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
369 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
370 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
371 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
372 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
373 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
374 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
375 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
376 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
377 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
378 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
379 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
380 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
381 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
382 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
383 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
384 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
385 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
386 0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
387 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
388 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
389 0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
390 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
391 0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
392 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
393 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
396 static const uint8_t charMap_8859_7
[] = {
397 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
398 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
399 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
400 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
401 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
402 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
403 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
404 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
405 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
406 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
407 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
408 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
409 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
410 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
411 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
412 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
413 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
414 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
415 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
416 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
417 0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
418 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
419 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
420 0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
421 0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
422 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
423 0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
424 0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
425 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
426 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
427 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
428 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
431 static const uint8_t charMap_8859_8
[] = {
432 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
433 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
434 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
435 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
436 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
437 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
438 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
439 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
440 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
441 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
442 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
443 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
444 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
445 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
446 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
447 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
448 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
449 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
450 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
451 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
452 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
453 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
454 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
455 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
456 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
457 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
458 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
459 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
460 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
461 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
462 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
463 0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
466 static const uint8_t charMap_8859_9
[] = {
467 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
468 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
469 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
470 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
471 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
472 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
473 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
474 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
475 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
476 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
477 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
478 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
479 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
480 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
481 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
482 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
483 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
484 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
485 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
486 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
487 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
488 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
489 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
490 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
491 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
492 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
493 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
494 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
495 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
496 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
497 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
498 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
501 static const int32_t ngrams_windows_1251
[] = {
502 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
503 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
504 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
505 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
508 static const uint8_t charMap_windows_1251
[] = {
509 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
510 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
511 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
512 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
513 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
514 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
515 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
516 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
517 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
518 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
519 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
520 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
521 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
522 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
523 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
524 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
525 0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
526 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
527 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
528 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
529 0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
530 0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
531 0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
532 0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
533 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
534 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
535 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
536 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
537 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
538 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
539 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
540 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
543 static const int32_t ngrams_windows_1256
[] = {
544 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
545 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
546 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
547 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
550 static const uint8_t charMap_windows_1256
[] = {
551 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
552 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
553 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
554 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
555 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
556 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
557 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
558 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
559 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
560 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
561 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
562 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
563 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
564 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
565 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
566 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
567 0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
568 0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
569 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
570 0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
571 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
572 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
573 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
574 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
575 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
576 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
577 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
578 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
579 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
580 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
581 0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
582 0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
585 static const int32_t ngrams_KOI8_R
[] = {
586 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
587 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
588 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
589 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
592 static const uint8_t charMap_KOI8_R
[] = {
593 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
594 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
595 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
596 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
597 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
598 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
599 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
600 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
601 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
602 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
603 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
604 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
605 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
606 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
607 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
608 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
609 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
610 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
611 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
612 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
613 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
614 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
615 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
616 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
617 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
618 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
619 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
620 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
621 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
622 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
623 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
624 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
627 static const int32_t ngrams_IBM424_he_rtl
[] = {
628 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
629 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
630 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
631 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
634 static const int32_t ngrams_IBM424_he_ltr
[] = {
635 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
636 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
637 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
638 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
641 static const uint8_t charMap_IBM424_he
[] = {
642 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
643 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
644 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
645 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
646 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
647 /* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
648 /* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
649 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
650 /* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
651 /* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
652 /* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
653 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
654 /* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
655 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
656 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
657 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
658 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
661 static const int32_t ngrams_IBM420_ar_rtl
[] = {
662 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
663 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
664 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
665 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
668 static const int32_t ngrams_IBM420_ar_ltr
[] = {
669 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
670 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
671 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
672 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
675 static const uint8_t charMap_IBM420_ar
[]= {
676 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
677 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
678 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
679 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
680 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
681 /* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
682 /* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
683 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
684 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
685 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
686 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
687 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
688 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
689 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
690 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
691 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
692 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
695 //ISO-8859-1,2,5,6,7,8,9 Ngrams
697 struct NGramsPlusLang
{
698 const int32_t ngrams
[64];
702 static const NGramsPlusLang ngrams_8859_1
[] = {
705 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
706 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
707 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
708 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
714 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
715 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
716 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
717 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
723 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
724 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
725 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
726 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
732 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
733 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
734 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
735 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
741 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
742 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
743 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
744 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
750 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
751 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
752 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
753 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
759 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
760 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
761 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
762 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
768 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
769 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
770 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
771 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
777 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
778 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
779 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
780 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
786 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
787 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
788 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
789 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
796 static const NGramsPlusLang ngrams_8859_2
[] = {
799 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
800 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
801 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
802 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
808 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
809 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
810 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
811 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
817 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
818 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
819 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
820 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
826 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
827 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
828 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
829 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
835 static const int32_t ngrams_8859_5_ru
[] = {
836 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
837 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
838 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
839 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
842 static const int32_t ngrams_8859_6_ar
[] = {
843 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
844 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
845 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
846 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
849 static const int32_t ngrams_8859_7_el
[] = {
850 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
851 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
852 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
853 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
856 static const int32_t ngrams_8859_8_I_he
[] = {
857 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
858 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
859 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
860 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
863 static const int32_t ngrams_8859_8_he
[] = {
864 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
865 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
866 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
867 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
870 static const int32_t ngrams_8859_9_tr
[] = {
871 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
872 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
873 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
874 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
877 CharsetRecog_8859_1::~CharsetRecog_8859_1()
882 UBool
CharsetRecog_8859_1::match(InputText
*textIn
, CharsetMatch
*results
) const {
883 const char *name
= textIn
->fC1Bytes
? "windows-1252" : "ISO-8859-1";
885 int32_t bestConfidenceSoFar
= -1;
886 for (i
=0; i
< ARRAY_SIZE(ngrams_8859_1
) ; i
++) {
887 const int32_t *ngrams
= ngrams_8859_1
[i
].ngrams
;
888 const char *lang
= ngrams_8859_1
[i
].lang
;
889 int32_t confidence
= match_sbcs(textIn
, ngrams
, charMap_8859_1
);
890 if (confidence
> bestConfidenceSoFar
) {
891 results
->set(textIn
, this, confidence
, name
, lang
);
892 bestConfidenceSoFar
= confidence
;
895 return (bestConfidenceSoFar
> 0);
898 const char *CharsetRecog_8859_1::getName() const
904 CharsetRecog_8859_2::~CharsetRecog_8859_2()
909 UBool
CharsetRecog_8859_2::match(InputText
*textIn
, CharsetMatch
*results
) const {
910 const char *name
= textIn
->fC1Bytes
? "windows-1250" : "ISO-8859-2";
912 int32_t bestConfidenceSoFar
= -1;
913 for (i
=0; i
< ARRAY_SIZE(ngrams_8859_2
) ; i
++) {
914 const int32_t *ngrams
= ngrams_8859_2
[i
].ngrams
;
915 const char *lang
= ngrams_8859_2
[i
].lang
;
916 int32_t confidence
= match_sbcs(textIn
, ngrams
, charMap_8859_2
);
917 if (confidence
> bestConfidenceSoFar
) {
918 results
->set(textIn
, this, confidence
, name
, lang
);
919 bestConfidenceSoFar
= confidence
;
922 return (bestConfidenceSoFar
> 0);
925 const char *CharsetRecog_8859_2::getName() const
931 CharsetRecog_8859_5::~CharsetRecog_8859_5()
936 const char *CharsetRecog_8859_5::getName() const
941 CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru()
946 const char *CharsetRecog_8859_5_ru::getLanguage() const
951 UBool
CharsetRecog_8859_5_ru::match(InputText
*textIn
, CharsetMatch
*results
) const
953 int32_t confidence
= match_sbcs(textIn
, ngrams_8859_5_ru
, charMap_8859_5
);
954 results
->set(textIn
, this, confidence
);
955 return (confidence
> 0);
958 CharsetRecog_8859_6::~CharsetRecog_8859_6()
963 const char *CharsetRecog_8859_6::getName() const
968 CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar()
973 const char *CharsetRecog_8859_6_ar::getLanguage() const
978 UBool
CharsetRecog_8859_6_ar::match(InputText
*textIn
, CharsetMatch
*results
) const
980 int32_t confidence
= match_sbcs(textIn
, ngrams_8859_6_ar
, charMap_8859_6
);
981 results
->set(textIn
, this, confidence
);
982 return (confidence
> 0);
985 CharsetRecog_8859_7::~CharsetRecog_8859_7()
990 const char *CharsetRecog_8859_7::getName() const
995 CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el()
1000 const char *CharsetRecog_8859_7_el::getLanguage() const
1005 UBool
CharsetRecog_8859_7_el::match(InputText
*textIn
, CharsetMatch
*results
) const
1007 const char *name
= textIn
->fC1Bytes
? "windows-1253" : "ISO-8859-7";
1008 int32_t confidence
= match_sbcs(textIn
, ngrams_8859_7_el
, charMap_8859_7
);
1009 results
->set(textIn
, this, confidence
, name
, "el");
1010 return (confidence
> 0);
1013 CharsetRecog_8859_8::~CharsetRecog_8859_8()
1018 const char *CharsetRecog_8859_8::getName() const
1020 return "ISO-8859-8";
1023 CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he ()
1028 const char *CharsetRecog_8859_8_I_he::getName() const
1030 return "ISO-8859-8-I";
1033 const char *CharsetRecog_8859_8_I_he::getLanguage() const
1038 UBool
CharsetRecog_8859_8_I_he::match(InputText
*textIn
, CharsetMatch
*results
) const
1040 const char *name
= textIn
->fC1Bytes
? "windows-1255" : "ISO-8859-8-I";
1041 int32_t confidence
= match_sbcs(textIn
, ngrams_8859_8_I_he
, charMap_8859_8
);
1042 results
->set(textIn
, this, confidence
, name
, "he");
1043 return (confidence
> 0);
1046 CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he()
1051 const char *CharsetRecog_8859_8_he::getLanguage() const
1056 UBool
CharsetRecog_8859_8_he::match(InputText
*textIn
, CharsetMatch
*results
) const
1058 const char *name
= textIn
->fC1Bytes
? "windows-1255" : "ISO-8859-8";
1059 int32_t confidence
= match_sbcs(textIn
, ngrams_8859_8_he
, charMap_8859_8
);
1060 results
->set(textIn
, this, confidence
, name
, "he");
1061 return (confidence
> 0);
1064 CharsetRecog_8859_9::~CharsetRecog_8859_9()
1069 const char *CharsetRecog_8859_9::getName() const
1071 return "ISO-8859-9";
1074 CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr ()
1079 const char *CharsetRecog_8859_9_tr::getLanguage() const
1084 UBool
CharsetRecog_8859_9_tr::match(InputText
*textIn
, CharsetMatch
*results
) const
1086 const char *name
= textIn
->fC1Bytes
? "windows-1254" : "ISO-8859-9";
1087 int32_t confidence
= match_sbcs(textIn
, ngrams_8859_9_tr
, charMap_8859_9
);
1088 results
->set(textIn
, this, confidence
, name
, "tr");
1089 return (confidence
> 0);
1092 CharsetRecog_windows_1256::~CharsetRecog_windows_1256()
1097 const char *CharsetRecog_windows_1256::getName() const
1099 return "windows-1256";
1102 const char *CharsetRecog_windows_1256::getLanguage() const
1107 UBool
CharsetRecog_windows_1256::match(InputText
*textIn
, CharsetMatch
*results
) const
1109 int32_t confidence
= match_sbcs(textIn
, ngrams_windows_1256
, charMap_windows_1256
);
1110 results
->set(textIn
, this, confidence
);
1111 return (confidence
> 0);
1114 CharsetRecog_windows_1251::~CharsetRecog_windows_1251()
1119 const char *CharsetRecog_windows_1251::getName() const
1121 return "windows-1251";
1124 const char *CharsetRecog_windows_1251::getLanguage() const
1129 UBool
CharsetRecog_windows_1251::match(InputText
*textIn
, CharsetMatch
*results
) const
1131 int32_t confidence
= match_sbcs(textIn
, ngrams_windows_1251
, charMap_windows_1251
);
1132 results
->set(textIn
, this, confidence
);
1133 return (confidence
> 0);
1136 CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R()
1141 const char *CharsetRecog_KOI8_R::getName() const
1146 const char *CharsetRecog_KOI8_R::getLanguage() const
1151 UBool
CharsetRecog_KOI8_R::match(InputText
*textIn
, CharsetMatch
*results
) const
1153 int32_t confidence
= match_sbcs(textIn
, ngrams_KOI8_R
, charMap_KOI8_R
);
1154 results
->set(textIn
, this, confidence
);
1155 return (confidence
> 0);
1158 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
1163 const char *CharsetRecog_IBM424_he::getLanguage() const
1168 CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
1173 const char *CharsetRecog_IBM424_he_rtl::getName() const
1175 return "IBM424_rtl";
1178 UBool
CharsetRecog_IBM424_he_rtl::match(InputText
*textIn
, CharsetMatch
*results
) const
1180 int32_t confidence
= match_sbcs(textIn
, ngrams_IBM424_he_rtl
, charMap_IBM424_he
);
1181 results
->set(textIn
, this, confidence
);
1182 return (confidence
> 0);
1185 CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
1190 const char *CharsetRecog_IBM424_he_ltr::getName() const
1192 return "IBM424_ltr";
1195 UBool
CharsetRecog_IBM424_he_ltr::match(InputText
*textIn
, CharsetMatch
*results
) const
1197 int32_t confidence
= match_sbcs(textIn
, ngrams_IBM424_he_ltr
, charMap_IBM424_he
);
1198 results
->set(textIn
, this, confidence
);
1199 return (confidence
> 0);
1202 CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
1207 const char *CharsetRecog_IBM420_ar::getLanguage() const
1213 int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText
*det
, const int32_t ngrams
[], const uint8_t byteMap
[]) const
1215 NGramParser_IBM420
parser(ngrams
, byteMap
);
1218 result
= parser
.parse(det
);
1223 CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
1228 const char *CharsetRecog_IBM420_ar_rtl::getName() const
1230 return "IBM420_rtl";
1233 UBool
CharsetRecog_IBM420_ar_rtl::match(InputText
*textIn
, CharsetMatch
*results
) const
1235 int32_t confidence
= match_sbcs(textIn
, ngrams_IBM420_ar_rtl
, charMap_IBM420_ar
);
1236 results
->set(textIn
, this, confidence
);
1237 return (confidence
> 0);
1240 CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
1245 const char *CharsetRecog_IBM420_ar_ltr::getName() const
1247 return "IBM420_ltr";
1250 UBool
CharsetRecog_IBM420_ar_ltr::match(InputText
*textIn
, CharsetMatch
*results
) const
1252 int32_t confidence
= match_sbcs(textIn
, ngrams_IBM420_ar_ltr
, charMap_IBM420_ar
);
1253 results
->set(textIn
, this, confidence
);
1254 return (confidence
> 0);