]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/csrsbcs.cpp
ICU-64260.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / csrsbcs.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
73c04bcf
A
3/*
4 **********************************************************************
2ca993e8 5 * Copyright (C) 2005-2016, International Business Machines
73c04bcf
A
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10#include "unicode/utypes.h"
11
729e4ab9
A
12#include "cmemory.h"
13
73c04bcf
A
14#if !UCONFIG_NO_CONVERSION
15#include "csrsbcs.h"
51004dcb 16#include "csmatch.h"
73c04bcf 17
73c04bcf
A
18#define N_GRAM_SIZE 3
19#define N_GRAM_MASK 0xFFFFFF
20
21U_NAMESPACE_BEGIN
22
23NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
57a6839d 24 : ngram(0), byteIndex(0)
73c04bcf
A
25{
26 ngramList = theNgramList;
27 charMap = theCharMap;
28
29 ngramCount = hitCount = 0;
30}
31
b331163b
A
32NGramParser::~NGramParser()
33{
34}
35
73c04bcf
A
36/*
37 * Binary search for value in table, which must have exactly 64 entries.
38 */
39
40int32_t NGramParser::search(const int32_t *table, int32_t value)
41{
42 int32_t index = 0;
43
44 if (table[index + 32] <= value) {
45 index += 32;
46 }
47
48 if (table[index + 16] <= value) {
49 index += 16;
50 }
51
52 if (table[index + 8] <= value) {
53 index += 8;
54 }
55
56 if (table[index + 4] <= value) {
57 index += 4;
58 }
59
60 if (table[index + 2] <= value) {
61 index += 2;
62 }
63
64 if (table[index + 1] <= value) {
65 index += 1;
66 }
67
68 if (table[index] > value) {
69 index -= 1;
70 }
71
72 if (index < 0 || table[index] != value) {
73 return -1;
74 }
75
76 return index;
77}
78
79void NGramParser::lookup(int32_t thisNgram)
80{
81 ngramCount += 1;
82
83 if (search(ngramList, thisNgram) >= 0) {
84 hitCount += 1;
85 }
86
87}
88
89void NGramParser::addByte(int32_t b)
90{
91 ngram = ((ngram << 8) + b) & N_GRAM_MASK;
92 lookup(ngram);
93}
94
95int32_t NGramParser::nextByte(InputText *det)
96{
97 if (byteIndex >= det->fInputLen) {
98 return -1;
99 }
100
101 return det->fInputBytes[byteIndex++];
102}
103
57a6839d 104void NGramParser::parseCharacters(InputText *det)
73c04bcf
A
105{
106 int32_t b;
107 bool ignoreSpace = FALSE;
108
109 while ((b = nextByte(det)) >= 0) {
110 uint8_t mb = charMap[b];
111
112 // TODO: 0x20 might not be a space in all character sets...
113 if (mb != 0) {
114 if (!(mb == 0x20 && ignoreSpace)) {
115 addByte(mb);
116 }
117
118 ignoreSpace = (mb == 0x20);
119 }
120 }
57a6839d
A
121}
122
123int32_t NGramParser::parse(InputText *det)
124{
125 parseCharacters(det);
73c04bcf
A
126
127 // TODO: Is this OK? The buffer could have ended in the middle of a word...
128 addByte(0x20);
129
130 double rawPercent = (double) hitCount / (double) ngramCount;
131
132 // if (rawPercent <= 2.0) {
133 // return 0;
134 // }
135
136 // TODO - This is a bit of a hack to take care of a case
137 // were we were getting a confidence of 135...
138 if (rawPercent > 0.33) {
139 return 98;
140 }
141
142 return (int32_t) (rawPercent * 300.0);
143}
144
b331163b 145#if !UCONFIG_ONLY_HTML_CONVERSION
57a6839d
A
146static const uint8_t unshapeMap_IBM420[] = {
147/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
148/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
149/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
150/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
151/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
152/* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
153/* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
154/* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
155/* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
156/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
157/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
158/* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
159/* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
160/* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
161/* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
162/* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
163/* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
164};
165
166NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap)
167{
168 alef = 0x00;
169}
170
2ca993e8 171NGramParser_IBM420::~NGramParser_IBM420() {}
57a6839d
A
172
173int32_t NGramParser_IBM420::isLamAlef(int32_t b)
174{
175 if(b == 0xB2 || b == 0xB3){
176 return 0x47;
177 }else if(b == 0xB4 || b == 0xB5){
178 return 0x49;
179 }else if(b == 0xB8 || b == 0xB9){
180 return 0x56;
181 }else
182 return 0x00;
183}
184
185/*
186* Arabic shaping needs to be done manually. Cannot call ArabicShaping class
187* because CharsetDetector is dealing with bytes not Unicode code points. We could
188* convert the bytes to Unicode code points but that would leave us dependent
189* on CharsetICU which we try to avoid. IBM420 converter amongst different versions
190* of JDK can produce different results and therefore is also avoided.
191*/
192int32_t NGramParser_IBM420::nextByte(InputText *det)
193{
194
195 if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) {
196 return -1;
197 }
198 int next;
199
200 alef = isLamAlef(det->fInputBytes[byteIndex]);
201 if(alef != 0x00)
202 next = 0xB1 & 0xFF;
203 else
204 next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF;
205
206 byteIndex++;
207
208 return next;
209}
210
211void NGramParser_IBM420::parseCharacters(InputText *det)
212{
213 int32_t b;
214 bool ignoreSpace = FALSE;
215
216 while ((b = nextByte(det)) >= 0) {
217 uint8_t mb = charMap[b];
218
219 // TODO: 0x20 might not be a space in all character sets...
220 if (mb != 0) {
221 if (!(mb == 0x20 && ignoreSpace)) {
222 addByte(mb);
223 }
224 ignoreSpace = (mb == 0x20);
225 }
226
227 if(alef != 0x00){
228 mb = charMap[alef & 0xFF];
229
230 // TODO: 0x20 might not be a space in all character sets...
231 if (mb != 0) {
232 if (!(mb == 0x20 && ignoreSpace)) {
233 addByte(mb);
234 }
235
236 ignoreSpace = (mb == 0x20);
237 }
238
239 }
240 }
241}
b331163b 242#endif
57a6839d 243
73c04bcf 244CharsetRecog_sbcs::CharsetRecog_sbcs()
73c04bcf
A
245{
246 // nothing else to do
247}
248
249CharsetRecog_sbcs::~CharsetRecog_sbcs()
250{
251 // nothing to do
252}
253
51004dcb 254int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const
73c04bcf 255{
46f4442e 256 NGramParser parser(ngrams, byteMap);
73c04bcf
A
257 int32_t result;
258
46f4442e 259 result = parser.parse(det);
73c04bcf
A
260
261 return result;
262}
263
264static const uint8_t charMap_8859_1[] = {
265 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
266 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
267 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
268 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
269 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
270 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
271 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
272 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
273 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
274 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
275 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
276 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
277 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
278 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
279 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
280 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
281 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
282 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
283 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
284 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
285 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
286 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
287 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
288 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
289 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
290 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
291 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
292 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
293 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
294 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
295 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
296 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
297};
298
299static const uint8_t charMap_8859_2[] = {
300 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
301 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
302 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
303 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
304 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
305 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
306 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
307 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
308 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
309 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
310 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
311 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
312 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
313 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
314 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
315 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
316 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
317 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
318 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
319 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
320 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
321 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
322 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
323 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
324 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
325 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
326 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
327 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
328 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
329 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
330 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
331 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
332};
333
334static const uint8_t charMap_8859_5[] = {
335 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
336 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
337 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
338 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
339 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
340 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
341 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
342 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
343 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
344 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
345 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
346 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
347 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
348 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
349 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
350 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
351 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
352 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
353 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
354 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
355 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
356 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
357 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
358 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
359 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
360 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
361 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
362 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
363 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
364 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
365 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
366 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
367};
368
369static const uint8_t charMap_8859_6[] = {
370 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
371 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
372 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
373 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
374 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
375 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
376 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
377 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
378 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
379 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
380 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
381 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
382 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
383 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
384 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
385 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
386 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
387 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
388 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
389 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
390 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
391 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
392 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
393 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
394 0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
395 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
396 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
397 0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
398 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
399 0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
400 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
401 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
402};
403
404static const uint8_t charMap_8859_7[] = {
405 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
406 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
407 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
408 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
409 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
410 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
411 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
412 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
413 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
414 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
415 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
416 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
417 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
418 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
419 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
420 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
421 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
422 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
423 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
424 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
425 0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
426 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
427 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
428 0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
429 0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
430 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
431 0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
432 0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
433 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
434 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
435 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
436 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
437};
438
439static const uint8_t charMap_8859_8[] = {
440 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
441 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
442 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
443 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
444 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
445 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
446 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
447 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
448 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
449 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
450 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
451 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
452 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
453 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
454 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
455 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
456 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
457 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
458 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
459 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
460 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
461 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
462 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
463 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
464 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
465 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
466 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
467 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
468 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
469 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
470 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
471 0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
472};
473
474static const uint8_t charMap_8859_9[] = {
475 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
476 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
477 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
478 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
479 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
480 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
481 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
482 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
483 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
484 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
485 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
486 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
487 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
488 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
489 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
490 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
491 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
492 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
493 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
494 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
495 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
496 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
497 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
498 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
499 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
500 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
501 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
502 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
503 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
504 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
505 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
506 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
507};
508
509static const int32_t ngrams_windows_1251[] = {
510 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
511 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
512 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
513 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
514};
515
516static const uint8_t charMap_windows_1251[] = {
517 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
518 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
519 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
520 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
521 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
522 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
523 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
524 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
525 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
526 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
527 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
528 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
529 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
530 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
531 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
532 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
533 0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
534 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
535 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
536 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
537 0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
538 0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
539 0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
540 0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
541 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
542 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
543 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
544 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
545 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
546 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
547 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
548 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
549};
550
551static const int32_t ngrams_windows_1256[] = {
552 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
553 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
554 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
555 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
556};
557
558static const uint8_t charMap_windows_1256[] = {
559 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
560 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
561 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
562 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
563 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
564 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
565 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
566 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
567 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
568 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
569 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
570 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
571 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
572 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
573 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
574 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
575 0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
576 0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
577 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
578 0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
579 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
580 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
581 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
582 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
583 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
584 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
585 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
586 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
587 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
588 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
589 0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
590 0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
591};
592
593static const int32_t ngrams_KOI8_R[] = {
594 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
595 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
596 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
597 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
598};
599
600static const uint8_t charMap_KOI8_R[] = {
601 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
602 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
603 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
604 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
605 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
606 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
607 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
608 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
609 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
610 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
611 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
612 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
613 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
614 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
615 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
616 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
617 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
618 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
619 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
620 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
621 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
622 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
623 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
624 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
625 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
626 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
627 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
628 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
629 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
630 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
631 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
632 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
633};
634
b331163b 635#if !UCONFIG_ONLY_HTML_CONVERSION
729e4ab9
A
636static const int32_t ngrams_IBM424_he_rtl[] = {
637 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
638 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
639 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
640 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
641};
642
643static const int32_t ngrams_IBM424_he_ltr[] = {
644 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
645 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
646 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
647 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
648};
649
650static const uint8_t charMap_IBM424_he[] = {
651/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
652/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
653/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
654/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
655/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
656/* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
657/* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
658/* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
659/* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
660/* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
661/* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
662/* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
663/* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
664/* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
665/* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
666/* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
667/* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
668};
669
670static const int32_t ngrams_IBM420_ar_rtl[] = {
671 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
672 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
673 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
674 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
675};
676
677static const int32_t ngrams_IBM420_ar_ltr[] = {
678 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
679 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
680 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
681 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
682};
683
684static const uint8_t charMap_IBM420_ar[]= {
685/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
686/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
687/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
688/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
689/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
690/* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
691/* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
692/* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
693/* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
694/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
695/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
696/* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
697/* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
698/* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
699/* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
700/* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
701/* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
702};
b331163b 703#endif
729e4ab9 704
51004dcb
A
705//ISO-8859-1,2,5,6,7,8,9 Ngrams
706
707struct NGramsPlusLang {
708 const int32_t ngrams[64];
709 const char * lang;
710};
711
712static const NGramsPlusLang ngrams_8859_1[] = {
713 {
714 {
73c04bcf
A
715 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
716 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
717 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
718 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
51004dcb
A
719 },
720 "en"
721 },
722 {
723 {
73c04bcf
A
724 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
725 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
726 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
727 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
51004dcb
A
728 },
729 "da"
730 },
731 {
732 {
73c04bcf
A
733 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
734 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
735 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
736 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
51004dcb
A
737 },
738 "de"
739 },
740 {
741 {
73c04bcf
A
742 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
743 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
744 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
745 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
51004dcb
A
746 },
747 "es"
748 },
749 {
750 {
73c04bcf
A
751 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
752 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
753 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
754 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
51004dcb
A
755 },
756 "fr"
757 },
758 {
759 {
73c04bcf
A
760 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
761 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
762 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
763 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
51004dcb
A
764 },
765 "it"
766 },
767 {
768 {
73c04bcf
A
769 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
770 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
771 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
772 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
51004dcb
A
773 },
774 "nl"
775 },
776 {
777 {
73c04bcf
A
778 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
779 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
780 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
781 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
51004dcb
A
782 },
783 "no"
784 },
785 {
786 {
73c04bcf
A
787 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
788 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
789 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
790 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
51004dcb
A
791 },
792 "pt"
793 },
794 {
795 {
73c04bcf
A
796 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
797 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
798 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
799 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
51004dcb
A
800 },
801 "sv"
802 }
73c04bcf
A
803};
804
51004dcb
A
805
806static const NGramsPlusLang ngrams_8859_2[] = {
807 {
808 {
73c04bcf
A
809 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
810 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
811 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
812 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
51004dcb
A
813 },
814 "cs"
815 },
816 {
817 {
73c04bcf
A
818 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
819 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
820 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
821 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
51004dcb
A
822 },
823 "hu"
824 },
825 {
826 {
73c04bcf
A
827 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
828 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
829 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
830 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
51004dcb
A
831 },
832 "pl"
833 },
834 {
835 {
73c04bcf
A
836 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
837 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
838 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
839 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
51004dcb
A
840 },
841 "ro"
842 }
73c04bcf
A
843};
844
845static const int32_t ngrams_8859_5_ru[] = {
846 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
847 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
848 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
849 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
850};
851
852static const int32_t ngrams_8859_6_ar[] = {
853 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
854 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
855 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
856 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
857};
858
859static const int32_t ngrams_8859_7_el[] = {
860 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
861 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
862 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
863 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
864};
865
866static const int32_t ngrams_8859_8_I_he[] = {
867 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
868 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
869 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
870 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
871};
872
873static const int32_t ngrams_8859_8_he[] = {
874 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
875 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
876 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
877 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
878};
879
880static const int32_t ngrams_8859_9_tr[] = {
881 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
882 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
883 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
884 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
885};
886
887CharsetRecog_8859_1::~CharsetRecog_8859_1()
888{
889 // nothing to do
890}
891
51004dcb
A
892UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const {
893 const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1";
894 uint32_t i;
895 int32_t bestConfidenceSoFar = -1;
2ca993e8 896 for (i=0; i < UPRV_LENGTHOF(ngrams_8859_1) ; i++) {
51004dcb
A
897 const int32_t *ngrams = ngrams_8859_1[i].ngrams;
898 const char *lang = ngrams_8859_1[i].lang;
899 int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1);
900 if (confidence > bestConfidenceSoFar) {
901 results->set(textIn, this, confidence, name, lang);
902 bestConfidenceSoFar = confidence;
903 }
904 }
1a147d09
A
905 if (bestConfidenceSoFar < 10 && textIn->fOnlyTypicalASCII) { // rdar://56373519
906 bestConfidenceSoFar = 15;
907 results->set(textIn, this, bestConfidenceSoFar, name);
908 }
51004dcb 909 return (bestConfidenceSoFar > 0);
73c04bcf
A
910}
911
51004dcb 912const char *CharsetRecog_8859_1::getName() const
73c04bcf 913{
51004dcb 914 return "ISO-8859-1";
73c04bcf
A
915}
916
73c04bcf
A
917
918CharsetRecog_8859_2::~CharsetRecog_8859_2()
919{
920 // nothing to do
921}
922
51004dcb
A
923UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const {
924 const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2";
925 uint32_t i;
926 int32_t bestConfidenceSoFar = -1;
2ca993e8 927 for (i=0; i < UPRV_LENGTHOF(ngrams_8859_2) ; i++) {
51004dcb
A
928 const int32_t *ngrams = ngrams_8859_2[i].ngrams;
929 const char *lang = ngrams_8859_2[i].lang;
930 int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2);
931 if (confidence > bestConfidenceSoFar) {
932 results->set(textIn, this, confidence, name, lang);
933 bestConfidenceSoFar = confidence;
934 }
935 }
936 return (bestConfidenceSoFar > 0);
73c04bcf
A
937}
938
51004dcb 939const char *CharsetRecog_8859_2::getName() const
73c04bcf 940{
51004dcb 941 return "ISO-8859-2";
73c04bcf
A
942}
943
73c04bcf
A
944
945CharsetRecog_8859_5::~CharsetRecog_8859_5()
946{
947 // nothing to do
948}
949
950const char *CharsetRecog_8859_5::getName() const
951{
952 return "ISO-8859-5";
953}
954
955CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru()
956{
957 // nothing to do
958}
959
960const char *CharsetRecog_8859_5_ru::getLanguage() const
961{
962 return "ru";
963}
964
51004dcb 965UBool CharsetRecog_8859_5_ru::match(InputText *textIn, CharsetMatch *results) const
73c04bcf 966{
51004dcb
A
967 int32_t confidence = match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5);
968 results->set(textIn, this, confidence);
969 return (confidence > 0);
73c04bcf
A
970}
971
972CharsetRecog_8859_6::~CharsetRecog_8859_6()
973{
974 // nothing to do
975}
976
977const char *CharsetRecog_8859_6::getName() const
978{
979 return "ISO-8859-6";
980}
981
982CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar()
983{
984 // nothing to do
985}
986
987const char *CharsetRecog_8859_6_ar::getLanguage() const
988{
989 return "ar";
990}
991
51004dcb 992UBool CharsetRecog_8859_6_ar::match(InputText *textIn, CharsetMatch *results) const
73c04bcf 993{
51004dcb
A
994 int32_t confidence = match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6);
995 results->set(textIn, this, confidence);
996 return (confidence > 0);
73c04bcf
A
997}
998
999CharsetRecog_8859_7::~CharsetRecog_8859_7()
1000{
1001 // nothing to do
1002}
1003
1004const char *CharsetRecog_8859_7::getName() const
1005{
51004dcb 1006 return "ISO-8859-7";
73c04bcf
A
1007}
1008
1009CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el()
1010{
1011 // nothing to do
1012}
1013
1014const char *CharsetRecog_8859_7_el::getLanguage() const
1015{
1016 return "el";
1017}
1018
51004dcb 1019UBool CharsetRecog_8859_7_el::match(InputText *textIn, CharsetMatch *results) const
73c04bcf 1020{
51004dcb
A
1021 const char *name = textIn->fC1Bytes? "windows-1253" : "ISO-8859-7";
1022 int32_t confidence = match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7);
1023 results->set(textIn, this, confidence, name, "el");
1024 return (confidence > 0);
73c04bcf
A
1025}
1026
1027CharsetRecog_8859_8::~CharsetRecog_8859_8()
1028{
1029 // nothing to do
1030}
1031
1032const char *CharsetRecog_8859_8::getName() const
1033{
51004dcb 1034 return "ISO-8859-8";
73c04bcf
A
1035}
1036
1037CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he ()
1038{
1039 // nothing to do
1040}
1041
1042const char *CharsetRecog_8859_8_I_he::getName() const
1043{
51004dcb 1044 return "ISO-8859-8-I";
73c04bcf
A
1045}
1046
1047const char *CharsetRecog_8859_8_I_he::getLanguage() const
1048{
1049 return "he";
1050}
1051
51004dcb 1052UBool CharsetRecog_8859_8_I_he::match(InputText *textIn, CharsetMatch *results) const
73c04bcf 1053{
51004dcb
A
1054 const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8-I";
1055 int32_t confidence = match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8);
1056 results->set(textIn, this, confidence, name, "he");
1057 return (confidence > 0);
73c04bcf
A
1058}
1059
1060CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he()
1061{
1062 // od ot gnihton
1063}
1064
1065const char *CharsetRecog_8859_8_he::getLanguage() const
1066{
1067 return "he";
1068}
1069
51004dcb 1070UBool CharsetRecog_8859_8_he::match(InputText *textIn, CharsetMatch *results) const
73c04bcf 1071{
51004dcb
A
1072 const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8";
1073 int32_t confidence = match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8);
1074 results->set(textIn, this, confidence, name, "he");
1075 return (confidence > 0);
73c04bcf
A
1076}
1077
1078CharsetRecog_8859_9::~CharsetRecog_8859_9()
1079{
1080 // nothing to do
1081}
1082
1083const char *CharsetRecog_8859_9::getName() const
1084{
51004dcb 1085 return "ISO-8859-9";
73c04bcf
A
1086}
1087
1088CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr ()
1089{
1090 // nothing to do
1091}
1092
1093const char *CharsetRecog_8859_9_tr::getLanguage() const
1094{
1095 return "tr";
1096}
1097
51004dcb 1098UBool CharsetRecog_8859_9_tr::match(InputText *textIn, CharsetMatch *results) const
73c04bcf 1099{
51004dcb
A
1100 const char *name = textIn->fC1Bytes? "windows-1254" : "ISO-8859-9";
1101 int32_t confidence = match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9);
1102 results->set(textIn, this, confidence, name, "tr");
1103 return (confidence > 0);
73c04bcf
A
1104}
1105
1106CharsetRecog_windows_1256::~CharsetRecog_windows_1256()
1107{
1108 // nothing to do
1109}
1110
1111const char *CharsetRecog_windows_1256::getName() const
1112{
1113 return "windows-1256";
1114}
1115
1116const char *CharsetRecog_windows_1256::getLanguage() const
1117{
1118 return "ar";
1119}
1120
51004dcb 1121UBool CharsetRecog_windows_1256::match(InputText *textIn, CharsetMatch *results) const
73c04bcf 1122{
51004dcb
A
1123 int32_t confidence = match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256);
1124 results->set(textIn, this, confidence);
1125 return (confidence > 0);
73c04bcf
A
1126}
1127
1128CharsetRecog_windows_1251::~CharsetRecog_windows_1251()
1129{
1130 // nothing to do
1131}
1132
1133const char *CharsetRecog_windows_1251::getName() const
1134{
1135 return "windows-1251";
1136}
1137
1138const char *CharsetRecog_windows_1251::getLanguage() const
1139{
1140 return "ru";
1141}
1142
51004dcb 1143UBool CharsetRecog_windows_1251::match(InputText *textIn, CharsetMatch *results) const
73c04bcf 1144{
51004dcb
A
1145 int32_t confidence = match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251);
1146 results->set(textIn, this, confidence);
1147 return (confidence > 0);
73c04bcf
A
1148}
1149
1150CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R()
1151{
1152 // nothing to do
1153}
1154
1155const char *CharsetRecog_KOI8_R::getName() const
1156{
1157 return "KOI8-R";
1158}
1159
1160const char *CharsetRecog_KOI8_R::getLanguage() const
1161{
1162 return "ru";
1163}
1164
51004dcb 1165UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const
73c04bcf 1166{
51004dcb
A
1167 int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R);
1168 results->set(textIn, this, confidence);
1169 return (confidence > 0);
73c04bcf
A
1170}
1171
b331163b 1172#if !UCONFIG_ONLY_HTML_CONVERSION
729e4ab9
A
1173CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
1174{
1175 // nothing to do
1176}
1177
1178const char *CharsetRecog_IBM424_he::getLanguage() const
1179{
1180 return "he";
1181}
1182
1183CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
1184{
1185 // nothing to do
1186}
1187
1188const char *CharsetRecog_IBM424_he_rtl::getName() const
1189{
1190 return "IBM424_rtl";
1191}
1192
51004dcb 1193UBool CharsetRecog_IBM424_he_rtl::match(InputText *textIn, CharsetMatch *results) const
729e4ab9 1194{
51004dcb
A
1195 int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he);
1196 results->set(textIn, this, confidence);
1197 return (confidence > 0);
729e4ab9
A
1198}
1199
1200CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
1201{
1202 // nothing to do
1203}
1204
1205const char *CharsetRecog_IBM424_he_ltr::getName() const
1206{
1207 return "IBM424_ltr";
1208}
1209
51004dcb 1210UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results) const
729e4ab9 1211{
51004dcb
A
1212 int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he);
1213 results->set(textIn, this, confidence);
1214 return (confidence > 0);
729e4ab9
A
1215}
1216
729e4ab9
A
1217CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
1218{
1219 // nothing to do
1220}
1221
1222const char *CharsetRecog_IBM420_ar::getLanguage() const
1223{
1224 return "ar";
1225}
1226
729e4ab9 1227
57a6839d
A
1228int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const
1229{
1230 NGramParser_IBM420 parser(ngrams, byteMap);
1231 int32_t result;
729e4ab9 1232
57a6839d 1233 result = parser.parse(det);
729e4ab9 1234
57a6839d 1235 return result;
729e4ab9
A
1236}
1237
1238CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
1239{
1240 // nothing to do
1241}
1242
1243const char *CharsetRecog_IBM420_ar_rtl::getName() const
1244{
1245 return "IBM420_rtl";
1246}
1247
51004dcb 1248UBool CharsetRecog_IBM420_ar_rtl::match(InputText *textIn, CharsetMatch *results) const
729e4ab9 1249{
51004dcb
A
1250 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar);
1251 results->set(textIn, this, confidence);
1252 return (confidence > 0);
729e4ab9
A
1253}
1254
1255CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
1256{
1257 // nothing to do
1258}
1259
1260const char *CharsetRecog_IBM420_ar_ltr::getName() const
1261{
1262 return "IBM420_ltr";
1263}
1264
51004dcb 1265UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results) const
729e4ab9 1266{
51004dcb
A
1267 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar);
1268 results->set(textIn, this, confidence);
1269 return (confidence > 0);
729e4ab9 1270}
b331163b 1271#endif
729e4ab9 1272
73c04bcf
A
1273U_NAMESPACE_END
1274#endif
1275