]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrsbcs.cpp
ICU-511.35.tar.gz
[apple/icu.git] / icuSources / i18n / csrsbcs.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #include "cmemory.h"
11
12 #if !UCONFIG_NO_CONVERSION
13 #include "csrsbcs.h"
14 #include "csmatch.h"
15
16 #define N_GRAM_SIZE 3
17 #define N_GRAM_MASK 0xFFFFFF
18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
19
20 U_NAMESPACE_BEGIN
21
22 NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
23 :byteIndex(0), ngram(0)
24 {
25 ngramList = theNgramList;
26 charMap = theCharMap;
27
28 ngramCount = hitCount = 0;
29 }
30
31 /*
32 * Binary search for value in table, which must have exactly 64 entries.
33 */
34
35 int32_t NGramParser::search(const int32_t *table, int32_t value)
36 {
37 int32_t index = 0;
38
39 if (table[index + 32] <= value) {
40 index += 32;
41 }
42
43 if (table[index + 16] <= value) {
44 index += 16;
45 }
46
47 if (table[index + 8] <= value) {
48 index += 8;
49 }
50
51 if (table[index + 4] <= value) {
52 index += 4;
53 }
54
55 if (table[index + 2] <= value) {
56 index += 2;
57 }
58
59 if (table[index + 1] <= value) {
60 index += 1;
61 }
62
63 if (table[index] > value) {
64 index -= 1;
65 }
66
67 if (index < 0 || table[index] != value) {
68 return -1;
69 }
70
71 return index;
72 }
73
74 void NGramParser::lookup(int32_t thisNgram)
75 {
76 ngramCount += 1;
77
78 if (search(ngramList, thisNgram) >= 0) {
79 hitCount += 1;
80 }
81
82 }
83
84 void NGramParser::addByte(int32_t b)
85 {
86 ngram = ((ngram << 8) + b) & N_GRAM_MASK;
87 lookup(ngram);
88 }
89
90 int32_t NGramParser::nextByte(InputText *det)
91 {
92 if (byteIndex >= det->fInputLen) {
93 return -1;
94 }
95
96 return det->fInputBytes[byteIndex++];
97 }
98
99 int32_t NGramParser::parse(InputText *det)
100 {
101 int32_t b;
102 bool ignoreSpace = FALSE;
103
104 while ((b = nextByte(det)) >= 0) {
105 uint8_t mb = charMap[b];
106
107 // TODO: 0x20 might not be a space in all character sets...
108 if (mb != 0) {
109 if (!(mb == 0x20 && ignoreSpace)) {
110 addByte(mb);
111 }
112
113 ignoreSpace = (mb == 0x20);
114 }
115 }
116
117 // TODO: Is this OK? The buffer could have ended in the middle of a word...
118 addByte(0x20);
119
120 double rawPercent = (double) hitCount / (double) ngramCount;
121
122 // if (rawPercent <= 2.0) {
123 // return 0;
124 // }
125
126 // TODO - This is a bit of a hack to take care of a case
127 // were we were getting a confidence of 135...
128 if (rawPercent > 0.33) {
129 return 98;
130 }
131
132 return (int32_t) (rawPercent * 300.0);
133 }
134
135 CharsetRecog_sbcs::CharsetRecog_sbcs()
136 {
137 // nothing else to do
138 }
139
140 CharsetRecog_sbcs::~CharsetRecog_sbcs()
141 {
142 // nothing to do
143 }
144
145 int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const
146 {
147 NGramParser parser(ngrams, byteMap);
148 int32_t result;
149
150 result = parser.parse(det);
151
152 return result;
153 }
154
155 static const uint8_t charMap_8859_1[] = {
156 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
157 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
158 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
159 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
160 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
161 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
162 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
163 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
164 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
165 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
166 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
167 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
168 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
169 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
170 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
171 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
172 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
173 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
174 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
175 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
176 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
177 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
178 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
179 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
180 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
181 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
182 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
183 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
184 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
185 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
186 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
187 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
188 };
189
190 static const uint8_t charMap_8859_2[] = {
191 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
192 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
193 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
194 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
195 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
196 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
197 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
198 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
199 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
200 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
201 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
202 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
203 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
204 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
205 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
206 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
207 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
208 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
209 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
210 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
211 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
212 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
213 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
214 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
215 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
216 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
217 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
218 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
219 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
220 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
221 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
222 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
223 };
224
225 static const uint8_t charMap_8859_5[] = {
226 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
227 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
228 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
229 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
230 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
231 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
232 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
233 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
234 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
235 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
236 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
237 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
238 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
239 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
240 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
241 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
242 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
243 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
244 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
245 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
246 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
247 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
248 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
249 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
250 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
251 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
252 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
253 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
254 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
255 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
256 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
257 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
258 };
259
260 static const uint8_t charMap_8859_6[] = {
261 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
262 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
263 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
264 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
265 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
266 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
267 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
268 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
269 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
270 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
271 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
272 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
273 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
274 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
275 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
276 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
277 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
278 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
279 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
280 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
281 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
282 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
283 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
284 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
285 0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
286 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
287 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
288 0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
289 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
290 0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
291 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
292 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
293 };
294
295 static const uint8_t charMap_8859_7[] = {
296 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
297 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
298 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
299 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
300 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
301 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
302 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
303 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
304 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
305 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
306 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
307 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
308 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
309 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
310 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
311 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
312 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
313 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
314 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
315 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
316 0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
317 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
318 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
319 0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
320 0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
321 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
322 0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
323 0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
324 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
325 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
326 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
327 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
328 };
329
330 static const uint8_t charMap_8859_8[] = {
331 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
332 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
333 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
334 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
335 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
336 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
337 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
338 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
339 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
340 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
341 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
342 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
343 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
344 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
345 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
346 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
347 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
348 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
349 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
350 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
351 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
352 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
353 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
354 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
355 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
356 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
357 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
358 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
359 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
360 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
361 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
362 0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
363 };
364
365 static const uint8_t charMap_8859_9[] = {
366 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
367 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
368 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
369 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
370 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
371 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
372 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
373 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
374 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
375 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
376 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
377 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
378 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
379 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
380 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
381 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
382 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
383 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
384 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
385 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
386 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
387 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
388 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
389 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
390 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
391 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
392 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
393 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
394 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
395 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
396 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
397 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
398 };
399
400 static const int32_t ngrams_windows_1251[] = {
401 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
402 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
403 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
404 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
405 };
406
407 static const uint8_t charMap_windows_1251[] = {
408 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
409 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
410 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
411 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
412 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
413 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
414 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
415 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
416 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
417 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
418 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
419 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
420 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
421 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
422 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
423 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
424 0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
425 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
426 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
427 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
428 0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
429 0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
430 0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
431 0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
432 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
433 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
434 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
435 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
436 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
437 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
438 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
439 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
440 };
441
442 static const int32_t ngrams_windows_1256[] = {
443 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
444 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
445 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
446 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
447 };
448
449 static const uint8_t charMap_windows_1256[] = {
450 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
451 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
452 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
453 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
454 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
455 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
456 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
457 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
458 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
459 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
460 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
461 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
462 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
463 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
464 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
465 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
466 0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
467 0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
468 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
469 0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
470 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
471 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
472 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
473 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
474 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
475 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
476 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
477 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
478 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
479 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
480 0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
481 0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
482 };
483
484 static const int32_t ngrams_KOI8_R[] = {
485 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
486 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
487 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
488 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
489 };
490
491 static const uint8_t charMap_KOI8_R[] = {
492 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
493 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
494 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
495 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
496 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
497 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
498 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
499 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
500 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
501 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
502 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
503 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
504 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
505 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
506 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
507 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
508 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
509 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
510 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
511 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
512 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
513 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
514 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
515 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
516 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
517 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
518 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
519 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
520 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
521 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
522 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
523 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
524 };
525
526 static const int32_t ngrams_IBM424_he_rtl[] = {
527 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
528 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
529 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
530 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
531 };
532
533 static const int32_t ngrams_IBM424_he_ltr[] = {
534 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
535 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
536 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
537 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
538 };
539
540 static const uint8_t charMap_IBM424_he[] = {
541 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
542 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
543 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
544 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
545 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
546 /* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
547 /* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
548 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
549 /* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
550 /* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
551 /* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
552 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
553 /* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
554 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
555 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
556 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
557 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
558 };
559
560 static const int32_t ngrams_IBM420_ar_rtl[] = {
561 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
562 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
563 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
564 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
565 };
566
567 static const int32_t ngrams_IBM420_ar_ltr[] = {
568 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
569 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
570 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
571 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
572 };
573
574 static const uint8_t charMap_IBM420_ar[]= {
575 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
576 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
577 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
578 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
579 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
580 /* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
581 /* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
582 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
583 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
584 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
585 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
586 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
587 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
588 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
589 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
590 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
591 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
592 };
593
594 //ISO-8859-1,2,5,6,7,8,9 Ngrams
595
596 struct NGramsPlusLang {
597 const int32_t ngrams[64];
598 const char * lang;
599 };
600
601 static const NGramsPlusLang ngrams_8859_1[] = {
602 {
603 {
604 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
605 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
606 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
607 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
608 },
609 "en"
610 },
611 {
612 {
613 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
614 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
615 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
616 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
617 },
618 "da"
619 },
620 {
621 {
622 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
623 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
624 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
625 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
626 },
627 "de"
628 },
629 {
630 {
631 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
632 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
633 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
634 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
635 },
636 "es"
637 },
638 {
639 {
640 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
641 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
642 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
643 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
644 },
645 "fr"
646 },
647 {
648 {
649 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
650 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
651 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
652 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
653 },
654 "it"
655 },
656 {
657 {
658 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
659 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
660 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
661 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
662 },
663 "nl"
664 },
665 {
666 {
667 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
668 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
669 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
670 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
671 },
672 "no"
673 },
674 {
675 {
676 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
677 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
678 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
679 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
680 },
681 "pt"
682 },
683 {
684 {
685 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
686 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
687 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
688 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
689 },
690 "sv"
691 }
692 };
693
694
695 static const NGramsPlusLang ngrams_8859_2[] = {
696 {
697 {
698 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
699 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
700 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
701 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
702 },
703 "cs"
704 },
705 {
706 {
707 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
708 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
709 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
710 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
711 },
712 "hu"
713 },
714 {
715 {
716 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
717 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
718 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
719 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
720 },
721 "pl"
722 },
723 {
724 {
725 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
726 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
727 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
728 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
729 },
730 "ro"
731 }
732 };
733
734 static const int32_t ngrams_8859_5_ru[] = {
735 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
736 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
737 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
738 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
739 };
740
741 static const int32_t ngrams_8859_6_ar[] = {
742 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
743 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
744 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
745 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
746 };
747
748 static const int32_t ngrams_8859_7_el[] = {
749 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
750 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
751 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
752 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
753 };
754
755 static const int32_t ngrams_8859_8_I_he[] = {
756 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
757 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
758 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
759 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
760 };
761
762 static const int32_t ngrams_8859_8_he[] = {
763 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
764 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
765 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
766 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
767 };
768
769 static const int32_t ngrams_8859_9_tr[] = {
770 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
771 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
772 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
773 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
774 };
775
776 CharsetRecog_8859_1::~CharsetRecog_8859_1()
777 {
778 // nothing to do
779 }
780
781 UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const {
782 const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1";
783 uint32_t i;
784 int32_t bestConfidenceSoFar = -1;
785 for (i=0; i < ARRAY_SIZE(ngrams_8859_1) ; i++) {
786 const int32_t *ngrams = ngrams_8859_1[i].ngrams;
787 const char *lang = ngrams_8859_1[i].lang;
788 int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1);
789 if (confidence > bestConfidenceSoFar) {
790 results->set(textIn, this, confidence, name, lang);
791 bestConfidenceSoFar = confidence;
792 }
793 }
794 return (bestConfidenceSoFar > 0);
795 }
796
797 const char *CharsetRecog_8859_1::getName() const
798 {
799 return "ISO-8859-1";
800 }
801
802
803 CharsetRecog_8859_2::~CharsetRecog_8859_2()
804 {
805 // nothing to do
806 }
807
808 UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const {
809 const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2";
810 uint32_t i;
811 int32_t bestConfidenceSoFar = -1;
812 for (i=0; i < ARRAY_SIZE(ngrams_8859_2) ; i++) {
813 const int32_t *ngrams = ngrams_8859_2[i].ngrams;
814 const char *lang = ngrams_8859_2[i].lang;
815 int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2);
816 if (confidence > bestConfidenceSoFar) {
817 results->set(textIn, this, confidence, name, lang);
818 bestConfidenceSoFar = confidence;
819 }
820 }
821 return (bestConfidenceSoFar > 0);
822 }
823
824 const char *CharsetRecog_8859_2::getName() const
825 {
826 return "ISO-8859-2";
827 }
828
829
830 CharsetRecog_8859_5::~CharsetRecog_8859_5()
831 {
832 // nothing to do
833 }
834
835 const char *CharsetRecog_8859_5::getName() const
836 {
837 return "ISO-8859-5";
838 }
839
840 CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru()
841 {
842 // nothing to do
843 }
844
845 const char *CharsetRecog_8859_5_ru::getLanguage() const
846 {
847 return "ru";
848 }
849
850 UBool CharsetRecog_8859_5_ru::match(InputText *textIn, CharsetMatch *results) const
851 {
852 int32_t confidence = match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5);
853 results->set(textIn, this, confidence);
854 return (confidence > 0);
855 }
856
857 CharsetRecog_8859_6::~CharsetRecog_8859_6()
858 {
859 // nothing to do
860 }
861
862 const char *CharsetRecog_8859_6::getName() const
863 {
864 return "ISO-8859-6";
865 }
866
867 CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar()
868 {
869 // nothing to do
870 }
871
872 const char *CharsetRecog_8859_6_ar::getLanguage() const
873 {
874 return "ar";
875 }
876
877 UBool CharsetRecog_8859_6_ar::match(InputText *textIn, CharsetMatch *results) const
878 {
879 int32_t confidence = match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6);
880 results->set(textIn, this, confidence);
881 return (confidence > 0);
882 }
883
884 CharsetRecog_8859_7::~CharsetRecog_8859_7()
885 {
886 // nothing to do
887 }
888
889 const char *CharsetRecog_8859_7::getName() const
890 {
891 return "ISO-8859-7";
892 }
893
894 CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el()
895 {
896 // nothing to do
897 }
898
899 const char *CharsetRecog_8859_7_el::getLanguage() const
900 {
901 return "el";
902 }
903
904 UBool CharsetRecog_8859_7_el::match(InputText *textIn, CharsetMatch *results) const
905 {
906 const char *name = textIn->fC1Bytes? "windows-1253" : "ISO-8859-7";
907 int32_t confidence = match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7);
908 results->set(textIn, this, confidence, name, "el");
909 return (confidence > 0);
910 }
911
912 CharsetRecog_8859_8::~CharsetRecog_8859_8()
913 {
914 // nothing to do
915 }
916
917 const char *CharsetRecog_8859_8::getName() const
918 {
919 return "ISO-8859-8";
920 }
921
922 CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he ()
923 {
924 // nothing to do
925 }
926
927 const char *CharsetRecog_8859_8_I_he::getName() const
928 {
929 return "ISO-8859-8-I";
930 }
931
932 const char *CharsetRecog_8859_8_I_he::getLanguage() const
933 {
934 return "he";
935 }
936
937 UBool CharsetRecog_8859_8_I_he::match(InputText *textIn, CharsetMatch *results) const
938 {
939 const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8-I";
940 int32_t confidence = match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8);
941 results->set(textIn, this, confidence, name, "he");
942 return (confidence > 0);
943 }
944
945 CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he()
946 {
947 // od ot gnihton
948 }
949
950 const char *CharsetRecog_8859_8_he::getLanguage() const
951 {
952 return "he";
953 }
954
955 UBool CharsetRecog_8859_8_he::match(InputText *textIn, CharsetMatch *results) const
956 {
957 const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8";
958 int32_t confidence = match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8);
959 results->set(textIn, this, confidence, name, "he");
960 return (confidence > 0);
961 }
962
963 CharsetRecog_8859_9::~CharsetRecog_8859_9()
964 {
965 // nothing to do
966 }
967
968 const char *CharsetRecog_8859_9::getName() const
969 {
970 return "ISO-8859-9";
971 }
972
973 CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr ()
974 {
975 // nothing to do
976 }
977
978 const char *CharsetRecog_8859_9_tr::getLanguage() const
979 {
980 return "tr";
981 }
982
983 UBool CharsetRecog_8859_9_tr::match(InputText *textIn, CharsetMatch *results) const
984 {
985 const char *name = textIn->fC1Bytes? "windows-1254" : "ISO-8859-9";
986 int32_t confidence = match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9);
987 results->set(textIn, this, confidence, name, "tr");
988 return (confidence > 0);
989 }
990
991 CharsetRecog_windows_1256::~CharsetRecog_windows_1256()
992 {
993 // nothing to do
994 }
995
996 const char *CharsetRecog_windows_1256::getName() const
997 {
998 return "windows-1256";
999 }
1000
1001 const char *CharsetRecog_windows_1256::getLanguage() const
1002 {
1003 return "ar";
1004 }
1005
1006 UBool CharsetRecog_windows_1256::match(InputText *textIn, CharsetMatch *results) const
1007 {
1008 int32_t confidence = match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256);
1009 results->set(textIn, this, confidence);
1010 return (confidence > 0);
1011 }
1012
1013 CharsetRecog_windows_1251::~CharsetRecog_windows_1251()
1014 {
1015 // nothing to do
1016 }
1017
1018 const char *CharsetRecog_windows_1251::getName() const
1019 {
1020 return "windows-1251";
1021 }
1022
1023 const char *CharsetRecog_windows_1251::getLanguage() const
1024 {
1025 return "ru";
1026 }
1027
1028 UBool CharsetRecog_windows_1251::match(InputText *textIn, CharsetMatch *results) const
1029 {
1030 int32_t confidence = match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251);
1031 results->set(textIn, this, confidence);
1032 return (confidence > 0);
1033 }
1034
1035 CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R()
1036 {
1037 // nothing to do
1038 }
1039
1040 const char *CharsetRecog_KOI8_R::getName() const
1041 {
1042 return "KOI8-R";
1043 }
1044
1045 const char *CharsetRecog_KOI8_R::getLanguage() const
1046 {
1047 return "ru";
1048 }
1049
1050 UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const
1051 {
1052 int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R);
1053 results->set(textIn, this, confidence);
1054 return (confidence > 0);
1055 }
1056
1057 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
1058 {
1059 // nothing to do
1060 }
1061
1062 const char *CharsetRecog_IBM424_he::getLanguage() const
1063 {
1064 return "he";
1065 }
1066
1067 CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
1068 {
1069 // nothing to do
1070 }
1071
1072 const char *CharsetRecog_IBM424_he_rtl::getName() const
1073 {
1074 return "IBM424_rtl";
1075 }
1076
1077 UBool CharsetRecog_IBM424_he_rtl::match(InputText *textIn, CharsetMatch *results) const
1078 {
1079 int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he);
1080 results->set(textIn, this, confidence);
1081 return (confidence > 0);
1082 }
1083
1084 CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
1085 {
1086 // nothing to do
1087 }
1088
1089 const char *CharsetRecog_IBM424_he_ltr::getName() const
1090 {
1091 return "IBM424_ltr";
1092 }
1093
1094 UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results) const
1095 {
1096 int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he);
1097 results->set(textIn, this, confidence);
1098 return (confidence > 0);
1099 }
1100
1101 static const uint8_t unshapeMap_IBM420[] = {
1102 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
1103 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1104 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1105 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1106 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1107 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
1108 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
1109 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
1110 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
1111 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
1112 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
1113 /* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
1114 /* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
1115 /* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
1116 /* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
1117 /* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
1118 /* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
1119 };
1120
1121 CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
1122 {
1123 // nothing to do
1124 }
1125
1126 const char *CharsetRecog_IBM420_ar::getLanguage() const
1127 {
1128 return "ar";
1129 }
1130
1131 void CharsetRecog_IBM420_ar::matchInit(InputText *textIn) {
1132 prev_fInputBytesLength = textIn->fInputLen;
1133 prev_fInputBytes = textIn->fInputBytes;
1134
1135 int32_t length = 0;
1136 uint8_t *bb = unshape(prev_fInputBytes, prev_fInputBytesLength, length);
1137
1138 if (bb != NULL) {
1139 textIn->fInputBytes = bb;
1140 textIn->fInputLen = length;
1141
1142 deleteBuffer = TRUE;
1143 } else {
1144 deleteBuffer = FALSE;
1145 }
1146 }
1147
1148 uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
1149 uint8_t *resultArray = unshapeLamAlef(inputBytes, inputBytesLength, length);
1150
1151 if (resultArray != NULL) {
1152 for (int32_t i = 0; i < inputBytesLength; i++) {
1153 resultArray[i] = unshapeMap_IBM420[resultArray[i]];
1154 }
1155 }
1156
1157 return resultArray;
1158 }
1159
1160 uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
1161 int32_t bigBufferLength = inputBytesLength * 2;
1162 uint8_t *bigBuffer = (uint8_t *)uprv_malloc(bigBufferLength);
1163 uint8_t *resultBuffer = NULL;
1164
1165 if (bigBuffer != NULL) {
1166 int32_t bufferIndex;
1167 static const uint8_t unshapedLamAlef[] = { 0xb1, 0x56 };
1168
1169 for (int32_t i = bufferIndex = 0; i < inputBytesLength; i++) {
1170 if (isLamAlef(inputBytes[i])) {
1171 bigBuffer[bufferIndex++] = unshapedLamAlef[0];
1172 bigBuffer[bufferIndex++] = unshapedLamAlef[1];
1173 } else {
1174 bigBuffer[bufferIndex++] = inputBytes[i];
1175 }
1176 }
1177
1178 length = bufferIndex;
1179 resultBuffer = (uint8_t *)uprv_malloc(length);
1180 if (resultBuffer != NULL) {
1181 uprv_memcpy(resultBuffer, bigBuffer, length);
1182 }
1183 }
1184
1185 if (bigBuffer != NULL) {
1186 uprv_free(bigBuffer);
1187 }
1188
1189 return resultBuffer;
1190 }
1191
1192 void CharsetRecog_IBM420_ar::matchFinish(InputText *textIn) {
1193 if (deleteBuffer) {
1194 uprv_free(textIn->fInputBytes);
1195
1196 textIn->fInputBytes = prev_fInputBytes;
1197 textIn->fInputLen = prev_fInputBytesLength;
1198 }
1199 }
1200
1201 UBool CharsetRecog_IBM420_ar::isLamAlef(uint8_t b) {
1202 static const uint8_t shapedLamAlef[] = {
1203 0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8
1204 };
1205
1206 for (uint32_t i = 0; i < sizeof(shapedLamAlef); i++) {
1207 if (b == shapedLamAlef[i]) {
1208 return TRUE;
1209 }
1210 }
1211
1212 return FALSE;
1213 }
1214
1215 CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
1216 {
1217 // nothing to do
1218 }
1219
1220 const char *CharsetRecog_IBM420_ar_rtl::getName() const
1221 {
1222 return "IBM420_rtl";
1223 }
1224
1225 UBool CharsetRecog_IBM420_ar_rtl::match(InputText *textIn, CharsetMatch *results) const
1226 {
1227 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar);
1228 results->set(textIn, this, confidence);
1229 return (confidence > 0);
1230 }
1231
1232 CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
1233 {
1234 // nothing to do
1235 }
1236
1237 const char *CharsetRecog_IBM420_ar_ltr::getName() const
1238 {
1239 return "IBM420_ltr";
1240 }
1241
1242 UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results) const
1243 {
1244 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar);
1245 results->set(textIn, this, confidence);
1246 return (confidence > 0);
1247 }
1248
1249 U_NAMESPACE_END
1250 #endif
1251