]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrsbcs.cpp
ICU-461.12.tar.gz
[apple/icu.git] / icuSources / i18n / csrsbcs.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #include "cmemory.h"
11
12 #if !UCONFIG_NO_CONVERSION
13 #include "csrsbcs.h"
14
15 #define N_GRAM_SIZE 3
16 #define N_GRAM_MASK 0xFFFFFF
17
18 U_NAMESPACE_BEGIN
19
20 NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
21 :byteIndex(0), ngram(0)
22 {
23 ngramList = theNgramList;
24 charMap = theCharMap;
25
26 ngramCount = hitCount = 0;
27 }
28
29 /*
30 * Binary search for value in table, which must have exactly 64 entries.
31 */
32
33 int32_t NGramParser::search(const int32_t *table, int32_t value)
34 {
35 int32_t index = 0;
36
37 if (table[index + 32] <= value) {
38 index += 32;
39 }
40
41 if (table[index + 16] <= value) {
42 index += 16;
43 }
44
45 if (table[index + 8] <= value) {
46 index += 8;
47 }
48
49 if (table[index + 4] <= value) {
50 index += 4;
51 }
52
53 if (table[index + 2] <= value) {
54 index += 2;
55 }
56
57 if (table[index + 1] <= value) {
58 index += 1;
59 }
60
61 if (table[index] > value) {
62 index -= 1;
63 }
64
65 if (index < 0 || table[index] != value) {
66 return -1;
67 }
68
69 return index;
70 }
71
72 void NGramParser::lookup(int32_t thisNgram)
73 {
74 ngramCount += 1;
75
76 if (search(ngramList, thisNgram) >= 0) {
77 hitCount += 1;
78 }
79
80 }
81
82 void NGramParser::addByte(int32_t b)
83 {
84 ngram = ((ngram << 8) + b) & N_GRAM_MASK;
85 lookup(ngram);
86 }
87
88 int32_t NGramParser::nextByte(InputText *det)
89 {
90 if (byteIndex >= det->fInputLen) {
91 return -1;
92 }
93
94 return det->fInputBytes[byteIndex++];
95 }
96
97 int32_t NGramParser::parse(InputText *det)
98 {
99 int32_t b;
100 bool ignoreSpace = FALSE;
101
102 while ((b = nextByte(det)) >= 0) {
103 uint8_t mb = charMap[b];
104
105 // TODO: 0x20 might not be a space in all character sets...
106 if (mb != 0) {
107 if (!(mb == 0x20 && ignoreSpace)) {
108 addByte(mb);
109 }
110
111 ignoreSpace = (mb == 0x20);
112 }
113 }
114
115 // TODO: Is this OK? The buffer could have ended in the middle of a word...
116 addByte(0x20);
117
118 double rawPercent = (double) hitCount / (double) ngramCount;
119
120 // if (rawPercent <= 2.0) {
121 // return 0;
122 // }
123
124 // TODO - This is a bit of a hack to take care of a case
125 // were we were getting a confidence of 135...
126 if (rawPercent > 0.33) {
127 return 98;
128 }
129
130 return (int32_t) (rawPercent * 300.0);
131 }
132
133 CharsetRecog_sbcs::CharsetRecog_sbcs()
134 : haveC1Bytes(FALSE)
135 {
136 // nothing else to do
137 }
138
139 CharsetRecog_sbcs::~CharsetRecog_sbcs()
140 {
141 // nothing to do
142 }
143
144 int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[])
145 {
146 NGramParser parser(ngrams, byteMap);
147 int32_t result;
148
149 haveC1Bytes = det->fC1Bytes;
150 result = parser.parse(det);
151
152 return result;
153 }
154
155 static const uint8_t charMap_8859_1[] = {
156 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
157 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
158 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
159 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
160 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
161 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
162 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
163 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
164 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
165 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
166 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
167 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
168 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
169 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
170 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
171 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
172 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
173 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
174 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
175 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
176 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
177 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
178 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
179 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
180 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
181 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
182 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
183 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
184 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
185 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
186 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
187 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
188 };
189
190 static const uint8_t charMap_8859_2[] = {
191 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
192 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
193 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
194 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
195 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
196 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
197 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
198 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
199 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
200 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
201 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
202 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
203 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
204 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
205 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
206 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
207 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
208 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
209 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
210 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
211 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
212 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
213 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
214 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
215 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
216 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
217 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
218 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
219 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
220 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
221 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
222 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
223 };
224
225 static const uint8_t charMap_8859_5[] = {
226 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
227 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
228 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
229 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
230 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
231 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
232 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
233 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
234 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
235 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
236 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
237 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
238 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
239 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
240 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
241 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
242 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
243 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
244 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
245 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
246 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
247 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
248 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
249 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
250 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
251 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
252 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
253 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
254 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
255 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
256 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
257 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
258 };
259
260 static const uint8_t charMap_8859_6[] = {
261 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
262 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
263 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
264 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
265 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
266 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
267 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
268 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
269 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
270 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
271 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
272 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
273 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
274 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
275 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
276 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
277 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
278 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
279 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
280 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
281 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
282 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
283 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
284 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
285 0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
286 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
287 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
288 0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
289 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
290 0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
291 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
292 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
293 };
294
295 static const uint8_t charMap_8859_7[] = {
296 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
297 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
298 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
299 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
300 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
301 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
302 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
303 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
304 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
305 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
306 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
307 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
308 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
309 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
310 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
311 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
312 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
313 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
314 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
315 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
316 0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
317 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
318 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
319 0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
320 0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
321 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
322 0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
323 0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
324 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
325 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
326 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
327 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
328 };
329
330 static const uint8_t charMap_8859_8[] = {
331 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
332 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
333 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
334 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
335 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
336 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
337 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
338 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
339 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
340 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
341 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
342 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
343 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
344 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
345 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
346 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
347 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
348 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
349 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
350 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
351 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
352 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
353 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
354 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
355 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
356 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
357 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
358 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
359 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
360 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
361 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
362 0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
363 };
364
365 static const uint8_t charMap_8859_9[] = {
366 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
367 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
368 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
369 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
370 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
371 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
372 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
373 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
374 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
375 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
376 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
377 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
378 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
379 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
380 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
381 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
382 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
383 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
384 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
385 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
386 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
387 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
388 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
389 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
390 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
391 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
392 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
393 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
394 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
395 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
396 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
397 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
398 };
399
400 static const int32_t ngrams_windows_1251[] = {
401 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
402 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
403 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
404 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
405 };
406
407 static const uint8_t charMap_windows_1251[] = {
408 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
409 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
410 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
411 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
412 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
413 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
414 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
415 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
416 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
417 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
418 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
419 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
420 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
421 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
422 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
423 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
424 0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
425 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
426 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
427 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
428 0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
429 0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
430 0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
431 0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
432 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
433 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
434 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
435 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
436 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
437 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
438 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
439 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
440 };
441
442 static const int32_t ngrams_windows_1256[] = {
443 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
444 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
445 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
446 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
447 };
448
449 static const uint8_t charMap_windows_1256[] = {
450 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
451 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
452 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
453 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
454 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
455 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
456 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
457 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
458 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
459 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
460 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
461 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
462 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
463 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
464 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
465 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
466 0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
467 0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
468 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
469 0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
470 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
471 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
472 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
473 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
474 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
475 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
476 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
477 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
478 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
479 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
480 0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
481 0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
482 };
483
484 static const int32_t ngrams_KOI8_R[] = {
485 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
486 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
487 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
488 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
489 };
490
491 static const uint8_t charMap_KOI8_R[] = {
492 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
493 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
494 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
495 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
496 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
497 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
498 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
499 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
500 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
501 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
502 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
503 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
504 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
505 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
506 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
507 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
508 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
509 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
510 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
511 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
512 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
513 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
514 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
515 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
516 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
517 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
518 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
519 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
520 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
521 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
522 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
523 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
524 };
525
526 static const int32_t ngrams_IBM424_he_rtl[] = {
527 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
528 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
529 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
530 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
531 };
532
533 static const int32_t ngrams_IBM424_he_ltr[] = {
534 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
535 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
536 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
537 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
538 };
539
540 static const uint8_t charMap_IBM424_he[] = {
541 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
542 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
543 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
544 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
545 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
546 /* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
547 /* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
548 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
549 /* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
550 /* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
551 /* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
552 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
553 /* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
554 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
555 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
556 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
557 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
558 };
559
560 static const int32_t ngrams_IBM420_ar_rtl[] = {
561 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
562 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
563 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
564 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
565 };
566
567 static const int32_t ngrams_IBM420_ar_ltr[] = {
568 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
569 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
570 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
571 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
572 };
573
574 static const uint8_t charMap_IBM420_ar[]= {
575 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
576 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
577 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
578 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
579 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
580 /* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
581 /* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
582 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
583 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
584 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
585 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
586 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
587 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
588 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
589 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
590 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
591 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
592 };
593
594 //ISO-8859-1,2,5,6,7,8,9 Ngrams
595 static const int32_t ngrams_8859_1_en[] = {
596 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
597 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
598 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
599 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
600 };
601
602 static const int32_t ngrams_8859_1_da[] = {
603 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
604 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
605 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
606 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
607 };
608
609 static const int32_t ngrams_8859_1_de[] = {
610 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
611 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
612 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
613 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
614 };
615
616 static const int32_t ngrams_8859_1_es[] = {
617 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
618 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
619 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
620 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
621 };
622
623 static const int32_t ngrams_8859_1_fr[] = {
624 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
625 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
626 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
627 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
628 };
629
630 static const int32_t ngrams_8859_1_it[] = {
631 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
632 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
633 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
634 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
635 };
636
637 static const int32_t ngrams_8859_1_nl[] = {
638 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
639 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
640 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
641 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
642 };
643
644 static const int32_t ngrams_8859_1_no[] = {
645 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
646 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
647 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
648 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
649 };
650
651 static const int32_t ngrams_8859_1_pt[] = {
652 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
653 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
654 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
655 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
656 };
657
658 static const int32_t ngrams_8859_1_sv[] = {
659 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
660 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
661 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
662 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
663 };
664
665 static const int32_t ngrams_8859_2_cs[] = {
666 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
667 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
668 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
669 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
670 };
671
672 static const int32_t ngrams_8859_2_hu[] = {
673 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
674 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
675 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
676 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
677 };
678
679 static const int32_t ngrams_8859_2_pl[] = {
680 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
681 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
682 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
683 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
684 };
685
686 static const int32_t ngrams_8859_2_ro[] = {
687 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
688 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
689 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
690 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
691 };
692
693 static const int32_t ngrams_8859_5_ru[] = {
694 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
695 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
696 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
697 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
698 };
699
700 static const int32_t ngrams_8859_6_ar[] = {
701 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
702 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
703 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
704 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
705 };
706
707 static const int32_t ngrams_8859_7_el[] = {
708 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
709 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
710 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
711 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
712 };
713
714 static const int32_t ngrams_8859_8_I_he[] = {
715 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
716 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
717 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
718 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
719 };
720
721 static const int32_t ngrams_8859_8_he[] = {
722 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
723 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
724 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
725 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
726 };
727
728 static const int32_t ngrams_8859_9_tr[] = {
729 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
730 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
731 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
732 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
733 };
734
735 CharsetRecog_8859_1::~CharsetRecog_8859_1()
736 {
737 // nothing to do
738 }
739
740 const char *CharsetRecog_8859_1::getName() const
741 {
742 return haveC1Bytes? "windows-1252" : "ISO-8859-1";
743 }
744
745 const char *CharsetRecog_8859_1_en::getLanguage() const
746 {
747 return "en";
748 }
749
750 CharsetRecog_8859_1_en::~CharsetRecog_8859_1_en()
751 {
752 // nothing to do
753 }
754
755 int32_t CharsetRecog_8859_1_en::match(InputText *textIn)
756 {
757 int32_t result = match_sbcs(textIn, ngrams_8859_1_en, charMap_8859_1);
758
759 // printf("8859_1_en: result = %d\n", result);
760 return result; //match_sbcs(textIn, ngrams, charMap);
761 }
762
763 CharsetRecog_8859_1_da::~CharsetRecog_8859_1_da()
764 {
765 // nothing to do
766 }
767
768 const char *CharsetRecog_8859_1_da::getLanguage() const
769 {
770 return "da";
771 }
772
773 int32_t CharsetRecog_8859_1_da::match(InputText *textIn)
774 {
775 return match_sbcs(textIn, ngrams_8859_1_da, charMap_8859_1);
776 }
777
778 CharsetRecog_8859_1_de::~CharsetRecog_8859_1_de() {}
779
780 const char *CharsetRecog_8859_1_de::getLanguage() const
781 {
782 return "de";
783 }
784
785 int32_t CharsetRecog_8859_1_de::match(InputText *textIn)
786 {
787 return match_sbcs(textIn, ngrams_8859_1_de, charMap_8859_1);
788 }
789
790 CharsetRecog_8859_1_es::~CharsetRecog_8859_1_es()
791 {
792 // nothing to do
793 }
794
795 const char *CharsetRecog_8859_1_es::getLanguage() const
796 {
797 return "es";
798 }
799
800 int32_t CharsetRecog_8859_1_es::match(InputText *textIn)
801 {
802 return match_sbcs(textIn, ngrams_8859_1_es, charMap_8859_1);
803 }
804
805 CharsetRecog_8859_1_fr::~CharsetRecog_8859_1_fr()
806 {
807 // nothing to do
808 }
809
810 const char *CharsetRecog_8859_1_fr::getLanguage() const
811 {
812 return "fr";
813 }
814
815 int32_t CharsetRecog_8859_1_fr::match(InputText *textIn)
816 {
817 return match_sbcs(textIn, ngrams_8859_1_fr, charMap_8859_1);
818 }
819
820 CharsetRecog_8859_1_it::~CharsetRecog_8859_1_it()
821 {
822 // nothing to do
823 }
824
825 const char *CharsetRecog_8859_1_it::getLanguage() const
826 {
827 return "it";
828 }
829
830 int32_t CharsetRecog_8859_1_it::match(InputText *textIn)
831 {
832 return match_sbcs(textIn, ngrams_8859_1_it, charMap_8859_1);
833 }
834
835 CharsetRecog_8859_1_nl::~CharsetRecog_8859_1_nl()
836 {
837 // nothing to do
838 }
839
840 const char *CharsetRecog_8859_1_nl::getLanguage() const
841 {
842 return "nl";
843 }
844
845 int32_t CharsetRecog_8859_1_nl::match(InputText *textIn)
846 {
847 return match_sbcs(textIn, ngrams_8859_1_nl, charMap_8859_1);
848 }
849
850 CharsetRecog_8859_1_no::~CharsetRecog_8859_1_no() {}
851
852 const char *CharsetRecog_8859_1_no::getLanguage() const
853 {
854 return "no";
855 }
856
857 int32_t CharsetRecog_8859_1_no::match(InputText *textIn)
858 {
859 return match_sbcs(textIn, ngrams_8859_1_no, charMap_8859_1);
860 }
861
862 CharsetRecog_8859_1_pt::~CharsetRecog_8859_1_pt()
863 {
864 // nothing to do
865 }
866
867 const char *CharsetRecog_8859_1_pt::getLanguage() const
868 {
869 return "pt";
870 }
871
872 int32_t CharsetRecog_8859_1_pt::match(InputText *textIn)
873 {
874 return match_sbcs(textIn, ngrams_8859_1_pt, charMap_8859_1);
875 }
876
877 CharsetRecog_8859_1_sv::~CharsetRecog_8859_1_sv() {}
878
879 const char *CharsetRecog_8859_1_sv::getLanguage() const
880 {
881 return "sv";
882 }
883
884 int32_t CharsetRecog_8859_1_sv::match(InputText *textIn)
885 {
886 return match_sbcs(textIn, ngrams_8859_1_sv, charMap_8859_1);
887 }
888
889 CharsetRecog_8859_2::~CharsetRecog_8859_2()
890 {
891 // nothing to do
892 }
893
894 const char *CharsetRecog_8859_2::getName() const
895 {
896 return haveC1Bytes? "windows-1250" : "ISO-8859-2";
897 }
898
899 CharsetRecog_8859_2_cs::~CharsetRecog_8859_2_cs()
900 {
901 // nothing to do
902 }
903
904 const char *CharsetRecog_8859_2_cs::getLanguage() const
905 {
906 return "cs";
907 }
908
909 int32_t CharsetRecog_8859_2_cs::match(InputText *textIn)
910 {
911 return match_sbcs(textIn, ngrams_8859_2_cs, charMap_8859_2);
912 }
913
914 CharsetRecog_8859_2_hu::~CharsetRecog_8859_2_hu()
915 {
916 // nothing to do
917 }
918
919 const char *CharsetRecog_8859_2_hu::getLanguage() const
920 {
921 return "hu";
922 }
923
924 int32_t CharsetRecog_8859_2_hu::match(InputText *textIn)
925 {
926 return match_sbcs(textIn, ngrams_8859_2_hu, charMap_8859_2);
927 }
928
929 CharsetRecog_8859_2_pl::~CharsetRecog_8859_2_pl()
930 {
931 // nothing to do
932 }
933
934 const char *CharsetRecog_8859_2_pl::getLanguage() const
935 {
936 return "pl";
937 }
938
939 int32_t CharsetRecog_8859_2_pl::match(InputText *textIn)
940 {
941 return match_sbcs(textIn, ngrams_8859_2_pl, charMap_8859_2);
942 }
943
944 CharsetRecog_8859_2_ro::~CharsetRecog_8859_2_ro()
945 {
946 // nothing to do
947 }
948
949 const char *CharsetRecog_8859_2_ro::getLanguage() const
950 {
951 return "ro";
952 }
953
954 int32_t CharsetRecog_8859_2_ro::match(InputText *textIn)
955 {
956 return match_sbcs(textIn, ngrams_8859_2_ro, charMap_8859_2);
957 }
958
959 CharsetRecog_8859_5::~CharsetRecog_8859_5()
960 {
961 // nothing to do
962 }
963
964 const char *CharsetRecog_8859_5::getName() const
965 {
966 return "ISO-8859-5";
967 }
968
969 CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru()
970 {
971 // nothing to do
972 }
973
974 const char *CharsetRecog_8859_5_ru::getLanguage() const
975 {
976 return "ru";
977 }
978
979 int32_t CharsetRecog_8859_5_ru::match(InputText *textIn)
980 {
981 return match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5);
982 }
983
984 CharsetRecog_8859_6::~CharsetRecog_8859_6()
985 {
986 // nothing to do
987 }
988
989 const char *CharsetRecog_8859_6::getName() const
990 {
991 return "ISO-8859-6";
992 }
993
994 CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar()
995 {
996 // nothing to do
997 }
998
999 const char *CharsetRecog_8859_6_ar::getLanguage() const
1000 {
1001 return "ar";
1002 }
1003
1004 int32_t CharsetRecog_8859_6_ar::match(InputText *textIn)
1005 {
1006 return match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6);
1007 }
1008
1009 CharsetRecog_8859_7::~CharsetRecog_8859_7()
1010 {
1011 // nothing to do
1012 }
1013
1014 const char *CharsetRecog_8859_7::getName() const
1015 {
1016 return haveC1Bytes? "windows-1253" : "ISO-8859-7";
1017 }
1018
1019 CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el()
1020 {
1021 // nothing to do
1022 }
1023
1024 const char *CharsetRecog_8859_7_el::getLanguage() const
1025 {
1026 return "el";
1027 }
1028
1029 int32_t CharsetRecog_8859_7_el::match(InputText *textIn)
1030 {
1031 return match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7);
1032 }
1033
1034 CharsetRecog_8859_8::~CharsetRecog_8859_8()
1035 {
1036 // nothing to do
1037 }
1038
1039 const char *CharsetRecog_8859_8::getName() const
1040 {
1041 return haveC1Bytes? "windows-1255" : "ISO-8859-8";
1042 }
1043
1044 CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he ()
1045 {
1046 // nothing to do
1047 }
1048
1049 const char *CharsetRecog_8859_8_I_he::getName() const
1050 {
1051 return haveC1Bytes? "windows-1255" : "ISO-8859-8-I";
1052 }
1053
1054 const char *CharsetRecog_8859_8_I_he::getLanguage() const
1055 {
1056 return "he";
1057 }
1058
1059 int32_t CharsetRecog_8859_8_I_he::match(InputText *textIn)
1060 {
1061 return match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8);
1062 }
1063
1064 CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he()
1065 {
1066 // od ot gnihton
1067 }
1068
1069 const char *CharsetRecog_8859_8_he::getLanguage() const
1070 {
1071 return "he";
1072 }
1073
1074 int32_t CharsetRecog_8859_8_he::match(InputText *textIn)
1075 {
1076 return match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8);
1077 }
1078
1079 CharsetRecog_8859_9::~CharsetRecog_8859_9()
1080 {
1081 // nothing to do
1082 }
1083
1084 const char *CharsetRecog_8859_9::getName() const
1085 {
1086 return haveC1Bytes? "windows-1254" : "ISO-8859-9";
1087 }
1088
1089 CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr ()
1090 {
1091 // nothing to do
1092 }
1093
1094 const char *CharsetRecog_8859_9_tr::getLanguage() const
1095 {
1096 return "tr";
1097 }
1098
1099 int32_t CharsetRecog_8859_9_tr::match(InputText *textIn)
1100 {
1101 return match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9);
1102 }
1103
1104 CharsetRecog_windows_1256::~CharsetRecog_windows_1256()
1105 {
1106 // nothing to do
1107 }
1108
1109 const char *CharsetRecog_windows_1256::getName() const
1110 {
1111 return "windows-1256";
1112 }
1113
1114 const char *CharsetRecog_windows_1256::getLanguage() const
1115 {
1116 return "ar";
1117 }
1118
1119 int32_t CharsetRecog_windows_1256::match(InputText *textIn)
1120 {
1121 return match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256);
1122 }
1123
1124 CharsetRecog_windows_1251::~CharsetRecog_windows_1251()
1125 {
1126 // nothing to do
1127 }
1128
1129 const char *CharsetRecog_windows_1251::getName() const
1130 {
1131 return "windows-1251";
1132 }
1133
1134 const char *CharsetRecog_windows_1251::getLanguage() const
1135 {
1136 return "ru";
1137 }
1138
1139 int32_t CharsetRecog_windows_1251::match(InputText *textIn)
1140 {
1141 return match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251);
1142 }
1143
1144 CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R()
1145 {
1146 // nothing to do
1147 }
1148
1149 const char *CharsetRecog_KOI8_R::getName() const
1150 {
1151 return "KOI8-R";
1152 }
1153
1154 const char *CharsetRecog_KOI8_R::getLanguage() const
1155 {
1156 return "ru";
1157 }
1158
1159 int32_t CharsetRecog_KOI8_R::match(InputText *textIn)
1160 {
1161 return match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R);
1162 }
1163
1164 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
1165 {
1166 // nothing to do
1167 }
1168
1169 const char *CharsetRecog_IBM424_he::getLanguage() const
1170 {
1171 return "he";
1172 }
1173
1174 CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
1175 {
1176 // nothing to do
1177 }
1178
1179 const char *CharsetRecog_IBM424_he_rtl::getName() const
1180 {
1181 return "IBM424_rtl";
1182 }
1183
1184 int32_t CharsetRecog_IBM424_he_rtl::match(InputText *textIn)
1185 {
1186 return match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he);
1187 }
1188
1189 CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
1190 {
1191 // nothing to do
1192 }
1193
1194 const char *CharsetRecog_IBM424_he_ltr::getName() const
1195 {
1196 return "IBM424_ltr";
1197 }
1198
1199 int32_t CharsetRecog_IBM424_he_ltr::match(InputText *textIn)
1200 {
1201 return match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he);
1202 }
1203
1204 static const uint8_t unshapeMap_IBM420[] = {
1205 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
1206 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1207 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1208 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1209 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1210 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
1211 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
1212 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
1213 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
1214 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
1215 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
1216 /* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
1217 /* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
1218 /* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
1219 /* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
1220 /* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
1221 /* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
1222 };
1223
1224 CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
1225 {
1226 // nothing to do
1227 }
1228
1229 const char *CharsetRecog_IBM420_ar::getLanguage() const
1230 {
1231 return "ar";
1232 }
1233
1234 void CharsetRecog_IBM420_ar::matchInit(InputText *textIn) {
1235 prev_fInputBytesLength = textIn->fInputLen;
1236 prev_fInputBytes = textIn->fInputBytes;
1237
1238 int32_t length = 0;
1239 uint8_t *bb = unshape(prev_fInputBytes, prev_fInputBytesLength, length);
1240
1241 if (bb != NULL) {
1242 textIn->fInputBytes = bb;
1243 textIn->fInputLen = length;
1244
1245 deleteBuffer = TRUE;
1246 } else {
1247 deleteBuffer = FALSE;
1248 }
1249 }
1250
1251 uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
1252 uint8_t *resultArray = unshapeLamAlef(inputBytes, inputBytesLength, length);
1253
1254 if (resultArray != NULL) {
1255 for (int32_t i = 0; i < inputBytesLength; i++) {
1256 resultArray[i] = unshapeMap_IBM420[resultArray[i]];
1257 }
1258 }
1259
1260 return resultArray;
1261 }
1262
1263 uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
1264 int32_t bigBufferLength = inputBytesLength * 2;
1265 uint8_t *bigBuffer = (uint8_t *)uprv_malloc(bigBufferLength);
1266 uint8_t *resultBuffer = NULL;
1267
1268 if (bigBuffer != NULL) {
1269 int32_t bufferIndex;
1270 uint8_t unshapedLamAlef[] = { 0xb1, 0x56 };
1271
1272 for (int32_t i = bufferIndex = 0; i < inputBytesLength; i++) {
1273 if (isLamAlef(inputBytes[i])) {
1274 bigBuffer[bufferIndex++] = unshapedLamAlef[0];
1275 bigBuffer[bufferIndex++] = unshapedLamAlef[1];
1276 } else {
1277 bigBuffer[bufferIndex++] = inputBytes[i];
1278 }
1279 }
1280
1281 length = bufferIndex;
1282 resultBuffer = (uint8_t *)uprv_malloc(length);
1283 if (resultBuffer != NULL) {
1284 uprv_memcpy(resultBuffer, bigBuffer, length);
1285 }
1286 }
1287
1288 if (bigBuffer != NULL) {
1289 uprv_free(bigBuffer);
1290 }
1291
1292 return resultBuffer;
1293 }
1294
1295 void CharsetRecog_IBM420_ar::matchFinish(InputText *textIn) {
1296 if (deleteBuffer) {
1297 uprv_free(textIn->fInputBytes);
1298
1299 textIn->fInputBytes = prev_fInputBytes;
1300 textIn->fInputLen = prev_fInputBytesLength;
1301 }
1302 }
1303
1304 UBool CharsetRecog_IBM420_ar::isLamAlef(uint8_t b) {
1305 uint8_t shapedLamAlef[] = {
1306 0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8
1307 };
1308
1309 for (uint32_t i = 0; i < sizeof(shapedLamAlef); i++) {
1310 if (b == shapedLamAlef[i]) {
1311 return TRUE;
1312 }
1313 }
1314
1315 return FALSE;
1316 }
1317
1318 CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
1319 {
1320 // nothing to do
1321 }
1322
1323 const char *CharsetRecog_IBM420_ar_rtl::getName() const
1324 {
1325 return "IBM420_rtl";
1326 }
1327
1328 int32_t CharsetRecog_IBM420_ar_rtl::match(InputText *textIn)
1329 {
1330 return match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar);
1331 }
1332
1333 CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
1334 {
1335 // nothing to do
1336 }
1337
1338 const char *CharsetRecog_IBM420_ar_ltr::getName() const
1339 {
1340 return "IBM420_ltr";
1341 }
1342
1343 int32_t CharsetRecog_IBM420_ar_ltr::match(InputText *textIn)
1344 {
1345 return match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar);
1346 }
1347
1348 U_NAMESPACE_END
1349 #endif
1350