]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | ********************************************************************** | |
729e4ab9 | 3 | * Copyright (C) 2005-2009, International Business Machines |
73c04bcf A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | */ | |
7 | ||
8 | #ifndef __CSRSBCS_H | |
9 | #define __CSRSBCS_H | |
10 | ||
11 | #include "unicode/uobject.h" | |
12 | ||
13 | #if !UCONFIG_NO_CONVERSION | |
14 | ||
15 | #include "csrecog.h" | |
16 | ||
17 | U_NAMESPACE_BEGIN | |
18 | ||
19 | class NGramParser : public UMemory | |
20 | { | |
21 | private: | |
22 | int32_t byteIndex; | |
23 | int32_t ngram; | |
24 | ||
25 | const int32_t *ngramList; | |
26 | const uint8_t *charMap; | |
27 | ||
28 | int32_t ngramCount; | |
29 | int32_t hitCount; | |
30 | ||
31 | public: | |
32 | NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap); | |
33 | ||
34 | private: | |
35 | /* | |
36 | * Binary search for value in table, which must have exactly 64 entries. | |
37 | */ | |
38 | int32_t search(const int32_t *table, int32_t value); | |
39 | ||
40 | void lookup(int32_t thisNgram); | |
41 | void addByte(int32_t b); | |
42 | int32_t nextByte(InputText *det); | |
43 | ||
44 | public: | |
45 | int32_t parse(InputText *det); | |
46 | ||
47 | }; | |
48 | ||
49 | class CharsetRecog_sbcs : public CharsetRecognizer | |
50 | { | |
51 | protected: | |
52 | UBool haveC1Bytes; | |
53 | ||
54 | public: | |
55 | CharsetRecog_sbcs(); | |
56 | ||
57 | virtual ~CharsetRecog_sbcs(); | |
58 | ||
59 | virtual const char *getName() const = 0; | |
60 | ||
61 | virtual int32_t match(InputText *det) = 0; | |
62 | ||
63 | int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]); | |
64 | }; | |
65 | ||
66 | class CharsetRecog_8859_1 : public CharsetRecog_sbcs | |
67 | { | |
68 | public: | |
69 | virtual ~CharsetRecog_8859_1(); | |
70 | ||
71 | const char *getName() const; | |
72 | }; | |
73 | ||
74 | class CharsetRecog_8859_2 : public CharsetRecog_sbcs | |
75 | { | |
76 | public: | |
77 | virtual ~CharsetRecog_8859_2(); | |
78 | ||
79 | const char *getName() const; | |
80 | }; | |
81 | ||
82 | class CharsetRecog_8859_5 : public CharsetRecog_sbcs | |
83 | { | |
84 | public: | |
85 | virtual ~CharsetRecog_8859_5(); | |
86 | ||
87 | const char *getName() const; | |
88 | }; | |
89 | ||
90 | class CharsetRecog_8859_6 : public CharsetRecog_sbcs | |
91 | { | |
92 | public: | |
93 | virtual ~CharsetRecog_8859_6(); | |
94 | ||
95 | const char *getName() const; | |
96 | }; | |
97 | ||
98 | class CharsetRecog_8859_7 : public CharsetRecog_sbcs | |
99 | { | |
100 | public: | |
101 | virtual ~CharsetRecog_8859_7(); | |
102 | ||
103 | const char *getName() const; | |
104 | }; | |
105 | ||
106 | class CharsetRecog_8859_8 : public CharsetRecog_sbcs | |
107 | { | |
108 | public: | |
109 | virtual ~CharsetRecog_8859_8(); | |
110 | ||
111 | virtual const char *getName() const; | |
112 | }; | |
113 | ||
114 | class CharsetRecog_8859_9 : public CharsetRecog_sbcs | |
115 | { | |
116 | public: | |
117 | virtual ~CharsetRecog_8859_9(); | |
118 | ||
119 | const char *getName() const; | |
120 | }; | |
121 | ||
122 | class CharsetRecog_8859_1_en : public CharsetRecog_8859_1 | |
123 | { | |
124 | public: | |
125 | virtual ~CharsetRecog_8859_1_en(); | |
126 | ||
127 | const char *getLanguage() const; | |
128 | ||
129 | int32_t match(InputText *textIn); | |
130 | }; | |
131 | ||
132 | class CharsetRecog_8859_1_da : public CharsetRecog_8859_1 | |
133 | { | |
134 | public: | |
135 | virtual ~CharsetRecog_8859_1_da(); | |
136 | ||
137 | const char *getLanguage() const; | |
138 | ||
139 | int32_t match(InputText *textIn); | |
140 | }; | |
141 | ||
142 | class CharsetRecog_8859_1_de : public CharsetRecog_8859_1 | |
143 | { | |
144 | public: | |
145 | virtual ~CharsetRecog_8859_1_de(); | |
146 | ||
147 | const char *getLanguage() const; | |
148 | ||
149 | int32_t match(InputText *textIn); | |
150 | }; | |
151 | ||
152 | class CharsetRecog_8859_1_es : public CharsetRecog_8859_1 | |
153 | { | |
154 | public: | |
155 | virtual ~CharsetRecog_8859_1_es(); | |
156 | ||
157 | const char *getLanguage() const; | |
158 | ||
159 | int32_t match(InputText *textIn); | |
160 | }; | |
161 | ||
162 | class CharsetRecog_8859_1_fr : public CharsetRecog_8859_1 | |
163 | { | |
164 | public: | |
165 | virtual ~CharsetRecog_8859_1_fr(); | |
166 | ||
167 | const char *getLanguage() const; | |
168 | ||
169 | int32_t match(InputText *textIn); | |
170 | }; | |
171 | ||
172 | class CharsetRecog_8859_1_it : public CharsetRecog_8859_1 | |
173 | { | |
174 | public: | |
175 | virtual ~CharsetRecog_8859_1_it(); | |
176 | ||
177 | const char *getLanguage() const; | |
178 | ||
179 | int32_t match(InputText *textIn); | |
180 | }; | |
181 | ||
182 | class CharsetRecog_8859_1_nl : public CharsetRecog_8859_1 | |
183 | { | |
184 | public: | |
185 | virtual ~CharsetRecog_8859_1_nl(); | |
186 | ||
187 | const char *getLanguage() const; | |
188 | ||
189 | int32_t match(InputText *textIn); | |
190 | }; | |
191 | ||
192 | class CharsetRecog_8859_1_no : public CharsetRecog_8859_1 | |
193 | { | |
194 | public: | |
195 | virtual ~CharsetRecog_8859_1_no(); | |
196 | ||
197 | const char *getLanguage() const; | |
198 | ||
199 | int32_t match(InputText *textIn); | |
200 | }; | |
201 | ||
202 | class CharsetRecog_8859_1_pt : public CharsetRecog_8859_1 | |
203 | { | |
204 | public: | |
205 | virtual ~CharsetRecog_8859_1_pt(); | |
206 | ||
207 | const char *getLanguage() const; | |
208 | ||
209 | int32_t match(InputText *textIn); | |
210 | }; | |
211 | ||
212 | class CharsetRecog_8859_1_sv : public CharsetRecog_8859_1 | |
213 | { | |
214 | public: | |
215 | virtual ~CharsetRecog_8859_1_sv(); | |
216 | ||
217 | const char *getLanguage() const; | |
218 | ||
219 | int32_t match(InputText *textIn); | |
220 | }; | |
221 | ||
222 | class CharsetRecog_8859_2_cs : public CharsetRecog_8859_2 | |
223 | { | |
224 | public: | |
225 | virtual ~CharsetRecog_8859_2_cs(); | |
226 | ||
227 | const char *getLanguage() const; | |
228 | ||
229 | int32_t match(InputText *textIn); | |
230 | }; | |
231 | ||
232 | class CharsetRecog_8859_2_hu : public CharsetRecog_8859_2 | |
233 | { | |
234 | public: | |
235 | virtual ~CharsetRecog_8859_2_hu(); | |
236 | ||
237 | const char *getLanguage() const; | |
238 | ||
239 | int32_t match(InputText *textIn); | |
240 | }; | |
241 | ||
242 | class CharsetRecog_8859_2_pl : public CharsetRecog_8859_2 | |
243 | { | |
244 | public: | |
245 | virtual ~CharsetRecog_8859_2_pl(); | |
246 | ||
247 | const char *getLanguage() const; | |
248 | ||
249 | int32_t match(InputText *textIn); | |
250 | }; | |
251 | ||
252 | class CharsetRecog_8859_2_ro : public CharsetRecog_8859_2 | |
253 | { | |
254 | public: | |
255 | virtual ~CharsetRecog_8859_2_ro(); | |
256 | ||
257 | const char *getLanguage() const; | |
258 | ||
259 | int32_t match(InputText *textIn); | |
260 | }; | |
261 | ||
262 | class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5 | |
263 | { | |
264 | public: | |
265 | virtual ~CharsetRecog_8859_5_ru(); | |
266 | ||
267 | const char *getLanguage() const; | |
268 | ||
269 | int32_t match(InputText *textIn); | |
270 | }; | |
271 | ||
272 | class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6 | |
273 | { | |
274 | public: | |
275 | virtual ~CharsetRecog_8859_6_ar(); | |
276 | ||
277 | const char *getLanguage() const; | |
278 | ||
279 | int32_t match(InputText *textIn); | |
280 | }; | |
281 | ||
282 | class CharsetRecog_8859_7_el : public CharsetRecog_8859_7 | |
283 | { | |
284 | public: | |
285 | virtual ~CharsetRecog_8859_7_el(); | |
286 | ||
287 | const char *getLanguage() const; | |
288 | ||
289 | int32_t match(InputText *textIn); | |
290 | }; | |
291 | ||
292 | class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8 | |
293 | { | |
294 | public: | |
295 | virtual ~CharsetRecog_8859_8_I_he(); | |
296 | ||
297 | const char *getName() const; | |
298 | ||
299 | const char *getLanguage() const; | |
300 | ||
301 | int32_t match(InputText *textIn); | |
302 | }; | |
303 | ||
304 | class CharsetRecog_8859_8_he : public CharsetRecog_8859_8 | |
305 | { | |
306 | public: | |
307 | virtual ~CharsetRecog_8859_8_he (); | |
308 | ||
309 | const char *getLanguage() const; | |
310 | ||
311 | int32_t match(InputText *textIn); | |
312 | }; | |
313 | ||
314 | class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9 | |
315 | { | |
316 | public: | |
317 | virtual ~CharsetRecog_8859_9_tr (); | |
318 | ||
319 | const char *getLanguage() const; | |
320 | ||
321 | int32_t match(InputText *textIn); | |
322 | }; | |
323 | ||
324 | class CharsetRecog_windows_1256 : public CharsetRecog_sbcs | |
325 | { | |
326 | public: | |
327 | virtual ~CharsetRecog_windows_1256(); | |
328 | ||
329 | const char *getName() const; | |
330 | ||
331 | const char *getLanguage() const; | |
332 | ||
333 | int32_t match(InputText *textIn); | |
334 | }; | |
335 | ||
336 | class CharsetRecog_windows_1251 : public CharsetRecog_sbcs | |
337 | { | |
338 | public: | |
339 | virtual ~CharsetRecog_windows_1251(); | |
340 | ||
341 | const char *getName() const; | |
342 | ||
343 | const char *getLanguage() const; | |
344 | ||
345 | int32_t match(InputText *textIn); | |
346 | }; | |
347 | ||
348 | ||
349 | class CharsetRecog_KOI8_R : public CharsetRecog_sbcs | |
350 | { | |
351 | public: | |
352 | virtual ~CharsetRecog_KOI8_R(); | |
353 | ||
354 | const char *getName() const; | |
355 | ||
356 | const char *getLanguage() const; | |
357 | ||
358 | int32_t match(InputText *textIn); | |
359 | }; | |
360 | ||
729e4ab9 A |
361 | class CharsetRecog_IBM424_he : public CharsetRecog_sbcs |
362 | { | |
363 | public: | |
364 | virtual ~CharsetRecog_IBM424_he(); | |
365 | ||
366 | const char *getLanguage() const; | |
367 | }; | |
368 | ||
369 | class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he { | |
370 | public: | |
371 | virtual ~CharsetRecog_IBM424_he_rtl(); | |
372 | ||
373 | const char *getName() const; | |
374 | ||
375 | int32_t match(InputText *textIn); | |
376 | }; | |
377 | ||
378 | class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { | |
379 | virtual ~CharsetRecog_IBM424_he_ltr(); | |
380 | ||
381 | const char *getName() const; | |
382 | ||
383 | int32_t match(InputText *textIn); | |
384 | }; | |
385 | ||
386 | class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs | |
387 | { | |
388 | public: | |
389 | virtual ~CharsetRecog_IBM420_ar(); | |
390 | ||
391 | const char *getLanguage() const; | |
392 | ||
393 | protected: | |
394 | void matchInit(InputText *textIn); | |
395 | void matchFinish(InputText *textIn); | |
396 | ||
397 | private: | |
398 | uint8_t *prev_fInputBytes; | |
399 | int32_t prev_fInputBytesLength; | |
400 | UBool deleteBuffer; | |
401 | ||
402 | UBool isLamAlef(uint8_t b); | |
403 | uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length); | |
404 | uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length); | |
405 | }; | |
406 | ||
407 | class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar { | |
408 | public: | |
409 | virtual ~CharsetRecog_IBM420_ar_rtl(); | |
410 | ||
411 | const char *getName() const; | |
412 | ||
413 | int32_t match(InputText *textIn); | |
414 | }; | |
415 | ||
416 | class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { | |
417 | virtual ~CharsetRecog_IBM420_ar_ltr(); | |
418 | ||
419 | const char *getName() const; | |
420 | ||
421 | int32_t match(InputText *textIn); | |
422 | }; | |
423 | ||
73c04bcf A |
424 | U_NAMESPACE_END |
425 | ||
426 | #endif | |
427 | #endif /* __CSRSBCS_H */ |