2 **********************************************************************
3 * Copyright (C) 2005-2009, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
11 #include "unicode/uobject.h"
13 #if !UCONFIG_NO_CONVERSION
19 class NGramParser
: public UMemory
25 const int32_t *ngramList
;
26 const uint8_t *charMap
;
32 NGramParser(const int32_t *theNgramList
, const uint8_t *theCharMap
);
36 * Binary search for value in table, which must have exactly 64 entries.
38 int32_t search(const int32_t *table
, int32_t value
);
40 void lookup(int32_t thisNgram
);
41 void addByte(int32_t b
);
42 int32_t nextByte(InputText
*det
);
45 int32_t parse(InputText
*det
);
49 class CharsetRecog_sbcs
: public CharsetRecognizer
57 virtual ~CharsetRecog_sbcs();
59 virtual const char *getName() const = 0;
61 virtual int32_t match(InputText
*det
) = 0;
63 int32_t match_sbcs(InputText
*det
, const int32_t ngrams
[], const uint8_t charMap
[]);
66 class CharsetRecog_8859_1
: public CharsetRecog_sbcs
69 virtual ~CharsetRecog_8859_1();
71 const char *getName() const;
74 class CharsetRecog_8859_2
: public CharsetRecog_sbcs
77 virtual ~CharsetRecog_8859_2();
79 const char *getName() const;
82 class CharsetRecog_8859_5
: public CharsetRecog_sbcs
85 virtual ~CharsetRecog_8859_5();
87 const char *getName() const;
90 class CharsetRecog_8859_6
: public CharsetRecog_sbcs
93 virtual ~CharsetRecog_8859_6();
95 const char *getName() const;
98 class CharsetRecog_8859_7
: public CharsetRecog_sbcs
101 virtual ~CharsetRecog_8859_7();
103 const char *getName() const;
106 class CharsetRecog_8859_8
: public CharsetRecog_sbcs
109 virtual ~CharsetRecog_8859_8();
111 virtual const char *getName() const;
114 class CharsetRecog_8859_9
: public CharsetRecog_sbcs
117 virtual ~CharsetRecog_8859_9();
119 const char *getName() const;
122 class CharsetRecog_8859_1_en
: public CharsetRecog_8859_1
125 virtual ~CharsetRecog_8859_1_en();
127 const char *getLanguage() const;
129 int32_t match(InputText
*textIn
);
132 class CharsetRecog_8859_1_da
: public CharsetRecog_8859_1
135 virtual ~CharsetRecog_8859_1_da();
137 const char *getLanguage() const;
139 int32_t match(InputText
*textIn
);
142 class CharsetRecog_8859_1_de
: public CharsetRecog_8859_1
145 virtual ~CharsetRecog_8859_1_de();
147 const char *getLanguage() const;
149 int32_t match(InputText
*textIn
);
152 class CharsetRecog_8859_1_es
: public CharsetRecog_8859_1
155 virtual ~CharsetRecog_8859_1_es();
157 const char *getLanguage() const;
159 int32_t match(InputText
*textIn
);
162 class CharsetRecog_8859_1_fr
: public CharsetRecog_8859_1
165 virtual ~CharsetRecog_8859_1_fr();
167 const char *getLanguage() const;
169 int32_t match(InputText
*textIn
);
172 class CharsetRecog_8859_1_it
: public CharsetRecog_8859_1
175 virtual ~CharsetRecog_8859_1_it();
177 const char *getLanguage() const;
179 int32_t match(InputText
*textIn
);
182 class CharsetRecog_8859_1_nl
: public CharsetRecog_8859_1
185 virtual ~CharsetRecog_8859_1_nl();
187 const char *getLanguage() const;
189 int32_t match(InputText
*textIn
);
192 class CharsetRecog_8859_1_no
: public CharsetRecog_8859_1
195 virtual ~CharsetRecog_8859_1_no();
197 const char *getLanguage() const;
199 int32_t match(InputText
*textIn
);
202 class CharsetRecog_8859_1_pt
: public CharsetRecog_8859_1
205 virtual ~CharsetRecog_8859_1_pt();
207 const char *getLanguage() const;
209 int32_t match(InputText
*textIn
);
212 class CharsetRecog_8859_1_sv
: public CharsetRecog_8859_1
215 virtual ~CharsetRecog_8859_1_sv();
217 const char *getLanguage() const;
219 int32_t match(InputText
*textIn
);
222 class CharsetRecog_8859_2_cs
: public CharsetRecog_8859_2
225 virtual ~CharsetRecog_8859_2_cs();
227 const char *getLanguage() const;
229 int32_t match(InputText
*textIn
);
232 class CharsetRecog_8859_2_hu
: public CharsetRecog_8859_2
235 virtual ~CharsetRecog_8859_2_hu();
237 const char *getLanguage() const;
239 int32_t match(InputText
*textIn
);
242 class CharsetRecog_8859_2_pl
: public CharsetRecog_8859_2
245 virtual ~CharsetRecog_8859_2_pl();
247 const char *getLanguage() const;
249 int32_t match(InputText
*textIn
);
252 class CharsetRecog_8859_2_ro
: public CharsetRecog_8859_2
255 virtual ~CharsetRecog_8859_2_ro();
257 const char *getLanguage() const;
259 int32_t match(InputText
*textIn
);
262 class CharsetRecog_8859_5_ru
: public CharsetRecog_8859_5
265 virtual ~CharsetRecog_8859_5_ru();
267 const char *getLanguage() const;
269 int32_t match(InputText
*textIn
);
272 class CharsetRecog_8859_6_ar
: public CharsetRecog_8859_6
275 virtual ~CharsetRecog_8859_6_ar();
277 const char *getLanguage() const;
279 int32_t match(InputText
*textIn
);
282 class CharsetRecog_8859_7_el
: public CharsetRecog_8859_7
285 virtual ~CharsetRecog_8859_7_el();
287 const char *getLanguage() const;
289 int32_t match(InputText
*textIn
);
292 class CharsetRecog_8859_8_I_he
: public CharsetRecog_8859_8
295 virtual ~CharsetRecog_8859_8_I_he();
297 const char *getName() const;
299 const char *getLanguage() const;
301 int32_t match(InputText
*textIn
);
304 class CharsetRecog_8859_8_he
: public CharsetRecog_8859_8
307 virtual ~CharsetRecog_8859_8_he ();
309 const char *getLanguage() const;
311 int32_t match(InputText
*textIn
);
314 class CharsetRecog_8859_9_tr
: public CharsetRecog_8859_9
317 virtual ~CharsetRecog_8859_9_tr ();
319 const char *getLanguage() const;
321 int32_t match(InputText
*textIn
);
324 class CharsetRecog_windows_1256
: public CharsetRecog_sbcs
327 virtual ~CharsetRecog_windows_1256();
329 const char *getName() const;
331 const char *getLanguage() const;
333 int32_t match(InputText
*textIn
);
336 class CharsetRecog_windows_1251
: public CharsetRecog_sbcs
339 virtual ~CharsetRecog_windows_1251();
341 const char *getName() const;
343 const char *getLanguage() const;
345 int32_t match(InputText
*textIn
);
349 class CharsetRecog_KOI8_R
: public CharsetRecog_sbcs
352 virtual ~CharsetRecog_KOI8_R();
354 const char *getName() const;
356 const char *getLanguage() const;
358 int32_t match(InputText
*textIn
);
361 class CharsetRecog_IBM424_he
: public CharsetRecog_sbcs
364 virtual ~CharsetRecog_IBM424_he();
366 const char *getLanguage() const;
369 class CharsetRecog_IBM424_he_rtl
: public CharsetRecog_IBM424_he
{
371 virtual ~CharsetRecog_IBM424_he_rtl();
373 const char *getName() const;
375 int32_t match(InputText
*textIn
);
378 class CharsetRecog_IBM424_he_ltr
: public CharsetRecog_IBM424_he
{
379 virtual ~CharsetRecog_IBM424_he_ltr();
381 const char *getName() const;
383 int32_t match(InputText
*textIn
);
386 class CharsetRecog_IBM420_ar
: public CharsetRecog_sbcs
389 virtual ~CharsetRecog_IBM420_ar();
391 const char *getLanguage() const;
394 void matchInit(InputText
*textIn
);
395 void matchFinish(InputText
*textIn
);
398 uint8_t *prev_fInputBytes
;
399 int32_t prev_fInputBytesLength
;
402 UBool
isLamAlef(uint8_t b
);
403 uint8_t *unshapeLamAlef(const uint8_t *inputBytes
, int32_t inputBytesLength
, int32_t &length
);
404 uint8_t *unshape(const uint8_t *inputBytes
, int32_t inputBytesLength
, int32_t &length
);
407 class CharsetRecog_IBM420_ar_rtl
: public CharsetRecog_IBM420_ar
{
409 virtual ~CharsetRecog_IBM420_ar_rtl();
411 const char *getName() const;
413 int32_t match(InputText
*textIn
);
416 class CharsetRecog_IBM420_ar_ltr
: public CharsetRecog_IBM420_ar
{
417 virtual ~CharsetRecog_IBM420_ar_ltr();
419 const char *getName() const;
421 int32_t match(InputText
*textIn
);
427 #endif /* __CSRSBCS_H */