1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2005-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
13 #include "unicode/uobject.h"
15 #if !UCONFIG_NO_CONVERSION
21 class NGramParser
: public UMemory
25 const int32_t *ngramList
;
32 const uint8_t *charMap
;
34 void addByte(int32_t b
);
37 NGramParser(const int32_t *theNgramList
, const uint8_t *theCharMap
);
38 virtual ~NGramParser();
42 * Binary search for value in table, which must have exactly 64 entries.
44 int32_t search(const int32_t *table
, int32_t value
);
46 void lookup(int32_t thisNgram
);
48 virtual int32_t nextByte(InputText
*det
);
49 virtual void parseCharacters(InputText
*det
);
52 int32_t parse(InputText
*det
);
56 #if !UCONFIG_ONLY_HTML_CONVERSION
57 class NGramParser_IBM420
: public NGramParser
60 NGramParser_IBM420(const int32_t *theNgramList
, const uint8_t *theCharMap
);
61 ~NGramParser_IBM420();
65 int32_t isLamAlef(int32_t b
);
66 int32_t nextByte(InputText
*det
);
67 void parseCharacters(InputText
*det
);
72 class CharsetRecog_sbcs
: public CharsetRecognizer
76 virtual ~CharsetRecog_sbcs();
77 virtual const char *getName() const = 0;
78 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const = 0;
79 virtual int32_t match_sbcs(InputText
*det
, const int32_t ngrams
[], const uint8_t charMap
[]) const;
82 class CharsetRecog_8859_1
: public CharsetRecog_sbcs
85 virtual ~CharsetRecog_8859_1();
86 const char *getName() const;
87 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const;
90 class CharsetRecog_8859_2
: public CharsetRecog_sbcs
93 virtual ~CharsetRecog_8859_2();
94 const char *getName() const;
95 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const;
98 class CharsetRecog_8859_5
: public CharsetRecog_sbcs
101 virtual ~CharsetRecog_8859_5();
102 const char *getName() const;
105 class CharsetRecog_8859_6
: public CharsetRecog_sbcs
108 virtual ~CharsetRecog_8859_6();
110 const char *getName() const;
113 class CharsetRecog_8859_7
: public CharsetRecog_sbcs
116 virtual ~CharsetRecog_8859_7();
118 const char *getName() const;
121 class CharsetRecog_8859_8
: public CharsetRecog_sbcs
124 virtual ~CharsetRecog_8859_8();
126 virtual const char *getName() const;
129 class CharsetRecog_8859_9
: public CharsetRecog_sbcs
132 virtual ~CharsetRecog_8859_9();
134 const char *getName() const;
139 class CharsetRecog_8859_5_ru
: public CharsetRecog_8859_5
142 virtual ~CharsetRecog_8859_5_ru();
144 const char *getLanguage() const;
146 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const;
149 class CharsetRecog_8859_6_ar
: public CharsetRecog_8859_6
152 virtual ~CharsetRecog_8859_6_ar();
154 const char *getLanguage() const;
156 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const;
159 class CharsetRecog_8859_7_el
: public CharsetRecog_8859_7
162 virtual ~CharsetRecog_8859_7_el();
164 const char *getLanguage() const;
166 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const;
169 class CharsetRecog_8859_8_I_he
: public CharsetRecog_8859_8
172 virtual ~CharsetRecog_8859_8_I_he();
174 const char *getName() const;
176 const char *getLanguage() const;
178 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const;
181 class CharsetRecog_8859_8_he
: public CharsetRecog_8859_8
184 virtual ~CharsetRecog_8859_8_he ();
186 const char *getLanguage() const;
188 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const;
191 class CharsetRecog_8859_9_tr
: public CharsetRecog_8859_9
194 virtual ~CharsetRecog_8859_9_tr ();
196 const char *getLanguage() const;
198 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const;
201 class CharsetRecog_windows_1256
: public CharsetRecog_sbcs
204 virtual ~CharsetRecog_windows_1256();
206 const char *getName() const;
208 const char *getLanguage() const;
210 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const;
213 class CharsetRecog_windows_1251
: public CharsetRecog_sbcs
216 virtual ~CharsetRecog_windows_1251();
218 const char *getName() const;
220 const char *getLanguage() const;
222 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const;
226 class CharsetRecog_KOI8_R
: public CharsetRecog_sbcs
229 virtual ~CharsetRecog_KOI8_R();
231 const char *getName() const;
233 const char *getLanguage() const;
235 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const;
238 #if !UCONFIG_ONLY_HTML_CONVERSION
239 class CharsetRecog_IBM424_he
: public CharsetRecog_sbcs
242 virtual ~CharsetRecog_IBM424_he();
244 const char *getLanguage() const;
247 class CharsetRecog_IBM424_he_rtl
: public CharsetRecog_IBM424_he
{
249 virtual ~CharsetRecog_IBM424_he_rtl();
251 const char *getName() const;
253 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const;
256 class CharsetRecog_IBM424_he_ltr
: public CharsetRecog_IBM424_he
{
257 virtual ~CharsetRecog_IBM424_he_ltr();
259 const char *getName() const;
261 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const;
264 class CharsetRecog_IBM420_ar
: public CharsetRecog_sbcs
267 virtual ~CharsetRecog_IBM420_ar();
269 const char *getLanguage() const;
270 int32_t match_sbcs(InputText
*det
, const int32_t ngrams
[], const uint8_t charMap
[]) const;
274 class CharsetRecog_IBM420_ar_rtl
: public CharsetRecog_IBM420_ar
{
276 virtual ~CharsetRecog_IBM420_ar_rtl();
278 const char *getName() const;
280 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const;
283 class CharsetRecog_IBM420_ar_ltr
: public CharsetRecog_IBM420_ar
{
284 virtual ~CharsetRecog_IBM420_ar_ltr();
286 const char *getName() const;
288 virtual UBool
match(InputText
*det
, CharsetMatch
*results
) const;
294 #endif /* !UCONFIG_NO_CONVERSION */
295 #endif /* __CSRSBCS_H */