]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | ********************************************************************** | |
b331163b | 3 | * Copyright (C) 2005-2015, International Business Machines |
73c04bcf A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | */ | |
7 | ||
8 | #ifndef __CSRSBCS_H | |
9 | #define __CSRSBCS_H | |
10 | ||
11 | #include "unicode/uobject.h" | |
12 | ||
13 | #if !UCONFIG_NO_CONVERSION | |
14 | ||
15 | #include "csrecog.h" | |
16 | ||
17 | U_NAMESPACE_BEGIN | |
18 | ||
19 | class NGramParser : public UMemory | |
20 | { | |
21 | private: | |
73c04bcf | 22 | int32_t ngram; |
57a6839d | 23 | const int32_t *ngramList; |
73c04bcf A |
24 | |
25 | int32_t ngramCount; | |
26 | int32_t hitCount; | |
27 | ||
57a6839d A |
28 | protected: |
29 | int32_t byteIndex; | |
30 | const uint8_t *charMap; | |
31 | ||
32 | void addByte(int32_t b); | |
33 | ||
73c04bcf A |
34 | public: |
35 | NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap); | |
b331163b | 36 | virtual ~NGramParser(); |
73c04bcf A |
37 | |
38 | private: | |
39 | /* | |
40 | * Binary search for value in table, which must have exactly 64 entries. | |
41 | */ | |
42 | int32_t search(const int32_t *table, int32_t value); | |
43 | ||
44 | void lookup(int32_t thisNgram); | |
57a6839d A |
45 | |
46 | virtual int32_t nextByte(InputText *det); | |
47 | virtual void parseCharacters(InputText *det); | |
73c04bcf A |
48 | |
49 | public: | |
50 | int32_t parse(InputText *det); | |
51 | ||
52 | }; | |
53 | ||
b331163b | 54 | #if !UCONFIG_ONLY_HTML_CONVERSION |
57a6839d A |
55 | class NGramParser_IBM420 : public NGramParser |
56 | { | |
57 | private: | |
58 | int32_t alef; | |
59 | int32_t isLamAlef(int32_t b); | |
60 | int32_t nextByte(InputText *det); | |
61 | void parseCharacters(InputText *det); | |
62 | ||
63 | public: | |
64 | NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap); | |
65 | }; | |
b331163b | 66 | #endif |
57a6839d | 67 | |
51004dcb | 68 | |
73c04bcf A |
69 | class CharsetRecog_sbcs : public CharsetRecognizer |
70 | { | |
73c04bcf A |
71 | public: |
72 | CharsetRecog_sbcs(); | |
73c04bcf | 73 | virtual ~CharsetRecog_sbcs(); |
73c04bcf | 74 | virtual const char *getName() const = 0; |
51004dcb A |
75 | virtual UBool match(InputText *det, CharsetMatch *results) const = 0; |
76 | virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; | |
73c04bcf A |
77 | }; |
78 | ||
79 | class CharsetRecog_8859_1 : public CharsetRecog_sbcs | |
80 | { | |
81 | public: | |
82 | virtual ~CharsetRecog_8859_1(); | |
73c04bcf | 83 | const char *getName() const; |
51004dcb | 84 | virtual UBool match(InputText *det, CharsetMatch *results) const; |
73c04bcf A |
85 | }; |
86 | ||
87 | class CharsetRecog_8859_2 : public CharsetRecog_sbcs | |
88 | { | |
89 | public: | |
90 | virtual ~CharsetRecog_8859_2(); | |
73c04bcf | 91 | const char *getName() const; |
51004dcb | 92 | virtual UBool match(InputText *det, CharsetMatch *results) const; |
73c04bcf A |
93 | }; |
94 | ||
95 | class CharsetRecog_8859_5 : public CharsetRecog_sbcs | |
96 | { | |
97 | public: | |
98 | virtual ~CharsetRecog_8859_5(); | |
73c04bcf A |
99 | const char *getName() const; |
100 | }; | |
101 | ||
102 | class CharsetRecog_8859_6 : public CharsetRecog_sbcs | |
103 | { | |
104 | public: | |
105 | virtual ~CharsetRecog_8859_6(); | |
106 | ||
107 | const char *getName() const; | |
108 | }; | |
109 | ||
110 | class CharsetRecog_8859_7 : public CharsetRecog_sbcs | |
111 | { | |
112 | public: | |
113 | virtual ~CharsetRecog_8859_7(); | |
114 | ||
115 | const char *getName() const; | |
116 | }; | |
117 | ||
118 | class CharsetRecog_8859_8 : public CharsetRecog_sbcs | |
119 | { | |
120 | public: | |
121 | virtual ~CharsetRecog_8859_8(); | |
122 | ||
123 | virtual const char *getName() const; | |
124 | }; | |
125 | ||
126 | class CharsetRecog_8859_9 : public CharsetRecog_sbcs | |
127 | { | |
128 | public: | |
129 | virtual ~CharsetRecog_8859_9(); | |
130 | ||
131 | const char *getName() const; | |
132 | }; | |
133 | ||
73c04bcf | 134 | |
73c04bcf A |
135 | |
136 | class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5 | |
137 | { | |
138 | public: | |
139 | virtual ~CharsetRecog_8859_5_ru(); | |
140 | ||
141 | const char *getLanguage() const; | |
142 | ||
51004dcb | 143 | virtual UBool match(InputText *det, CharsetMatch *results) const; |
73c04bcf A |
144 | }; |
145 | ||
146 | class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6 | |
147 | { | |
148 | public: | |
149 | virtual ~CharsetRecog_8859_6_ar(); | |
150 | ||
151 | const char *getLanguage() const; | |
152 | ||
51004dcb | 153 | virtual UBool match(InputText *det, CharsetMatch *results) const; |
73c04bcf A |
154 | }; |
155 | ||
156 | class CharsetRecog_8859_7_el : public CharsetRecog_8859_7 | |
157 | { | |
158 | public: | |
159 | virtual ~CharsetRecog_8859_7_el(); | |
160 | ||
161 | const char *getLanguage() const; | |
162 | ||
51004dcb | 163 | virtual UBool match(InputText *det, CharsetMatch *results) const; |
73c04bcf A |
164 | }; |
165 | ||
166 | class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8 | |
167 | { | |
168 | public: | |
169 | virtual ~CharsetRecog_8859_8_I_he(); | |
170 | ||
171 | const char *getName() const; | |
172 | ||
173 | const char *getLanguage() const; | |
174 | ||
51004dcb | 175 | virtual UBool match(InputText *det, CharsetMatch *results) const; |
73c04bcf A |
176 | }; |
177 | ||
178 | class CharsetRecog_8859_8_he : public CharsetRecog_8859_8 | |
179 | { | |
180 | public: | |
181 | virtual ~CharsetRecog_8859_8_he (); | |
182 | ||
183 | const char *getLanguage() const; | |
184 | ||
51004dcb | 185 | virtual UBool match(InputText *det, CharsetMatch *results) const; |
73c04bcf A |
186 | }; |
187 | ||
188 | class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9 | |
189 | { | |
190 | public: | |
191 | virtual ~CharsetRecog_8859_9_tr (); | |
192 | ||
193 | const char *getLanguage() const; | |
194 | ||
51004dcb | 195 | virtual UBool match(InputText *det, CharsetMatch *results) const; |
73c04bcf A |
196 | }; |
197 | ||
198 | class CharsetRecog_windows_1256 : public CharsetRecog_sbcs | |
199 | { | |
200 | public: | |
201 | virtual ~CharsetRecog_windows_1256(); | |
202 | ||
203 | const char *getName() const; | |
204 | ||
205 | const char *getLanguage() const; | |
206 | ||
51004dcb | 207 | virtual UBool match(InputText *det, CharsetMatch *results) const; |
73c04bcf A |
208 | }; |
209 | ||
210 | class CharsetRecog_windows_1251 : public CharsetRecog_sbcs | |
211 | { | |
212 | public: | |
213 | virtual ~CharsetRecog_windows_1251(); | |
214 | ||
215 | const char *getName() const; | |
216 | ||
217 | const char *getLanguage() const; | |
218 | ||
51004dcb | 219 | virtual UBool match(InputText *det, CharsetMatch *results) const; |
73c04bcf A |
220 | }; |
221 | ||
222 | ||
223 | class CharsetRecog_KOI8_R : public CharsetRecog_sbcs | |
224 | { | |
225 | public: | |
226 | virtual ~CharsetRecog_KOI8_R(); | |
227 | ||
228 | const char *getName() const; | |
229 | ||
230 | const char *getLanguage() const; | |
231 | ||
51004dcb | 232 | virtual UBool match(InputText *det, CharsetMatch *results) const; |
73c04bcf A |
233 | }; |
234 | ||
b331163b | 235 | #if !UCONFIG_ONLY_HTML_CONVERSION |
729e4ab9 A |
236 | class CharsetRecog_IBM424_he : public CharsetRecog_sbcs |
237 | { | |
238 | public: | |
239 | virtual ~CharsetRecog_IBM424_he(); | |
240 | ||
241 | const char *getLanguage() const; | |
242 | }; | |
243 | ||
244 | class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he { | |
245 | public: | |
246 | virtual ~CharsetRecog_IBM424_he_rtl(); | |
247 | ||
248 | const char *getName() const; | |
249 | ||
51004dcb | 250 | virtual UBool match(InputText *det, CharsetMatch *results) const; |
729e4ab9 A |
251 | }; |
252 | ||
253 | class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { | |
254 | virtual ~CharsetRecog_IBM424_he_ltr(); | |
255 | ||
256 | const char *getName() const; | |
257 | ||
51004dcb | 258 | virtual UBool match(InputText *det, CharsetMatch *results) const; |
729e4ab9 A |
259 | }; |
260 | ||
261 | class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs | |
262 | { | |
263 | public: | |
264 | virtual ~CharsetRecog_IBM420_ar(); | |
265 | ||
266 | const char *getLanguage() const; | |
57a6839d | 267 | int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; |
729e4ab9 | 268 | |
729e4ab9 A |
269 | }; |
270 | ||
271 | class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar { | |
272 | public: | |
273 | virtual ~CharsetRecog_IBM420_ar_rtl(); | |
274 | ||
275 | const char *getName() const; | |
276 | ||
51004dcb | 277 | virtual UBool match(InputText *det, CharsetMatch *results) const; |
729e4ab9 A |
278 | }; |
279 | ||
280 | class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { | |
281 | virtual ~CharsetRecog_IBM420_ar_ltr(); | |
282 | ||
283 | const char *getName() const; | |
284 | ||
51004dcb | 285 | virtual UBool match(InputText *det, CharsetMatch *results) const; |
729e4ab9 | 286 | }; |
b331163b | 287 | #endif |
729e4ab9 | 288 | |
73c04bcf A |
289 | U_NAMESPACE_END |
290 | ||
51004dcb | 291 | #endif /* !UCONFIG_NO_CONVERSION */ |
73c04bcf | 292 | #endif /* __CSRSBCS_H */ |