]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | ********************************************************************** | |
3 | * Copyright (C) 2005-2008, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ********************************************************************** | |
6 | */ | |
7 | ||
8 | #ifndef __CSRMBCS_H | |
9 | #define __CSRMBCS_H | |
10 | ||
11 | #include "unicode/utypes.h" | |
12 | ||
13 | #if !UCONFIG_NO_CONVERSION | |
14 | ||
15 | #include "csrecog.h" | |
16 | ||
17 | U_NAMESPACE_BEGIN | |
18 | ||
19 | // "Character" iterated character class. | |
20 | // Recognizers for specific mbcs encodings make their "characters" available | |
21 | // by providing a nextChar() function that fills in an instance of IteratedChar | |
22 | // with the next char from the input. | |
23 | // The returned characters are not converted to Unicode, but remain as the raw | |
24 | // bytes (concatenated into an int) from the codepage data. | |
25 | // | |
26 | // For Asian charsets, use the raw input rather than the input that has been | |
27 | // stripped of markup. Detection only considers multi-byte chars, effectively | |
28 | // stripping markup anyway, and double byte chars do occur in markup too. | |
29 | // | |
30 | class IteratedChar : public UMemory | |
31 | { | |
32 | public: | |
33 | uint32_t charValue; // 1-4 bytes from the raw input data | |
34 | int32_t index; | |
35 | int32_t nextIndex; | |
36 | UBool error; | |
37 | UBool done; | |
38 | ||
39 | public: | |
40 | IteratedChar(); | |
41 | //void reset(); | |
42 | int32_t nextByte(InputText* det); | |
43 | }; | |
44 | ||
45 | ||
46 | class CharsetRecog_mbcs : public CharsetRecognizer { | |
47 | ||
48 | protected: | |
49 | /** | |
50 | * Test the match of this charset with the input text data | |
51 | * which is obtained via the CharsetDetector object. | |
52 | * | |
53 | * @param det The CharsetDetector, which contains the input text | |
54 | * to be checked for being in this charset. | |
55 | * @return Two values packed into one int (Damn java, anyhow) | |
56 | * <br/> | |
57 | * bits 0-7: the match confidence, ranging from 0-100 | |
58 | * <br/> | |
59 | * bits 8-15: The match reason, an enum-like value. | |
60 | */ | |
61 | int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen); | |
62 | ||
63 | public: | |
64 | ||
65 | virtual ~CharsetRecog_mbcs(); | |
66 | ||
67 | /** | |
68 | * Get the IANA name of this charset. | |
69 | * @return the charset name. | |
70 | */ | |
71 | ||
72 | const char *getName() const = 0; | |
73 | const char *getLanguage() const = 0; | |
74 | int32_t match(InputText* det) = 0; | |
75 | ||
76 | /** | |
77 | * Get the next character (however many bytes it is) from the input data | |
78 | * Subclasses for specific charset encodings must implement this function | |
79 | * to get characters according to the rules of their encoding scheme. | |
80 | * | |
81 | * This function is not a method of class IteratedChar only because | |
82 | * that would require a lot of extra derived classes, which is awkward. | |
83 | * @param it The IteratedChar "struct" into which the returned char is placed. | |
84 | * @param det The charset detector, which is needed to get at the input byte data | |
85 | * being iterated over. | |
86 | * @return True if a character was returned, false at end of input. | |
87 | */ | |
88 | virtual UBool nextChar(IteratedChar *it, InputText *textIn) = 0; | |
89 | ||
90 | }; | |
91 | ||
92 | ||
93 | /** | |
94 | * Shift-JIS charset recognizer. | |
95 | * | |
96 | */ | |
97 | class CharsetRecog_sjis : public CharsetRecog_mbcs { | |
98 | public: | |
99 | virtual ~CharsetRecog_sjis(); | |
100 | ||
101 | UBool nextChar(IteratedChar *it, InputText *det); | |
102 | ||
103 | int32_t match(InputText *det); | |
104 | ||
105 | const char *getName() const; | |
106 | const char *getLanguage() const; | |
107 | ||
108 | }; | |
109 | ||
110 | ||
111 | /** | |
112 | * EUC charset recognizers. One abstract class that provides the common function | |
113 | * for getting the next character according to the EUC encoding scheme, | |
114 | * and nested derived classes for EUC_KR, EUC_JP, EUC_CN. | |
115 | * | |
116 | */ | |
117 | class CharsetRecog_euc : public CharsetRecog_mbcs | |
118 | { | |
119 | public: | |
120 | virtual ~CharsetRecog_euc(); | |
121 | ||
122 | const char *getName() const = 0; | |
123 | const char *getLanguage() const = 0; | |
124 | ||
125 | int32_t match(InputText* det) = 0; | |
126 | /* | |
127 | * (non-Javadoc) | |
128 | * Get the next character value for EUC based encodings. | |
129 | * Character "value" is simply the raw bytes that make up the character | |
130 | * packed into an int. | |
131 | */ | |
132 | UBool nextChar(IteratedChar *it, InputText *det); | |
133 | }; | |
134 | ||
135 | /** | |
136 | * The charset recognize for EUC-JP. A singleton instance of this class | |
137 | * is created and kept by the public CharsetDetector class | |
138 | */ | |
139 | class CharsetRecog_euc_jp : public CharsetRecog_euc | |
140 | { | |
141 | public: | |
142 | virtual ~CharsetRecog_euc_jp(); | |
143 | ||
144 | const char *getName() const; | |
145 | const char *getLanguage() const; | |
146 | ||
147 | int32_t match(InputText *det); | |
148 | }; | |
149 | ||
150 | /** | |
151 | * The charset recognize for EUC-KR. A singleton instance of this class | |
152 | * is created and kept by the public CharsetDetector class | |
153 | */ | |
154 | class CharsetRecog_euc_kr : public CharsetRecog_euc | |
155 | { | |
156 | public: | |
157 | virtual ~CharsetRecog_euc_kr(); | |
158 | ||
159 | const char *getName() const; | |
160 | const char *getLanguage() const; | |
161 | ||
162 | int32_t match(InputText *det); | |
163 | }; | |
164 | ||
165 | /** | |
166 | * | |
167 | * Big5 charset recognizer. | |
168 | * | |
169 | */ | |
170 | class CharsetRecog_big5 : public CharsetRecog_mbcs | |
171 | { | |
172 | public: | |
173 | virtual ~CharsetRecog_big5(); | |
174 | ||
175 | UBool nextChar(IteratedChar* it, InputText* det); | |
176 | ||
177 | const char *getName() const; | |
178 | const char *getLanguage() const; | |
179 | ||
180 | int32_t match(InputText *det); | |
181 | }; | |
182 | ||
183 | ||
184 | /** | |
185 | * | |
186 | * GB-18030 recognizer. Uses simplified Chinese statistics. | |
187 | * | |
188 | */ | |
189 | class CharsetRecog_gb_18030 : public CharsetRecog_mbcs | |
190 | { | |
191 | public: | |
192 | virtual ~CharsetRecog_gb_18030(); | |
193 | ||
194 | UBool nextChar(IteratedChar* it, InputText* det); | |
195 | ||
196 | const char *getName() const; | |
197 | const char *getLanguage() const; | |
198 | ||
199 | int32_t match(InputText *det); | |
200 | }; | |
201 | ||
202 | U_NAMESPACE_END | |
203 | ||
204 | #endif | |
205 | #endif /* __CSRMBCS_H */ |