]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ********************************************************************** | |
d5d484b0 | 3 | * Copyright (C) 2000-2006,2008 International Business Machines |
b75a7d8f A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | * file name: ucnv2022.c | |
7 | * encoding: US-ASCII | |
8 | * tab size: 8 (not used) | |
9 | * indentation:4 | |
10 | * | |
11 | * created on: 2000feb03 | |
12 | * created by: Markus W. Scherer | |
13 | * | |
14 | * Change history: | |
15 | * | |
16 | * 06/29/2000 helena Major rewrite of the callback APIs. | |
17 | * 08/08/2000 Ram Included support for ISO-2022-JP-2 | |
18 | * Changed implementation of toUnicode | |
19 | * function | |
20 | * 08/21/2000 Ram Added support for ISO-2022-KR | |
21 | * 08/29/2000 Ram Seperated implementation of EBCDIC to | |
22 | * ucnvebdc.c | |
23 | * 09/20/2000 Ram Added support for ISO-2022-CN | |
24 | * Added implementations for getNextUChar() | |
25 | * for specific 2022 country variants. | |
26 | * 10/31/2000 Ram Implemented offsets logic functions | |
27 | */ | |
28 | ||
29 | #include "unicode/utypes.h" | |
30 | ||
374ca955 | 31 | #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION |
b75a7d8f A |
32 | |
33 | #include "unicode/ucnv.h" | |
34 | #include "unicode/uset.h" | |
35 | #include "unicode/ucnv_err.h" | |
36 | #include "unicode/ucnv_cb.h" | |
374ca955 | 37 | #include "ucnv_imp.h" |
b75a7d8f A |
38 | #include "ucnv_bld.h" |
39 | #include "ucnv_cnv.h" | |
40 | #include "ucnvmbcs.h" | |
41 | #include "cstring.h" | |
42 | #include "cmemory.h" | |
43 | ||
374ca955 A |
44 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
45 | ||
46 | #ifdef U_ENABLE_GENERIC_ISO_2022 | |
47 | /* | |
48 | * I am disabling the generic ISO-2022 converter after proposing to do so on | |
49 | * the icu mailing list two days ago. | |
50 | * | |
51 | * Reasons: | |
52 | * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of | |
53 | * its designation sequences, single shifts with return to the previous state, | |
54 | * switch-with-no-return to UTF-16BE or similar, etc. | |
55 | * This is unlike the language-specific variants like ISO-2022-JP which | |
56 | * require a much smaller repertoire of ISO-2022 features. | |
57 | * These variants continue to be supported. | |
58 | * 2. I believe that no one is really using the generic ISO-2022 converter | |
59 | * but rather always one of the language-specific variants. | |
60 | * Note that ICU's generic ISO-2022 converter has always output one escape | |
61 | * sequence followed by UTF-8 for the whole stream. | |
62 | * 3. Switching between subcharsets is extremely slow, because each time | |
63 | * the previous converter is closed and a new one opened, | |
64 | * without any kind of caching, least-recently-used list, etc. | |
65 | * 4. The code is currently buggy, and given the above it does not seem | |
66 | * reasonable to spend the time on maintenance. | |
67 | * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. | |
68 | * This means, for example, that when ISO-8859-7 is designated, the following | |
69 | * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. | |
70 | * The ICU ISO-2022 converter does not handle this - and has no information | |
71 | * about which subconverter would have to be shifted vs. which is designed | |
72 | * for 7-bit ISO-2022. | |
73 | * | |
74 | * Markus Scherer 2003-dec-03 | |
75 | */ | |
76 | #endif | |
77 | ||
78 | static const char SHIFT_IN_STR[] = "\x0F"; | |
79 | static const char SHIFT_OUT_STR[] = "\x0E"; | |
b75a7d8f A |
80 | |
81 | #define CR 0x0D | |
82 | #define LF 0x0A | |
83 | #define H_TAB 0x09 | |
84 | #define V_TAB 0x0B | |
85 | #define SPACE 0x20 | |
86 | ||
73c04bcf A |
87 | /* |
88 | * ISO 2022 control codes must not be converted from Unicode | |
89 | * because they would mess up the byte stream. | |
90 | * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b | |
91 | * corresponding to SO, SI, and ESC. | |
92 | */ | |
93 | #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) | |
94 | ||
374ca955 | 95 | /* for ISO-2022-JP and -CN implementations */ |
b75a7d8f | 96 | typedef enum { |
374ca955 A |
97 | /* shared values */ |
98 | INVALID_STATE=-1, | |
b75a7d8f | 99 | ASCII = 0, |
374ca955 A |
100 | |
101 | SS2_STATE=0x10, | |
102 | SS3_STATE, | |
103 | ||
104 | /* JP */ | |
b75a7d8f A |
105 | ISO8859_1 = 1 , |
106 | ISO8859_7 = 2 , | |
107 | JISX201 = 3, | |
108 | JISX208 = 4, | |
109 | JISX212 = 5, | |
110 | GB2312 =6, | |
111 | KSC5601 =7, | |
112 | HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ | |
b75a7d8f | 113 | |
374ca955 A |
114 | /* CN */ |
115 | /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ | |
116 | GB2312_1=1, | |
117 | ISO_IR_165=2, | |
118 | CNS_11643=3, | |
119 | ||
120 | /* | |
121 | * these are used in StateEnum and ISO2022State variables, | |
122 | * but CNS_11643 must be used to index into myConverterArray[] | |
123 | */ | |
124 | CNS_11643_0=0x20, | |
125 | CNS_11643_1, | |
126 | CNS_11643_2, | |
127 | CNS_11643_3, | |
128 | CNS_11643_4, | |
129 | CNS_11643_5, | |
130 | CNS_11643_6, | |
131 | CNS_11643_7 | |
b75a7d8f A |
132 | } StateEnum; |
133 | ||
374ca955 A |
134 | /* is the StateEnum charset value for a DBCS charset? */ |
135 | #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) | |
136 | ||
137 | #define CSM(cs) ((uint16_t)1<<(cs)) | |
b75a7d8f | 138 | |
374ca955 A |
139 | /* |
140 | * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence | |
141 | * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x | |
142 | * | |
143 | * Note: The converter uses some leniency: | |
144 | * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in | |
145 | * all versions, not just JIS7 and JIS8. | |
146 | * - ICU does not distinguish between different versions of JIS X 0208. | |
147 | */ | |
148 | static const uint16_t jpCharsetMasks[5]={ | |
149 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), | |
150 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), | |
151 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), | |
152 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), | |
153 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) | |
154 | }; | |
b75a7d8f A |
155 | |
156 | typedef enum { | |
157 | ASCII1=0, | |
158 | LATIN1, | |
159 | SBCS, | |
160 | DBCS, | |
374ca955 A |
161 | MBCS, |
162 | HWKANA | |
b75a7d8f A |
163 | }Cnv2022Type; |
164 | ||
374ca955 A |
165 | typedef struct ISO2022State { |
166 | int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ | |
167 | int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ | |
168 | int8_t prevG; /* g before single shift (SS2 or SS3) */ | |
169 | } ISO2022State; | |
170 | ||
b75a7d8f A |
171 | #define UCNV_OPTIONS_VERSION_MASK 0xf |
172 | #define UCNV_2022_MAX_CONVERTERS 10 | |
173 | ||
174 | typedef struct{ | |
73c04bcf | 175 | UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; |
b75a7d8f | 176 | UConverter *currentConverter; |
b75a7d8f | 177 | Cnv2022Type currentType; |
374ca955 | 178 | ISO2022State toU2022State, fromU2022State; |
b75a7d8f A |
179 | uint32_t key; |
180 | uint32_t version; | |
73c04bcf A |
181 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
182 | UBool isFirstBuffer; | |
183 | #endif | |
d5d484b0 | 184 | UBool isEmptySegment; |
b75a7d8f | 185 | char name[30]; |
73c04bcf | 186 | char locale[3]; |
b75a7d8f A |
187 | }UConverterDataISO2022; |
188 | ||
374ca955 | 189 | /* Protos */ |
b75a7d8f A |
190 | /* ISO-2022 ----------------------------------------------------------------- */ |
191 | ||
192 | /*Forward declaration */ | |
193 | U_CFUNC void | |
374ca955 A |
194 | ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, |
195 | UErrorCode * err); | |
b75a7d8f | 196 | U_CFUNC void |
374ca955 A |
197 | ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, |
198 | UErrorCode * err); | |
b75a7d8f A |
199 | |
200 | #define ESC_2022 0x1B /*ESC*/ | |
201 | ||
202 | typedef enum | |
203 | { | |
204 | INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ | |
205 | VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ | |
206 | VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ | |
374ca955 | 207 | VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ |
b75a7d8f A |
208 | } UCNV_TableStates_2022; |
209 | ||
210 | /* | |
211 | * The way these state transition arrays work is: | |
212 | * ex : ESC$B is the sequence for JISX208 | |
213 | * a) First Iteration: char is ESC | |
214 | * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index | |
215 | * int x = normalize_esq_chars_2022[27] which is equal to 1 | |
216 | * ii) Search for this value in escSeqStateTable_Key_2022[] | |
217 | * value of x is stored at escSeqStateTable_Key_2022[0] | |
218 | * iii) Save this index as offset | |
219 | * iv) Get state of this sequence from escSeqStateTable_Value_2022[] | |
220 | * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 | |
221 | * b) Switch on this state and continue to next char | |
222 | * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index | |
223 | * which is normalize_esq_chars_2022[36] == 4 | |
224 | * ii) x is currently 1(from above) | |
225 | * x<<=5 -- x is now 32 | |
226 | * x+=normalize_esq_chars_2022[36] | |
227 | * now x is 36 | |
228 | * iii) Search for this value in escSeqStateTable_Key_2022[] | |
229 | * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 | |
230 | * iv) Get state of this sequence from escSeqStateTable_Value_2022[] | |
231 | * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 | |
232 | * c) Switch on this state and continue to next char | |
233 | * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index | |
234 | * ii) x is currently 36 (from above) | |
235 | * x<<=5 -- x is now 1152 | |
236 | * x+=normalize_esq_chars_2022[66] | |
237 | * now x is 1161 | |
238 | * iii) Search for this value in escSeqStateTable_Key_2022[] | |
239 | * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 | |
240 | * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] | |
241 | * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 | |
242 | * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 | |
243 | */ | |
244 | ||
245 | ||
246 | /*Below are the 3 arrays depicting a state transition table*/ | |
247 | static const int8_t normalize_esq_chars_2022[256] = { | |
248 | /* 0 1 2 3 4 5 6 7 8 9 */ | |
249 | ||
250 | 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
251 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
252 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 | |
253 | ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 | |
254 | ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 | |
255 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
256 | ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 | |
257 | ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 | |
258 | ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
259 | ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
260 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
261 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
262 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
263 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
264 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
265 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
266 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
267 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
268 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
269 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
270 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
271 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
272 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
273 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
274 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
275 | ,0 ,0 ,0 ,0 ,0 ,0 | |
276 | }; | |
277 | ||
374ca955 A |
278 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
279 | /* | |
280 | * When the generic ISO-2022 converter is completely removed, not just disabled | |
281 | * per #ifdef, then the following state table and the associated tables that are | |
282 | * dimensioned with MAX_STATES_2022 should be trimmed. | |
283 | * | |
284 | * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of | |
285 | * the associated escape sequences starting with ESC ( B should be removed. | |
286 | * This includes the ones with key values 1097 and all of the ones above 1000000. | |
287 | * | |
288 | * For the latter, the tables can simply be truncated. | |
289 | * For the former, since the tables must be kept parallel, it is probably best | |
290 | * to simply duplicate an adjacent table cell, parallel in all tables. | |
291 | * | |
292 | * It may make sense to restructure the tables, especially by using small search | |
293 | * tables for the variants instead of indexing them parallel to the table here. | |
294 | */ | |
295 | #endif | |
296 | ||
b75a7d8f A |
297 | #define MAX_STATES_2022 74 |
298 | static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { | |
299 | /* 0 1 2 3 4 5 6 7 8 9 */ | |
300 | ||
301 | 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 | |
302 | ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 | |
303 | ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 | |
304 | ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 | |
305 | ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 | |
306 | ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 | |
307 | ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 | |
308 | ,35947631 ,35947635 ,35947636 ,35947638 | |
309 | }; | |
310 | ||
374ca955 | 311 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
b75a7d8f A |
312 | |
313 | static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { | |
314 | /* 0 1 2 3 4 5 6 7 8 9 */ | |
315 | ||
316 | NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" | |
374ca955 | 317 | ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" |
b75a7d8f A |
318 | ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" |
319 | ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" | |
320 | ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" | |
321 | ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" | |
322 | ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" | |
323 | ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" | |
324 | }; | |
325 | ||
374ca955 A |
326 | #endif |
327 | ||
b75a7d8f A |
328 | static const UCNV_TableStates_2022 escSeqStateTable_Value_2022[MAX_STATES_2022] = { |
329 | /* 0 1 2 3 4 5 6 7 8 9 */ | |
374ca955 | 330 | VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
b75a7d8f A |
331 | ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
332 | ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 | |
333 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 | |
334 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 | |
335 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 | |
336 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 | |
337 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 | |
338 | }; | |
339 | ||
340 | ||
b75a7d8f A |
341 | /* Type def for refactoring changeState_2022 code*/ |
342 | typedef enum{ | |
374ca955 | 343 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
b75a7d8f | 344 | ISO_2022=0, |
374ca955 | 345 | #endif |
b75a7d8f A |
346 | ISO_2022_JP=1, |
347 | ISO_2022_KR=2, | |
348 | ISO_2022_CN=3 | |
349 | } Variant2022; | |
350 | ||
b75a7d8f A |
351 | /*********** ISO 2022 Converter Protos ***********/ |
352 | static void | |
353 | _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode); | |
354 | ||
355 | static void | |
356 | _ISO2022Close(UConverter *converter); | |
357 | ||
358 | static void | |
359 | _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); | |
360 | ||
361 | static const char* | |
362 | _ISO2022getName(const UConverter* cnv); | |
363 | ||
364 | static void | |
365 | _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); | |
366 | ||
367 | static UConverter * | |
368 | _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); | |
369 | ||
374ca955 | 370 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
b75a7d8f | 371 | static void |
374ca955 A |
372 | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); |
373 | #endif | |
b75a7d8f | 374 | |
374ca955 A |
375 | /*const UConverterSharedData _ISO2022Data;*/ |
376 | static const UConverterSharedData _ISO2022JPData; | |
377 | static const UConverterSharedData _ISO2022KRData; | |
378 | static const UConverterSharedData _ISO2022CNData; | |
b75a7d8f | 379 | |
374ca955 | 380 | /*************** Converter implementations ******************/ |
b75a7d8f | 381 | |
73c04bcf A |
382 | /* The purpose of this function is to get around gcc compiler warnings. */ |
383 | static U_INLINE void | |
384 | fromUWriteUInt8(UConverter *cnv, | |
385 | const char *bytes, int32_t length, | |
386 | uint8_t **target, const char *targetLimit, | |
387 | int32_t **offsets, | |
388 | int32_t sourceIndex, | |
389 | UErrorCode *pErrorCode) | |
390 | { | |
391 | char *targetChars = (char *)*target; | |
392 | ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, | |
393 | offsets, sourceIndex, pErrorCode); | |
394 | *target = (uint8_t*)targetChars; | |
395 | ||
396 | } | |
397 | ||
398 | static U_INLINE void | |
374ca955 A |
399 | setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){ |
400 | if(myConverterData->version == 1) { | |
401 | UConverter *cnv = myConverterData->currentConverter; | |
b75a7d8f | 402 | |
374ca955 A |
403 | cnv->toUnicodeStatus=0; /* offset */ |
404 | cnv->mode=0; /* state */ | |
405 | cnv->toULength=0; /* byteIndex */ | |
406 | } | |
407 | } | |
b75a7d8f | 408 | |
73c04bcf | 409 | static U_INLINE void |
374ca955 A |
410 | setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ |
411 | /* in ISO-2022-KR the designator sequence appears only once | |
412 | * in a file so we append it only once | |
413 | */ | |
414 | if( converter->charErrorBufferLength==0){ | |
b75a7d8f | 415 | |
374ca955 A |
416 | converter->charErrorBufferLength = 4; |
417 | converter->charErrorBuffer[0] = 0x1b; | |
418 | converter->charErrorBuffer[1] = 0x24; | |
419 | converter->charErrorBuffer[2] = 0x29; | |
420 | converter->charErrorBuffer[3] = 0x43; | |
421 | } | |
422 | if(myConverterData->version == 1) { | |
423 | UConverter *cnv = myConverterData->currentConverter; | |
b75a7d8f | 424 | |
374ca955 A |
425 | cnv->fromUChar32=0; |
426 | cnv->fromUnicodeStatus=1; /* prevLength */ | |
427 | } | |
428 | } | |
b75a7d8f | 429 | |
374ca955 A |
430 | static void |
431 | _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){ | |
b75a7d8f | 432 | |
374ca955 | 433 | char myLocale[6]={' ',' ',' ',' ',' ',' '}; |
b75a7d8f | 434 | |
374ca955 A |
435 | cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); |
436 | if(cnv->extraInfo != NULL) { | |
437 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; | |
438 | uint32_t version; | |
b75a7d8f | 439 | |
374ca955 | 440 | uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); |
374ca955 | 441 | myConverterData->currentType = ASCII1; |
374ca955 A |
442 | cnv->fromUnicodeStatus =FALSE; |
443 | if(locale){ | |
444 | uprv_strncpy(myLocale, locale, sizeof(myLocale)); | |
445 | } | |
374ca955 | 446 | version = options & UCNV_OPTIONS_VERSION_MASK; |
73c04bcf | 447 | myConverterData->version = version; |
374ca955 | 448 | if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && |
73c04bcf A |
449 | (myLocale[2]=='_' || myLocale[2]=='\0')) |
450 | { | |
451 | size_t len=0; | |
374ca955 A |
452 | /* open the required converters and cache them */ |
453 | if(jpCharsetMasks[version]&CSM(ISO8859_7)) { | |
454 | myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode); | |
455 | } | |
456 | myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode); | |
457 | myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode); | |
458 | if(jpCharsetMasks[version]&CSM(JISX212)) { | |
459 | myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode); | |
460 | } | |
461 | if(jpCharsetMasks[version]&CSM(GB2312)) { | |
462 | myConverterData->myConverterArray[GB2312] = ucnv_loadSharedData("ibm-5478", NULL, errorCode); /* gb_2312_80-1 */ | |
463 | } | |
464 | if(jpCharsetMasks[version]&CSM(KSC5601)) { | |
465 | myConverterData->myConverterArray[KSC5601] = ucnv_loadSharedData("ksc_5601", NULL, errorCode); | |
466 | } | |
b75a7d8f | 467 | |
374ca955 A |
468 | /* set the function pointers to appropriate funtions */ |
469 | cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); | |
470 | uprv_strcpy(myConverterData->locale,"ja"); | |
b75a7d8f | 471 | |
374ca955 A |
472 | uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); |
473 | len = uprv_strlen(myConverterData->name); | |
474 | myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); | |
475 | myConverterData->name[len+1]='\0'; | |
476 | } | |
477 | else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && | |
73c04bcf A |
478 | (myLocale[2]=='_' || myLocale[2]=='\0')) |
479 | { | |
480 | if (version==1){ | |
481 | myConverterData->currentConverter= | |
482 | ucnv_open("icu-internal-25546",errorCode); | |
b75a7d8f | 483 | |
73c04bcf A |
484 | if (U_FAILURE(*errorCode)) { |
485 | _ISO2022Close(cnv); | |
486 | return; | |
487 | } | |
b75a7d8f | 488 | |
73c04bcf A |
489 | uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); |
490 | uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); | |
491 | cnv->subCharLen = myConverterData->currentConverter->subCharLen; | |
374ca955 | 492 | }else{ |
73c04bcf | 493 | myConverterData->currentConverter=ucnv_open("ibm-949",errorCode); |
b75a7d8f | 494 | |
73c04bcf A |
495 | if (U_FAILURE(*errorCode)) { |
496 | _ISO2022Close(cnv); | |
497 | return; | |
498 | } | |
b75a7d8f | 499 | |
73c04bcf A |
500 | myConverterData->version = 0; |
501 | uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); | |
374ca955 | 502 | } |
b75a7d8f | 503 | |
374ca955 A |
504 | /* initialize the state variables */ |
505 | setInitialStateToUnicodeKR(cnv, myConverterData); | |
73c04bcf | 506 | setInitialStateFromUnicodeKR(cnv, myConverterData); |
b75a7d8f A |
507 | |
508 | /* set the function pointers to appropriate funtions */ | |
509 | cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; | |
b75a7d8f A |
510 | uprv_strcpy(myConverterData->locale,"ko"); |
511 | } | |
512 | else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& | |
73c04bcf A |
513 | (myLocale[2]=='_' || myLocale[2]=='\0')) |
514 | { | |
b75a7d8f A |
515 | |
516 | /* open the required converters and cache them */ | |
374ca955 A |
517 | myConverterData->myConverterArray[GB2312_1] = ucnv_loadSharedData("ibm-5478", NULL, errorCode); |
518 | if(version==1) { | |
519 | myConverterData->myConverterArray[ISO_IR_165] = ucnv_loadSharedData("iso-ir-165", NULL, errorCode); | |
520 | } | |
521 | myConverterData->myConverterArray[CNS_11643] = ucnv_loadSharedData("cns-11643-1992", NULL, errorCode); | |
b75a7d8f | 522 | |
b75a7d8f A |
523 | |
524 | /* set the function pointers to appropriate funtions */ | |
525 | cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; | |
526 | uprv_strcpy(myConverterData->locale,"cn"); | |
527 | ||
73c04bcf | 528 | if (version==1){ |
b75a7d8f A |
529 | uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); |
530 | }else{ | |
b75a7d8f | 531 | myConverterData->version = 0; |
73c04bcf | 532 | uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); |
b75a7d8f A |
533 | } |
534 | } | |
535 | else{ | |
374ca955 | 536 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
73c04bcf A |
537 | myConverterData->isFirstBuffer = TRUE; |
538 | ||
b75a7d8f A |
539 | /* append the UTF-8 escape sequence */ |
540 | cnv->charErrorBufferLength = 3; | |
541 | cnv->charErrorBuffer[0] = 0x1b; | |
542 | cnv->charErrorBuffer[1] = 0x25; | |
543 | cnv->charErrorBuffer[2] = 0x42; | |
544 | ||
545 | cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; | |
546 | /* initialize the state variables */ | |
b75a7d8f | 547 | uprv_strcpy(myConverterData->name,"ISO_2022"); |
374ca955 A |
548 | #else |
549 | *errorCode = U_UNSUPPORTED_ERROR; | |
550 | return; | |
551 | #endif | |
b75a7d8f A |
552 | } |
553 | ||
374ca955 A |
554 | cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; |
555 | ||
556 | if(U_FAILURE(*errorCode)) { | |
557 | _ISO2022Close(cnv); | |
558 | } | |
b75a7d8f A |
559 | } else { |
560 | *errorCode = U_MEMORY_ALLOCATION_ERROR; | |
561 | } | |
b75a7d8f A |
562 | } |
563 | ||
564 | ||
565 | static void | |
566 | _ISO2022Close(UConverter *converter) { | |
374ca955 A |
567 | UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); |
568 | UConverterSharedData **array = myData->myConverterArray; | |
569 | int32_t i; | |
b75a7d8f A |
570 | |
571 | if (converter->extraInfo != NULL) { | |
572 | /*close the array of converter pointers and free the memory*/ | |
374ca955 A |
573 | for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { |
574 | if(array[i]!=NULL) { | |
575 | ucnv_unloadSharedDataIfReady(array[i]); | |
b75a7d8f | 576 | } |
b75a7d8f A |
577 | } |
578 | ||
374ca955 | 579 | ucnv_close(myData->currentConverter); |
b75a7d8f A |
580 | |
581 | if(!converter->isExtraLocal){ | |
582 | uprv_free (converter->extraInfo); | |
374ca955 | 583 | converter->extraInfo = NULL; |
b75a7d8f A |
584 | } |
585 | } | |
586 | } | |
587 | ||
588 | static void | |
589 | _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { | |
590 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); | |
374ca955 A |
591 | if(choice<=UCNV_RESET_TO_UNICODE) { |
592 | uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); | |
593 | myConverterData->key = 0; | |
d5d484b0 | 594 | myConverterData->isEmptySegment = FALSE; |
374ca955 A |
595 | } |
596 | if(choice!=UCNV_RESET_TO_UNICODE) { | |
597 | uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); | |
598 | } | |
599 | #ifdef U_ENABLE_GENERIC_ISO_2022 | |
600 | if(myConverterData->locale[0] == 0){ | |
b75a7d8f A |
601 | if(choice<=UCNV_RESET_TO_UNICODE) { |
602 | myConverterData->isFirstBuffer = TRUE; | |
374ca955 | 603 | myConverterData->key = 0; |
b75a7d8f A |
604 | if (converter->mode == UCNV_SO){ |
605 | ucnv_close (myConverterData->currentConverter); | |
606 | myConverterData->currentConverter=NULL; | |
607 | } | |
608 | converter->mode = UCNV_SI; | |
609 | } | |
610 | if(choice!=UCNV_RESET_TO_UNICODE) { | |
611 | /* re-append UTF-8 escape sequence */ | |
612 | converter->charErrorBufferLength = 3; | |
613 | converter->charErrorBuffer[0] = 0x1b; | |
614 | converter->charErrorBuffer[1] = 0x28; | |
615 | converter->charErrorBuffer[2] = 0x42; | |
616 | } | |
617 | } | |
374ca955 A |
618 | else |
619 | #endif | |
620 | { | |
b75a7d8f | 621 | /* reset the state variables */ |
374ca955 | 622 | if(myConverterData->locale[0] == 'k'){ |
b75a7d8f A |
623 | if(choice<=UCNV_RESET_TO_UNICODE) { |
624 | setInitialStateToUnicodeKR(converter, myConverterData); | |
625 | } | |
626 | if(choice!=UCNV_RESET_TO_UNICODE) { | |
627 | setInitialStateFromUnicodeKR(converter, myConverterData); | |
628 | } | |
629 | } | |
630 | } | |
631 | } | |
632 | ||
633 | static const char* | |
634 | _ISO2022getName(const UConverter* cnv){ | |
635 | if(cnv->extraInfo){ | |
636 | UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; | |
637 | return myData->name; | |
638 | } | |
639 | return NULL; | |
640 | } | |
641 | ||
b75a7d8f | 642 | |
374ca955 A |
643 | /*************** to unicode *******************/ |
644 | /**************************************************************************** | |
645 | * Recognized escape sequences are | |
646 | * <ESC>(B ASCII | |
647 | * <ESC>.A ISO-8859-1 | |
648 | * <ESC>.F ISO-8859-7 | |
649 | * <ESC>(J JISX-201 | |
650 | * <ESC>(I JISX-201 | |
651 | * <ESC>$B JISX-208 | |
652 | * <ESC>$@ JISX-208 | |
653 | * <ESC>$(D JISX-212 | |
654 | * <ESC>$A GB2312 | |
655 | * <ESC>$(C KSC5601 | |
656 | */ | |
657 | static const StateEnum nextStateToUnicodeJP[MAX_STATES_2022]= { | |
658 | /* 0 1 2 3 4 5 6 7 8 9 */ | |
659 | INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
660 | ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE | |
661 | ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
662 | ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE | |
663 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
664 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
665 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
666 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
667 | }; | |
b75a7d8f | 668 | |
374ca955 A |
669 | /*************** to unicode *******************/ |
670 | static const StateEnum nextStateToUnicodeCN[MAX_STATES_2022]= { | |
671 | /* 0 1 2 3 4 5 6 7 8 9 */ | |
672 | INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
673 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
674 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
675 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
676 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 | |
677 | ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
678 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
679 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
680 | }; | |
b75a7d8f | 681 | |
b75a7d8f | 682 | |
374ca955 A |
683 | static UCNV_TableStates_2022 |
684 | getKey_2022(char c,int32_t* key,int32_t* offset){ | |
685 | int32_t togo; | |
686 | int32_t low = 0; | |
687 | int32_t hi = MAX_STATES_2022; | |
688 | int32_t oldmid=0; | |
b75a7d8f | 689 | |
374ca955 A |
690 | togo = normalize_esq_chars_2022[(uint8_t)c]; |
691 | if(togo == 0) { | |
692 | /* not a valid character anywhere in an escape sequence */ | |
693 | *key = 0; | |
694 | *offset = 0; | |
695 | return INVALID_2022; | |
696 | } | |
697 | togo = (*key << 5) + togo; | |
b75a7d8f | 698 | |
374ca955 | 699 | while (hi != low) /*binary search*/{ |
b75a7d8f | 700 | |
374ca955 A |
701 | register int32_t mid = (hi+low) >> 1; /*Finds median*/ |
702 | ||
703 | if (mid == oldmid) | |
704 | break; | |
705 | ||
706 | if (escSeqStateTable_Key_2022[mid] > togo){ | |
707 | hi = mid; | |
708 | } | |
709 | else if (escSeqStateTable_Key_2022[mid] < togo){ | |
710 | low = mid; | |
711 | } | |
712 | else /*we found it*/{ | |
713 | *key = togo; | |
714 | *offset = mid; | |
715 | return escSeqStateTable_Value_2022[mid]; | |
716 | } | |
717 | oldmid = mid; | |
b75a7d8f | 718 | |
b75a7d8f | 719 | } |
b75a7d8f | 720 | |
374ca955 A |
721 | *key = 0; |
722 | *offset = 0; | |
723 | return INVALID_2022; | |
b75a7d8f A |
724 | } |
725 | ||
374ca955 A |
726 | /*runs through a state machine to determine the escape sequence - codepage correspondance |
727 | */ | |
728 | static void | |
729 | changeState_2022(UConverter* _this, | |
730 | const char** source, | |
731 | const char* sourceLimit, | |
732 | Variant2022 var, | |
733 | UErrorCode* err){ | |
734 | UCNV_TableStates_2022 value; | |
735 | UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); | |
736 | uint32_t key = myData2022->key; | |
73c04bcf | 737 | int32_t offset = 0; |
fd0068a8 | 738 | int8_t initialToULength = _this->toULength; |
374ca955 A |
739 | char c; |
740 | ||
741 | value = VALID_NON_TERMINAL_2022; | |
742 | while (*source < sourceLimit) { | |
743 | c = *(*source)++; | |
744 | _this->toUBytes[_this->toULength++]=(uint8_t)c; | |
745 | value = getKey_2022(c,(int32_t *) &key, &offset); | |
746 | ||
747 | switch (value){ | |
b75a7d8f | 748 | |
374ca955 A |
749 | case VALID_NON_TERMINAL_2022 : |
750 | /* continue with the loop */ | |
751 | break; | |
b75a7d8f | 752 | |
374ca955 A |
753 | case VALID_TERMINAL_2022: |
754 | key = 0; | |
755 | goto DONE; | |
b75a7d8f | 756 | |
374ca955 A |
757 | case INVALID_2022: |
758 | goto DONE; | |
b75a7d8f | 759 | |
374ca955 A |
760 | case VALID_MAYBE_TERMINAL_2022: |
761 | #ifdef U_ENABLE_GENERIC_ISO_2022 | |
762 | /* ESC ( B is ambiguous only for ISO_2022 itself */ | |
763 | if(var == ISO_2022) { | |
764 | /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ | |
765 | _this->toULength = 0; | |
b75a7d8f | 766 | |
374ca955 A |
767 | /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ |
768 | ||
769 | /* continue with the loop */ | |
770 | value = VALID_NON_TERMINAL_2022; | |
771 | break; | |
772 | } else | |
773 | #endif | |
774 | { | |
775 | /* not ISO_2022 itself, finish here */ | |
776 | value = VALID_TERMINAL_2022; | |
777 | key = 0; | |
778 | goto DONE; | |
b75a7d8f A |
779 | } |
780 | } | |
b75a7d8f | 781 | } |
b75a7d8f | 782 | |
374ca955 A |
783 | DONE: |
784 | myData2022->key = key; | |
b75a7d8f | 785 | |
374ca955 A |
786 | if (value == VALID_NON_TERMINAL_2022) { |
787 | /* indicate that the escape sequence is incomplete: key!=0 */ | |
788 | return; | |
789 | } else if (value == INVALID_2022 ) { | |
790 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; | |
374ca955 A |
791 | } else /* value == VALID_TERMINAL_2022 */ { |
792 | switch(var){ | |
793 | #ifdef U_ENABLE_GENERIC_ISO_2022 | |
794 | case ISO_2022: | |
795 | { | |
796 | const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; | |
797 | if(chosenConverterName == NULL) { | |
798 | /* SS2 or SS3 */ | |
799 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
800 | return; | |
b75a7d8f | 801 | } |
374ca955 A |
802 | |
803 | _this->mode = UCNV_SI; | |
804 | ucnv_close(myData2022->currentConverter); | |
805 | myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); | |
806 | if(U_SUCCESS(*err)) { | |
807 | myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; | |
808 | _this->mode = UCNV_SO; | |
809 | } | |
810 | break; | |
811 | } | |
812 | #endif | |
813 | case ISO_2022_JP: | |
814 | { | |
815 | StateEnum tempState=nextStateToUnicodeJP[offset]; | |
816 | switch(tempState) { | |
817 | case INVALID_STATE: | |
818 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
819 | break; | |
820 | case SS2_STATE: | |
821 | if(myData2022->toU2022State.cs[2]!=0) { | |
822 | if(myData2022->toU2022State.g<2) { | |
823 | myData2022->toU2022State.prevG=myData2022->toU2022State.g; | |
824 | } | |
825 | myData2022->toU2022State.g=2; | |
826 | } else { | |
827 | /* illegal to have SS2 before a matching designator */ | |
828 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; | |
829 | } | |
830 | break; | |
831 | /* case SS3_STATE: not used in ISO-2022-JP-x */ | |
832 | case ISO8859_1: | |
833 | case ISO8859_7: | |
834 | if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { | |
835 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
836 | } else { | |
837 | /* G2 charset for SS2 */ | |
838 | myData2022->toU2022State.cs[2]=(int8_t)tempState; | |
839 | } | |
840 | break; | |
841 | default: | |
842 | if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { | |
843 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
844 | } else { | |
845 | /* G0 charset */ | |
846 | myData2022->toU2022State.cs[0]=(int8_t)tempState; | |
847 | } | |
848 | break; | |
849 | } | |
850 | } | |
851 | break; | |
852 | case ISO_2022_CN: | |
853 | { | |
854 | StateEnum tempState=nextStateToUnicodeCN[offset]; | |
855 | switch(tempState) { | |
856 | case INVALID_STATE: | |
857 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
858 | break; | |
859 | case SS2_STATE: | |
860 | if(myData2022->toU2022State.cs[2]!=0) { | |
861 | if(myData2022->toU2022State.g<2) { | |
862 | myData2022->toU2022State.prevG=myData2022->toU2022State.g; | |
863 | } | |
864 | myData2022->toU2022State.g=2; | |
865 | } else { | |
866 | /* illegal to have SS2 before a matching designator */ | |
867 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; | |
868 | } | |
869 | break; | |
870 | case SS3_STATE: | |
871 | if(myData2022->toU2022State.cs[3]!=0) { | |
872 | if(myData2022->toU2022State.g<2) { | |
873 | myData2022->toU2022State.prevG=myData2022->toU2022State.g; | |
874 | } | |
875 | myData2022->toU2022State.g=3; | |
876 | } else { | |
877 | /* illegal to have SS3 before a matching designator */ | |
878 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; | |
879 | } | |
880 | break; | |
881 | case ISO_IR_165: | |
882 | if(myData2022->version==0) { | |
883 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
884 | break; | |
885 | } | |
73c04bcf | 886 | /*fall through*/ |
374ca955 | 887 | case GB2312_1: |
73c04bcf | 888 | /*fall through*/ |
374ca955 A |
889 | case CNS_11643_1: |
890 | myData2022->toU2022State.cs[1]=(int8_t)tempState; | |
891 | break; | |
892 | case CNS_11643_2: | |
893 | myData2022->toU2022State.cs[2]=(int8_t)tempState; | |
894 | break; | |
895 | default: | |
896 | /* other CNS 11643 planes */ | |
897 | if(myData2022->version==0) { | |
898 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
899 | } else { | |
900 | myData2022->toU2022State.cs[3]=(int8_t)tempState; | |
901 | } | |
902 | break; | |
903 | } | |
904 | } | |
905 | break; | |
906 | case ISO_2022_KR: | |
907 | if(offset==0x30){ | |
908 | /* nothing to be done, just accept this one escape sequence */ | |
909 | } else { | |
910 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
911 | } | |
912 | break; | |
913 | ||
914 | default: | |
915 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; | |
916 | break; | |
917 | } | |
918 | } | |
919 | if(U_SUCCESS(*err)) { | |
920 | _this->toULength = 0; | |
fd0068a8 A |
921 | } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { |
922 | if(_this->toULength>1) { | |
923 | /* | |
924 | * Ticket 5691: consistent illegal sequences: | |
925 | * - We include at least the first byte (ESC) in the illegal sequence. | |
926 | * - If any of the non-initial bytes could be the start of a character, | |
927 | * we stop the illegal sequence before the first one of those. | |
928 | * In escape sequences, all following bytes are "printable", that is, | |
929 | * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), | |
930 | * they are valid single/lead bytes. | |
931 | * For simplicity, we always only report the initial ESC byte as the | |
932 | * illegal sequence and back out all other bytes we looked at. | |
933 | */ | |
934 | /* Back out some bytes. */ | |
935 | int8_t backOutDistance=_this->toULength-1; | |
936 | int8_t bytesFromThisBuffer=_this->toULength-initialToULength; | |
937 | if(backOutDistance<=bytesFromThisBuffer) { | |
938 | /* same as initialToULength<=1 */ | |
939 | *source-=backOutDistance; | |
940 | } else { | |
941 | /* Back out bytes from the previous buffer: Need to replay them. */ | |
942 | _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); | |
943 | /* same as -(initialToULength-1) */ | |
944 | /* preToULength is negative! */ | |
945 | uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); | |
946 | *source-=bytesFromThisBuffer; | |
947 | } | |
948 | _this->toULength=1; | |
949 | } | |
374ca955 A |
950 | } |
951 | } | |
952 | ||
953 | /*Checks the characters of the buffer against valid 2022 escape sequences | |
954 | *if the match we return a pointer to the initial start of the sequence otherwise | |
955 | *we return sourceLimit | |
956 | */ | |
957 | /*for 2022 looks ahead in the stream | |
958 | *to determine the longest possible convertible | |
959 | *data stream | |
960 | */ | |
961 | static U_INLINE const char* | |
962 | getEndOfBuffer_2022(const char** source, | |
963 | const char* sourceLimit, | |
964 | UBool flush){ | |
965 | ||
966 | const char* mySource = *source; | |
967 | ||
968 | #ifdef U_ENABLE_GENERIC_ISO_2022 | |
969 | if (*source >= sourceLimit) | |
970 | return sourceLimit; | |
971 | ||
972 | do{ | |
973 | ||
974 | if (*mySource == ESC_2022){ | |
975 | int8_t i; | |
976 | int32_t key = 0; | |
977 | int32_t offset; | |
978 | UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; | |
979 | ||
980 | /* Kludge: I could not | |
981 | * figure out the reason for validating an escape sequence | |
982 | * twice - once here and once in changeState_2022(). | |
983 | * is it possible to have an ESC character in a ISO2022 | |
984 | * byte stream which is valid in a code page? Is it legal? | |
985 | */ | |
986 | for (i=0; | |
987 | (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); | |
988 | i++) { | |
989 | value = getKey_2022(*(mySource+i), &key, &offset); | |
990 | } | |
991 | if (value > 0 || *mySource==ESC_2022) | |
992 | return mySource; | |
993 | ||
994 | if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) | |
995 | return sourceLimit; | |
996 | } | |
997 | }while (++mySource < sourceLimit); | |
998 | ||
999 | return sourceLimit; | |
1000 | #else | |
1001 | while(mySource < sourceLimit && *mySource != ESC_2022) { | |
1002 | ++mySource; | |
1003 | } | |
1004 | return mySource; | |
1005 | #endif | |
1006 | } | |
1007 | ||
1008 | ||
1009 | /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c | |
1010 | * any future change in _MBCSFromUChar32() function should be reflected in | |
1011 | * this macro | |
1012 | */ | |
1013 | static U_INLINE void | |
1014 | MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, | |
1015 | UChar32 c, | |
1016 | uint32_t* value, | |
1017 | UBool useFallback, | |
1018 | int32_t *length, | |
1019 | int outputType) | |
1020 | { | |
1021 | const int32_t *cx; | |
1022 | const uint16_t *table; | |
1023 | uint32_t stage2Entry; | |
1024 | uint32_t myValue; | |
1025 | const uint8_t *p; | |
1026 | /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ | |
1027 | if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { | |
1028 | table=sharedData->mbcs.fromUnicodeTable; | |
1029 | stage2Entry=MBCS_STAGE_2_FROM_U(table, c); | |
1030 | /* get the bytes and the length for the output */ | |
1031 | if(outputType==MBCS_OUTPUT_2){ | |
1032 | myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); | |
1033 | if(myValue<=0xff) { | |
1034 | *length=1; | |
1035 | } else { | |
1036 | *length=2; | |
1037 | } | |
1038 | } else /* outputType==MBCS_OUTPUT_3 */ { | |
1039 | p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); | |
1040 | myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; | |
1041 | if(myValue<=0xff) { | |
1042 | *length=1; | |
1043 | } else if(myValue<=0xffff) { | |
1044 | *length=2; | |
1045 | } else { | |
1046 | *length=3; | |
b75a7d8f A |
1047 | } |
1048 | } | |
1049 | /* is this code point assigned, or do we use fallbacks? */ | |
1050 | if( (stage2Entry&(1<<(16+(c&0xf))))!=0 || | |
374ca955 | 1051 | (FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) |
b75a7d8f A |
1052 | ) { |
1053 | /* | |
374ca955 | 1054 | * We allow a 0 byte output if the "assigned" bit is set for this entry. |
b75a7d8f | 1055 | * There is no way with this data structure for fallback output |
374ca955 | 1056 | * to be a zero byte. |
b75a7d8f A |
1057 | */ |
1058 | /* assigned */ | |
1059 | *value=myValue; | |
374ca955 | 1060 | return; |
b75a7d8f | 1061 | } |
b75a7d8f | 1062 | } |
374ca955 A |
1063 | |
1064 | cx=sharedData->mbcs.extIndexes; | |
1065 | if(cx!=NULL) { | |
1066 | *length=ucnv_extSimpleMatchFromU(cx, c, value, useFallback); | |
1067 | return; | |
1068 | } | |
1069 | ||
1070 | /* unassigned */ | |
1071 | *length=0; | |
b75a7d8f A |
1072 | } |
1073 | ||
1074 | /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c | |
1075 | * any future change in _MBCSSingleFromUChar32() function should be reflected in | |
1076 | * this macro | |
1077 | */ | |
1078 | static U_INLINE void | |
1079 | MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, | |
1080 | UChar32 c, | |
1081 | uint32_t* retval, | |
1082 | UBool useFallback) | |
1083 | { | |
1084 | const uint16_t *table; | |
1085 | int32_t value; | |
1086 | /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ | |
374ca955 A |
1087 | if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { |
1088 | *retval=(uint16_t)-1; | |
1089 | return; | |
b75a7d8f A |
1090 | } |
1091 | /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ | |
374ca955 | 1092 | table=sharedData->mbcs.fromUnicodeTable; |
b75a7d8f | 1093 | /* get the byte for the output */ |
374ca955 | 1094 | value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); |
b75a7d8f A |
1095 | /* is this code point assigned, or do we use fallbacks? */ |
1096 | if(useFallback ? value>=0x800 : value>=0xc00) { | |
1097 | value &=0xff; | |
1098 | } else { | |
1099 | value= -1; | |
1100 | } | |
1101 | *retval=(uint16_t) value; | |
1102 | } | |
1103 | ||
374ca955 A |
1104 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
1105 | ||
b75a7d8f A |
1106 | /********************************************************************************** |
1107 | * ISO-2022 Converter | |
1108 | * | |
1109 | * | |
1110 | */ | |
1111 | ||
b75a7d8f A |
1112 | static void |
1113 | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, | |
1114 | UErrorCode* err){ | |
374ca955 A |
1115 | const char* mySourceLimit, *realSourceLimit; |
1116 | const char* sourceStart; | |
1117 | const UChar* myTargetStart; | |
b75a7d8f | 1118 | UConverter* saveThis; |
b75a7d8f | 1119 | UConverterDataISO2022* myData; |
374ca955 A |
1120 | int8_t length; |
1121 | ||
1122 | saveThis = args->converter; | |
1123 | myData=((UConverterDataISO2022*)(saveThis->extraInfo)); | |
1124 | ||
1125 | realSourceLimit = args->sourceLimit; | |
1126 | while (args->source < realSourceLimit) { | |
1127 | if(myData->key == 0) { /* are we in the middle of an escape sequence? */ | |
1128 | /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ | |
1129 | mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); | |
1130 | ||
1131 | if(args->source < mySourceLimit) { | |
1132 | if(myData->currentConverter==NULL) { | |
1133 | myData->currentConverter = ucnv_open("ASCII",err); | |
1134 | if(U_FAILURE(*err)){ | |
1135 | return; | |
1136 | } | |
b75a7d8f | 1137 | |
374ca955 A |
1138 | myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; |
1139 | saveThis->mode = UCNV_SO; | |
b75a7d8f | 1140 | } |
b75a7d8f | 1141 | |
374ca955 A |
1142 | /* convert to before the ESC or until the end of the buffer */ |
1143 | myData->isFirstBuffer=FALSE; | |
1144 | sourceStart = args->source; | |
1145 | myTargetStart = args->target; | |
1146 | args->converter = myData->currentConverter; | |
1147 | ucnv_toUnicode(args->converter, | |
1148 | &args->target, | |
1149 | args->targetLimit, | |
1150 | &args->source, | |
1151 | mySourceLimit, | |
1152 | args->offsets, | |
1153 | (UBool)(args->flush && mySourceLimit == realSourceLimit), | |
1154 | err); | |
1155 | args->converter = saveThis; | |
1156 | ||
1157 | if (*err == U_BUFFER_OVERFLOW_ERROR) { | |
1158 | /* move the overflow buffer */ | |
1159 | length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; | |
1160 | myData->currentConverter->UCharErrorBufferLength = 0; | |
1161 | if(length > 0) { | |
1162 | uprv_memcpy(saveThis->UCharErrorBuffer, | |
1163 | myData->currentConverter->UCharErrorBuffer, | |
1164 | length*U_SIZEOF_UCHAR); | |
1165 | } | |
1166 | return; | |
1167 | } | |
b75a7d8f | 1168 | |
374ca955 A |
1169 | /* |
1170 | * At least one of: | |
1171 | * -Error while converting | |
1172 | * -Done with entire buffer | |
1173 | * -Need to write offsets or update the current offset | |
1174 | * (leave that up to the code in ucnv.c) | |
1175 | * | |
1176 | * or else we just stopped at an ESC byte and continue with changeState_2022() | |
1177 | */ | |
1178 | if (U_FAILURE(*err) || | |
1179 | (args->source == realSourceLimit) || | |
1180 | (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || | |
1181 | (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) | |
1182 | ) { | |
1183 | /* copy partial or error input for truncated detection and error handling */ | |
1184 | if(U_FAILURE(*err)) { | |
1185 | length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; | |
1186 | if(length > 0) { | |
1187 | uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); | |
1188 | } | |
1189 | } else { | |
1190 | length = saveThis->toULength = myData->currentConverter->toULength; | |
1191 | if(length > 0) { | |
1192 | uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); | |
1193 | if(args->source < mySourceLimit) { | |
1194 | *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ | |
1195 | } | |
1196 | } | |
1197 | } | |
1198 | return; | |
b75a7d8f | 1199 | } |
b75a7d8f A |
1200 | } |
1201 | } | |
b75a7d8f A |
1202 | |
1203 | sourceStart = args->source; | |
1204 | changeState_2022(args->converter, | |
1205 | &(args->source), | |
374ca955 | 1206 | realSourceLimit, |
b75a7d8f | 1207 | ISO_2022, |
b75a7d8f | 1208 | err); |
374ca955 A |
1209 | if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { |
1210 | /* let the ucnv.c code update its current offset */ | |
1211 | return; | |
b75a7d8f | 1212 | } |
b75a7d8f | 1213 | } |
b75a7d8f A |
1214 | } |
1215 | ||
374ca955 | 1216 | #endif |
b75a7d8f A |
1217 | |
1218 | /* | |
1219 | * To Unicode Callback helper function | |
1220 | */ | |
1221 | static void | |
374ca955 A |
1222 | toUnicodeCallback(UConverter *cnv, |
1223 | const uint32_t sourceChar, const uint32_t targetUniChar, | |
1224 | UErrorCode* err){ | |
b75a7d8f | 1225 | if(sourceChar>0xff){ |
374ca955 A |
1226 | cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); |
1227 | cnv->toUBytes[1] = (uint8_t)sourceChar; | |
1228 | cnv->toULength = 2; | |
b75a7d8f A |
1229 | } |
1230 | else{ | |
374ca955 | 1231 | cnv->toUBytes[0] =(char) sourceChar; |
fd0068a8 | 1232 | cnv->toULength = 1; |
b75a7d8f A |
1233 | } |
1234 | ||
1235 | if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ | |
b75a7d8f A |
1236 | *err = U_INVALID_CHAR_FOUND; |
1237 | } | |
1238 | else{ | |
b75a7d8f A |
1239 | *err = U_ILLEGAL_CHAR_FOUND; |
1240 | } | |
b75a7d8f A |
1241 | } |
1242 | ||
1243 | /**************************************ISO-2022-JP*************************************************/ | |
1244 | ||
1245 | /************************************** IMPORTANT ************************************************** | |
1246 | * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and | |
1247 | * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). | |
1248 | * The converter iterates over each Unicode codepoint | |
1249 | * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is | |
1250 | * processed one char at a time it would make sense to reduce the extra processing a canned converter | |
1251 | * would do as far as possible. | |
1252 | * | |
1253 | * If the implementation of these macros or structure of sharedData struct change in the future, make | |
1254 | * sure that ISO-2022 is also changed. | |
1255 | *************************************************************************************************** | |
1256 | */ | |
1257 | ||
1258 | /*************************************************************************************************** | |
1259 | * Rules for ISO-2022-jp encoding | |
1260 | * (i) Escape sequences must be fully contained within a line they should not | |
1261 | * span new lines or CRs | |
1262 | * (ii) If the last character on a line is represented by two bytes then an ASCII or | |
1263 | * JIS-Roman character escape sequence should follow before the line terminates | |
1264 | * (iii) If the first character on the line is represented by two bytes then a two | |
1265 | * byte character escape sequence should precede it | |
1266 | * (iv) If no escape sequence is encountered then the characters are ASCII | |
1267 | * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, | |
1268 | * and invoked with SS2 (ESC N). | |
1269 | * (vi) If there is any G0 designation in text, there must be a switch to | |
1270 | * ASCII or to JIS X 0201-Roman before a space character (but not | |
1271 | * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control | |
1272 | * characters such as tab or CRLF. | |
1273 | * (vi) Supported encodings: | |
1274 | * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 | |
1275 | * | |
1276 | * source : RFC-1554 | |
1277 | * | |
1278 | * JISX201, JISX208,JISX212 : new .cnv data files created | |
1279 | * KSC5601 : alias to ibm-949 mapping table | |
1280 | * GB2312 : alias to ibm-1386 mapping table | |
1281 | * ISO-8859-1 : Algorithmic implemented as LATIN1 case | |
1282 | * ISO-8859-7 : alisas to ibm-9409 mapping table | |
1283 | */ | |
b75a7d8f | 1284 | |
374ca955 A |
1285 | /* preference order of JP charsets */ |
1286 | static const StateEnum jpCharsetPref[]={ | |
1287 | ASCII, | |
1288 | JISX201, | |
1289 | ISO8859_1, | |
1290 | ISO8859_7, | |
1291 | JISX208, | |
1292 | JISX212, | |
1293 | GB2312, | |
1294 | KSC5601, | |
1295 | HWKANA_7BIT | |
b75a7d8f A |
1296 | }; |
1297 | ||
73c04bcf A |
1298 | /* |
1299 | * The escape sequences must be in order of the enum constants like JISX201 = 3, | |
1300 | * not in order of jpCharsetPref[]! | |
1301 | */ | |
374ca955 | 1302 | static const char escSeqChars[][6] ={ |
b75a7d8f A |
1303 | "\x1B\x28\x42", /* <ESC>(B ASCII */ |
1304 | "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ | |
1305 | "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ | |
1306 | "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ | |
1307 | "\x1B\x24\x42", /* <ESC>$B JISX-208 */ | |
1308 | "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ | |
1309 | "\x1B\x24\x41", /* <ESC>$A GB2312 */ | |
1310 | "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ | |
1311 | "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ | |
1312 | ||
1313 | }; | |
374ca955 A |
1314 | static const int32_t escSeqCharsLen[] ={ |
1315 | 3, /* length of <ESC>(B ASCII */ | |
b75a7d8f A |
1316 | 3, /* length of <ESC>.A ISO-8859-1 */ |
1317 | 3, /* length of <ESC>.F ISO-8859-7 */ | |
1318 | 3, /* length of <ESC>(J JISX-201 */ | |
1319 | 3, /* length of <ESC>$B JISX-208 */ | |
1320 | 4, /* length of <ESC>$(D JISX-212 */ | |
1321 | 3, /* length of <ESC>$A GB2312 */ | |
1322 | 4, /* length of <ESC>$(C KSC5601 */ | |
1323 | 3 /* length of <ESC>(I HWKANA_7BIT */ | |
1324 | }; | |
1325 | ||
1326 | /* | |
1327 | * The iteration over various code pages works this way: | |
1328 | * i) Get the currentState from myConverterData->currentState | |
1329 | * ii) Check if the character is mapped to a valid character in the currentState | |
1330 | * Yes -> a) set the initIterState to currentState | |
1331 | * b) remain in this state until an invalid character is found | |
1332 | * No -> a) go to the next code page and find the character | |
1333 | * iii) Before changing the state increment the current state check if the current state | |
1334 | * is equal to the intitIteration state | |
1335 | * Yes -> A character that cannot be represented in any of the supported encodings | |
1336 | * break and return a U_INVALID_CHARACTER error | |
1337 | * No -> Continue and find the character in next code page | |
1338 | * | |
1339 | * | |
1340 | * TODO: Implement a priority technique where the users are allowed to set the priority of code pages | |
1341 | */ | |
1342 | ||
1343 | static void | |
374ca955 | 1344 | UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { |
b75a7d8f | 1345 | UConverterDataISO2022 *converterData; |
374ca955 A |
1346 | ISO2022State *pFromU2022State; |
1347 | uint8_t *target = (uint8_t *) args->target; | |
1348 | const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; | |
b75a7d8f A |
1349 | const UChar* source = args->source; |
1350 | const UChar* sourceLimit = args->sourceLimit; | |
1351 | int32_t* offsets = args->offsets; | |
374ca955 A |
1352 | UChar32 sourceChar; |
1353 | char buffer[8]; | |
1354 | int32_t len, outLen; | |
1355 | int8_t choices[10]; | |
1356 | int32_t choiceCount; | |
73c04bcf | 1357 | uint32_t targetValue = 0; |
374ca955 A |
1358 | UBool useFallback; |
1359 | ||
1360 | int32_t i; | |
1361 | int8_t cs, g; | |
1362 | ||
1363 | /* set up the state */ | |
1364 | converterData = (UConverterDataISO2022*)args->converter->extraInfo; | |
1365 | pFromU2022State = &converterData->fromU2022State; | |
1366 | useFallback = args->converter->useFallback; | |
1367 | ||
1368 | choiceCount = 0; | |
b75a7d8f | 1369 | |
b75a7d8f | 1370 | /* check if the last codepoint of previous buffer was a lead surrogate*/ |
374ca955 | 1371 | if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) { |
b75a7d8f A |
1372 | goto getTrail; |
1373 | } | |
b75a7d8f | 1374 | |
374ca955 A |
1375 | while(source < sourceLimit) { |
1376 | if(target < targetLimit) { | |
b75a7d8f | 1377 | |
b75a7d8f | 1378 | sourceChar = *(source++); |
374ca955 | 1379 | /*check if the char is a First surrogate*/ |
73c04bcf | 1380 | if(UTF_IS_SURROGATE(sourceChar)) { |
374ca955 A |
1381 | if(UTF_IS_SURROGATE_FIRST(sourceChar)) { |
1382 | getTrail: | |
1383 | /*look ahead to find the trail surrogate*/ | |
1384 | if(source < sourceLimit) { | |
1385 | /* test the following code unit */ | |
1386 | UChar trail=(UChar) *source; | |
1387 | if(UTF_IS_SECOND_SURROGATE(trail)) { | |
1388 | source++; | |
1389 | sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); | |
1390 | args->converter->fromUChar32=0x00; | |
1391 | /* convert this supplementary code point */ | |
1392 | /* exit this condition tree */ | |
1393 | } else { | |
1394 | /* this is an unmatched lead code unit (1st surrogate) */ | |
1395 | /* callback(illegal) */ | |
1396 | *err=U_ILLEGAL_CHAR_FOUND; | |
1397 | args->converter->fromUChar32=sourceChar; | |
1398 | break; | |
b75a7d8f | 1399 | } |
374ca955 A |
1400 | } else { |
1401 | /* no more input */ | |
1402 | args->converter->fromUChar32=sourceChar; | |
b75a7d8f A |
1403 | break; |
1404 | } | |
374ca955 A |
1405 | } else { |
1406 | /* this is an unmatched trail code unit (2nd surrogate) */ | |
1407 | /* callback(illegal) */ | |
1408 | *err=U_ILLEGAL_CHAR_FOUND; | |
1409 | args->converter->fromUChar32=sourceChar; | |
1410 | break; | |
1411 | } | |
b75a7d8f A |
1412 | } |
1413 | ||
73c04bcf A |
1414 | /* do not convert SO/SI/ESC */ |
1415 | if(IS_2022_CONTROL(sourceChar)) { | |
1416 | /* callback(illegal) */ | |
1417 | *err=U_ILLEGAL_CHAR_FOUND; | |
1418 | args->converter->fromUChar32=sourceChar; | |
1419 | break; | |
1420 | } | |
1421 | ||
374ca955 | 1422 | /* do the conversion */ |
b75a7d8f | 1423 | |
374ca955 A |
1424 | if(choiceCount == 0) { |
1425 | uint16_t csm; | |
b75a7d8f | 1426 | |
374ca955 A |
1427 | /* |
1428 | * The csm variable keeps track of which charsets are allowed | |
1429 | * and not used yet while building the choices[]. | |
1430 | */ | |
1431 | csm = jpCharsetMasks[converterData->version]; | |
1432 | choiceCount = 0; | |
1433 | ||
1434 | /* JIS7/8: try single-byte half-width Katakana before JISX208 */ | |
1435 | if(converterData->version == 3 || converterData->version == 4) { | |
1436 | choices[choiceCount++] = cs = (int8_t)HWKANA_7BIT; | |
1437 | csm &= ~CSM(cs); | |
1438 | } | |
b75a7d8f | 1439 | |
374ca955 A |
1440 | /* try the current G0 charset */ |
1441 | choices[choiceCount++] = cs = pFromU2022State->cs[0]; | |
1442 | csm &= ~CSM(cs); | |
b75a7d8f | 1443 | |
374ca955 A |
1444 | /* try the current G2 charset */ |
1445 | if((cs = pFromU2022State->cs[2]) != 0) { | |
1446 | choices[choiceCount++] = cs; | |
1447 | csm &= ~CSM(cs); | |
1448 | } | |
1449 | ||
1450 | /* try all the other possible charsets */ | |
1451 | for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) { | |
1452 | cs = (int8_t)jpCharsetPref[i]; | |
1453 | if(CSM(cs) & csm) { | |
1454 | choices[choiceCount++] = cs; | |
1455 | csm &= ~CSM(cs); | |
b75a7d8f A |
1456 | } |
1457 | } | |
374ca955 | 1458 | } |
b75a7d8f | 1459 | |
374ca955 A |
1460 | cs = g = 0; |
1461 | len = 0; | |
1462 | ||
1463 | for(i = 0; i < choiceCount && len == 0; ++i) { | |
1464 | cs = choices[i]; | |
1465 | switch(cs) { | |
1466 | case ASCII: | |
1467 | if(sourceChar <= 0x7f) { | |
1468 | targetValue = (uint32_t)sourceChar; | |
1469 | len = 1; | |
b75a7d8f | 1470 | } |
374ca955 A |
1471 | break; |
1472 | case ISO8859_1: | |
1473 | if(0x80 <= sourceChar && sourceChar <= 0xff) { | |
1474 | targetValue = (uint32_t)sourceChar - 0x80; | |
1475 | len = 1; | |
1476 | g = 2; | |
1477 | } | |
1478 | break; | |
1479 | case HWKANA_7BIT: | |
1480 | if((uint32_t)(0xff9f-sourceChar)<=(0xff9f-0xff61)) { | |
1481 | targetValue = (uint32_t)(sourceChar - (0xff61 - 0x21)); | |
1482 | len = 1; | |
1483 | ||
1484 | if(converterData->version==3) { | |
1485 | /* JIS7: use G1 (SO) */ | |
1486 | pFromU2022State->cs[1] = cs; /* do not output an escape sequence */ | |
1487 | g = 1; | |
1488 | } else if(converterData->version==4) { | |
1489 | /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ | |
1490 | int8_t cs0; | |
1491 | ||
1492 | targetValue += 0x80; | |
1493 | ||
1494 | cs0 = pFromU2022State->cs[0]; | |
1495 | if(IS_JP_DBCS(cs0)) { | |
1496 | /* switch from a DBCS charset to JISX201 */ | |
1497 | cs = (int8_t)JISX201; | |
1498 | } else { | |
1499 | /* stay in the current G0 charset */ | |
1500 | cs = cs0; | |
b75a7d8f | 1501 | } |
b75a7d8f | 1502 | } |
b75a7d8f | 1503 | } |
374ca955 A |
1504 | break; |
1505 | case JISX201: | |
1506 | /* G0 SBCS */ | |
1507 | MBCS_SINGLE_FROM_UCHAR32( | |
1508 | converterData->myConverterArray[cs], | |
1509 | sourceChar, &targetValue, | |
1510 | useFallback); | |
1511 | if(targetValue <= 0x7f) { | |
1512 | len = 1; | |
1513 | } | |
1514 | break; | |
1515 | case ISO8859_7: | |
1516 | /* G0 SBCS forced to 7-bit output */ | |
1517 | MBCS_SINGLE_FROM_UCHAR32( | |
1518 | converterData->myConverterArray[cs], | |
1519 | sourceChar, &targetValue, | |
1520 | useFallback); | |
1521 | if(0x80 <= targetValue && targetValue <= 0xff) { | |
1522 | targetValue -= 0x80; | |
1523 | len = 1; | |
1524 | g = 2; | |
1525 | } | |
1526 | break; | |
1527 | default: | |
1528 | /* G0 DBCS */ | |
1529 | MBCS_FROM_UCHAR32_ISO2022( | |
1530 | converterData->myConverterArray[cs], | |
1531 | sourceChar, &targetValue, | |
1532 | useFallback, &len, MBCS_OUTPUT_2); | |
1533 | if(len != 2) { | |
1534 | len = 0; | |
1535 | } | |
1536 | break; | |
b75a7d8f A |
1537 | } |
1538 | } | |
b75a7d8f | 1539 | |
374ca955 A |
1540 | if(len > 0) { |
1541 | outLen = 0; /* count output bytes */ | |
1542 | ||
1543 | /* write SI if necessary (only for JIS7) */ | |
1544 | if(pFromU2022State->g == 1 && g == 0) { | |
1545 | buffer[outLen++] = UCNV_SI; | |
1546 | pFromU2022State->g = 0; | |
1547 | } | |
1548 | ||
1549 | /* write the designation sequence if necessary */ | |
1550 | if(cs != pFromU2022State->cs[g]) { | |
1551 | int32_t escLen = escSeqCharsLen[cs]; | |
1552 | uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); | |
1553 | outLen += escLen; | |
1554 | pFromU2022State->cs[g] = cs; | |
1555 | ||
1556 | /* invalidate the choices[] */ | |
1557 | choiceCount = 0; | |
1558 | } | |
1559 | ||
1560 | /* write the shift sequence if necessary */ | |
1561 | if(g != pFromU2022State->g) { | |
1562 | switch(g) { | |
1563 | /* case 0 handled before writing escapes */ | |
1564 | case 1: | |
1565 | buffer[outLen++] = UCNV_SO; | |
1566 | pFromU2022State->g = 1; | |
1567 | break; | |
1568 | default: /* case 2 */ | |
1569 | buffer[outLen++] = 0x1b; | |
1570 | buffer[outLen++] = 0x4e; | |
1571 | break; | |
1572 | /* no case 3: no SS3 in ISO-2022-JP-x */ | |
1573 | } | |
1574 | } | |
1575 | ||
1576 | /* write the output bytes */ | |
1577 | if(len == 1) { | |
1578 | buffer[outLen++] = (char)targetValue; | |
1579 | } else /* len == 2 */ { | |
1580 | buffer[outLen++] = (char)(targetValue >> 8); | |
1581 | buffer[outLen++] = (char)targetValue; | |
1582 | } | |
1583 | } else { | |
1584 | /* | |
1585 | * if we cannot find the character after checking all codepages | |
b75a7d8f A |
1586 | * then this is an error |
1587 | */ | |
b75a7d8f | 1588 | *err = U_INVALID_CHAR_FOUND; |
374ca955 A |
1589 | args->converter->fromUChar32=sourceChar; |
1590 | break; | |
1591 | } | |
1592 | ||
1593 | if(sourceChar == CR || sourceChar == LF) { | |
1594 | /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ | |
1595 | pFromU2022State->cs[2] = 0; | |
1596 | choiceCount = 0; | |
1597 | } | |
1598 | ||
1599 | /* output outLen>0 bytes in buffer[] */ | |
1600 | if(outLen == 1) { | |
1601 | *target++ = buffer[0]; | |
1602 | if(offsets) { | |
73c04bcf | 1603 | *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ |
b75a7d8f | 1604 | } |
374ca955 A |
1605 | } else if(outLen == 2 && (target + 2) <= targetLimit) { |
1606 | *target++ = buffer[0]; | |
1607 | *target++ = buffer[1]; | |
1608 | if(offsets) { | |
1609 | int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); | |
1610 | *offsets++ = sourceIndex; | |
1611 | *offsets++ = sourceIndex; | |
1612 | } | |
1613 | } else { | |
73c04bcf | 1614 | fromUWriteUInt8( |
374ca955 A |
1615 | args->converter, |
1616 | buffer, outLen, | |
73c04bcf | 1617 | &target, (const char *)targetLimit, |
374ca955 A |
1618 | &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), |
1619 | err); | |
1620 | if(U_FAILURE(*err)) { | |
b75a7d8f A |
1621 | break; |
1622 | } | |
1623 | } | |
1624 | } /* end if(myTargetIndex<myTargetLength) */ | |
1625 | else{ | |
1626 | *err =U_BUFFER_OVERFLOW_ERROR; | |
1627 | break; | |
1628 | } | |
1629 | ||
1630 | }/* end while(mySourceIndex<mySourceLength) */ | |
1631 | ||
374ca955 A |
1632 | /* |
1633 | * the end of the input stream and detection of truncated input | |
1634 | * are handled by the framework, but for ISO-2022-JP conversion | |
1635 | * we need to be in ASCII mode at the very end | |
1636 | * | |
1637 | * conditions: | |
1638 | * successful | |
1639 | * in SO mode or not in ASCII mode | |
1640 | * end of input and no truncated input | |
b75a7d8f | 1641 | */ |
374ca955 A |
1642 | if( U_SUCCESS(*err) && |
1643 | (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && | |
1644 | args->flush && source>=sourceLimit && args->converter->fromUChar32==0 | |
1645 | ) { | |
1646 | int32_t sourceIndex; | |
1647 | ||
1648 | outLen = 0; | |
1649 | ||
1650 | if(pFromU2022State->g != 0) { | |
1651 | buffer[outLen++] = UCNV_SI; | |
1652 | pFromU2022State->g = 0; | |
1653 | } | |
1654 | ||
1655 | if(pFromU2022State->cs[0] != ASCII) { | |
1656 | int32_t escLen = escSeqCharsLen[ASCII]; | |
1657 | uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); | |
1658 | outLen += escLen; | |
1659 | pFromU2022State->cs[0] = (int8_t)ASCII; | |
1660 | } | |
1661 | ||
1662 | /* get the source index of the last input character */ | |
1663 | /* | |
1664 | * TODO this would be simpler and more reliable if we used a pair | |
1665 | * of sourceIndex/prevSourceIndex like in ucnvmbcs.c | |
1666 | * so that we could simply use the prevSourceIndex here; | |
1667 | * this code gives an incorrect result for the rare case of an unmatched | |
1668 | * trail surrogate that is alone in the last buffer of the text stream | |
1669 | */ | |
1670 | sourceIndex=(int32_t)(source-args->source); | |
1671 | if(sourceIndex>0) { | |
1672 | --sourceIndex; | |
1673 | if( U16_IS_TRAIL(args->source[sourceIndex]) && | |
1674 | (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) | |
1675 | ) { | |
1676 | --sourceIndex; | |
1677 | } | |
1678 | } else { | |
1679 | sourceIndex=-1; | |
1680 | } | |
1681 | ||
73c04bcf | 1682 | fromUWriteUInt8( |
374ca955 A |
1683 | args->converter, |
1684 | buffer, outLen, | |
73c04bcf | 1685 | &target, (const char *)targetLimit, |
374ca955 A |
1686 | &offsets, sourceIndex, |
1687 | err); | |
b75a7d8f A |
1688 | } |
1689 | ||
1690 | /*save the state and return */ | |
1691 | args->source = source; | |
1692 | args->target = (char*)target; | |
1693 | } | |
1694 | ||
1695 | /*************** to unicode *******************/ | |
1696 | ||
b75a7d8f A |
1697 | static void |
1698 | UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, | |
374ca955 A |
1699 | UErrorCode* err){ |
1700 | char tempBuf[3]; | |
1701 | const char *mySource = (char *) args->source; | |
b75a7d8f A |
1702 | UChar *myTarget = args->target; |
1703 | const char *mySourceLimit = args->sourceLimit; | |
1704 | uint32_t targetUniChar = 0x0000; | |
1705 | uint32_t mySourceChar = 0x0000; | |
1706 | UConverterDataISO2022* myData; | |
374ca955 A |
1707 | ISO2022State *pToU2022State; |
1708 | StateEnum cs; | |
b75a7d8f | 1709 | |
b75a7d8f | 1710 | myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
374ca955 | 1711 | pToU2022State = &myData->toU2022State; |
b75a7d8f | 1712 | |
374ca955 A |
1713 | if(myData->key != 0) { |
1714 | /* continue with a partial escape sequence */ | |
1715 | goto escape; | |
1716 | } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { | |
1717 | /* continue with a partial double-byte character */ | |
1718 | mySourceChar = args->converter->toUBytes[0]; | |
1719 | args->converter->toULength = 0; | |
1720 | cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; | |
fd0068a8 | 1721 | targetUniChar = missingCharMarker; |
374ca955 A |
1722 | goto getTrailByte; |
1723 | } | |
1724 | ||
1725 | while(mySource < mySourceLimit){ | |
1726 | ||
1727 | targetUniChar =missingCharMarker; | |
b75a7d8f A |
1728 | |
1729 | if(myTarget < args->targetLimit){ | |
1730 | ||
1731 | mySourceChar= (unsigned char) *mySource++; | |
374ca955 A |
1732 | |
1733 | switch(mySourceChar) { | |
1734 | case UCNV_SI: | |
1735 | if(myData->version==3) { | |
1736 | pToU2022State->g=0; | |
b75a7d8f | 1737 | continue; |
374ca955 A |
1738 | } else { |
1739 | /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ | |
d5d484b0 | 1740 | myData->isEmptySegment = FALSE; /* reset this, we have a different error */ |
374ca955 | 1741 | break; |
b75a7d8f | 1742 | } |
b75a7d8f | 1743 | |
374ca955 A |
1744 | case UCNV_SO: |
1745 | if(myData->version==3) { | |
1746 | /* JIS7: switch to G1 half-width Katakana */ | |
1747 | pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; | |
1748 | pToU2022State->g=1; | |
b75a7d8f | 1749 | continue; |
374ca955 A |
1750 | } else { |
1751 | /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ | |
d5d484b0 | 1752 | myData->isEmptySegment = FALSE; /* reset this, we have a different error */ |
374ca955 | 1753 | break; |
b75a7d8f | 1754 | } |
b75a7d8f | 1755 | |
374ca955 A |
1756 | case ESC_2022: |
1757 | mySource--; | |
1758 | escape: | |
d5d484b0 A |
1759 | { |
1760 | const char * mySourceBefore = mySource; | |
1761 | int8_t toULengthBefore = args->converter->toULength; | |
1762 | ||
1763 | changeState_2022(args->converter,&(mySource), | |
1764 | mySourceLimit, ISO_2022_JP,err); | |
1765 | ||
1766 | /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ | |
1767 | if ( myData->version == 0 && myData->key == 0 && U_SUCCESS(*err) && myData->isEmptySegment ) { | |
1768 | *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */ | |
1769 | args->converter->toULength = toULengthBefore + (mySource - mySourceBefore); | |
1770 | } | |
b75a7d8f | 1771 | |
d5d484b0 | 1772 | } |
374ca955 A |
1773 | /* invalid or illegal escape sequence */ |
1774 | if(U_FAILURE(*err)){ | |
1775 | args->target = myTarget; | |
1776 | args->source = mySource; | |
d5d484b0 | 1777 | myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ |
374ca955 | 1778 | return; |
b75a7d8f | 1779 | } |
d5d484b0 A |
1780 | /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ |
1781 | if (myData->key == 0) { | |
1782 | myData->isEmptySegment = TRUE; | |
1783 | } | |
374ca955 | 1784 | continue; |
b75a7d8f | 1785 | |
374ca955 | 1786 | /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ |
b75a7d8f | 1787 | |
374ca955 A |
1788 | case CR: |
1789 | /*falls through*/ | |
1790 | case LF: | |
1791 | /* automatically reset to single-byte mode */ | |
1792 | if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { | |
1793 | pToU2022State->cs[0] = (int8_t)ASCII; | |
b75a7d8f | 1794 | } |
374ca955 A |
1795 | pToU2022State->cs[2] = 0; |
1796 | pToU2022State->g = 0; | |
1797 | /* falls through */ | |
b75a7d8f | 1798 | default: |
374ca955 | 1799 | /* convert one or two bytes */ |
d5d484b0 | 1800 | myData->isEmptySegment = FALSE; |
374ca955 A |
1801 | cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
1802 | if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && | |
1803 | !IS_JP_DBCS(cs) | |
1804 | ) { | |
1805 | /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ | |
1806 | targetUniChar = mySourceChar + (0xff61 - 0xa1); | |
1807 | ||
1808 | /* return from a single-shift state to the previous one */ | |
1809 | if(pToU2022State->g >= 2) { | |
1810 | pToU2022State->g=pToU2022State->prevG; | |
1811 | } | |
1812 | } else switch(cs) { | |
1813 | case ASCII: | |
1814 | if(mySourceChar <= 0x7f) { | |
1815 | targetUniChar = mySourceChar; | |
1816 | } | |
1817 | break; | |
1818 | case ISO8859_1: | |
1819 | if(mySourceChar <= 0x7f) { | |
1820 | targetUniChar = mySourceChar + 0x80; | |
1821 | } | |
1822 | /* return from a single-shift state to the previous one */ | |
1823 | pToU2022State->g=pToU2022State->prevG; | |
1824 | break; | |
1825 | case ISO8859_7: | |
1826 | if(mySourceChar <= 0x7f) { | |
1827 | /* convert mySourceChar+0x80 to use a normal 8-bit table */ | |
1828 | targetUniChar = | |
1829 | _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( | |
1830 | myData->myConverterArray[cs], | |
1831 | mySourceChar + 0x80); | |
1832 | } | |
1833 | /* return from a single-shift state to the previous one */ | |
1834 | pToU2022State->g=pToU2022State->prevG; | |
1835 | break; | |
1836 | case JISX201: | |
1837 | if(mySourceChar <= 0x7f) { | |
1838 | targetUniChar = | |
1839 | _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( | |
1840 | myData->myConverterArray[cs], | |
1841 | mySourceChar); | |
1842 | } | |
1843 | break; | |
1844 | case HWKANA_7BIT: | |
1845 | if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { | |
1846 | /* 7-bit halfwidth Katakana */ | |
1847 | targetUniChar = mySourceChar + (0xff61 - 0x21); | |
1848 | } | |
1849 | break; | |
1850 | default: | |
1851 | /* G0 DBCS */ | |
1852 | if(mySource < mySourceLimit) { | |
fd0068a8 A |
1853 | int leadIsOk, trailIsOk; |
1854 | uint8_t trailByte; | |
374ca955 | 1855 | getTrailByte: |
fd0068a8 A |
1856 | trailByte = (uint8_t)*mySource; |
1857 | /* old | |
374ca955 A |
1858 | tempBuf[0] = (char) (mySourceChar); |
1859 | tempBuf[1] = trailByte = *mySource++; | |
1860 | mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); | |
1861 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); | |
fd0068a8 A |
1862 | */ |
1863 | /* | |
1864 | * Ticket 5691: consistent illegal sequences: | |
1865 | * - We include at least the first byte in the illegal sequence. | |
1866 | * - If any of the non-initial bytes could be the start of a character, | |
1867 |