]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ********************************************************************** | |
b331163b | 3 | * Copyright (C) 2000-2015, International Business Machines |
b75a7d8f A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
4388f060 | 6 | * file name: ucnv2022.cpp |
b75a7d8f A |
7 | * encoding: US-ASCII |
8 | * tab size: 8 (not used) | |
9 | * indentation:4 | |
10 | * | |
11 | * created on: 2000feb03 | |
12 | * created by: Markus W. Scherer | |
13 | * | |
14 | * Change history: | |
15 | * | |
16 | * 06/29/2000 helena Major rewrite of the callback APIs. | |
17 | * 08/08/2000 Ram Included support for ISO-2022-JP-2 | |
18 | * Changed implementation of toUnicode | |
19 | * function | |
20 | * 08/21/2000 Ram Added support for ISO-2022-KR | |
21 | * 08/29/2000 Ram Seperated implementation of EBCDIC to | |
22 | * ucnvebdc.c | |
23 | * 09/20/2000 Ram Added support for ISO-2022-CN | |
24 | * Added implementations for getNextUChar() | |
25 | * for specific 2022 country variants. | |
26 | * 10/31/2000 Ram Implemented offsets logic functions | |
27 | */ | |
28 | ||
29 | #include "unicode/utypes.h" | |
30 | ||
374ca955 | 31 | #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION |
b75a7d8f A |
32 | |
33 | #include "unicode/ucnv.h" | |
34 | #include "unicode/uset.h" | |
35 | #include "unicode/ucnv_err.h" | |
36 | #include "unicode/ucnv_cb.h" | |
4388f060 | 37 | #include "unicode/utf16.h" |
374ca955 | 38 | #include "ucnv_imp.h" |
b75a7d8f A |
39 | #include "ucnv_bld.h" |
40 | #include "ucnv_cnv.h" | |
41 | #include "ucnvmbcs.h" | |
42 | #include "cstring.h" | |
43 | #include "cmemory.h" | |
4388f060 | 44 | #include "uassert.h" |
b75a7d8f | 45 | |
374ca955 A |
46 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
47 | /* | |
48 | * I am disabling the generic ISO-2022 converter after proposing to do so on | |
49 | * the icu mailing list two days ago. | |
50 | * | |
51 | * Reasons: | |
52 | * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of | |
53 | * its designation sequences, single shifts with return to the previous state, | |
54 | * switch-with-no-return to UTF-16BE or similar, etc. | |
55 | * This is unlike the language-specific variants like ISO-2022-JP which | |
56 | * require a much smaller repertoire of ISO-2022 features. | |
57 | * These variants continue to be supported. | |
58 | * 2. I believe that no one is really using the generic ISO-2022 converter | |
59 | * but rather always one of the language-specific variants. | |
60 | * Note that ICU's generic ISO-2022 converter has always output one escape | |
61 | * sequence followed by UTF-8 for the whole stream. | |
62 | * 3. Switching between subcharsets is extremely slow, because each time | |
63 | * the previous converter is closed and a new one opened, | |
64 | * without any kind of caching, least-recently-used list, etc. | |
65 | * 4. The code is currently buggy, and given the above it does not seem | |
66 | * reasonable to spend the time on maintenance. | |
67 | * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. | |
68 | * This means, for example, that when ISO-8859-7 is designated, the following | |
69 | * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. | |
70 | * The ICU ISO-2022 converter does not handle this - and has no information | |
71 | * about which subconverter would have to be shifted vs. which is designed | |
72 | * for 7-bit ISO-2022. | |
73 | * | |
74 | * Markus Scherer 2003-dec-03 | |
75 | */ | |
76 | #endif | |
77 | ||
b331163b | 78 | #if !UCONFIG_ONLY_HTML_CONVERSION |
374ca955 | 79 | static const char SHIFT_IN_STR[] = "\x0F"; |
51004dcb | 80 | // static const char SHIFT_OUT_STR[] = "\x0E"; |
b331163b | 81 | #endif |
b75a7d8f A |
82 | |
83 | #define CR 0x0D | |
84 | #define LF 0x0A | |
85 | #define H_TAB 0x09 | |
86 | #define V_TAB 0x0B | |
87 | #define SPACE 0x20 | |
88 | ||
46f4442e A |
89 | enum { |
90 | HWKANA_START=0xff61, | |
91 | HWKANA_END=0xff9f | |
92 | }; | |
93 | ||
94 | /* | |
95 | * 94-character sets with native byte values A1..FE are encoded in ISO 2022 | |
96 | * as bytes 21..7E. (Subtract 0x80.) | |
97 | * 96-character sets with native byte values A0..FF are encoded in ISO 2022 | |
98 | * as bytes 20..7F. (Subtract 0x80.) | |
99 | * Do not encode C1 control codes with native bytes 80..9F | |
100 | * as bytes 00..1F (C0 control codes). | |
101 | */ | |
102 | enum { | |
103 | GR94_START=0xa1, | |
104 | GR94_END=0xfe, | |
105 | GR96_START=0xa0, | |
106 | GR96_END=0xff | |
107 | }; | |
108 | ||
73c04bcf A |
109 | /* |
110 | * ISO 2022 control codes must not be converted from Unicode | |
111 | * because they would mess up the byte stream. | |
112 | * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b | |
113 | * corresponding to SO, SI, and ESC. | |
114 | */ | |
115 | #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) | |
116 | ||
374ca955 | 117 | /* for ISO-2022-JP and -CN implementations */ |
b75a7d8f | 118 | typedef enum { |
374ca955 A |
119 | /* shared values */ |
120 | INVALID_STATE=-1, | |
b75a7d8f | 121 | ASCII = 0, |
374ca955 A |
122 | |
123 | SS2_STATE=0x10, | |
124 | SS3_STATE, | |
125 | ||
126 | /* JP */ | |
b75a7d8f A |
127 | ISO8859_1 = 1 , |
128 | ISO8859_7 = 2 , | |
129 | JISX201 = 3, | |
130 | JISX208 = 4, | |
131 | JISX212 = 5, | |
132 | GB2312 =6, | |
133 | KSC5601 =7, | |
134 | HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ | |
b75a7d8f | 135 | |
374ca955 A |
136 | /* CN */ |
137 | /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ | |
138 | GB2312_1=1, | |
139 | ISO_IR_165=2, | |
140 | CNS_11643=3, | |
141 | ||
142 | /* | |
143 | * these are used in StateEnum and ISO2022State variables, | |
144 | * but CNS_11643 must be used to index into myConverterArray[] | |
145 | */ | |
146 | CNS_11643_0=0x20, | |
147 | CNS_11643_1, | |
148 | CNS_11643_2, | |
149 | CNS_11643_3, | |
150 | CNS_11643_4, | |
151 | CNS_11643_5, | |
152 | CNS_11643_6, | |
153 | CNS_11643_7 | |
b75a7d8f A |
154 | } StateEnum; |
155 | ||
374ca955 | 156 | /* is the StateEnum charset value for a DBCS charset? */ |
b331163b A |
157 | #if UCONFIG_ONLY_HTML_CONVERSION |
158 | #define IS_JP_DBCS(cs) (JISX208==(cs)) | |
159 | #else | |
374ca955 | 160 | #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) |
b331163b | 161 | #endif |
374ca955 A |
162 | |
163 | #define CSM(cs) ((uint16_t)1<<(cs)) | |
b75a7d8f | 164 | |
374ca955 A |
165 | /* |
166 | * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence | |
167 | * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x | |
168 | * | |
169 | * Note: The converter uses some leniency: | |
170 | * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in | |
171 | * all versions, not just JIS7 and JIS8. | |
172 | * - ICU does not distinguish between different versions of JIS X 0208. | |
173 | */ | |
b331163b A |
174 | #if UCONFIG_ONLY_HTML_CONVERSION |
175 | enum { MAX_JA_VERSION=0 }; | |
176 | #else | |
729e4ab9 | 177 | enum { MAX_JA_VERSION=4 }; |
b331163b | 178 | #endif |
729e4ab9 | 179 | static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ |
374ca955 | 180 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), |
b331163b | 181 | #if !UCONFIG_ONLY_HTML_CONVERSION |
374ca955 A |
182 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), |
183 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), | |
184 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), | |
185 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) | |
b331163b | 186 | #endif |
374ca955 | 187 | }; |
b75a7d8f A |
188 | |
189 | typedef enum { | |
190 | ASCII1=0, | |
191 | LATIN1, | |
192 | SBCS, | |
193 | DBCS, | |
374ca955 A |
194 | MBCS, |
195 | HWKANA | |
b75a7d8f A |
196 | }Cnv2022Type; |
197 | ||
374ca955 A |
198 | typedef struct ISO2022State { |
199 | int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ | |
200 | int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ | |
201 | int8_t prevG; /* g before single shift (SS2 or SS3) */ | |
202 | } ISO2022State; | |
203 | ||
b75a7d8f A |
204 | #define UCNV_OPTIONS_VERSION_MASK 0xf |
205 | #define UCNV_2022_MAX_CONVERTERS 10 | |
206 | ||
207 | typedef struct{ | |
73c04bcf | 208 | UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; |
b75a7d8f | 209 | UConverter *currentConverter; |
b75a7d8f | 210 | Cnv2022Type currentType; |
374ca955 | 211 | ISO2022State toU2022State, fromU2022State; |
b75a7d8f A |
212 | uint32_t key; |
213 | uint32_t version; | |
73c04bcf A |
214 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
215 | UBool isFirstBuffer; | |
216 | #endif | |
d5d484b0 | 217 | UBool isEmptySegment; |
b75a7d8f | 218 | char name[30]; |
73c04bcf | 219 | char locale[3]; |
b75a7d8f A |
220 | }UConverterDataISO2022; |
221 | ||
374ca955 | 222 | /* Protos */ |
b75a7d8f A |
223 | /* ISO-2022 ----------------------------------------------------------------- */ |
224 | ||
225 | /*Forward declaration */ | |
46f4442e | 226 | U_CFUNC void |
374ca955 A |
227 | ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, |
228 | UErrorCode * err); | |
46f4442e | 229 | U_CFUNC void |
374ca955 A |
230 | ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, |
231 | UErrorCode * err); | |
b75a7d8f A |
232 | |
233 | #define ESC_2022 0x1B /*ESC*/ | |
234 | ||
235 | typedef enum | |
236 | { | |
237 | INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ | |
238 | VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ | |
239 | VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ | |
374ca955 | 240 | VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ |
b75a7d8f A |
241 | } UCNV_TableStates_2022; |
242 | ||
243 | /* | |
244 | * The way these state transition arrays work is: | |
245 | * ex : ESC$B is the sequence for JISX208 | |
246 | * a) First Iteration: char is ESC | |
247 | * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index | |
248 | * int x = normalize_esq_chars_2022[27] which is equal to 1 | |
249 | * ii) Search for this value in escSeqStateTable_Key_2022[] | |
250 | * value of x is stored at escSeqStateTable_Key_2022[0] | |
251 | * iii) Save this index as offset | |
252 | * iv) Get state of this sequence from escSeqStateTable_Value_2022[] | |
253 | * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 | |
254 | * b) Switch on this state and continue to next char | |
255 | * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index | |
256 | * which is normalize_esq_chars_2022[36] == 4 | |
257 | * ii) x is currently 1(from above) | |
258 | * x<<=5 -- x is now 32 | |
259 | * x+=normalize_esq_chars_2022[36] | |
260 | * now x is 36 | |
261 | * iii) Search for this value in escSeqStateTable_Key_2022[] | |
262 | * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 | |
263 | * iv) Get state of this sequence from escSeqStateTable_Value_2022[] | |
264 | * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 | |
265 | * c) Switch on this state and continue to next char | |
266 | * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index | |
267 | * ii) x is currently 36 (from above) | |
268 | * x<<=5 -- x is now 1152 | |
269 | * x+=normalize_esq_chars_2022[66] | |
270 | * now x is 1161 | |
271 | * iii) Search for this value in escSeqStateTable_Key_2022[] | |
272 | * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 | |
273 | * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] | |
274 | * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 | |
275 | * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 | |
276 | */ | |
277 | ||
278 | ||
279 | /*Below are the 3 arrays depicting a state transition table*/ | |
280 | static const int8_t normalize_esq_chars_2022[256] = { | |
281 | /* 0 1 2 3 4 5 6 7 8 9 */ | |
282 | ||
283 | 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
284 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
285 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 | |
286 | ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 | |
287 | ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 | |
288 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
289 | ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 | |
290 | ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 | |
291 | ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
292 | ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
293 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
294 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
295 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
296 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
297 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
298 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
299 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
300 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
301 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
302 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
303 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
304 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
305 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
306 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
307 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 | |
308 | ,0 ,0 ,0 ,0 ,0 ,0 | |
309 | }; | |
310 | ||
374ca955 A |
311 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
312 | /* | |
313 | * When the generic ISO-2022 converter is completely removed, not just disabled | |
314 | * per #ifdef, then the following state table and the associated tables that are | |
315 | * dimensioned with MAX_STATES_2022 should be trimmed. | |
316 | * | |
317 | * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of | |
318 | * the associated escape sequences starting with ESC ( B should be removed. | |
319 | * This includes the ones with key values 1097 and all of the ones above 1000000. | |
320 | * | |
321 | * For the latter, the tables can simply be truncated. | |
322 | * For the former, since the tables must be kept parallel, it is probably best | |
323 | * to simply duplicate an adjacent table cell, parallel in all tables. | |
324 | * | |
325 | * It may make sense to restructure the tables, especially by using small search | |
326 | * tables for the variants instead of indexing them parallel to the table here. | |
327 | */ | |
328 | #endif | |
329 | ||
b75a7d8f A |
330 | #define MAX_STATES_2022 74 |
331 | static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { | |
332 | /* 0 1 2 3 4 5 6 7 8 9 */ | |
333 | ||
334 | 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 | |
335 | ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 | |
336 | ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 | |
337 | ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 | |
338 | ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 | |
339 | ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 | |
340 | ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 | |
341 | ,35947631 ,35947635 ,35947636 ,35947638 | |
342 | }; | |
343 | ||
374ca955 | 344 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
b75a7d8f A |
345 | |
346 | static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { | |
347 | /* 0 1 2 3 4 5 6 7 8 9 */ | |
348 | ||
349 | NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" | |
374ca955 | 350 | ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" |
b75a7d8f A |
351 | ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" |
352 | ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" | |
353 | ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" | |
354 | ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" | |
355 | ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" | |
356 | ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" | |
357 | }; | |
358 | ||
374ca955 A |
359 | #endif |
360 | ||
46f4442e | 361 | static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { |
b75a7d8f | 362 | /* 0 1 2 3 4 5 6 7 8 9 */ |
374ca955 | 363 | VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
b75a7d8f A |
364 | ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
365 | ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 | |
366 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 | |
367 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 | |
368 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 | |
369 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 | |
370 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 | |
371 | }; | |
372 | ||
b75a7d8f A |
373 | /* Type def for refactoring changeState_2022 code*/ |
374 | typedef enum{ | |
374ca955 | 375 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
b75a7d8f | 376 | ISO_2022=0, |
374ca955 | 377 | #endif |
b75a7d8f | 378 | ISO_2022_JP=1, |
b331163b | 379 | #if !UCONFIG_ONLY_HTML_CONVERSION |
b75a7d8f A |
380 | ISO_2022_KR=2, |
381 | ISO_2022_CN=3 | |
b331163b | 382 | #endif |
b75a7d8f A |
383 | } Variant2022; |
384 | ||
b75a7d8f | 385 | /*********** ISO 2022 Converter Protos ***********/ |
46f4442e | 386 | static void |
729e4ab9 | 387 | _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); |
b75a7d8f A |
388 | |
389 | static void | |
390 | _ISO2022Close(UConverter *converter); | |
391 | ||
46f4442e | 392 | static void |
b75a7d8f A |
393 | _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); |
394 | ||
46f4442e | 395 | static const char* |
b75a7d8f A |
396 | _ISO2022getName(const UConverter* cnv); |
397 | ||
46f4442e | 398 | static void |
b75a7d8f A |
399 | _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); |
400 | ||
46f4442e | 401 | static UConverter * |
b75a7d8f A |
402 | _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); |
403 | ||
374ca955 | 404 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
46f4442e | 405 | static void |
374ca955 A |
406 | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); |
407 | #endif | |
b75a7d8f | 408 | |
4388f060 A |
409 | namespace { |
410 | ||
374ca955 | 411 | /*const UConverterSharedData _ISO2022Data;*/ |
4388f060 | 412 | extern const UConverterSharedData _ISO2022JPData; |
b331163b A |
413 | |
414 | #if !UCONFIG_ONLY_HTML_CONVERSION | |
4388f060 A |
415 | extern const UConverterSharedData _ISO2022KRData; |
416 | extern const UConverterSharedData _ISO2022CNData; | |
b331163b | 417 | #endif |
4388f060 A |
418 | |
419 | } // namespace | |
b75a7d8f | 420 | |
374ca955 | 421 | /*************** Converter implementations ******************/ |
b75a7d8f | 422 | |
73c04bcf | 423 | /* The purpose of this function is to get around gcc compiler warnings. */ |
4388f060 | 424 | static inline void |
73c04bcf A |
425 | fromUWriteUInt8(UConverter *cnv, |
426 | const char *bytes, int32_t length, | |
427 | uint8_t **target, const char *targetLimit, | |
428 | int32_t **offsets, | |
429 | int32_t sourceIndex, | |
430 | UErrorCode *pErrorCode) | |
431 | { | |
432 | char *targetChars = (char *)*target; | |
433 | ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, | |
434 | offsets, sourceIndex, pErrorCode); | |
435 | *target = (uint8_t*)targetChars; | |
436 | ||
437 | } | |
438 | ||
4388f060 A |
439 | static inline void |
440 | setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){ | |
374ca955 A |
441 | if(myConverterData->version == 1) { |
442 | UConverter *cnv = myConverterData->currentConverter; | |
b75a7d8f | 443 | |
374ca955 A |
444 | cnv->toUnicodeStatus=0; /* offset */ |
445 | cnv->mode=0; /* state */ | |
446 | cnv->toULength=0; /* byteIndex */ | |
447 | } | |
448 | } | |
b75a7d8f | 449 | |
4388f060 | 450 | static inline void |
374ca955 A |
451 | setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ |
452 | /* in ISO-2022-KR the designator sequence appears only once | |
453 | * in a file so we append it only once | |
454 | */ | |
455 | if( converter->charErrorBufferLength==0){ | |
b75a7d8f | 456 | |
374ca955 A |
457 | converter->charErrorBufferLength = 4; |
458 | converter->charErrorBuffer[0] = 0x1b; | |
459 | converter->charErrorBuffer[1] = 0x24; | |
460 | converter->charErrorBuffer[2] = 0x29; | |
461 | converter->charErrorBuffer[3] = 0x43; | |
462 | } | |
463 | if(myConverterData->version == 1) { | |
464 | UConverter *cnv = myConverterData->currentConverter; | |
b75a7d8f | 465 | |
374ca955 A |
466 | cnv->fromUChar32=0; |
467 | cnv->fromUnicodeStatus=1; /* prevLength */ | |
468 | } | |
469 | } | |
b75a7d8f | 470 | |
46f4442e | 471 | static void |
729e4ab9 | 472 | _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ |
b75a7d8f | 473 | |
374ca955 | 474 | char myLocale[6]={' ',' ',' ',' ',' ',' '}; |
b75a7d8f | 475 | |
374ca955 A |
476 | cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); |
477 | if(cnv->extraInfo != NULL) { | |
729e4ab9 | 478 | UConverterNamePieces stackPieces; |
4388f060 | 479 | UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER; |
374ca955 A |
480 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; |
481 | uint32_t version; | |
b75a7d8f | 482 | |
729e4ab9 A |
483 | stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; |
484 | ||
374ca955 | 485 | uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); |
374ca955 | 486 | myConverterData->currentType = ASCII1; |
374ca955 | 487 | cnv->fromUnicodeStatus =FALSE; |
729e4ab9 A |
488 | if(pArgs->locale){ |
489 | uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); | |
374ca955 | 490 | } |
729e4ab9 | 491 | version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; |
73c04bcf | 492 | myConverterData->version = version; |
46f4442e | 493 | if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && |
73c04bcf A |
494 | (myLocale[2]=='_' || myLocale[2]=='\0')) |
495 | { | |
374ca955 | 496 | /* open the required converters and cache them */ |
729e4ab9 | 497 | if(version>MAX_JA_VERSION) { |
b331163b A |
498 | // ICU 55 fails to open a converter for an unsupported version. |
499 | // Previously, it fell back to version 0, but that would yield | |
500 | // unexpected behavior. | |
501 | *errorCode = U_MISSING_RESOURCE_ERROR; | |
502 | return; | |
729e4ab9 | 503 | } |
374ca955 | 504 | if(jpCharsetMasks[version]&CSM(ISO8859_7)) { |
729e4ab9 A |
505 | myConverterData->myConverterArray[ISO8859_7] = |
506 | ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); | |
374ca955 | 507 | } |
729e4ab9 A |
508 | myConverterData->myConverterArray[JISX208] = |
509 | ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); | |
374ca955 | 510 | if(jpCharsetMasks[version]&CSM(JISX212)) { |
729e4ab9 A |
511 | myConverterData->myConverterArray[JISX212] = |
512 | ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); | |
374ca955 A |
513 | } |
514 | if(jpCharsetMasks[version]&CSM(GB2312)) { | |
729e4ab9 A |
515 | myConverterData->myConverterArray[GB2312] = |
516 | ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ | |
374ca955 A |
517 | } |
518 | if(jpCharsetMasks[version]&CSM(KSC5601)) { | |
729e4ab9 A |
519 | myConverterData->myConverterArray[KSC5601] = |
520 | ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); | |
374ca955 | 521 | } |
b75a7d8f | 522 | |
374ca955 A |
523 | /* set the function pointers to appropriate funtions */ |
524 | cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); | |
525 | uprv_strcpy(myConverterData->locale,"ja"); | |
b75a7d8f | 526 | |
46f4442e | 527 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); |
b331163b | 528 | size_t len = uprv_strlen(myConverterData->name); |
374ca955 A |
529 | myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); |
530 | myConverterData->name[len+1]='\0'; | |
531 | } | |
b331163b | 532 | #if !UCONFIG_ONLY_HTML_CONVERSION |
46f4442e | 533 | else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && |
73c04bcf A |
534 | (myLocale[2]=='_' || myLocale[2]=='\0')) |
535 | { | |
b331163b A |
536 | if(version>1) { |
537 | // ICU 55 fails to open a converter for an unsupported version. | |
538 | // Previously, it fell back to version 0, but that would yield | |
539 | // unexpected behavior. | |
540 | *errorCode = U_MISSING_RESOURCE_ERROR; | |
541 | return; | |
542 | } | |
729e4ab9 A |
543 | const char *cnvName; |
544 | if(version==1) { | |
545 | cnvName="icu-internal-25546"; | |
546 | } else { | |
547 | cnvName="ibm-949"; | |
548 | myConverterData->version=version=0; | |
549 | } | |
550 | if(pArgs->onlyTestIsLoadable) { | |
551 | ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ | |
552 | uprv_free(cnv->extraInfo); | |
553 | cnv->extraInfo=NULL; | |
554 | return; | |
555 | } else { | |
556 | myConverterData->currentConverter=ucnv_open(cnvName, errorCode); | |
73c04bcf A |
557 | if (U_FAILURE(*errorCode)) { |
558 | _ISO2022Close(cnv); | |
559 | return; | |
560 | } | |
b75a7d8f | 561 | |
729e4ab9 A |
562 | if(version==1) { |
563 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); | |
564 | uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); | |
565 | cnv->subCharLen = myConverterData->currentConverter->subCharLen; | |
566 | }else{ | |
567 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); | |
73c04bcf | 568 | } |
b75a7d8f | 569 | |
729e4ab9 A |
570 | /* initialize the state variables */ |
571 | setInitialStateToUnicodeKR(cnv, myConverterData); | |
572 | setInitialStateFromUnicodeKR(cnv, myConverterData); | |
b75a7d8f | 573 | |
729e4ab9 A |
574 | /* set the function pointers to appropriate funtions */ |
575 | cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; | |
576 | uprv_strcpy(myConverterData->locale,"ko"); | |
577 | } | |
b75a7d8f | 578 | } |
46f4442e | 579 | else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& |
73c04bcf A |
580 | (myLocale[2]=='_' || myLocale[2]=='\0')) |
581 | { | |
b331163b A |
582 | if(version>2) { |
583 | // ICU 55 fails to open a converter for an unsupported version. | |
584 | // Previously, it fell back to version 0, but that would yield | |
585 | // unexpected behavior. | |
586 | *errorCode = U_MISSING_RESOURCE_ERROR; | |
587 | return; | |
588 | } | |
b75a7d8f A |
589 | |
590 | /* open the required converters and cache them */ | |
729e4ab9 A |
591 | myConverterData->myConverterArray[GB2312_1] = |
592 | ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); | |
374ca955 | 593 | if(version==1) { |
729e4ab9 A |
594 | myConverterData->myConverterArray[ISO_IR_165] = |
595 | ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode); | |
374ca955 | 596 | } |
729e4ab9 A |
597 | myConverterData->myConverterArray[CNS_11643] = |
598 | ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode); | |
b75a7d8f | 599 | |
b75a7d8f A |
600 | |
601 | /* set the function pointers to appropriate funtions */ | |
602 | cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; | |
603 | uprv_strcpy(myConverterData->locale,"cn"); | |
604 | ||
729e4ab9 | 605 | if (version==0){ |
b75a7d8f | 606 | myConverterData->version = 0; |
46f4442e | 607 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); |
729e4ab9 A |
608 | }else if (version==1){ |
609 | myConverterData->version = 1; | |
610 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); | |
611 | }else { | |
612 | myConverterData->version = 2; | |
613 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); | |
b75a7d8f A |
614 | } |
615 | } | |
b331163b | 616 | #endif // !UCONFIG_ONLY_HTML_CONVERSION |
b75a7d8f | 617 | else{ |
374ca955 | 618 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
73c04bcf A |
619 | myConverterData->isFirstBuffer = TRUE; |
620 | ||
b75a7d8f A |
621 | /* append the UTF-8 escape sequence */ |
622 | cnv->charErrorBufferLength = 3; | |
623 | cnv->charErrorBuffer[0] = 0x1b; | |
624 | cnv->charErrorBuffer[1] = 0x25; | |
625 | cnv->charErrorBuffer[2] = 0x42; | |
626 | ||
627 | cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; | |
628 | /* initialize the state variables */ | |
b75a7d8f | 629 | uprv_strcpy(myConverterData->name,"ISO_2022"); |
374ca955 | 630 | #else |
b331163b A |
631 | *errorCode = U_MISSING_RESOURCE_ERROR; |
632 | // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard | |
633 | // data loading error code. | |
374ca955 A |
634 | return; |
635 | #endif | |
b75a7d8f A |
636 | } |
637 | ||
374ca955 A |
638 | cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; |
639 | ||
729e4ab9 | 640 | if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { |
374ca955 A |
641 | _ISO2022Close(cnv); |
642 | } | |
b75a7d8f A |
643 | } else { |
644 | *errorCode = U_MEMORY_ALLOCATION_ERROR; | |
645 | } | |
b75a7d8f A |
646 | } |
647 | ||
648 | ||
649 | static void | |
650 | _ISO2022Close(UConverter *converter) { | |
374ca955 A |
651 | UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); |
652 | UConverterSharedData **array = myData->myConverterArray; | |
653 | int32_t i; | |
b75a7d8f A |
654 | |
655 | if (converter->extraInfo != NULL) { | |
656 | /*close the array of converter pointers and free the memory*/ | |
374ca955 A |
657 | for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { |
658 | if(array[i]!=NULL) { | |
659 | ucnv_unloadSharedDataIfReady(array[i]); | |
b75a7d8f | 660 | } |
b75a7d8f A |
661 | } |
662 | ||
374ca955 | 663 | ucnv_close(myData->currentConverter); |
b75a7d8f A |
664 | |
665 | if(!converter->isExtraLocal){ | |
666 | uprv_free (converter->extraInfo); | |
374ca955 | 667 | converter->extraInfo = NULL; |
b75a7d8f A |
668 | } |
669 | } | |
670 | } | |
671 | ||
672 | static void | |
673 | _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { | |
674 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); | |
374ca955 A |
675 | if(choice<=UCNV_RESET_TO_UNICODE) { |
676 | uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); | |
677 | myConverterData->key = 0; | |
d5d484b0 | 678 | myConverterData->isEmptySegment = FALSE; |
374ca955 A |
679 | } |
680 | if(choice!=UCNV_RESET_TO_UNICODE) { | |
681 | uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); | |
682 | } | |
683 | #ifdef U_ENABLE_GENERIC_ISO_2022 | |
684 | if(myConverterData->locale[0] == 0){ | |
b75a7d8f A |
685 | if(choice<=UCNV_RESET_TO_UNICODE) { |
686 | myConverterData->isFirstBuffer = TRUE; | |
374ca955 | 687 | myConverterData->key = 0; |
b75a7d8f A |
688 | if (converter->mode == UCNV_SO){ |
689 | ucnv_close (myConverterData->currentConverter); | |
690 | myConverterData->currentConverter=NULL; | |
691 | } | |
46f4442e | 692 | converter->mode = UCNV_SI; |
b75a7d8f A |
693 | } |
694 | if(choice!=UCNV_RESET_TO_UNICODE) { | |
695 | /* re-append UTF-8 escape sequence */ | |
696 | converter->charErrorBufferLength = 3; | |
697 | converter->charErrorBuffer[0] = 0x1b; | |
698 | converter->charErrorBuffer[1] = 0x28; | |
699 | converter->charErrorBuffer[2] = 0x42; | |
700 | } | |
701 | } | |
374ca955 A |
702 | else |
703 | #endif | |
704 | { | |
b75a7d8f | 705 | /* reset the state variables */ |
374ca955 | 706 | if(myConverterData->locale[0] == 'k'){ |
b75a7d8f A |
707 | if(choice<=UCNV_RESET_TO_UNICODE) { |
708 | setInitialStateToUnicodeKR(converter, myConverterData); | |
709 | } | |
710 | if(choice!=UCNV_RESET_TO_UNICODE) { | |
711 | setInitialStateFromUnicodeKR(converter, myConverterData); | |
712 | } | |
713 | } | |
714 | } | |
715 | } | |
716 | ||
46f4442e | 717 | static const char* |
b75a7d8f A |
718 | _ISO2022getName(const UConverter* cnv){ |
719 | if(cnv->extraInfo){ | |
720 | UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; | |
721 | return myData->name; | |
722 | } | |
723 | return NULL; | |
724 | } | |
725 | ||
b75a7d8f | 726 | |
374ca955 A |
727 | /*************** to unicode *******************/ |
728 | /**************************************************************************** | |
729 | * Recognized escape sequences are | |
730 | * <ESC>(B ASCII | |
731 | * <ESC>.A ISO-8859-1 | |
732 | * <ESC>.F ISO-8859-7 | |
733 | * <ESC>(J JISX-201 | |
734 | * <ESC>(I JISX-201 | |
735 | * <ESC>$B JISX-208 | |
736 | * <ESC>$@ JISX-208 | |
737 | * <ESC>$(D JISX-212 | |
738 | * <ESC>$A GB2312 | |
739 | * <ESC>$(C KSC5601 | |
740 | */ | |
46f4442e | 741 | static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { |
374ca955 A |
742 | /* 0 1 2 3 4 5 6 7 8 9 */ |
743 | INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
744 | ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE | |
745 | ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
746 | ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE | |
747 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
748 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
749 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
750 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
751 | }; | |
b75a7d8f | 752 | |
b331163b | 753 | #if !UCONFIG_ONLY_HTML_CONVERSION |
374ca955 | 754 | /*************** to unicode *******************/ |
46f4442e | 755 | static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { |
374ca955 A |
756 | /* 0 1 2 3 4 5 6 7 8 9 */ |
757 | INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
758 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
759 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
760 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
761 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 | |
762 | ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
763 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
764 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE | |
765 | }; | |
b331163b | 766 | #endif |
b75a7d8f | 767 | |
b75a7d8f | 768 | |
46f4442e | 769 | static UCNV_TableStates_2022 |
374ca955 A |
770 | getKey_2022(char c,int32_t* key,int32_t* offset){ |
771 | int32_t togo; | |
772 | int32_t low = 0; | |
773 | int32_t hi = MAX_STATES_2022; | |
774 | int32_t oldmid=0; | |
b75a7d8f | 775 | |
374ca955 A |
776 | togo = normalize_esq_chars_2022[(uint8_t)c]; |
777 | if(togo == 0) { | |
778 | /* not a valid character anywhere in an escape sequence */ | |
779 | *key = 0; | |
780 | *offset = 0; | |
781 | return INVALID_2022; | |
782 | } | |
783 | togo = (*key << 5) + togo; | |
b75a7d8f | 784 | |
374ca955 | 785 | while (hi != low) /*binary search*/{ |
b75a7d8f | 786 | |
57a6839d | 787 | int32_t mid = (hi+low) >> 1; /*Finds median*/ |
374ca955 | 788 | |
46f4442e | 789 | if (mid == oldmid) |
374ca955 A |
790 | break; |
791 | ||
792 | if (escSeqStateTable_Key_2022[mid] > togo){ | |
793 | hi = mid; | |
794 | } | |
795 | else if (escSeqStateTable_Key_2022[mid] < togo){ | |
796 | low = mid; | |
797 | } | |
798 | else /*we found it*/{ | |
799 | *key = togo; | |
800 | *offset = mid; | |
46f4442e | 801 | return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; |
374ca955 A |
802 | } |
803 | oldmid = mid; | |
b75a7d8f | 804 | |
b75a7d8f | 805 | } |
b75a7d8f | 806 | |
374ca955 A |
807 | *key = 0; |
808 | *offset = 0; | |
809 | return INVALID_2022; | |
b75a7d8f A |
810 | } |
811 | ||
374ca955 A |
812 | /*runs through a state machine to determine the escape sequence - codepage correspondance |
813 | */ | |
46f4442e | 814 | static void |
374ca955 | 815 | changeState_2022(UConverter* _this, |
46f4442e | 816 | const char** source, |
374ca955 A |
817 | const char* sourceLimit, |
818 | Variant2022 var, | |
819 | UErrorCode* err){ | |
820 | UCNV_TableStates_2022 value; | |
821 | UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); | |
822 | uint32_t key = myData2022->key; | |
73c04bcf | 823 | int32_t offset = 0; |
fd0068a8 | 824 | int8_t initialToULength = _this->toULength; |
374ca955 A |
825 | char c; |
826 | ||
827 | value = VALID_NON_TERMINAL_2022; | |
828 | while (*source < sourceLimit) { | |
829 | c = *(*source)++; | |
830 | _this->toUBytes[_this->toULength++]=(uint8_t)c; | |
831 | value = getKey_2022(c,(int32_t *) &key, &offset); | |
46f4442e | 832 | |
374ca955 | 833 | switch (value){ |
b75a7d8f | 834 | |
374ca955 A |
835 | case VALID_NON_TERMINAL_2022 : |
836 | /* continue with the loop */ | |
837 | break; | |
b75a7d8f | 838 | |
374ca955 A |
839 | case VALID_TERMINAL_2022: |
840 | key = 0; | |
841 | goto DONE; | |
b75a7d8f | 842 | |
374ca955 A |
843 | case INVALID_2022: |
844 | goto DONE; | |
b75a7d8f | 845 | |
374ca955 A |
846 | case VALID_MAYBE_TERMINAL_2022: |
847 | #ifdef U_ENABLE_GENERIC_ISO_2022 | |
848 | /* ESC ( B is ambiguous only for ISO_2022 itself */ | |
849 | if(var == ISO_2022) { | |
850 | /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ | |
851 | _this->toULength = 0; | |
b75a7d8f | 852 | |
374ca955 A |
853 | /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ |
854 | ||
855 | /* continue with the loop */ | |
856 | value = VALID_NON_TERMINAL_2022; | |
857 | break; | |
858 | } else | |
859 | #endif | |
860 | { | |
861 | /* not ISO_2022 itself, finish here */ | |
862 | value = VALID_TERMINAL_2022; | |
863 | key = 0; | |
864 | goto DONE; | |
b75a7d8f A |
865 | } |
866 | } | |
b75a7d8f | 867 | } |
b75a7d8f | 868 | |
374ca955 A |
869 | DONE: |
870 | myData2022->key = key; | |
b75a7d8f | 871 | |
374ca955 A |
872 | if (value == VALID_NON_TERMINAL_2022) { |
873 | /* indicate that the escape sequence is incomplete: key!=0 */ | |
874 | return; | |
875 | } else if (value == INVALID_2022 ) { | |
876 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; | |
374ca955 A |
877 | } else /* value == VALID_TERMINAL_2022 */ { |
878 | switch(var){ | |
879 | #ifdef U_ENABLE_GENERIC_ISO_2022 | |
880 | case ISO_2022: | |
881 | { | |
882 | const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; | |
883 | if(chosenConverterName == NULL) { | |
884 | /* SS2 or SS3 */ | |
885 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
46f4442e | 886 | _this->toUCallbackReason = UCNV_UNASSIGNED; |
374ca955 | 887 | return; |
b75a7d8f | 888 | } |
374ca955 A |
889 | |
890 | _this->mode = UCNV_SI; | |
891 | ucnv_close(myData2022->currentConverter); | |
892 | myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); | |
893 | if(U_SUCCESS(*err)) { | |
894 | myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; | |
895 | _this->mode = UCNV_SO; | |
896 | } | |
897 | break; | |
898 | } | |
899 | #endif | |
900 | case ISO_2022_JP: | |
901 | { | |
46f4442e | 902 | StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; |
374ca955 A |
903 | switch(tempState) { |
904 | case INVALID_STATE: | |
905 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
906 | break; | |
907 | case SS2_STATE: | |
908 | if(myData2022->toU2022State.cs[2]!=0) { | |
909 | if(myData2022->toU2022State.g<2) { | |
910 | myData2022->toU2022State.prevG=myData2022->toU2022State.g; | |
911 | } | |
912 | myData2022->toU2022State.g=2; | |
913 | } else { | |
914 | /* illegal to have SS2 before a matching designator */ | |
915 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; | |
916 | } | |
917 | break; | |
918 | /* case SS3_STATE: not used in ISO-2022-JP-x */ | |
919 | case ISO8859_1: | |
920 | case ISO8859_7: | |
921 | if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { | |
922 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
923 | } else { | |
924 | /* G2 charset for SS2 */ | |
925 | myData2022->toU2022State.cs[2]=(int8_t)tempState; | |
926 | } | |
927 | break; | |
928 | default: | |
929 | if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { | |
930 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
931 | } else { | |
932 | /* G0 charset */ | |
933 | myData2022->toU2022State.cs[0]=(int8_t)tempState; | |
934 | } | |
935 | break; | |
936 | } | |
937 | } | |
938 | break; | |
b331163b | 939 | #if !UCONFIG_ONLY_HTML_CONVERSION |
374ca955 A |
940 | case ISO_2022_CN: |
941 | { | |
46f4442e | 942 | StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; |
374ca955 A |
943 | switch(tempState) { |
944 | case INVALID_STATE: | |
945 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
946 | break; | |
947 | case SS2_STATE: | |
948 | if(myData2022->toU2022State.cs[2]!=0) { | |
949 | if(myData2022->toU2022State.g<2) { | |
950 | myData2022->toU2022State.prevG=myData2022->toU2022State.g; | |
951 | } | |
952 | myData2022->toU2022State.g=2; | |
953 | } else { | |
954 | /* illegal to have SS2 before a matching designator */ | |
955 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; | |
956 | } | |
957 | break; | |
958 | case SS3_STATE: | |
959 | if(myData2022->toU2022State.cs[3]!=0) { | |
960 | if(myData2022->toU2022State.g<2) { | |
961 | myData2022->toU2022State.prevG=myData2022->toU2022State.g; | |
962 | } | |
963 | myData2022->toU2022State.g=3; | |
964 | } else { | |
965 | /* illegal to have SS3 before a matching designator */ | |
966 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; | |
967 | } | |
968 | break; | |
969 | case ISO_IR_165: | |
970 | if(myData2022->version==0) { | |
971 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
972 | break; | |
973 | } | |
73c04bcf | 974 | /*fall through*/ |
374ca955 | 975 | case GB2312_1: |
73c04bcf | 976 | /*fall through*/ |
374ca955 A |
977 | case CNS_11643_1: |
978 | myData2022->toU2022State.cs[1]=(int8_t)tempState; | |
979 | break; | |
980 | case CNS_11643_2: | |
981 | myData2022->toU2022State.cs[2]=(int8_t)tempState; | |
982 | break; | |
983 | default: | |
984 | /* other CNS 11643 planes */ | |
985 | if(myData2022->version==0) { | |
986 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
987 | } else { | |
988 | myData2022->toU2022State.cs[3]=(int8_t)tempState; | |
989 | } | |
990 | break; | |
991 | } | |
992 | } | |
993 | break; | |
994 | case ISO_2022_KR: | |
995 | if(offset==0x30){ | |
996 | /* nothing to be done, just accept this one escape sequence */ | |
997 | } else { | |
998 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; | |
999 | } | |
1000 | break; | |
b331163b | 1001 | #endif // !UCONFIG_ONLY_HTML_CONVERSION |
374ca955 A |
1002 | |
1003 | default: | |
1004 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; | |
1005 | break; | |
1006 | } | |
1007 | } | |
1008 | if(U_SUCCESS(*err)) { | |
1009 | _this->toULength = 0; | |
fd0068a8 A |
1010 | } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { |
1011 | if(_this->toULength>1) { | |
1012 | /* | |
1013 | * Ticket 5691: consistent illegal sequences: | |
1014 | * - We include at least the first byte (ESC) in the illegal sequence. | |
1015 | * - If any of the non-initial bytes could be the start of a character, | |
1016 | * we stop the illegal sequence before the first one of those. | |
1017 | * In escape sequences, all following bytes are "printable", that is, | |
1018 | * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), | |
1019 | * they are valid single/lead bytes. | |
1020 | * For simplicity, we always only report the initial ESC byte as the | |
1021 | * illegal sequence and back out all other bytes we looked at. | |
1022 | */ | |
1023 | /* Back out some bytes. */ | |
1024 | int8_t backOutDistance=_this->toULength-1; | |
1025 | int8_t bytesFromThisBuffer=_this->toULength-initialToULength; | |
1026 | if(backOutDistance<=bytesFromThisBuffer) { | |
1027 | /* same as initialToULength<=1 */ | |
1028 | *source-=backOutDistance; | |
1029 | } else { | |
1030 | /* Back out bytes from the previous buffer: Need to replay them. */ | |
1031 | _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); | |
1032 | /* same as -(initialToULength-1) */ | |
1033 | /* preToULength is negative! */ | |
1034 | uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); | |
1035 | *source-=bytesFromThisBuffer; | |
1036 | } | |
1037 | _this->toULength=1; | |
1038 | } | |
46f4442e A |
1039 | } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { |
1040 | _this->toUCallbackReason = UCNV_UNASSIGNED; | |
374ca955 A |
1041 | } |
1042 | } | |
1043 | ||
b331163b | 1044 | #if !UCONFIG_ONLY_HTML_CONVERSION |
374ca955 A |
1045 | /*Checks the characters of the buffer against valid 2022 escape sequences |
1046 | *if the match we return a pointer to the initial start of the sequence otherwise | |
1047 | *we return sourceLimit | |
1048 | */ | |
1049 | /*for 2022 looks ahead in the stream | |
1050 | *to determine the longest possible convertible | |
1051 | *data stream | |
1052 | */ | |
4388f060 | 1053 | static inline const char* |
374ca955 A |
1054 | getEndOfBuffer_2022(const char** source, |
1055 | const char* sourceLimit, | |
4388f060 | 1056 | UBool /*flush*/){ |
374ca955 A |
1057 | |
1058 | const char* mySource = *source; | |
1059 | ||
1060 | #ifdef U_ENABLE_GENERIC_ISO_2022 | |
46f4442e | 1061 | if (*source >= sourceLimit) |
374ca955 A |
1062 | return sourceLimit; |
1063 | ||
1064 | do{ | |
1065 | ||
1066 | if (*mySource == ESC_2022){ | |
1067 | int8_t i; | |
1068 | int32_t key = 0; | |
1069 | int32_t offset; | |
1070 | UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; | |
1071 | ||
1072 | /* Kludge: I could not | |
1073 | * figure out the reason for validating an escape sequence | |
1074 | * twice - once here and once in changeState_2022(). | |
1075 | * is it possible to have an ESC character in a ISO2022 | |
1076 | * byte stream which is valid in a code page? Is it legal? | |
1077 | */ | |
46f4442e | 1078 | for (i=0; |
374ca955 A |
1079 | (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); |
1080 | i++) { | |
1081 | value = getKey_2022(*(mySource+i), &key, &offset); | |
1082 | } | |
46f4442e | 1083 | if (value > 0 || *mySource==ESC_2022) |
374ca955 A |
1084 | return mySource; |
1085 | ||
46f4442e | 1086 | if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) |
374ca955 A |
1087 | return sourceLimit; |
1088 | } | |
1089 | }while (++mySource < sourceLimit); | |
1090 | ||
1091 | return sourceLimit; | |
1092 | #else | |
1093 | while(mySource < sourceLimit && *mySource != ESC_2022) { | |
1094 | ++mySource; | |
1095 | } | |
1096 | return mySource; | |
1097 | #endif | |
1098 | } | |
b331163b | 1099 | #endif |
374ca955 A |
1100 | |
1101 | /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c | |
46f4442e A |
1102 | * any future change in _MBCSFromUChar32() function should be reflected here. |
1103 | * @return number of bytes in *value; negative number if fallback; 0 if no mapping | |
374ca955 | 1104 | */ |
4388f060 | 1105 | static inline int32_t |
374ca955 | 1106 | MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, |
46f4442e A |
1107 | UChar32 c, |
1108 | uint32_t* value, | |
1109 | UBool useFallback, | |
374ca955 A |
1110 | int outputType) |
1111 | { | |
1112 | const int32_t *cx; | |
1113 | const uint16_t *table; | |
1114 | uint32_t stage2Entry; | |
1115 | uint32_t myValue; | |
46f4442e | 1116 | int32_t length; |
374ca955 | 1117 | const uint8_t *p; |
46f4442e A |
1118 | /* |
1119 | * TODO(markus): Use and require new, faster MBCS conversion table structures. | |
1120 | * Use internal version of ucnv_open() that verifies that the new structures are available, | |
1121 | * else U_INTERNAL_PROGRAM_ERROR. | |
1122 | */ | |
374ca955 A |
1123 | /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
1124 | if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { | |
1125 | table=sharedData->mbcs.fromUnicodeTable; | |
1126 | stage2Entry=MBCS_STAGE_2_FROM_U(table, c); | |
1127 | /* get the bytes and the length for the output */ | |
1128 | if(outputType==MBCS_OUTPUT_2){ | |
1129 | myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); | |
1130 | if(myValue<=0xff) { | |
46f4442e | 1131 | length=1; |
374ca955 | 1132 | } else { |
46f4442e | 1133 | length=2; |
374ca955 A |
1134 | } |
1135 | } else /* outputType==MBCS_OUTPUT_3 */ { | |
1136 | p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); | |
1137 | myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; | |
1138 | if(myValue<=0xff) { | |
46f4442e | 1139 | length=1; |
374ca955 | 1140 | } else if(myValue<=0xffff) { |
46f4442e | 1141 | length=2; |
374ca955 | 1142 | } else { |
46f4442e | 1143 | length=3; |
b75a7d8f A |
1144 | } |
1145 | } | |
1146 | /* is this code point assigned, or do we use fallbacks? */ | |
46f4442e A |
1147 | if((stage2Entry&(1<<(16+(c&0xf))))!=0) { |
1148 | /* assigned */ | |
1149 | *value=myValue; | |
1150 | return length; | |
1151 | } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { | |
b75a7d8f | 1152 | /* |
374ca955 | 1153 | * We allow a 0 byte output if the "assigned" bit is set for this entry. |
b75a7d8f | 1154 | * There is no way with this data structure for fallback output |
374ca955 | 1155 | * to be a zero byte. |
b75a7d8f | 1156 | */ |
b75a7d8f | 1157 | *value=myValue; |
46f4442e | 1158 | return -length; |
b75a7d8f | 1159 | } |
b75a7d8f | 1160 | } |
374ca955 A |
1161 | |
1162 | cx=sharedData->mbcs.extIndexes; | |
1163 | if(cx!=NULL) { | |
46f4442e | 1164 | return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); |
374ca955 A |
1165 | } |
1166 | ||
1167 | /* unassigned */ | |
46f4442e | 1168 | return 0; |
b75a7d8f A |
1169 | } |
1170 | ||
1171 | /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c | |
46f4442e A |
1172 | * any future change in _MBCSSingleFromUChar32() function should be reflected here. |
1173 | * @param retval pointer to output byte | |
1174 | * @return 1 roundtrip byte 0 no mapping -1 fallback byte | |
b75a7d8f | 1175 | */ |
4388f060 | 1176 | static inline int32_t |
b75a7d8f | 1177 | MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, |
46f4442e A |
1178 | UChar32 c, |
1179 | uint32_t* retval, | |
b75a7d8f A |
1180 | UBool useFallback) |
1181 | { | |
46f4442e | 1182 | const uint16_t *table; |
b75a7d8f A |
1183 | int32_t value; |
1184 | /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ | |
374ca955 | 1185 | if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { |
46f4442e | 1186 | return 0; |
b75a7d8f A |
1187 | } |
1188 | /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ | |
374ca955 | 1189 | table=sharedData->mbcs.fromUnicodeTable; |
b75a7d8f | 1190 | /* get the byte for the output */ |
374ca955 | 1191 | value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); |
b75a7d8f | 1192 | /* is this code point assigned, or do we use fallbacks? */ |
46f4442e A |
1193 | *retval=(uint32_t)(value&0xff); |
1194 | if(value>=0xf00) { | |
1195 | return 1; /* roundtrip */ | |
1196 | } else if(useFallback ? value>=0x800 : value>=0xc00) { | |
1197 | return -1; /* fallback taken */ | |
b75a7d8f | 1198 | } else { |
46f4442e | 1199 | return 0; /* no mapping */ |
b75a7d8f | 1200 | } |
b75a7d8f A |
1201 | } |
1202 | ||
46f4442e A |
1203 | /* |
1204 | * Check that the result is a 2-byte value with each byte in the range A1..FE | |
1205 | * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte | |
1206 | * to move it to the ISO 2022 range 21..7E. | |
1207 | * Return 0 if out of range. | |
1208 | */ | |
4388f060 | 1209 | static inline uint32_t |
46f4442e A |
1210 | _2022FromGR94DBCS(uint32_t value) { |
1211 | if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && | |
1212 | (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) | |
1213 | ) { | |
1214 | return value - 0x8080; /* shift down to 21..7e byte range */ | |
1215 | } else { | |
1216 | return 0; /* not valid for ISO 2022 */ | |
1217 | } | |
1218 | } | |
1219 | ||
1220 | #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ | |
1221 | /* | |
1222 | * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the | |
1223 | * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point | |
1224 | * unchanged. | |
1225 | */ | |
4388f060 | 1226 | static inline uint32_t |
46f4442e A |
1227 | _2022ToGR94DBCS(uint32_t value) { |
1228 | uint32_t returnValue = value + 0x8080; | |
1229 | if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && | |
1230 | (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { | |
1231 | return returnValue; | |
1232 | } else { | |
1233 | return value; | |
1234 | } | |
1235 | } | |
1236 | #endif | |
1237 | ||
374ca955 A |
1238 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
1239 | ||
b75a7d8f A |
1240 | /********************************************************************************** |
1241 | * ISO-2022 Converter | |
1242 | * | |
1243 | * | |
1244 | */ | |
1245 | ||
46f4442e | 1246 | static void |
b75a7d8f A |
1247 | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, |
1248 | UErrorCode* err){ | |
374ca955 A |
1249 | const char* mySourceLimit, *realSourceLimit; |
1250 | const char* sourceStart; | |
1251 | const UChar* myTargetStart; | |
b75a7d8f | 1252 | UConverter* saveThis; |
b75a7d8f | 1253 | UConverterDataISO2022* myData; |
374ca955 A |
1254 | int8_t length; |
1255 | ||
1256 | saveThis = args->converter; | |
1257 | myData=((UConverterDataISO2022*)(saveThis->extraInfo)); | |
1258 | ||
1259 | realSourceLimit = args->sourceLimit; | |
1260 | while (args->source < realSourceLimit) { | |
1261 | if(myData->key == 0) { /* are we in the middle of an escape sequence? */ | |
1262 | /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ | |
1263 | mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); | |
1264 | ||
1265 | if(args->source < mySourceLimit) { | |
1266 | if(myData->currentConverter==NULL) { | |
1267 | myData->currentConverter = ucnv_open("ASCII",err); | |
1268 | if(U_FAILURE(*err)){ | |
1269 | return; | |
1270 | } | |
b75a7d8f | 1271 | |
374ca955 A |
1272 | myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; |
1273 | saveThis->mode = UCNV_SO; | |
b75a7d8f | 1274 | } |
b75a7d8f | 1275 | |
374ca955 A |
1276 | /* convert to before the ESC or until the end of the buffer */ |
1277 | myData->isFirstBuffer=FALSE; | |
1278 | sourceStart = args->source; | |
1279 | myTargetStart = args->target; | |
1280 | args->converter = myData->currentConverter; | |
1281 | ucnv_toUnicode(args->converter, | |
1282 | &args->target, | |
1283 | args->targetLimit, | |
1284 | &args->source, | |
1285 | mySourceLimit, | |
1286 | args->offsets, | |
1287 | (UBool)(args->flush && mySourceLimit == realSourceLimit), | |
1288 | err); | |
1289 | args->converter = saveThis; | |
1290 | ||
1291 | if (*err == U_BUFFER_OVERFLOW_ERROR) { | |
1292 | /* move the overflow buffer */ | |
1293 | length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; | |
1294 | myData->currentConverter->UCharErrorBufferLength = 0; | |
1295 | if(length > 0) { | |
1296 | uprv_memcpy(saveThis->UCharErrorBuffer, | |
1297 | myData->currentConverter->UCharErrorBuffer, | |
1298 | length*U_SIZEOF_UCHAR); | |
1299 | } | |
1300 | return; | |
1301 | } | |
b75a7d8f | 1302 | |
374ca955 A |
1303 | /* |
1304 | * At least one of: | |
1305 | * -Error while converting | |
1306 | * -Done with entire buffer | |
1307 | * -Need to write offsets or update the current offset | |
1308 | * (leave that up to the code in ucnv.c) | |
1309 | * | |
1310 | * or else we just stopped at an ESC byte and continue with changeState_2022() | |
1311 | */ | |
1312 | if (U_FAILURE(*err) || | |
1313 | (args->source == realSourceLimit) || | |
1314 | (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || | |
1315 | (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) | |
1316 | ) { | |
1317 | /* copy partial or error input for truncated detection and error handling */ | |
1318 | if(U_FAILURE(*err)) { | |
1319 | length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; | |
1320 | if(length > 0) { | |
1321 | uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); | |
1322 | } | |
1323 | } else { | |
1324 | length = saveThis->toULength = myData->currentConverter->toULength; | |
1325 | if(length > 0) { | |
1326 | uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); | |
1327 | if(args->source < mySourceLimit) { | |
1328 | *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ | |
1329 | } | |
1330 | } | |
1331 | } | |
1332 | return; | |
b75a7d8f | 1333 | } |
b75a7d8f A |
1334 | } |
1335 | } | |
b75a7d8f A |
1336 | |
1337 | sourceStart = args->source; | |
1338 | changeState_2022(args->converter, | |
46f4442e | 1339 | &(args->source), |
374ca955 | 1340 | realSourceLimit, |
b75a7d8f | 1341 | ISO_2022, |
b75a7d8f | 1342 | err); |
374ca955 A |
1343 | if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { |
1344 | /* let the ucnv.c code update its current offset */ | |
1345 | return; | |
b75a7d8f | 1346 | } |
b75a7d8f | 1347 | } |
b75a7d8f A |
1348 | } |
1349 | ||
374ca955 | 1350 | #endif |
b75a7d8f A |
1351 | |
1352 | /* | |
1353 | * To Unicode Callback helper function | |
1354 | */ | |
46f4442e | 1355 | static void |
374ca955 A |
1356 | toUnicodeCallback(UConverter *cnv, |
1357 | const uint32_t sourceChar, const uint32_t targetUniChar, | |
1358 | UErrorCode* err){ | |
b75a7d8f | 1359 | if(sourceChar>0xff){ |
374ca955 A |
1360 | cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); |
1361 | cnv->toUBytes[1] = (uint8_t)sourceChar; | |
1362 | cnv->toULength = 2; | |
b75a7d8f A |
1363 | } |
1364 | else{ | |
374ca955 | 1365 | cnv->toUBytes[0] =(char) sourceChar; |
fd0068a8 | 1366 | cnv->toULength = 1; |
b75a7d8f A |
1367 | } |
1368 | ||
1369 | if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ | |
b75a7d8f A |
1370 | *err = U_INVALID_CHAR_FOUND; |
1371 | } | |
1372 | else{ | |
b75a7d8f A |
1373 | *err = U_ILLEGAL_CHAR_FOUND; |
1374 | } | |
b75a7d8f A |
1375 | } |
1376 | ||
1377 | /**************************************ISO-2022-JP*************************************************/ | |
1378 | ||
1379 | /************************************** IMPORTANT ************************************************** | |
1380 | * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and | |
1381 | * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). | |
46f4442e A |
1382 | * The converter iterates over each Unicode codepoint |
1383 | * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is | |
1384 | * processed one char at a time it would make sense to reduce the extra processing a canned converter | |
b75a7d8f A |
1385 | * would do as far as possible. |
1386 | * | |
46f4442e A |
1387 | * If the implementation of these macros or structure of sharedData struct change in the future, make |
1388 | * sure that ISO-2022 is also changed. | |
b75a7d8f A |
1389 | *************************************************************************************************** |
1390 | */ | |
1391 | ||
1392 | /*************************************************************************************************** | |
1393 | * Rules for ISO-2022-jp encoding | |
46f4442e | 1394 | * (i) Escape sequences must be fully contained within a line they should not |
b75a7d8f A |
1395 | * span new lines or CRs |
1396 | * (ii) If the last character on a line is represented by two bytes then an ASCII or | |
1397 | * JIS-Roman character escape sequence should follow before the line terminates | |
46f4442e A |
1398 | * (iii) If the first character on the line is represented by two bytes then a two |
1399 | * byte character escape sequence should precede it | |
b75a7d8f A |
1400 | * (iv) If no escape sequence is encountered then the characters are ASCII |
1401 | * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, | |
1402 | * and invoked with SS2 (ESC N). | |
1403 | * (vi) If there is any G0 designation in text, there must be a switch to | |
1404 | * ASCII or to JIS X 0201-Roman before a space character (but not | |
1405 | * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control | |
1406 | * characters such as tab or CRLF. | |
1407 | * (vi) Supported encodings: | |
1408 | * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 | |
1409 | * | |
1410 | * source : RFC-1554 | |
1411 | * | |
1412 | * JISX201, JISX208,JISX212 : new .cnv data files created | |
1413 | * KSC5601 : alias to ibm-949 mapping table | |
1414 | * GB2312 : alias to ibm-1386 mapping table | |
1415 | * ISO-8859-1 : Algorithmic implemented as LATIN1 case | |
1416 | * ISO-8859-7 : alisas to ibm-9409 mapping table | |
1417 | */ | |
b75a7d8f | 1418 | |
374ca955 A |
1419 | /* preference order of JP charsets */ |
1420 | static const StateEnum jpCharsetPref[]={ | |
1421 | ASCII, | |
1422 | JISX201, | |
1423 | ISO8859_1, | |
1424 | ISO8859_7, | |
1425 | JISX208, | |
1426 | JISX212, | |
1427 | GB2312, | |
1428 | KSC5601, | |
1429 | HWKANA_7BIT | |
b75a7d8f A |
1430 | }; |
1431 | ||
73c04bcf A |
1432 | /* |
1433 | * The escape sequences must be in order of the enum constants like JISX201 = 3, | |
1434 | * not in order of jpCharsetPref[]! | |
1435 | */ | |
374ca955 | 1436 | static const char escSeqChars[][6] ={ |
b75a7d8f A |
1437 | "\x1B\x28\x42", /* <ESC>(B ASCII */ |
1438 | "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ | |
1439 | "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ | |
1440 | "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ | |
1441 | "\x1B\x24\x42", /* <ESC>$B JISX-208 */ | |
1442 | "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ | |
1443 | "\x1B\x24\x41", /* <ESC>$A GB2312 */ | |
1444 | "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ | |
1445 | "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ | |
1446 | ||
1447 | }; | |
46f4442e | 1448 | static const int8_t escSeqCharsLen[] ={ |
374ca955 | 1449 | 3, /* length of <ESC>(B ASCII */ |
b75a7d8f A |
1450 | 3, /* length of <ESC>.A ISO-8859-1 */ |
1451 | 3, /* length of <ESC>.F ISO-8859-7 */ | |
1452 | 3, /* length of <ESC>(J JISX-201 */ | |
1453 | 3, /* length of <ESC>$B JISX-208 */ | |
1454 | 4, /* length of <ESC>$(D JISX-212 */ | |
1455 | 3, /* length of <ESC>$A GB2312 */ | |
1456 | 4, /* length of <ESC>$(C KSC5601 */ | |
1457 | 3 /* length of <ESC>(I HWKANA_7BIT */ | |
1458 | }; | |
1459 | ||
1460 | /* | |
1461 | * The iteration over various code pages works this way: | |
1462 | * i) Get the currentState from myConverterData->currentState | |
1463 | * ii) Check if the character is mapped to a valid character in the currentState | |
1464 | * Yes -> a) set the initIterState to currentState | |
1465 | * b) remain in this state until an invalid character is found | |
1466 | * No -> a) go to the next code page and find the character | |
46f4442e | 1467 | * iii) Before changing the state increment the current state check if the current state |
b75a7d8f A |
1468 | * is equal to the intitIteration state |
1469 | * Yes -> A character that cannot be represented in any of the supported encodings | |
1470 | * break and return a U_INVALID_CHARACTER error | |
1471 | * No -> Continue and find the character in next code page | |
1472 | * | |
1473 | * | |
46f4442e | 1474 | * TODO: Implement a priority technique where the users are allowed to set the priority of code pages |
b75a7d8f A |
1475 | */ |
1476 | ||
46f4442e | 1477 | /* Map 00..7F to Unicode according to JIS X 0201. */ |
4388f060 | 1478 | static inline uint32_t |
46f4442e A |
1479 | jisx201ToU(uint32_t value) { |
1480 | if(value < 0x5c) { | |
1481 | return value; | |
1482 | } else if(value == 0x5c) { | |
1483 | return 0xa5; | |
1484 | } else if(value == 0x7e) { | |
1485 | return 0x203e; | |
1486 | } else /* value <= 0x7f */ { | |
1487 | return value; | |
1488 | } | |
1489 | } | |
1490 | ||
1491 | /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ | |
4388f060 | 1492 | static inline uint32_t |
46f4442e A |
1493 | jisx201FromU(uint32_t value) { |
1494 | if(value<=0x7f) { | |
1495 | if(value!=0x5c && value!=0x7e) { | |
1496 | return value; | |
1497 | } | |
1498 | } else if(value==0xa5) { | |
1499 | return 0x5c; | |
1500 | } else if(value==0x203e) { | |
1501 | return 0x7e; | |
1502 | } | |
1503 | return 0xfffe; | |
1504 | } | |
1505 | ||
1506 | /* | |
1507 | * Take a valid Shift-JIS byte pair, check that it is in the range corresponding | |
1508 | * to JIS X 0208, and convert it to a pair of 21..7E bytes. | |
1509 | * Return 0 if the byte pair is out of range. | |
1510 | */ | |
4388f060 | 1511 | static inline uint32_t |
46f4442e A |
1512 | _2022FromSJIS(uint32_t value) { |
1513 | uint8_t trail; | |
1514 | ||
1515 | if(value > 0xEFFC) { | |
1516 | return 0; /* beyond JIS X 0208 */ | |
1517 | } | |
1518 | ||
1519 | trail = (uint8_t)value; | |
1520 | ||
1521 | value &= 0xff00; /* lead byte */ | |
1522 | if(value <= 0x9f00) { | |
1523 | value -= 0x7000; | |
1524 | } else /* 0xe000 <= value <= 0xef00 */ { | |
1525 | value -= 0xb000; | |
1526 | } | |
1527 | value <<= 1; | |
1528 | ||
1529 | if(trail <= 0x9e) { | |
1530 | value -= 0x100; | |
1531 | if(trail <= 0x7e) { | |
1532 | value |= trail - 0x1f; | |
1533 | } else { | |
1534 | value |= trail - 0x20; | |
1535 | } | |
1536 | } else /* trail <= 0xfc */ { | |
1537 | value |= trail - 0x7e; | |
1538 | } | |
1539 | return value; | |
1540 | } | |
1541 | ||
1542 | /* | |
1543 | * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. | |
1544 | * If either byte is outside 21..7E make sure that the result is not valid | |
1545 | * for Shift-JIS so that the converter catches it. | |
1546 | * Some invalid byte values already turn into equally invalid Shift-JIS | |
1547 | * byte values and need not be tested explicitly. | |
1548 | */ | |
4388f060 | 1549 | static inline void |
46f4442e A |
1550 | _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { |
1551 | if(c1&1) { | |
1552 | ++c1; | |
1553 | if(c2 <= 0x5f) { | |
1554 | c2 += 0x1f; | |
1555 | } else if(c2 <= 0x7e) { | |
1556 | c2 += 0x20; | |
1557 | } else { | |
1558 | c2 = 0; /* invalid */ | |
1559 | } | |
1560 | } else { | |
1561 | if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { | |
1562 | c2 += 0x7e; | |
1563 | } else { | |
1564 | c2 = 0; /* invalid */ | |
1565 | } | |
1566 | } | |
1567 | c1 >>= 1; | |
1568 | if(c1 <= 0x2f) { | |
1569 | c1 += 0x70; | |
1570 | } else if(c1 <= 0x3f) { | |
1571 | c1 += 0xb0; | |
1572 | } else { | |
1573 | c1 = 0; /* invalid */ | |
1574 | } | |
1575 | bytes[0] = (char)c1; | |
1576 | bytes[1] = (char)c2; | |
1577 | } | |
1578 | ||
1579 | /* | |
1580 | * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) | |
1581 | * Katakana. | |
1582 | * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks | |
1583 | * because Shift-JIS roundtrips half-width Katakana to single bytes. | |
1584 | * These were the only fallbacks in ICU's jisx-208.ucm file. | |
1585 | */ | |
1586 | static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { | |
1587 | 0x2123, /* U+FF61 */ | |
1588 | 0x2156, | |
1589 | 0x2157, | |
1590 | 0x2122, | |
1591 | 0x2126, | |
1592 | 0x2572, | |
1593 | 0x2521, | |
1594 | 0x2523, | |
1595 | 0x2525, | |
1596 | 0x2527, | |
1597 | 0x2529, | |
1598 | 0x2563, | |
1599 | 0x2565, | |
1600 | 0x2567, | |
1601 | 0x2543, | |
1602 | 0x213C, /* U+FF70 */ | |
1603 | 0x2522, | |
1604 | 0x2524, | |
1605 | 0x2526, | |
1606 | 0x2528, | |
1607 | 0x252A, | |
1608 | 0x252B, | |
1609 | 0x252D, | |
1610 | 0x252F, | |
1611 | 0x2531, | |
1612 | 0x2533, | |
1613 | 0x2535, | |
1614 | 0x2537, | |
1615 | 0x2539, | |
1616 | 0x253B, | |
1617 | 0x253D, | |
1618 | 0x253F, /* U+FF80 */ | |
1619 | 0x2541, | |
1620 | 0x2544, | |
1621 | 0x2546, | |
1622 | 0x2548, | |
1623 | 0x254A, | |
1624 | 0x254B, | |
1625 | 0x254C, | |
1626 | 0x254D, | |
1627 | 0x254E, | |
1628 | 0x254F, | |
1629 | 0x2552, | |
1630 | 0x2555, | |
1631 | 0x2558, | |
1632 | 0x255B, | |
1633 | 0x255E, | |
1634 | 0x255F, /* U+FF90 */ | |
1635 | 0x2560, | |
1636 | 0x2561, | |
1637 | 0x2562, | |
1638 | 0x2564, | |
1639 | 0x2566, | |
1640 | 0x2568, | |
1641 | 0x2569, | |
1642 | 0x256A, | |
1643 | 0x256B, | |
1644 | 0x256C, | |
1645 | 0x256D, | |
1646 | 0x256F, | |
1647 | 0x2573, | |
1648 | 0x212B, | |
1649 | 0x212C /* U+FF9F */ | |
1650 | }; | |
1651 | ||
1652 | static void | |
374ca955 | 1653 | UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { |
46f4442e | 1654 | UConverter *cnv = args->converter; |
b75a7d8f | 1655 | UConverterDataISO2022 *converterData; |
374ca955 A |
1656 | ISO2022State *pFromU2022State; |
1657 | uint8_t *target = (uint8_t *) args->target; | |
1658 | const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; | |
b75a7d8f A |
1659 | const UChar* source = args->source; |
1660 | const UChar* sourceLimit = args->sourceLimit; | |
1661 | int32_t* offsets = args->offsets; | |
374ca955 A |
1662 | UChar32 sourceChar; |
1663 | char buffer[8]; | |
1664 | int32_t len, outLen; | |
1665 | int8_t choices[10]; | |
1666 | int32_t choiceCount; | |
73c04bcf | 1667 | uint32_t targetValue = 0; |
374ca955 A |
1668 | UBool useFallback; |
1669 | ||
1670 | int32_t i; | |
1671 | int8_t cs, g; | |
1672 | ||
1673 | /* set up the state */ | |
46f4442e | 1674 | converterData = (UConverterDataISO2022*)cnv->extraInfo; |
374ca955 | 1675 | pFromU2022State = &converterData->fromU2022State; |
374ca955 A |
1676 | |
1677 | choiceCount = 0; | |
b75a7d8f | 1678 | |
b75a7d8f | 1679 | /* check if the last codepoint of previous buffer was a lead surrogate*/ |
46f4442e | 1680 | if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { |
b75a7d8f A |
1681 | goto getTrail; |
1682 | } | |
b75a7d8f | 1683 | |
374ca955 A |
1684 | while(source < sourceLimit) { |
1685 | if(target < targetLimit) { | |
b75a7d8f | 1686 | |
b75a7d8f | 1687 | sourceChar = *(source++); |
374ca955 | 1688 | /*check if the char is a First surrogate*/ |
4388f060 A |
1689 | if(U16_IS_SURROGATE(sourceChar)) { |
1690 | if(U16_IS_SURROGATE_LEAD(sourceChar)) { | |
374ca955 A |
1691 | getTrail: |
1692 | /*look ahead to find the trail surrogate*/ | |
1693 | if(source < sourceLimit) { | |
1694 | /* test the following code unit */ | |
1695 | UChar trail=(UChar) *source; | |
4388f060 | 1696 | if(U16_IS_TRAIL(trail)) { |
374ca955 | 1697 | source++; |
4388f060 | 1698 | sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
46f4442e | 1699 | cnv->fromUChar32=0x00; |
374ca955 A |
1700 | /* convert this supplementary code point */ |
1701 | /* exit this condition tree */ | |
1702 | } else { | |
1703 | /* this is an unmatched lead code unit (1st surrogate) */ | |
1704 | /* callback(illegal) */ | |
1705 | *err=U_ILLEGAL_CHAR_FOUND; | |
46f4442e | 1706 | cnv->fromUChar32=sourceChar; |
374ca955 | 1707 | break; |
b75a7d8f | 1708 | } |
374ca955 A |
1709 | } else { |
1710 | /* no more input */ | |
46f4442e | 1711 | cnv->fromUChar32=sourceChar; |
b75a7d8f A |
1712 | break; |
1713 | } | |
374ca955 A |
1714 | } else { |
1715 | /* this is an unmatched trail code unit (2nd surrogate) */ | |
1716 | /* callback(illegal) */ | |
1717 | *err=U_ILLEGAL_CHAR_FOUND; | |
46f4442e | 1718 | cnv->fromUChar32=sourceChar; |
374ca955 A |
1719 | break; |
1720 | } | |
b75a7d8f A |
1721 | } |
1722 | ||
73c04bcf A |
1723 | /* do not convert SO/SI/ESC */ |
1724 | if(IS_2022_CONTROL(sourceChar)) { | |
1725 | /* callback(illegal) */ | |
1726 | *err=U_ILLEGAL_CHAR_FOUND; | |
46f4442e | 1727 | cnv->fromUChar32=sourceChar; |
73c04bcf A |
1728 | break; |
1729 | } | |
1730 | ||
374ca955 | 1731 | /* do the conversion */ |
b75a7d8f | 1732 | |
374ca955 A |
1733 | if(choiceCount == 0) { |
1734 | uint16_t csm; | |
b75a7d8f | 1735 | |
374ca955 A |
1736 | /* |
1737 | * The csm variable keeps track of which charsets are allowed | |
1738 | * and not used yet while building the choices[]. | |
1739 | */ | |
1740 | csm = jpCharsetMasks[converterData->version]; | |
1741 | choiceCount = 0; | |
1742 | ||
1743 | /* JIS7/8: try single-byte half-width Katakana before JISX208 */ | |
1744 | if(converterData->version == 3 || converterData->version == 4) { | |
46f4442e | 1745 | choices[choiceCount++] = (int8_t)HWKANA_7BIT; |
374ca955 | 1746 | } |
46f4442e A |
1747 | /* Do not try single-byte half-width Katakana for other versions. */ |
1748 | csm &= ~CSM(HWKANA_7BIT); | |
b75a7d8f | 1749 | |
374ca955 A |
1750 | /* try the current G0 charset */ |
1751 | choices[choiceCount++] = cs = pFromU2022State->cs[0]; | |
1752 | csm &= ~CSM(cs); | |
b75a7d8f | 1753 | |
374ca955 A |
1754 | /* try the current G2 charset */ |
1755 | if((cs = pFromU2022State->cs[2]) != 0) { | |
1756 | choices[choiceCount++] = cs; | |
1757 | csm &= ~CSM(cs); | |
1758 | } | |
1759 | ||
1760 | /* try all the other possible charsets */ | |
b331163b | 1761 | for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) { |
374ca955 A |
1762 | cs = (int8_t)jpCharsetPref[i]; |
1763 | if(CSM(cs) & csm) { | |
1764 | choices[choiceCount++] = cs; | |
1765 | csm &= ~CSM(cs); | |
b75a7d8f A |
1766 | } |
1767 | } | |
374ca955 | 1768 | } |
b75a7d8f | 1769 | |
374ca955 | 1770 | cs = g = 0; |
46f4442e A |
1771 | /* |
1772 | * len==0: no mapping found yet | |
1773 | * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks | |
1774 | * len>0: found a roundtrip result, done | |
1775 | */ | |
374ca955 | 1776 | len = 0; |
46f4442e A |
1777 | /* |
1778 | * We will turn off useFallback after finding a fallback, | |
1779 | * but we still get fallbacks from PUA code points as usual. | |
1780 | * Therefore, we will also need to check that we don't overwrite | |
1781 | * an early fallback with a later one. | |
1782 | */ | |
1783 | useFallback = cnv->useFallback; | |
374ca955 | 1784 | |
46f4442e A |
1785 | for(i = 0; i < choiceCount && len <= 0; ++i) { |
1786 | uint32_t value; | |
1787 | int32_t len2; | |
1788 | int8_t cs0 = choices[i]; | |
1789 | switch(cs0) { | |
374ca955 A |
1790 | case ASCII: |
1791 | if(sourceChar <= 0x7f) { | |
1792 | targetValue = (uint32_t)sourceChar; | |
1793 | len = 1; | |
46f4442e A |
1794 | cs = cs0; |
1795 | g = 0; | |
b75a7d8f | 1796 | } |
374ca955 A |
1797 | break; |
1798 | case ISO8859_1: | |
46f4442e | 1799 | if(GR96_START <= sourceChar && sourceChar <= GR96_END) { |
374ca955 A |
1800 | targetValue = (uint32_t)sourceChar - 0x80; |
1801 | len = 1; | |
46f4442e | 1802 | cs = cs0; |
374ca955 A |
1803 | g = 2; |
1804 | } | |
1805 | break; | |
1806 | case HWKANA_7BIT: | |
46f4442e | 1807 | if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { |
374ca955 A |
1808 | if(converterData->version==3) { |
1809 | /* JIS7: use G1 (SO) */ | |
46f4442e A |
1810 | /* Shift U+FF61..U+FF9F to bytes 21..5F. */ |
1811 | targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); | |
1812 | len = 1; | |
1813 | pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ | |
374ca955 A |
1814 | g = 1; |
1815 | } else if(converterData->version==4) { | |
1816 | /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ | |
46f4442e A |
1817 | /* Shift U+FF61..U+FF9F to bytes A1..DF. */ |
1818 | targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); | |
1819 | len = 1; | |
374ca955 | 1820 | |
46f4442e A |
1821 | cs = pFromU2022State->cs[0]; |
1822 | if(IS_JP_DBCS(cs)) { | |
374ca955 A |
1823 | /* switch from a DBCS charset to JISX201 */ |
1824 | cs = (int8_t)JISX201; | |
b75a7d8f | 1825 | } |
46f4442e A |
1826 | /* else stay in the current G0 charset */ |
1827 | g = 0; | |
b75a7d8f | 1828 | } |
46f4442e | 1829 | /* else do not use HWKANA_7BIT with other versions */ |
b75a7d8f | 1830 | } |
374ca955 A |
1831 | break; |
1832 | case JISX201: | |
1833 | /* G0 SBCS */ | |
46f4442e A |
1834 | value = jisx201FromU(sourceChar); |
1835 | if(value <= 0x7f) { | |
1836 | targetValue = value; | |
374ca955 | 1837 | len = 1; |
46f4442e A |
1838 | cs = cs0; |
1839 | g = 0; | |
1840 | useFallback = FALSE; | |
1841 | } | |
1842 | break; | |
1843 | case JISX208: | |
1844 | /* G0 DBCS from Shift-JIS table */ | |
1845 | len2 = MBCS_FROM_UCHAR32_ISO2022( | |
1846 | converterData->myConverterArray[cs0], | |
1847 | sourceChar, &value, | |
1848 | useFallback, MBCS_OUTPUT_2); | |
1849 | if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ | |
1850 | value = _2022FromSJIS(value); | |
1851 | if(value != 0) { | |
1852 | targetValue = value; | |
1853 | len = len2; | |
1854 | cs = cs0; | |
1855 | g = 0; | |
1856 | useFallback = FALSE; | |
1857 | } | |
1858 | } else if(len == 0 && useFallback && | |
1859 | (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { | |
1860 | targetValue = hwkana_fb[sourceChar - HWKANA_START]; | |
1861 | len = -2; | |
1862 | cs = cs0; | |
1863 | g = 0; | |
1864 | useFallback = FALSE; | |
374ca955 A |
1865 | } |
1866 | break; | |
1867 | case ISO8859_7: | |
1868 | /* G0 SBCS forced to 7-bit output */ | |
46f4442e A |
1869 | len2 = MBCS_SINGLE_FROM_UCHAR32( |
1870 | converterData->myConverterArray[cs0], | |
1871 | sourceChar, &value, | |
1872 | useFallback); | |
1873 | if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { | |
1874 | targetValue = value - 0x80; | |
1875 | len = len2; | |
1876 | cs = cs0; | |
374ca955 | 1877 | g = 2; |
46f4442e | 1878 | useFallback = FALSE; |
374ca955 A |
1879 | } |
1880 | break; | |
1881 | default: | |
1882 | /* G0 DBCS */ | |
46f4442e A |
1883 | len2 = MBCS_FROM_UCHAR32_ISO2022( |
1884 | converterData->myConverterArray[cs0], | |
1885 | sourceChar, &value, | |
1886 | useFallback, MBCS_OUTPUT_2); | |
1887 | if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ | |
1888 | if(cs0 == KSC5601) { | |
1889 | /* | |
1890 | * Check for valid bytes for the encoding scheme. | |
1891 | * This is necessary because the sub-converter (windows-949) | |
1892 | * has a broader encoding scheme than is valid for 2022. | |
1893 | */ | |
1894 | value = _2022FromGR94DBCS(value); | |
1895 | if(value == 0) { | |
1896 | break; | |
1897 | } | |
1898 | } | |
1899 | targetValue = value; | |
1900 | len = len2; | |
1901 | cs = cs0; | |
1902 | g = 0; | |
1903 | useFallback = FALSE; | |
374ca955 A |
1904 | } |
1905 | break; | |
b75a7d8f A |
1906 | } |
1907 | } | |
b75a7d8f | 1908 | |
46f4442e A |
1909 | if(len != 0) { |
1910 | if(len < 0) { | |
1911 | len = -len; /* fallback */ | |
1912 | } | |
374ca955 A |
1913 | outLen = 0; /* count output bytes */ |
1914 | ||
1915 | /* write SI if necessary (only for JIS7) */ | |
1916 | if(pFromU2022State->g == 1 && g == 0) { | |
1917 | buffer[outLen++] = UCNV_SI; | |
1918 | pFromU2022State->g = 0; | |
1919 | } | |
1920 | ||
1921 | /* write the designation sequence if necessary */ | |
1922 | if(cs != pFromU2022State->cs[g]) { | |
1923 | int32_t escLen = escSeqCharsLen[cs]; | |
1924 | uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); | |
1925 | outLen += escLen; | |
1926 | pFromU2022State->cs[g] = cs; | |
1927 | ||
1928 | /* invalidate the choices[] */ | |
1929 | choiceCount = 0; | |
1930 | } | |
1931 | ||
1932 | /* write the shift sequence if necessary */ | |
1933 | if(g != pFromU2022State->g) { | |
1934 | switch(g) { | |
1935 | /* case 0 handled before writing escapes */ | |
1936 | case 1: | |
1937 | buffer[outLen++] = UCNV_SO; | |
1938 | pFromU2022State->g = 1; | |
1939 | break; | |
1940 | default: /* case 2 */ | |
1941 | buffer[outLen++] = 0x1b; | |
1942 | buffer[outLen++] = 0x4e; | |
1943 | break; | |
1944 | /* no case 3: no SS3 in ISO-2022-JP-x */ | |
1945 | } | |
1946 | } | |
1947 | ||
1948 | /* write the output bytes */ | |
1949 | if(len == 1) { | |
1950 | buffer[outLen++] = (char)targetValue; | |
1951 | } else /* len == 2 */ { | |
1952 | buffer[outLen++] = (char)(targetValue >> 8); | |
1953 | buffer[outLen++] = (char)targetValue; | |
1954 | } | |
1955 | } else { | |
1956 | /* | |
46f4442e | 1957 | * if we cannot find the character after checking all codepages |
b75a7d8f A |
1958 | * then this is an error |
1959 | */ | |
b75a7d8f | 1960 | *err = U_INVALID_CHAR_FOUND; |
46f4442e | 1961 | cnv->fromUChar32=sourceChar; |
374ca955 A |
1962 | break; |
1963 | } | |
1964 | ||
1965 | if(sourceChar == CR || sourceChar == LF) { | |
1966 | /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ | |
1967 | pFromU2022State->cs[2] = 0; | |
1968 | choiceCount = 0; | |
1969 | } | |
1970 | ||
1971 | /* output outLen>0 bytes in buffer[] */ | |
1972 | if(outLen == 1) { | |
1973 | *target++ = buffer[0]; | |
1974 | if(offsets) { | |
73c04bcf | 1975 | *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ |
b75a7d8f | 1976 | } |
374ca955 A |
1977 | } else if(outLen == 2 && (target + 2) <= targetLimit) { |
1978 | *target++ = buffer[0]; | |
1979 | *target++ = buffer[1]; | |
1980 | if(offsets) { | |
1981 | int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); | |
1982 | *offsets++ = sourceIndex; | |
1983 | *offsets++ = sourceIndex; | |
1984 | } | |
1985 | } else { | |
73c04bcf | 1986 | fromUWriteUInt8( |
46f4442e | 1987 | cnv, |
374ca955 | 1988 | buffer, outLen, |
73c04bcf | 1989 | &target, (const char *)targetLimit, |
374ca955 A |
1990 | &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), |
1991 | err); | |
1992 | if(U_FAILURE(*err)) { | |
b75a7d8f A |
1993 | break; |
1994 | } | |
1995 | } | |
1996 | } /* end if(myTargetIndex<myTargetLength) */ | |
1997 | else{ | |
1998 | *err =U_BUFFER_OVERFLOW_ERROR; | |
1999 | break; | |
2000 | } | |
2001 | ||
2002 | }/* end while(mySourceIndex<mySourceLength) */ | |
2003 | ||
374ca955 A |
2004 | /* |
2005 | * the end of the input stream and detection of truncated input | |
2006 | * are handled by the framework, but for ISO-2022-JP conversion | |
2007 | * we need to be in ASCII mode at the very end | |
2008 | * | |
2009 | * conditions: | |
2010 | * successful | |
2011 | * in SO mode or not in ASCII mode | |
2012 | * end of input and no truncated input | |
b75a7d8f | 2013 | */ |
374ca955 A |
2014 | if( U_SUCCESS(*err) && |
2015 | (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && | |
46f4442e | 2016 | args->flush && source>=sourceLimit && cnv->fromUChar32==0 |
374ca955 A |
2017 | ) { |
2018 | int32_t sourceIndex; | |
2019 | ||
2020 | outLen = 0; | |
2021 | ||
2022 | if(pFromU2022State->g != 0) { | |
2023 | buffer[outLen++] = UCNV_SI; | |
2024 | pFromU2022State->g = 0; | |
2025 | } | |
2026 | ||
2027 | if(pFromU2022State->cs[0] != ASCII) { | |
2028 | int32_t escLen = escSeqCharsLen[ASCII]; | |
2029 | uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); | |
2030 | outLen += escLen; | |
2031 | pFromU2022State->cs[0] = (int8_t)ASCII; | |
2032 | } | |
2033 | ||
2034 | /* get the source index of the last input character */ | |
2035 | /* | |
2036 | * TODO this would be simpler and more reliable if we used a pair | |
2037 | * of sourceIndex/prevSourceIndex like in ucnvmbcs.c | |
2038 | * so that we could simply use the prevSourceIndex here; | |
2039 | * this code gives an incorrect result for the rare case of an unmatched | |
2040 | * trail surrogate that is alone in the last buffer of the text stream | |
2041 | */ | |
2042 | sourceIndex=(int32_t)(source-args->source); | |
2043 | if(sourceIndex>0) { | |
2044 | --sourceIndex; | |
2045 | if( U16_IS_TRAIL(args->source[sourceIndex]) && | |
2046 | (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) | |
2047 | ) { | |
2048 | --sourceIndex; | |
2049 | } | |
2050 | } else { | |
2051 | sourceIndex=-1; | |
2052 | } | |
2053 | ||
73c04bcf | 2054 | fromUWriteUInt8( |
46f4442e | 2055 | cnv, |
374ca955 | 2056 | buffer, outLen, |
73c04bcf | 2057 | &target, (const char *)targetLimit, |
374ca955 A |
2058 | &offsets, sourceIndex, |
2059 | err); | |
b75a7d8f A |
2060 | } |
2061 | ||
2062 | /*save the state and return */ | |
2063 | args->source = source; | |
2064 | args->target = (char*)target; | |
2065 | } | |
2066 | ||
2067 | /*************** to unicode *******************/ | |
2068 | ||
46f4442e | 2069 | static void |
b75a7d8f | 2070 | UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
374ca955 | 2071 | UErrorCode* err){ |
46f4442e | 2072 | char tempBuf[2]; |
374ca955 | 2073 | const char *mySource = (char *) args->source; |
b75a7d8f A |
2074 | UChar *myTarget = args->target; |
2075 | const char *mySourceLimit = args->sourceLimit; | |
2076 | uint32_t targetUniChar = 0x0000; | |
2077 | uint32_t mySourceChar = 0x0000; | |
46f4442e | 2078 | uint32_t tmpSourceChar = 0x0000; |
b75a7d8f | 2079 | UConverterDataISO2022* myData; |
374ca955 A |
2080 | ISO2022State *pToU2022State; |
2081 | StateEnum cs; | |
b75a7d8f | 2082 | |
b75a7d8f | 2083 | myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
374ca955 | 2084 | pToU2022State = &myData->toU2022State; |
b75a7d8f | 2085 | |
374ca955 A |
2086 | if(myData->key != 0) { |
2087 | /* continue with a partial escape sequence */ | |
2088 | goto escape; | |
2089 | } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { | |
2090 | /* continue with a partial double-byte character */ | |
2091 | mySourceChar = args->converter->toUBytes[0]; | |
2092 | args->converter->toULength = 0; | |
2093 | cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; | |
fd0068a8 | 2094 | targetUniChar = missingCharMarker; |
374ca955 A |
2095 | goto getTrailByte; |
2096 | } | |
2097 | ||
2098 | while(mySource < mySourceLimit){ | |
2099 | ||
2100 | targetUniChar =missingCharMarker; | |
b75a7d8f A |
2101 | |
2102 | if(myTarget < args->targetLimit){ | |
2103 | ||
2104 | mySourceChar= (unsigned char) *mySource++; | |
374ca955 A |
2105 | |
2106 | switch(mySourceChar) { | |
2107 | case UCNV_SI: | |
2108 | if(myData->version==3) { | |
2109 | pToU2022State->g=0; | |
b75a7d8f | 2110 | continue; |
374ca955 A |
2111 | } else { |
2112 | /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ | |
d5d484b0 | 2113 | myData->isEmptySegment = FALSE; /* reset this, we have a different error */ |
374ca955 | 2114 | break; |
b75a7d8f | 2115 | } |
b75a7d8f | 2116 | |
374ca955 A |
2117 | case UCNV_SO: |
2118 | if(myData->version==3) { | |
2119 | /* JIS7: switch to G1 half-width Katakana */ | |
2120 | pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; | |
2121 | pToU2022State->g=1; | |
b75a7d8f | 2122 | continue; |
374ca955 A |
2123 | } else { |
2124 | /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ | |
d5d484b0 | 2125 | myData->isEmptySegment = FALSE; /* reset this, we have a different error */ |
374ca955 | 2126 | break; |
b75a7d8f | 2127 | } |
b75a7d8f | 2128 | |
374ca955 A |
2129 | case ESC_2022: |
2130 | mySource--; | |
2131 | escape: | |
d5d484b0 A |
2132 | { |
2133 | const char * mySourceBefore = mySource; | |
2134 | int8_t toULengthBefore = args->converter->toULength; | |
2135 | ||
46f4442e | 2136 | changeState_2022(args->converter,&(mySource), |
d5d484b0 A |
2137 | mySourceLimit, ISO_2022_JP,err); |
2138 | ||
2139 | /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ | |
46f4442e A |
2140 | if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { |
2141 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; | |
2142 | args->converter->toUCallbackReason = UCNV_IRREGULAR; | |
729e4ab9 | 2143 | args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); |
d5d484b0 | 2144 | } |
d5d484b0 | 2145 | } |
46f4442e | 2146 | |
374ca955 A |
2147 | /* invalid or illegal escape sequence */ |
2148 | if(U_FAILURE(*err)){ | |
2149 | args->target = myTarget; | |
2150 | args->source = mySource; | |
d5d484b0 | 2151 | myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ |
374ca955 | 2152 | return; |
b75a7d8f | 2153 | } |
d5d484b0 | 2154 | /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ |
46f4442e | 2155 | if(myData->key==0) { |
d5d484b0 A |
2156 | myData->isEmptySegment = TRUE; |
2157 | } | |
374ca955 | 2158 | continue; |
b75a7d8f | 2159 | |
374ca955 | 2160 | /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ |
b75a7d8f | 2161 | |
374ca955 A |
2162 | case CR: |
2163 | /*falls through*/ | |
2164 | case LF: | |
2165 | /* automatically reset to single-byte mode */ | |
2166 | if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { | |
2167 | pToU2022State->cs[0] = (int8_t)ASCII; | |
b75a7d8f | 2168 | } |
374ca955 A |
2169 | pToU2022State->cs[2] = 0; |
2170 | pToU2022State->g = 0; | |
2171 | /* falls through */ | |
b75a7d8f | 2172 | default: |
374ca955 | 2173 | /* convert one or two bytes */ |
d5d484b0 | 2174 | myData->isEmptySegment = FALSE; |
374ca955 A |
2175 | cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
2176 | if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && | |
2177 | !IS_JP_DBCS(cs) | |
2178 | ) { | |
2179 | /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ | |
46f4442e | 2180 | targetUniChar = mySourceChar + (HWKANA_START - 0xa1); |
374ca955 A |
2181 | |
2182 | /* return from a single-shift state to the previous one */ | |
2183 | if(pToU2022State->g >= 2) { | |
2184 | pToU2022State->g=pToU2022State->prevG; | |
2185 | } | |
2186 | } else switch(cs) { | |
2187 | case ASCII: | |
2188 | if(mySourceChar <= 0x7f) { | |
2189 | targetUniChar = mySourceChar; | |
2190 | } | |
2191 | break; | |
2192 | case ISO8859_1: | |
2193 | if(mySourceChar <= 0x7f) { | |
2194 | targetUniChar = mySourceChar + 0x80; | |
2195 | } | |
2196 | /* return from a single-shift state to the previous one */ | |
2197 | pToU2022State->g=pToU2022State->prevG; | |
2198 | break; | |
2199 | case ISO8859_7: | |
2200 | if(mySourceChar <= 0x7f) { | |
2201 | /* convert mySourceChar+0x80 to use a normal 8-bit table */ | |
2202 | targetUniChar = | |
2203 | _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( | |
2204 | myData->myConverterArray[cs], | |
2205 | mySourceChar + 0x80); | |
2206 | } | |
2207 | /* return from a single-shift state to the previous one */ | |
2208 | pToU2022State->g=pToU2022State->prevG; | |
2209 | break; | |
2210 | case JISX201: | |
2211 | if(mySourceChar <= 0x7f) { | |
46f4442e | 2212 | targetUniChar = jisx201ToU(mySourceChar); |
374ca955 A |
2213 | } |
2214 | break; | |
2215 | case HWKANA_7BIT: | |
2216 | if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { | |
2217 | /* 7-bit halfwidth Katakana */ | |
46f4442e | 2218 | targetUniChar = mySourceChar + (HWKANA_START - 0x21); |
374ca955 A |
2219 | } |
2220 | break; | |
2221 | default: | |
2222 | /* G0 DBCS */ | |
2223 | if(mySource < mySourceLimit) { | |
fd0068a8 A |
2224 | int leadIsOk, trailIsOk; |
2225 | uint8_t trailByte; | |
374ca955 | 2226 | getTrailByte: |
fd0068a8 | 2227 | trailByte = (uint8_t)*mySource; |
fd0068a8 A |
2228 | /* |
2229 | * Ticket 5691: consistent illegal sequences: | |
2230 | * - We include at least the first byte in the illegal sequence. | |
2231 | * - If any of the non-initial bytes could be the start of a character, | |
46f4442e | 2232 | * we stop the illegal sequence before the first one of those. |
fd0068a8 A |
2233 | * |
2234 | * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is | |
2235 | * an ESC/SO/SI, we report only the first byte as the illegal sequence. | |
2236 | * Otherwise we convert or report the pair of bytes. | |
2237 | */ | |
2238 | leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); | |
2239 | trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); | |
2240 | if (leadIsOk && trailIsOk) { | |
2241 | ++mySource; | |
46f4442e A |
2242 | tmpSourceChar = (mySourceChar << 8) | trailByte; |
2243 | if(cs == JISX208) { | |
2244 | _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); | |
2245 | mySourceChar = tmpSourceChar; | |
2246 | } else { | |
2247 | /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ | |
2248 | mySourceChar = tmpSourceChar; | |
2249 | if (cs == KSC5601) { | |
2250 | tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ | |
2251 | } | |
2252 | tempBuf[0] = (char)(tmpSourceChar >> 8); | |
2253 | tempBuf[1] = (char)(tmpSourceChar); | |
2254 | } | |
fd0068a8 A |
2255 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); |
2256 | } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { | |
2257 | /* report a pair of illegal bytes if the second byte is not a DBCS starter */ | |
2258 | ++mySource; | |
2259 | /* add another bit so that the code below writes 2 bytes in case of error */ | |
2260 | mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; | |
2261 | } | |
374ca955 A |
2262 | } else { |
2263 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; | |
2264 | args->converter->toULength = 1; | |
2265 | goto endloop; | |
2266 | } | |
46f4442e | 2267 | } /* End of inner switch */ |
b75a7d8f | 2268 | break; |
46f4442e | 2269 | } /* End of outer switch */ |
b75a7d8f A |
2270 | if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ |
2271 | if(args->offsets){ | |
73c04bcf | 2272 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
b75a7d8f A |
2273 | } |
2274 | *(myTarget++)=(UChar)targetUniChar; | |
b75a7d8f | 2275 | } |
374ca955 A |
2276 | else if(targetUniChar > missingCharMarker){ |
2277 | /* disassemble the surrogate pair and write to output*/ | |
2278 | targetUniChar-=0x0010000; | |
2279 | *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); | |
2280 | if(args->offsets){ | |
73c04bcf | 2281 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
374ca955 A |
2282 | } |
2283 | ++myTarget; | |
46f4442e | 2284 | if(myTarget< args->targetLimit){ |
374ca955 A |
2285 | *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
2286 | if(args->offsets){ | |
73c04bcf | 2287 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
374ca955 A |
2288 | } |
2289 | ++myTarget; | |
2290 | }else{ | |
2291 | args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= | |
2292 | (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); | |
2293 | } | |
b75a7d8f | 2294 | |
374ca955 A |
2295 | } |
2296 | else{ | |
b75a7d8f | 2297 | /* Call the callback function*/ |
374ca955 A |
2298 | toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
2299 | break; | |
b75a7d8f A |
2300 | } |
2301 | } | |
46f4442e | 2302 | else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ |
b75a7d8f A |
2303 | *err =U_BUFFER_OVERFLOW_ERROR; |
2304 | break; | |
2305 | } | |
2306 | } | |
374ca955 | 2307 | endloop: |
b75a7d8f A |
2308 | args->target = myTarget; |
2309 | args->source = mySource; | |
2310 | } | |
2311 | ||
2312 | ||
b331163b | 2313 | #if !UCONFIG_ONLY_HTML_CONVERSION |
b75a7d8f A |
2314 | /*************************************************************** |
2315 | * Rules for ISO-2022-KR encoding | |
46f4442e | 2316 | * i) The KSC5601 designator sequence should appear only once in a file, |
b75a7d8f A |
2317 | * at the begining of a line before any KSC5601 characters. This usually |
2318 | * means that it appears by itself on the first line of the file | |
2319 | * ii) There are only 2 shifting sequences SO to shift into double byte mode | |
2320 | * and SI to shift into single byte mode | |
2321 | */ | |
46f4442e | 2322 | static void |
b75a7d8f A |
2323 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
2324 | ||
374ca955 A |
2325 | UConverter* saveConv = args->converter; |
2326 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; | |
2327 | args->converter=myConverterData->currentConverter; | |
2328 | ||
2329 | myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; | |
2330 | ucnv_MBCSFromUnicodeWithOffsets(args,err); | |
2331 | saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; | |
2332 | ||
2333 | if(*err == U_BUFFER_OVERFLOW_ERROR) { | |
2334 | if(myConverterData->currentConverter->charErrorBufferLength > 0) { | |
2335 | uprv_memcpy( | |
2336 | saveConv->charErrorBuffer, | |
2337 | myConverterData->currentConverter->charErrorBuffer, | |
2338 | myConverterData->currentConverter->charErrorBufferLength); | |
2339 | } | |
2340 | saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; | |
2341 | myConverterData->currentConverter->charErrorBufferLength = 0; | |
2342 | } | |
2343 | args->converter=saveConv; | |
b75a7d8f A |
2344 | } |
2345 | ||
46f4442e | 2346 | static void |
b75a7d8f A |
2347 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
2348 | ||
2349 | const UChar *source = args->source; | |
2350 | const UChar *sourceLimit = args->sourceLimit; | |
2351 | unsigned char *target = (unsigned char *) args->target; | |
2352 | unsigned char *targetLimit = (unsigned char *) args->targetLimit; | |
2353 | int32_t* offsets = args->offsets; | |
2354 | uint32_t targetByteUnit = 0x0000; | |
2355 | UChar32 sourceChar = 0x0000; | |
2356 | UBool isTargetByteDBCS; | |
2357 | UBool oldIsTargetByteDBCS; | |
2358 | UConverterDataISO2022 *converterData; | |
b75a7d8f A |
2359 | UConverterSharedData* sharedData; |
2360 | UBool useFallback; | |
2361 | int32_t length =0; | |
2362 | ||
b75a7d8f | 2363 | converterData=(UConverterDataISO2022*)args->converter->extraInfo; |
46f4442e A |
2364 | /* if the version is 1 then the user is requesting |
2365 | * conversion with ibm-25546 pass the arguments to | |
b75a7d8f A |
2366 | * MBCS converter and return |
2367 | */ | |
2368 | if(converterData->version==1){ | |
2369 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); | |
2370 | return; | |
2371 | } | |
374ca955 A |
2372 | |
2373 | /* initialize data */ | |
2374 | sharedData = converterData->currentConverter->sharedData; | |
2375 | useFallback = args->converter->useFallback; | |
2376 | isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; | |
2377 | oldIsTargetByteDBCS = isTargetByteDBCS; | |
46f4442e | 2378 | |
b75a7d8f | 2379 | isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; |
374ca955 | 2380 | if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { |
b75a7d8f A |
2381 | goto getTrail; |
2382 | } | |
2383 | while(source < sourceLimit){ | |
46f4442e | 2384 | |
b75a7d8f A |
2385 | targetByteUnit = missingCharMarker; |
2386 | ||
2387 | if(target < (unsigned char*) args->targetLimit){ | |
2388 | sourceChar = *source++; | |
73c04bcf A |
2389 | |
2390 | /* do not convert SO/SI/ESC */ | |
2391 | if(IS_2022_CONTROL(sourceChar)) { | |
2392 | /* callback(illegal) */ | |
2393 | *err=U_ILLEGAL_CHAR_FOUND; | |
2394 | args->converter->fromUChar32=sourceChar; | |
2395 | break; | |
2396 | } | |
2397 | ||
46f4442e A |
2398 | length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); |
2399 | if(length < 0) { | |
2400 | length = -length; /* fallback */ | |
2401 | } | |
b75a7d8f | 2402 | /* only DBCS or SBCS characters are expected*/ |
374ca955 | 2403 | /* DB characters with high bit set to 1 are expected */ |
fd0068a8 A |
2404 | if( length > 2 || length==0 || |
2405 | (length == 1 && targetByteUnit > 0x7f) || | |
2406 | (length == 2 && | |
2407 | ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || | |
2408 | (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) | |
2409 | ) { | |
b75a7d8f A |
2410 | targetByteUnit=missingCharMarker; |
2411 | } | |
2412 | if (targetByteUnit != missingCharMarker){ | |
2413 | ||
2414 | oldIsTargetByteDBCS = isTargetByteDBCS; | |
2415 | isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); | |
2416 | /* append the shift sequence */ | |
2417 | if (oldIsTargetByteDBCS != isTargetByteDBCS ){ | |
46f4442e A |
2418 | |
2419 | if (isTargetByteDBCS) | |
b75a7d8f | 2420 | *target++ = UCNV_SO; |
46f4442e | 2421 | else |
b75a7d8f A |
2422 | *target++ = UCNV_SI; |
2423 | if(offsets) | |
73c04bcf | 2424 | *(offsets++) = (int32_t)(source - args->source-1); |
b75a7d8f A |
2425 | } |
2426 | /* write the targetUniChar to target */ | |
2427 | if(targetByteUnit <= 0x00FF){ | |
2428 | if( target < targetLimit){ | |
2429 | *(target++) = (unsigned char) targetByteUnit; | |
2430 | if(offsets){ | |
73c04bcf | 2431 | *(offsets++) = (int32_t)(source - args->source-1); |
b75a7d8f A |
2432 | } |
2433 | ||
2434 | }else{ | |
2435 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); | |
2436 | *err = U_BUFFER_OVERFLOW_ERROR; | |
2437 | } | |
2438 | }else{ | |
2439 | if(target < targetLimit){ | |
2440 | *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); | |
2441 | if(offsets){ | |
73c04bcf | 2442 | *(offsets++) = (int32_t)(source - args->source-1); |
b75a7d8f A |
2443 | } |
2444 | if(target < targetLimit){ | |
2445 | *(target++) =(unsigned char) (targetByteUnit -0x80); | |
2446 | if(offsets){ | |
73c04bcf | 2447 | *(offsets++) = (int32_t)(source - args->source-1); |
b75a7d8f A |
2448 | } |
2449 | }else{ | |
2450 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); | |
2451 | *err = U_BUFFER_OVERFLOW_ERROR; | |
2452 | } | |
2453 | }else{ | |
2454 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); | |
2455 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); | |
2456 | *err = U_BUFFER_OVERFLOW_ERROR; | |
2457 | } | |
2458 | } | |
2459 | ||
2460 | } | |
2461 | else{ | |
2462 | /* oops.. the code point is unassingned | |
2463 | * set the error and reason | |
2464 | */ | |
b75a7d8f A |
2465 | |
2466 | /*check if the char is a First surrogate*/ | |
4388f060 A |
2467 | if(U16_IS_SURROGATE(sourceChar)) { |
2468 | if(U16_IS_SURROGATE_LEAD(sourceChar)) { | |
b75a7d8f A |
2469 | getTrail: |
2470 | /*look ahead to find the trail surrogate*/ | |
2471 | if(source < sourceLimit) { | |
2472 | /* test the following code unit */ | |
2473 | UChar trail=(UChar) *source; | |
4388f060 | 2474 | if(U16_IS_TRAIL(trail)) { |
b75a7d8f | 2475 | source++; |
4388f060 | 2476 | sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
b75a7d8f | 2477 | *err = U_INVALID_CHAR_FOUND; |
b75a7d8f A |
2478 | /* convert this surrogate code point */ |
2479 | /* exit this condition tree */ | |
2480 | } else { | |
2481 | /* this is an unmatched lead code unit (1st surrogate) */ | |
2482 | /* callback(illegal) */ | |
b75a7d8f A |
2483 | *err=U_ILLEGAL_CHAR_FOUND; |
2484 | } | |
2485 | } else { | |
2486 | /* no more input */ | |
2487 | *err = U_ZERO_ERROR; | |
b75a7d8f A |
2488 | } |
2489 | } else { | |
2490 | /* this is an unmatched trail code unit (2nd surrogate) */ | |
2491 | /* callback(illegal) */ | |
b75a7d8f A |
2492 | *err=U_ILLEGAL_CHAR_FOUND; |
2493 | } | |
374ca955 A |
2494 | } else { |
2495 | /* callback(unassigned) for a BMP code point */ | |
2496 | *err = U_INVALID_CHAR_FOUND; | |
b75a7d8f | 2497 | } |
b75a7d8f | 2498 | |
374ca955 | 2499 | args->converter->fromUChar32=sourceChar; |
374ca955 | 2500 | break; |
b75a7d8f A |
2501 | } |
2502 | } /* end if(myTargetIndex<myTargetLength) */ | |
2503 | else{ | |
2504 | *err =U_BUFFER_OVERFLOW_ERROR; | |
2505 | break; | |
2506 | } | |
2507 | ||
2508 | }/* end while(mySourceIndex<mySourceLength) */ | |
2509 | ||
374ca955 A |
2510 | /* |
2511 | * the end of the input stream and detection of truncated input | |
2512 | * are handled by the framework, but for ISO-2022-KR conversion | |
2513 | * we need to be in ASCII mode at the very end | |
2514 | * | |
2515 | * conditions: | |
2516 | * successful | |
2517 | * not in ASCII mode | |
2518 | * end of input and no truncated input | |
b75a7d8f | 2519 | */ |
374ca955 A |
2520 | if( U_SUCCESS(*err) && |
2521 | isTargetByteDBCS && | |
2522 | args->flush && source>=sourceLimit && args->converter->fromUChar32==0 | |
2523 | ) { | |
2524 | int32_t sourceIndex; | |
2525 | ||
2526 | /* we are switching to ASCII */ | |
2527 | isTargetByteDBCS=FALSE; | |
2528 | ||
2529 | /* get the source index of the last input character */ | |
2530 | /* | |
2531 | * TODO this would be simpler and more reliable if we used a pair | |
2532 | * of sourceIndex/prevSourceIndex like in ucnvmbcs.c | |
2533 | * so that we could simply use the prevSourceIndex here; | |
2534 | * this code gives an incorrect result for the rare case of an unmatched | |
2535 | * trail surrogate that is alone in the last buffer of the text stream | |
2536 | */ | |
2537 | sourceIndex=(int32_t)(source-args->source); | |
2538 | if(sourceIndex>0) { | |
2539 | --sourceIndex; | |
2540 | if( U16_IS_TRAIL(args->source[sourceIndex]) && | |
2541 | (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) | |
2542 | ) { | |
2543 | --sourceIndex; | |
2544 | } | |
2545 | } else { | |
2546 | sourceIndex=-1; | |
2547 | } | |
2548 | ||
73c04bcf | 2549 | fromUWriteUInt8( |
374ca955 A |
2550 | args->converter, |
2551 | SHIFT_IN_STR, 1, | |
73c04bcf | 2552 | &target, (const char *)targetLimit, |
374ca955 A |
2553 | &offsets, sourceIndex, |
2554 | err); | |
b75a7d8f A |
2555 | } |
2556 | ||
2557 | /*save the state and return */ | |
2558 | args->source = source; | |
2559 | args->target = (char*)target; | |
2560 | args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; | |
2561 | } | |
2562 | ||
2563 | /************************ To Unicode ***************************************/ | |
2564 | ||
46f4442e | 2565 | static void |
b75a7d8f A |
2566 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, |
2567 | UErrorCode* err){ | |
b75a7d8f | 2568 | char const* sourceStart; |
b75a7d8f | 2569 | UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
b75a7d8f | 2570 | |
374ca955 A |
2571 | UConverterToUnicodeArgs subArgs; |
2572 | int32_t minArgsSize; | |
2573 | ||
2574 | /* set up the subconverter arguments */ | |
2575 | if(args->size<sizeof(UConverterToUnicodeArgs)) { | |
2576 | minArgsSize = args->size; | |
2577 | } else { | |
2578 | minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); | |
2579 | } | |
2580 | ||
2581 | uprv_memcpy(&subArgs, args, minArgsSize); | |
2582 | subArgs.size = (uint16_t)minArgsSize; | |
2583 | subArgs.converter = myData->currentConverter; | |
2584 | ||
2585 | /* remember the original start of the input for offsets */ | |
2586 | sourceStart = args->source; | |
2587 | ||
2588 | if(myData->key != 0) { | |
2589 | /* continue with a partial escape sequence */ | |
2590 | goto escape; | |
2591 | } | |
2592 | ||
2593 | while(U_SUCCESS(*err) && args->source < args->sourceLimit) { | |
b75a7d8f | 2594 | /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ |
374ca955 A |
2595 | subArgs.source = args->source; |
2596 | subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); | |
2597 | if(subArgs.source != subArgs.sourceLimit) { | |
2598 | /* | |
2599 | * get the current partial byte sequence | |
2600 | * | |
2601 | * it needs to be moved between the public and the subconverter | |
2602 | * so that the conversion framework, which only sees the public | |
2603 | * converter, can handle truncated and illegal input etc. | |
2604 | */ | |
2605 | if(args->converter->toULength > 0) { | |
2606 | uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); | |
2607 | } | |
2608 | subArgs.converter->toULength = args->converter->toULength; | |
2609 | ||
2610 | /* | |
2611 | * Convert up to the end of the input, or to before the next escape character. | |
2612 | * Does not handle conversion extensions because the preToU[] state etc. | |
2613 | * is not copied. | |
2614 | */ | |
2615 | ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); | |
2616 | ||
2617 | if(args->offsets != NULL && sourceStart != args->source) { | |
2618 | /* update offsets to base them on the actual start of the input */ | |
2619 | int32_t *offsets = args->offsets; | |
2620 | UChar *target = args->target; | |
2621 | int32_t delta = (int32_t)(args->source - sourceStart); | |
2622 | while(target < subArgs.target) { | |
2623 | if(*offsets >= 0) { | |
2624 | *offsets += delta; | |
2625 | } | |
2626 | ++offsets; | |
2627 | ++target; | |
2628 | } | |
2629 | } | |
2630 | args->source = subArgs.source; | |
2631 | args->target = subArgs.target; | |
2632 | args->offsets = subArgs.offsets; | |
2633 | ||
2634 | /* copy input/error/overflow buffers */ | |
2635 | if(subArgs.converter->toULength > 0) { | |
2636 | uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); | |
2637 | } | |
2638 | args->converter->toULength = subArgs.converter->toULength; | |
2639 | ||
2640 | if(*err == U_BUFFER_OVERFLOW_ERROR) { | |
2641 | if(subArgs.converter->UCharErrorBufferLength > 0) { | |
2642 | uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, | |
2643 | subArgs.converter->UCharErrorBufferLength); | |
2644 | } | |
2645 | args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; | |
2646 | subArgs.converter->UCharErrorBufferLength = 0; | |
b75a7d8f | 2647 | } |
b75a7d8f A |
2648 | } |
2649 | ||
374ca955 | 2650 | if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { |
b75a7d8f | 2651 | return; |
374ca955 | 2652 | } |
b75a7d8f | 2653 | |
374ca955 | 2654 | escape: |
b75a7d8f | 2655 | changeState_2022(args->converter, |
46f4442e | 2656 | &(args->source), |
b75a7d8f | 2657 | args->sourceLimit, |
b75a7d8f | 2658 | ISO_2022_KR, |
b75a7d8f | 2659 | err); |
374ca955 | 2660 | } |
b75a7d8f A |
2661 | } |
2662 | ||
46f4442e | 2663 | static void |
b75a7d8f A |
2664 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
2665 | UErrorCode* err){ | |
374ca955 | 2666 | char tempBuf[2]; |
b75a7d8f A |
2667 | const char *mySource = ( char *) args->source; |
2668 | UChar *myTarget = args->target; | |
2669 | const char *mySourceLimit = args->sourceLimit; | |
2670 | UChar32 targetUniChar = 0x0000; | |
2671 | UChar mySourceChar = 0x0000; | |
2672 | UConverterDataISO2022* myData; | |
b75a7d8f A |
2673 | UConverterSharedData* sharedData ; |
2674 | UBool useFallback; | |
2675 | ||
374ca955 A |
2676 | myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
2677 | if(myData->version==1){ | |
2678 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); | |
b75a7d8f A |
2679 | return; |
2680 | } | |
374ca955 | 2681 | |
b75a7d8f | 2682 | /* initialize state */ |
374ca955 | 2683 | sharedData = myData->currentConverter->sharedData; |
b75a7d8f | 2684 | useFallback = args->converter->useFallback; |
46f4442e | 2685 | |
374ca955 A |
2686 | if(myData->key != 0) { |
2687 | /* continue with a partial escape sequence */ | |
2688 | goto escape; | |
2689 | } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { | |
2690 | /* continue with a partial double-byte character */ | |
2691 | mySourceChar = args->converter->toUBytes[0]; | |
2692 | args->converter->toULength = 0; | |
2693 | goto getTrailByte; | |
b75a7d8f | 2694 | } |
b75a7d8f | 2695 | |
374ca955 | 2696 | while(mySource< mySourceLimit){ |
b75a7d8f A |
2697 | |
2698 | if(myTarget < args->targetLimit){ | |
2699 | ||
2700 | mySourceChar= (unsigned char) *mySource++; | |
2701 | ||
2702 | if(mySourceChar==UCNV_SI){ | |
374ca955 | 2703 | myData->toU2022State.g = 0; |
d5d484b0 A |
2704 | if (myData->isEmptySegment) { |
2705 | myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ | |
46f4442e A |
2706 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
2707 | args->converter->toUCallbackReason = UCNV_IRREGULAR; | |
2708 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; | |
d5d484b0 A |
2709 | args->converter->toULength = 1; |
2710 | args->target = myTarget; | |
2711 | args->source = mySource; | |
2712 | return; | |
2713 | } | |
b75a7d8f A |
2714 | /*consume the source */ |
2715 | continue; | |
2716 | }else if(mySourceChar==UCNV_SO){ | |
374ca955 | 2717 | myData->toU2022State.g = 1; |
d5d484b0 | 2718 | myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ |
b75a7d8f A |
2719 | /*consume the source */ |
2720 | continue; | |
374ca955 A |
2721 | }else if(mySourceChar==ESC_2022){ |
2722 | mySource--; | |
2723 | escape: | |
d5d484b0 | 2724 | myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ |
46f4442e | 2725 | changeState_2022(args->converter,&(mySource), |
374ca955 | 2726 | mySourceLimit, ISO_2022_KR, err); |
b75a7d8f A |
2727 | if(U_FAILURE(*err)){ |
2728 | args->target = myTarget; | |
2729 | args->source = mySource; | |
2730 | return; | |
2731 | } | |
2732 | continue; | |
46f4442e | 2733 | } |
b75a7d8f | 2734 | |
d5d484b0 | 2735 | myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ |
374ca955 A |
2736 | if(myData->toU2022State.g == 1) { |
2737 | if(mySource < mySourceLimit) { | |
fd0068a8 A |
2738 | int leadIsOk, trailIsOk; |
2739 | uint8_t trailByte; | |
374ca955 | 2740 | getTrailByte: |
fd0068a8 A |
2741 | targetUniChar = missingCharMarker; |
2742 | trailByte = (uint8_t)*mySource; | |
2743 | /* | |
2744 | * Ticket 5691: consistent illegal sequences: | |
2745 | * - We include at least the first byte in the illegal sequence. | |
2746 | * - If any of the non-initial bytes could be the start of a character, | |
2747 | * we stop the illegal sequence before the first one of those. | |
2748 | * | |
2749 | * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is | |
2750 | * an ESC/SO/SI, we report only the first byte as the illegal sequence. | |
2751 | * Otherwise we convert or report the pair of bytes. | |
2752 | */ | |
2753 | leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); | |
2754 | trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); | |
2755 | if (leadIsOk && trailIsOk) { | |
2756 | ++mySource; | |
2757 | tempBuf[0] = (char)(mySourceChar + 0x80); | |
2758 | tempBuf[1] = (char)(trailByte + 0x80); | |
2759 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); | |
2760 | mySourceChar = (mySourceChar << 8) | trailByte; | |
2761 | } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { | |
2762 | /* report a pair of illegal bytes if the second byte is not a DBCS starter */ | |
2763 | ++mySource; | |
2764 | /* add another bit so that the code below writes 2 bytes in case of error */ | |
2765 | mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; | |
374ca955 A |
2766 | } |
2767 | } else { | |
2768 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; | |
2769 | args->converter->toULength = 1; | |
2770 | break; | |
b75a7d8f A |
2771 | } |
2772 | } | |
fd0068a8 | 2773 | else if(mySourceChar <= 0x7f) { |
374ca955 | 2774 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); |
fd0068a8 A |
2775 | } else { |
2776 | targetUniChar = 0xffff; | |
b75a7d8f | 2777 | } |
374ca955 A |
2778 | if(targetUniChar < 0xfffe){ |
2779 | if(args->offsets) { | |
73c04bcf | 2780 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
374ca955 | 2781 | } |
b75a7d8f A |
2782 | *(myTarget++)=(UChar)targetUniChar; |
2783 | } | |
2784 | else { | |
b75a7d8f | 2785 | /* Call the callback function*/ |
374ca955 A |
2786 | toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
2787 | break; | |
b75a7d8f A |
2788 | } |
2789 | } | |
2790 | else{ | |
2791 | *err =U_BUFFER_OVERFLOW_ERROR; | |
2792 | break; | |
2793 | } | |
2794 | } | |
b75a7d8f A |
2795 | args->target = myTarget; |
2796 | args->source = mySource; | |
2797 | } | |
2798 | ||
2799 | /*************************** END ISO2022-KR *********************************/ | |
2800 | ||
2801 | /*************************** ISO-2022-CN ********************************* | |
2802 | * | |
2803 | * Rules for ISO-2022-CN Encoding: | |
374ca955 | 2804 | * i) The designator sequence must appear once on a line before any instance |
b75a7d8f A |
2805 | * of character set it designates. |
2806 | * ii) If two lines contain characters from the same character set, both lines | |
2807 | * must include the designator sequence. | |
374ca955 | 2808 | * iii) Once the designator sequence is known, a shifting sequence has to be found |
b75a7d8f A |
2809 | * to invoke the shifting |
2810 | * iv) All lines start in ASCII and end in ASCII. | |
2811 | * v) Four shifting sequences are employed for this purpose: | |
2812 | * | |
2813 | * Sequcence ASCII Eq Charsets | |
2814 | * ---------- ------- --------- | |
374ca955 A |
2815 | * SI <SI> US-ASCII |
2816 | * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 | |
2817 | * SS2 <ESC>N CNS-11643-1992 Plane 2 | |
2818 | * SS3 <ESC>O CNS-11643-1992 Planes 3-7 | |
b75a7d8f A |
2819 | * |
2820 | * vi) | |
2821 | * SOdesignator : ESC "$" ")" finalchar_for_SO | |
2822 | * SS2designator : ESC "$" "*" finalchar_for_SS2 | |
2823 | * SS3designator : ESC "$" "+" finalchar_for_SS3 | |
2824 | * | |
2825 | * ESC $ ) A Indicates the bytes following SO are Chinese | |
2826 | * characters as defined in GB 2312-80, until | |
2827 | * another SOdesignation appears | |
2828 | * | |
2829 | * | |
2830 | * ESC $ ) E Indicates the bytes following SO are as defined | |
2831 | * in ISO-IR-165 (for details, see section 2.1), | |
2832 | * until another SOdesignation appears | |
2833 | * | |
2834 | * ESC $ ) G Indicates the bytes following SO are as defined | |
2835 | * in CNS 11643-plane-1, until another | |
2836 | * SOdesignation appears | |
2837 | * | |
2838 | * ESC $ * H Indicates the two bytes immediately following | |
2839 | * SS2 is a Chinese character as defined in CNS | |
2840 | * 11643-plane-2, until another SS2designation | |
2841 | * appears | |
46f4442e | 2842 | * (Meaning <ESC>N must preceed every 2 byte |
b75a7d8f A |
2843 | * sequence.) |
2844 | * | |
2845 | * ESC $ + I Indicates the immediate two bytes following SS3 | |
2846 | * is a Chinese character as defined in CNS | |
2847 | * 11643-plane-3, until another SS3designation | |
2848 | * appears | |
46f4442e | 2849 | * (Meaning <ESC>O must preceed every 2 byte |
b75a7d8f A |
2850 | * sequence.) |
2851 | * | |
2852 | * ESC $ + J Indicates the immediate two bytes following SS3 | |
2853 | * is a Chinese character as defined in CNS | |
2854 | * 11643-plane-4, until another SS3designation | |
2855 | * appears | |
46f4442e | 2856 | * (In English: <ESC>O must preceed every 2 byte |
b75a7d8f A |
2857 | * sequence.) |
2858 | * | |
2859 | * ESC $ + K Indicates the immediate two bytes following SS3 | |
2860 | * is a Chinese character as defined in CNS | |
2861 | * 11643-plane-5, until another SS3designation | |
2862 | * appears | |
2863 | * | |
2864 | * ESC $ + L Indicates the immediate two bytes following SS3 | |
2865 | * is a Chinese character as defined in CNS | |
2866 | * 11643-plane-6, until another SS3designation | |
2867 | * appears | |
2868 | * | |
2869 | * ESC $ + M Indicates the immediate two bytes following SS3 | |
2870 | * is a Chinese character as defined in CNS | |
2871 | * 11643-plane-7, until another SS3designation | |
2872 | * appears | |
2873 | * | |
2874 | * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and | |
2875 | * has its own designation information before any Chinese characters | |
2876 | * appear | |
2877 | * | |
2878 | */ | |
2879 | ||
4388f060 | 2880 | /* The following are defined this way to make the strings truly readonly */ |
b75a7d8f A |
2881 | static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; |
2882 | static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; | |
2883 | static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; | |
2884 | static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; | |
2885 | static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; | |
2886 | static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; | |
2887 | static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; | |
2888 | static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; | |
2889 | static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; | |
2890 | ||
2891 | /********************** ISO2022-CN Data **************************/ | |
2892 | static const char* const escSeqCharsCN[10] ={ | |
4388f060 A |
2893 | SHIFT_IN_STR, /* 0 ASCII */ |
2894 | GB_2312_80_STR, /* 1 GB2312_1 */ | |
2895 | ISO_IR_165_STR, /* 2 ISO_IR_165 */ | |
b75a7d8f A |
2896 | CNS_11643_1992_Plane_1_STR, |
2897 | CNS_11643_1992_Plane_2_STR, | |
2898 | CNS_11643_1992_Plane_3_STR, | |
2899 | CNS_11643_1992_Plane_4_STR, | |
2900 | CNS_11643_1992_Plane_5_STR, | |
2901 | CNS_11643_1992_Plane_6_STR, | |
2902 | CNS_11643_1992_Plane_7_STR | |
2903 | }; | |
b75a7d8f | 2904 | |
46f4442e | 2905 | static void |
b75a7d8f | 2906 | UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
46f4442e | 2907 | UConverter *cnv = args->converter; |
b75a7d8f | 2908 | UConverterDataISO2022 *converterData; |
374ca955 A |
2909 | ISO2022State *pFromU2022State; |
2910 | uint8_t *target = (uint8_t *) args->target; | |
2911 | const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; | |
b75a7d8f A |
2912 | const UChar* source = args->source; |
2913 | const UChar* sourceLimit = args->sourceLimit; | |
2914 | int32_t* offsets = args->offsets; | |
374ca955 A |
2915 | UChar32 sourceChar; |
2916 | char buffer[8]; | |
2917 | int32_t len; | |
2918 | int8_t choices[3]; | |
2919 | int32_t choiceCount; | |
73c04bcf | 2920 | uint32_t targetValue = 0; |
b75a7d8f A |
2921 | UBool useFallback; |
2922 | ||
b75a7d8f | 2923 | /* set up the state */ |
46f4442e | 2924 | converterData = (UConverterDataISO2022*)cnv->extraInfo; |
374ca955 | 2925 | pFromU2022State = &converterData->fromU2022State; |
374ca955 A |
2926 | |
2927 | choiceCount = 0; | |
b75a7d8f A |
2928 | |
2929 | /* check if the last codepoint of previous buffer was a lead surrogate*/ | |
46f4442e | 2930 | if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { |
b75a7d8f A |
2931 | goto getTrail; |
2932 | } | |
2933 | ||
b75a7d8f | 2934 | while( source < sourceLimit){ |
b75a7d8f A |
2935 | if(target < targetLimit){ |
2936 | ||
2937 | sourceChar = *(source++); | |
2938 | /*check if the char is a First surrogate*/ | |
4388f060 A |
2939 | if(U16_IS_SURROGATE(sourceChar)) { |
2940 | if(U16_IS_SURROGATE_LEAD(sourceChar)) { | |
b75a7d8f A |
2941 | getTrail: |
2942 | /*look ahead to find the trail surrogate*/ | |
2943 | if(source < sourceLimit) { | |
2944 | /* test the following code unit */ | |
2945 | UChar trail=(UChar) *source; | |
4388f060 | 2946 | if(U16_IS_TRAIL(trail)) { |
b75a7d8f | 2947 | source++; |
4388f060 | 2948 | sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
46f4442e | 2949 | cnv->fromUChar32=0x00; |
374ca955 | 2950 | /* convert this supplementary code point */ |
b75a7d8f A |
2951 | /* exit this condition tree */ |
2952 | } else { | |
2953 | /* this is an unmatched lead code unit (1st surrogate) */ | |
2954 | /* callback(illegal) */ | |
b75a7d8f | 2955 | *err=U_ILLEGAL_CHAR_FOUND; |
46f4442e | 2956 | cnv->fromUChar32=sourceChar; |
374ca955 | 2957 | break; |
b75a7d8f A |
2958 | } |
2959 | } else { | |
2960 | /* no more input */ | |
46f4442e | 2961 | cnv->fromUChar32=sourceChar; |
b75a7d8f A |
2962 | break; |
2963 | } | |
2964 | } else { | |
2965 | /* this is an unmatched trail code unit (2nd surrogate) */ | |
2966 | /* callback(illegal) */ | |
b75a7d8f | 2967 | *err=U_ILLEGAL_CHAR_FOUND; |
46f4442e | 2968 | cnv->fromUChar32=sourceChar; |
374ca955 | 2969 | break; |
b75a7d8f A |
2970 | } |
2971 | } | |
2972 | ||
2973 | /* do the conversion */ | |
374ca955 | 2974 | if(sourceChar <= 0x007f ){ |
73c04bcf A |
2975 | /* do not convert SO/SI/ESC */ |
2976 | if(IS_2022_CONTROL(sourceChar)) { | |
2977 | /* callback(illegal) */ | |
2978 | *err=U_ILLEGAL_CHAR_FOUND; | |
46f4442e | 2979 | cnv->fromUChar32=sourceChar; |
73c04bcf A |
2980 | break; |
2981 | } | |
2982 | ||
374ca955 A |
2983 | /* US-ASCII */ |
2984 | if(pFromU2022State->g == 0) { | |
2985 | buffer[0] = (char)sourceChar; | |
2986 | len = 1; | |
2987 | } else { | |
2988 | buffer[0] = UCNV_SI; | |
2989 | buffer[1] = (char)sourceChar; | |
2990 | len = 2; | |
2991 | pFromU2022State->g = 0; | |
2992 | choiceCount = 0; | |
2993 | } | |
2994 | if(sourceChar == CR || sourceChar == LF) { | |
2995 | /* reset the state at the end of a line */ | |
2996 | uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); | |
2997 | choiceCount = 0; | |
b75a7d8f | 2998 | } |
b75a7d8f A |
2999 | } |
3000 | else{ | |
374ca955 | 3001 | /* convert U+0080..U+10ffff */ |
374ca955 A |
3002 | int32_t i; |
3003 | int8_t cs, g; | |
3004 | ||
3005 | if(choiceCount == 0) { | |
3006 | /* try the current SO/G1 converter first */ | |
3007 | choices[0] = pFromU2022State->cs[1]; | |
3008 | ||
3009 | /* default to GB2312_1 if none is designated yet */ | |
3010 | if(choices[0] == 0) { | |
3011 | choices[0] = GB2312_1; | |
3012 | } | |
b75a7d8f | 3013 | |
374ca955 A |
3014 | if(converterData->version == 0) { |
3015 | /* ISO-2022-CN */ | |
3016 | ||
3017 | /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ | |
3018 | if(choices[0] == GB2312_1) { | |
3019 | choices[1] = (int8_t)CNS_11643_1; | |
3020 | } else { | |
3021 | choices[1] = (int8_t)GB2312_1; | |
b75a7d8f | 3022 | } |
374ca955 A |
3023 | |
3024 | choiceCount = 2; | |
729e4ab9 | 3025 | } else if (converterData->version == 1) { |
374ca955 A |
3026 | /* ISO-2022-CN-EXT */ |
3027 | ||
3028 | /* try one of the other converters */ | |
3029 | switch(choices[0]) { | |
3030 | case GB2312_1: | |
3031 | choices[1] = (int8_t)CNS_11643_1; | |
3032 | choices[2] = (int8_t)ISO_IR_165; | |
3033 | break; | |
3034 | case ISO_IR_165: | |
3035 | choices[1] = (int8_t)GB2312_1; | |
3036 | choices[2] = (int8_t)CNS_11643_1; | |
3037 | break; | |
3038 | default: /* CNS_11643_x */ | |
3039 | choices[1] = (int8_t)GB2312_1; | |
3040 | choices[2] = (int8_t)ISO_IR_165; | |
3041 | break; | |
b75a7d8f | 3042 | } |
b75a7d8f | 3043 | |
374ca955 | 3044 | choiceCount = 3; |
729e4ab9 A |
3045 | } else { |
3046 | choices[0] = (int8_t)CNS_11643_1; | |
3047 | choices[1] = (int8_t)GB2312_1; | |
374ca955 | 3048 | } |
b75a7d8f A |
3049 | } |
3050 | ||
374ca955 | 3051 | cs = g = 0; |
46f4442e A |
3052 | /* |
3053 | * len==0: no mapping found yet | |
3054 | * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks | |
3055 | * len>0: found a roundtrip result, done | |
3056 | */ | |
374ca955 | 3057 | len = 0; |
46f4442e A |
3058 | /* |
3059 | * We will turn off useFallback after finding a fallback, | |
3060 | * but we still get fallbacks from PUA code points as usual. | |
3061 | * Therefore, we will also need to check that we don't overwrite | |
3062 | * an early fallback with a later one. | |
3063 | */ | |
3064 | useFallback = cnv->useFallback; | |
3065 | ||
3066 | for(i = 0; i < choiceCount && len <= 0; ++i) { | |
3067 | int8_t cs0 = choices[i]; | |
3068 | if(cs0 > 0) { | |
3069 | uint32_t value; | |
3070 | int32_t len2; | |
3071 | if(cs0 >= CNS_11643_0) { | |
3072 | len2 = MBCS_FROM_UCHAR32_ISO2022( | |
3073 | converterData->myConverterArray[CNS_11643], | |
3074 | sourceChar, | |
3075 | &value, | |
3076 | useFallback, | |
3077 | MBCS_OUTPUT_3); | |
3078 | if(len2 == 3 || (len2 == -3 && len == 0)) { | |
3079 | targetValue = value; | |
3080 | cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); | |
3081 | if(len2 >= 0) { | |
3082 | len = 2; | |
3083 | } else { | |
3084 | len = -2; | |
3085 | useFallback = FALSE; | |
3086 | } | |
374ca955 A |
3087 | if(cs == CNS_11643_1) { |
3088 | g = 1; | |
3089 | } else if(cs == CNS_11643_2) { | |
3090 | g = 2; | |
3091 | } else /* plane 3..7 */ if(converterData->version == 1) { | |
3092 | g = 3; | |
3093 | } else { | |
3094 | /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ | |
3095 | len = 0; | |
3096 | } | |
3097 | } | |
3098 | } else { | |
3099 | /* GB2312_1 or ISO-IR-165 */ | |
4388f060 | 3100 | U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS); |
46f4442e A |
3101 | len2 = MBCS_FROM_UCHAR32_ISO2022( |
3102 | converterData->myConverterArray[cs0], | |
3103 | sourceChar, | |
3104 | &value, | |
3105 | useFallback, | |
3106 | MBCS_OUTPUT_2); | |
3107 | if(len2 == 2 || (len2 == -2 && len == 0)) { | |
3108 | targetValue = value; | |
3109 | len = len2; | |
3110 | cs = cs0; | |
3111 | g = 1; | |
3112 | useFallback = FALSE; | |
3113 | } | |
374ca955 | 3114 | } |
b75a7d8f | 3115 | } |
b75a7d8f A |
3116 | } |
3117 | ||
46f4442e A |
3118 | if(len != 0) { |
3119 | len = 0; /* count output bytes; it must have been abs(len) == 2 */ | |
b75a7d8f | 3120 | |
374ca955 A |
3121 | /* write the designation sequence if necessary */ |
3122 | if(cs != pFromU2022State->cs[g]) { | |
3123 | if(cs < CNS_11643) { | |
3124 | uprv_memcpy(buffer, escSeqCharsCN[cs], 4); | |
3125 | } else { | |
4388f060 | 3126 | U_ASSERT(cs >= CNS_11643_1); |
374ca955 | 3127 | uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); |
b75a7d8f | 3128 | } |
374ca955 A |
3129 | len = 4; |
3130 | pFromU2022State->cs[g] = cs; | |
3131 | if(g == 1) { | |
3132 | /* changing the SO/G1 charset invalidates the choices[] */ | |
3133 | choiceCount = 0; | |
b75a7d8f | 3134 | } |
374ca955 A |
3135 | } |
3136 | ||
3137 | /* write the shift sequence if necessary */ | |
3138 | if(g != pFromU2022State->g) { | |
3139 | switch(g) { | |
3140 | case 1: | |
3141 | buffer[len++] = UCNV_SO; | |
3142 | ||
3143 | /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ | |
3144 | pFromU2022State->g = 1; | |
3145 | break; | |
3146 | case 2: | |
3147 | buffer[len++] = 0x1b; | |
3148 | buffer[len++] = 0x4e; | |
3149 | break; | |
3150 | default: /* case 3 */ | |
3151 | buffer[len++] = 0x1b; | |
3152 | buffer[len++] = 0x4f; | |
3153 | break; | |
b75a7d8f | 3154 | } |
b75a7d8f | 3155 | } |
b75a7d8f | 3156 | |
374ca955 A |
3157 | /* write the two output bytes */ |
3158 | buffer[len++] = (char)(targetValue >> 8); | |
3159 | buffer[len++] = (char)targetValue; | |
3160 | } else { | |
46f4442e | 3161 | /* if we cannot find the character after checking all codepages |
374ca955 A |
3162 | * then this is an error |
3163 | */ | |
3164 | *err = U_INVALID_CHAR_FOUND; | |
46f4442e | 3165 | cnv->fromUChar32=sourceChar; |
374ca955 A |
3166 | break; |
3167 | } | |
b75a7d8f | 3168 | } |
b75a7d8f | 3169 | |
374ca955 A |
3170 | /* output len>0 bytes in buffer[] */ |
3171 | if(len == 1) { | |
3172 | *target++ = buffer[0]; | |
3173 | if(offsets) { | |
73c04bcf | 3174 | *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ |
374ca955 A |
3175 | } |
3176 | } else if(len == 2 && (target + 2) <= targetLimit) { | |
3177 | *target++ = buffer[0]; | |
3178 | *target++ = buffer[1]; | |
3179 | if(offsets) { | |
3180 | int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); | |
3181 | *offsets++ = sourceIndex; | |
3182 | *offsets++ = sourceIndex; | |
3183 | } | |
3184 | } else { | |
73c04bcf | 3185 | fromUWriteUInt8( |
46f4442e | 3186 | cnv, |
374ca955 | 3187 | buffer, len, |
73c04bcf | 3188 | &target, (const char *)targetLimit, |
374ca955 A |
3189 | &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), |
3190 | err); | |
3191 | if(U_FAILURE(*err)) { | |
b75a7d8f A |
3192 | break; |
3193 | } | |
3194 | } | |
3195 | } /* end if(myTargetIndex<myTargetLength) */ | |
3196 | else{ | |
3197 | *err =U_BUFFER_OVERFLOW_ERROR; | |
3198 | break; | |
3199 | } | |
3200 | ||
3201 | }/* end while(mySourceIndex<mySourceLength) */ | |
3202 | ||
374ca955 A |
3203 | /* |
3204 | * the end of the input stream and detection of truncated input | |
3205 | * are handled by the framework, but for ISO-2022-CN conversion | |
3206 | * we need to be in ASCII mode at the very end | |
3207 | * | |
3208 | * conditions: | |
3209 | * successful | |
3210 | * not in ASCII mode | |
3211 | * end of input and no truncated input | |
b75a7d8f | 3212 | */ |
374ca955 A |
3213 | if( U_SUCCESS(*err) && |
3214 | pFromU2022State->g!=0 && | |
46f4442e | 3215 | args->flush && source>=sourceLimit && cnv->fromUChar32==0 |
374ca955 A |
3216 | ) { |
3217 | int32_t sourceIndex; | |
3218 | ||
3219 | /* we are switching to ASCII */ | |
3220 | pFromU2022State->g=0; | |
3221 | ||
3222 | /* get the source index of the last input character */ | |
3223 | /* | |
3224 | * TODO this would be simpler and more reliable if we used a pair | |
3225 | * of sourceIndex/prevSourceIndex like in ucnvmbcs.c | |
3226 | * so that we could simply use the prevSourceIndex here; | |
3227 | * this code gives an incorrect result for the rare case of an unmatched | |
3228 | * trail surrogate that is alone in the last buffer of the text stream | |
3229 | */ | |
3230 | sourceIndex=(int32_t)(source-args->source); | |
3231 | if(sourceIndex>0) { | |
3232 | --sourceIndex; | |
3233 | if( U16_IS_TRAIL(args->source[sourceIndex]) && | |
3234 | (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) | |
3235 | ) { | |
3236 | --sourceIndex; | |
b75a7d8f | 3237 | } |
374ca955 A |
3238 | } else { |
3239 | sourceIndex=-1; | |
b75a7d8f | 3240 | } |
b75a7d8f | 3241 | |
73c04bcf | 3242 | fromUWriteUInt8( |
46f4442e | 3243 | cnv, |
374ca955 | 3244 | SHIFT_IN_STR, 1, |
73c04bcf | 3245 | &target, (const char *)targetLimit, |
374ca955 A |
3246 | &offsets, sourceIndex, |
3247 | err); | |
b75a7d8f | 3248 | } |
b75a7d8f | 3249 | |
374ca955 A |
3250 | /*save the state and return */ |
3251 | args->source = source; | |
3252 | args->target = (char*)target; | |
b75a7d8f A |
3253 | } |
3254 | ||
3255 | ||
46f4442e | 3256 | static void |
b75a7d8f A |
3257 | UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
3258 | UErrorCode* err){ | |
3259 | char tempBuf[3]; | |
374ca955 | 3260 | const char *mySource = (char *) args->source; |
b75a7d8f | 3261 | UChar *myTarget = args->target; |
b75a7d8f A |
3262 | const char *mySourceLimit = args->sourceLimit; |
3263 | uint32_t targetUniChar = 0x0000; | |
3264 | uint32_t mySourceChar = 0x0000; | |
3265 | UConverterDataISO2022* myData; | |
374ca955 | 3266 | ISO2022State *pToU2022State; |
b75a7d8f | 3267 | |
374ca955 A |
3268 | myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
3269 | pToU2022State = &myData->toU2022State; | |
3270 | ||
3271 | if(myData->key != 0) { | |
3272 | /* continue with a partial escape sequence */ | |
3273 | goto escape; | |
3274 | } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { | |
3275 | /* continue with a partial double-byte character */ | |
3276 | mySourceChar = args->converter->toUBytes[0]; | |
3277 | args->converter->toULength = 0; | |
fd0068a8 | 3278 | targetUniChar = missingCharMarker; |
374ca955 | 3279 | goto getTrailByte; |
b75a7d8f | 3280 | } |
374ca955 A |
3281 | |
3282 | while(mySource < mySourceLimit){ | |
b75a7d8f A |
3283 | |
3284 | targetUniChar =missingCharMarker; | |
3285 | ||
3286 | if(myTarget < args->targetLimit){ | |
3287 | ||
3288 | mySourceChar= (unsigned char) *mySource++; | |
3289 | ||
b75a7d8f A |
3290 | switch(mySourceChar){ |
3291 | case UCNV_SI: | |
374ca955 | 3292 | pToU2022State->g=0; |
d5d484b0 A |
3293 | if (myData->isEmptySegment) { |
3294 | myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ | |
46f4442e A |
3295 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
3296 | args->converter->toUCallbackReason = UCNV_IRREGULAR; | |
d5d484b0 A |
3297 | args->converter->toUBytes[0] = mySourceChar; |
3298 | args->converter->toULength = 1; | |
3299 | args->target = myTarget; | |
3300 | args->source = mySource; | |
3301 | return; | |
3302 | } | |
b75a7d8f A |
3303 | continue; |
3304 | ||
3305 | case UCNV_SO: | |
374ca955 A |
3306 | if(pToU2022State->cs[1] != 0) { |
3307 | pToU2022State->g=1; | |
d5d484b0 | 3308 | myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ |
374ca955 A |
3309 | continue; |
3310 | } else { | |
3311 | /* illegal to have SO before a matching designator */ | |
d5d484b0 | 3312 | myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ |
b75a7d8f A |
3313 | break; |
3314 | } | |
3315 | ||
b75a7d8f | 3316 | case ESC_2022: |
b75a7d8f | 3317 | mySource--; |
374ca955 | 3318 | escape: |
d5d484b0 A |
3319 | { |
3320 | const char * mySourceBefore = mySource; | |
3321 | int8_t toULengthBefore = args->converter->toULength; | |
3322 | ||
46f4442e | 3323 | changeState_2022(args->converter,&(mySource), |
d5d484b0 A |
3324 | mySourceLimit, ISO_2022_CN,err); |
3325 | ||
3326 | /* After SO there must be at least one character before a designator (designator error handled separately) */ | |
46f4442e A |
3327 | if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { |
3328 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; | |
3329 | args->converter->toUCallbackReason = UCNV_IRREGULAR; | |
729e4ab9 | 3330 | args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); |
d5d484b0 A |
3331 | } |
3332 | } | |
b75a7d8f A |
3333 | |
3334 | /* invalid or illegal escape sequence */ | |
3335 | if(U_FAILURE(*err)){ | |
3336 | args->target = myTarget; | |
3337 | args->source = mySource; | |
d5d484b0 | 3338 | myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ |
b75a7d8f A |
3339 | return; |
3340 | } | |
3341 | continue; | |
3342 | ||
374ca955 A |
3343 | /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ |
3344 | ||
3345 | case CR: | |
3346 | /*falls through*/ | |
3347 | case LF: | |
3348 | uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); | |
3349 | /* falls through */ | |
3350 | default: | |
3351 | /* convert one or two bytes */ | |
d5d484b0 | 3352 | myData->isEmptySegment = FALSE; |
374ca955 A |
3353 | if(pToU2022State->g != 0) { |
3354 | if(mySource < mySourceLimit) { | |
3355 | UConverterSharedData *cnv; | |
3356 | StateEnum tempState; | |
3357 | int32_t tempBufLen; | |
fd0068a8 A |
3358 | int leadIsOk, trailIsOk; |
3359 | uint8_t trailByte; | |
374ca955 | 3360 | getTrailByte: |
fd0068a8 A |
3361 | trailByte = (uint8_t)*mySource; |
3362 | /* | |
3363 | * Ticket 5691: consistent illegal sequences: | |
3364 | * - We include at least the first byte in the illegal sequence. | |
3365 | * - If any of the non-initial bytes could be the start of a character, | |
3366 | * we stop the illegal sequence before the first one of those. | |
3367 | * | |
3368 | * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is | |
3369 | * an ESC/SO/SI, we report only the first byte as the illegal sequence. | |
3370 | * Otherwise we convert or report the pair of bytes. | |
3371 | */ | |
3372 | leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); | |
3373 | trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); | |
3374 | if (leadIsOk && trailIsOk) { | |
3375 | ++mySource; | |
3376 | tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; | |
3377 | if(tempState >= CNS_11643_0) { | |
3378 | cnv = myData->myConverterArray[CNS_11643]; | |
3379 | tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); | |
3380 | tempBuf[1] = (char) (mySourceChar); | |
3381 | tempBuf[2] = (char) trailByte; | |
3382 | tempBufLen = 3; | |
3383 | ||
3384 | }else{ | |
4388f060 | 3385 | U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS); |
fd0068a8 A |
3386 | cnv = myData->myConverterArray[tempState]; |
3387 | tempBuf[0] = (char) (mySourceChar); | |
3388 | tempBuf[1] = (char) trailByte; | |
3389 | tempBufLen = 2; | |
3390 | } | |
3391 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); | |
3392 | mySourceChar = (mySourceChar << 8) | trailByte; | |
3393 | } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { | |
3394 | /* report a pair of illegal bytes if the second byte is not a DBCS starter */ | |
3395 | ++mySource; | |
3396 | /* add another bit so that the code below writes 2 bytes in case of error */ | |
3397 | mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; | |
374ca955 | 3398 | } |
374ca955 A |
3399 | if(pToU2022State->g>=2) { |
3400 | /* return from a single-shift state to the previous one */ | |
3401 | pToU2022State->g=pToU2022State->prevG; | |
3402 | } | |
374ca955 A |
3403 | } else { |
3404 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; | |
3405 | args->converter->toULength = 1; | |
3406 | goto endloop; | |
3407 | } | |
3408 | } | |
3409 | else{ | |
3410 | if(mySourceChar <= 0x7f) { | |
3411 | targetUniChar = (UChar) mySourceChar; | |
3412 | } | |
3413 | } | |
3414 | break; | |
b75a7d8f A |
3415 | } |
3416 | if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ | |
3417 | if(args->offsets){ | |
73c04bcf | 3418 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
b75a7d8f A |
3419 | } |
3420 | *(myTarget++)=(UChar)targetUniChar; | |
3421 | } | |
3422 | else if(targetUniChar > missingCharMarker){ | |
3423 | /* disassemble the surrogate pair and write to output*/ | |
3424 | targetUniChar-=0x0010000; | |
374ca955 | 3425 | *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); |
b75a7d8f | 3426 | if(args->offsets){ |
73c04bcf | 3427 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
b75a7d8f | 3428 | } |
374ca955 | 3429 | ++myTarget; |
46f4442e | 3430 | if(myTarget< args->targetLimit){ |
374ca955 | 3431 | *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
b75a7d8f | 3432 | if(args->offsets){ |
73c04bcf | 3433 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
b75a7d8f | 3434 | } |
374ca955 | 3435 | ++myTarget; |
b75a7d8f A |
3436 | }else{ |
3437 | args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= | |
3438 | (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); | |
3439 | } | |
3440 | ||
3441 | } | |
3442 | else{ | |
3443 | /* Call the callback function*/ | |
374ca955 A |
3444 | toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
3445 | break; | |
b75a7d8f A |
3446 | } |
3447 | } | |
3448 | else{ | |
3449 | *err =U_BUFFER_OVERFLOW_ERROR; | |
3450 | break; | |
3451 | } | |
3452 | } | |
374ca955 | 3453 | endloop: |
b75a7d8f A |
3454 | args->target = myTarget; |
3455 | args->source = mySource; | |
3456 | } | |
b331163b | 3457 | #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */ |
b75a7d8f A |
3458 | |
3459 | static void | |
3460 | _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { | |
3461 | UConverter *cnv = args->converter; | |
3462 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; | |
374ca955 A |
3463 | ISO2022State *pFromU2022State=&myConverterData->fromU2022State; |
3464 | char *p, *subchar; | |
3465 | char buffer[8]; | |
3466 | int32_t length; | |
3467 | ||
73c04bcf | 3468 | subchar=(char *)cnv->subChars; |
374ca955 | 3469 | length=cnv->subCharLen; /* assume length==1 for most variants */ |
b75a7d8f A |
3470 | |
3471 | p = buffer; | |
3472 | switch(myConverterData->locale[0]){ | |
3473 | case 'j': | |
374ca955 A |
3474 | { |
3475 | int8_t cs; | |
3476 | ||
3477 | if(pFromU2022State->g == 1) { | |
3478 | /* JIS7: switch from G1 to G0 */ | |
3479 | pFromU2022State->g = 0; | |
3480 | *p++ = UCNV_SI; | |
3481 | } | |
3482 | ||
3483 | cs = pFromU2022State->cs[0]; | |
3484 | if(cs != ASCII && cs != JISX201) { | |
3485 | /* not in ASCII or JIS X 0201: switch to ASCII */ | |
3486 | pFromU2022State->cs[0] = (int8_t)ASCII; | |
b75a7d8f A |
3487 | *p++ = '\x1b'; |
3488 | *p++ = '\x28'; | |
3489 | *p++ = '\x42'; | |
b75a7d8f | 3490 | } |
374ca955 A |
3491 | |
3492 | *p++ = subchar[0]; | |
b75a7d8f | 3493 | break; |
374ca955 | 3494 | } |
b75a7d8f | 3495 | case 'c': |
374ca955 A |
3496 | if(pFromU2022State->g != 0) { |
3497 | /* not in ASCII mode: switch to ASCII */ | |
3498 | pFromU2022State->g = 0; | |
3499 | *p++ = UCNV_SI; | |
3500 | } | |
3501 | *p++ = subchar[0]; | |
b75a7d8f A |
3502 | break; |
3503 | case 'k': | |
374ca955 A |
3504 | if(myConverterData->version == 0) { |
3505 | if(length == 1) { | |
3506 | if((UBool)args->converter->fromUnicodeStatus) { | |
3507 | /* in DBCS mode: switch to SBCS */ | |
3508 | args->converter->fromUnicodeStatus = 0; | |
3509 | *p++ = UCNV_SI; | |
3510 | } | |
3511 | *p++ = subchar[0]; | |
3512 | } else /* length == 2*/ { | |
3513 | if(!(UBool)args->converter->fromUnicodeStatus) { | |
3514 | /* in SBCS mode: switch to DBCS */ | |
3515 | args->converter->fromUnicodeStatus = 1; | |
3516 | *p++ = UCNV_SO; | |
3517 | } | |
3518 | *p++ = subchar[0]; | |
3519 | *p++ = subchar[1]; | |
3520 | } | |
3521 | break; | |
3522 | } else { | |
73c04bcf A |
3523 | /* save the subconverter's substitution string */ |
3524 | uint8_t *currentSubChars = myConverterData->currentConverter->subChars; | |
3525 | int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; | |
3526 | ||
3527 | /* set our substitution string into the subconverter */ | |
3528 | myConverterData->currentConverter->subChars = (uint8_t *)subchar; | |
374ca955 A |
3529 | myConverterData->currentConverter->subCharLen = (int8_t)length; |
3530 | ||
73c04bcf A |
3531 | /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ |
3532 | args->converter = myConverterData->currentConverter; | |
374ca955 A |
3533 | myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; |
3534 | ucnv_cbFromUWriteSub(args, 0, err); | |
3535 | cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; | |
73c04bcf A |
3536 | args->converter = cnv; |
3537 | ||
3538 | /* restore the subconverter's substitution string */ | |
3539 | myConverterData->currentConverter->subChars = currentSubChars; | |
3540 | myConverterData->currentConverter->subCharLen = currentSubCharLen; | |
374ca955 A |
3541 | |
3542 | if(*err == U_BUFFER_OVERFLOW_ERROR) { | |
3543 | if(myConverterData->currentConverter->charErrorBufferLength > 0) { | |
3544 | uprv_memcpy( | |
3545 | cnv->charErrorBuffer, | |
3546 | myConverterData->currentConverter->charErrorBuffer, | |
3547 | myConverterData->currentConverter->charErrorBufferLength); | |
3548 | } | |
3549 | cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; | |
3550 | myConverterData->currentConverter->charErrorBufferLength = 0; | |
3551 | } | |
374ca955 | 3552 | return; |
b75a7d8f | 3553 | } |
b75a7d8f A |
3554 | default: |
3555 | /* not expected */ | |
3556 | break; | |
3557 | } | |
3558 | ucnv_cbFromUWriteBytes(args, | |
3559 | buffer, (int32_t)(p - buffer), | |
3560 | offsetIndex, err); | |
3561 | } | |
3562 | ||
73c04bcf A |
3563 | /* |
3564 | * Structure for cloning an ISO 2022 converter into a single memory block. | |
3565 | * ucnv_safeClone() of the converter will align the entire cloneStruct, | |
3566 | * and then ucnv_safeClone() of the sub-converter may additionally align | |
3567 | * currentConverter inside the cloneStruct, for which we need the deadSpace | |
3568 | * after currentConverter. | |
3569 | * This is because UAlignedMemory may be larger than the actually | |
3570 | * necessary alignment size for the platform. | |
3571 | * The other cloneStruct fields will not be moved around, | |
3572 | * and are aligned properly with cloneStruct's alignment. | |
3573 | */ | |
b75a7d8f A |
3574 | struct cloneStruct |
3575 | { | |
3576 | UConverter cnv; | |
374ca955 | 3577 | UConverter currentConverter; |
73c04bcf A |
3578 | UAlignedMemory deadSpace; |
3579 | UConverterDataISO2022 mydata; | |
b75a7d8f A |
3580 | }; |
3581 | ||
3582 | ||
46f4442e | 3583 | static UConverter * |
b75a7d8f | 3584 | _ISO_2022_SafeClone( |
46f4442e A |
3585 | const UConverter *cnv, |
3586 | void *stackBuffer, | |
3587 | int32_t *pBufferSize, | |
b75a7d8f A |
3588 | UErrorCode *status) |
3589 | { | |
3590 | struct cloneStruct * localClone; | |
374ca955 A |
3591 | UConverterDataISO2022 *cnvData; |
3592 | int32_t i, size; | |
b75a7d8f A |
3593 | |
3594 | if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ | |
374ca955 A |
3595 | *pBufferSize = (int32_t)sizeof(struct cloneStruct); |
3596 | return NULL; | |
b75a7d8f A |
3597 | } |
3598 | ||
374ca955 | 3599 | cnvData = (UConverterDataISO2022 *)cnv->extraInfo; |
b75a7d8f | 3600 | localClone = (struct cloneStruct *)stackBuffer; |
b75a7d8f | 3601 | |
374ca955 | 3602 | /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ |
b75a7d8f | 3603 | |
374ca955 | 3604 | uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); |
73c04bcf A |
3605 | localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ |
3606 | localClone->cnv.isExtraLocal = TRUE; | |
b75a7d8f | 3607 | |
374ca955 | 3608 | /* share the subconverters */ |
b75a7d8f | 3609 | |
374ca955 | 3610 | if(cnvData->currentConverter != NULL) { |
73c04bcf | 3611 | size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ |
374ca955 A |
3612 | localClone->mydata.currentConverter = |
3613 | ucnv_safeClone(cnvData->currentConverter, | |
3614 | &localClone->currentConverter, | |
3615 | &size, status); | |
3616 | if(U_FAILURE(*status)) { | |
3617 | return NULL; | |
b75a7d8f | 3618 | } |
b75a7d8f A |
3619 | } |
3620 | ||
374ca955 A |
3621 | for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { |
3622 | if(cnvData->myConverterArray[i] != NULL) { | |
3623 | ucnv_incrementRefCount(cnvData->myConverterArray[i]); | |
3624 | } | |
b75a7d8f A |
3625 | } |
3626 | ||
b75a7d8f A |
3627 | return &localClone->cnv; |
3628 | } | |
3629 | ||
3630 | static void | |
3631 | _ISO_2022_GetUnicodeSet(const UConverter *cnv, | |
73c04bcf | 3632 | const USetAdder *sa, |
b75a7d8f A |
3633 | UConverterUnicodeSet which, |
3634 | UErrorCode *pErrorCode) | |
3635 | { | |
3636 | int32_t i; | |
b75a7d8f A |
3637 | UConverterDataISO2022* cnvData; |
3638 | ||
3639 | if (U_FAILURE(*pErrorCode)) { | |
3640 | return; | |
3641 | } | |
374ca955 | 3642 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
b75a7d8f A |
3643 | if (cnv->sharedData == &_ISO2022Data) { |
3644 | /* We use UTF-8 in this case */ | |
374ca955 A |
3645 | sa->addRange(sa->set, 0, 0xd7FF); |
3646 | sa->addRange(sa->set, 0xE000, 0x10FFFF); | |
b75a7d8f A |
3647 | return; |
3648 | } | |
374ca955 | 3649 | #endif |
b75a7d8f A |
3650 | |
3651 | cnvData = (UConverterDataISO2022*)cnv->extraInfo; | |
b75a7d8f | 3652 | |
374ca955 A |
3653 | /* open a set and initialize it with code points that are algorithmically round-tripped */ |
3654 | switch(cnvData->locale[0]){ | |
3655 | case 'j': | |
46f4442e A |
3656 | /* include JIS X 0201 which is hardcoded */ |
3657 | sa->add(sa->set, 0xa5); | |
3658 | sa->add(sa->set, 0x203e); | |
374ca955 A |
3659 | if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { |
3660 | /* include Latin-1 for some variants of JP */ | |
3661 | sa->addRange(sa->set, 0, 0xff); | |
3662 | } else { | |
3663 | /* include ASCII for JP */ | |
3664 | sa->addRange(sa->set, 0, 0x7f); | |
3665 | } | |
46f4442e A |
3666 | if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { |
3667 | /* | |
3668 | * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 | |
3669 | * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) | |
3670 | * use half-width Katakana. | |
3671 | * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) | |
3672 | * half-width Katakana via the ESC ( I sequence. | |
3673 | * However, we only emit (fromUnicode) half-width Katakana according to the | |
3674 | * definition of each variant. | |
3675 | * | |
3676 | * When including fallbacks, | |
3677 | * we need to include half-width Katakana Unicode code points for all JP variants because | |
3678 | * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). | |
3679 | */ | |
374ca955 | 3680 | /* include half-width Katakana for JP */ |
46f4442e | 3681 | sa->addRange(sa->set, HWKANA_START, HWKANA_END); |
374ca955 A |
3682 | } |
3683 | break; | |
b331163b | 3684 | #if !UCONFIG_ONLY_HTML_CONVERSION |
374ca955 A |
3685 | case 'c': |
3686 | case 'z': | |
3687 | /* include ASCII for CN */ | |
3688 | sa->addRange(sa->set, 0, 0x7f); | |
3689 | break; | |
3690 | case 'k': | |
3691 | /* there is only one converter for KR, and it is not in the myConverterArray[] */ | |
3692 | cnvData->currentConverter->sharedData->impl->getUnicodeSet( | |
3693 | cnvData->currentConverter, sa, which, pErrorCode); | |
73c04bcf A |
3694 | /* the loop over myConverterArray[] will simply not find another converter */ |
3695 | break; | |
b331163b | 3696 | #endif |
374ca955 A |
3697 | default: |
3698 | break; | |
b75a7d8f A |
3699 | } |
3700 | ||
46f4442e | 3701 | #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ |
374ca955 A |
3702 | if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && |
3703 | cnvData->version==0 && i==CNS_11643 | |
3704 | ) { | |
3705 | /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ | |
3706 | ucnv_MBCSGetUnicodeSetForBytes( | |
3707 | cnvData->myConverterArray[i], | |
3708 | sa, UCNV_ROUNDTRIP_SET, | |
3709 | 0, 0x81, 0x82, | |
3710 | pErrorCode); | |
46f4442e A |
3711 | } |
3712 | #endif | |
3713 | ||
3714 | for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { | |
3715 | UConverterSetFilter filter; | |
3716 | if(cnvData->myConverterArray[i]!=NULL) { | |
b331163b A |
3717 | if(cnvData->locale[0]=='j' && i==JISX208) { |
3718 | /* | |
3719 | * Only add code points that map to Shift-JIS codes | |
3720 | * corresponding to JIS X 0208. | |
3721 | */ | |
3722 | filter=UCNV_SET_FILTER_SJIS; | |
3723 | #if !UCONFIG_ONLY_HTML_CONVERSION | |
3724 | } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && | |
3725 | cnvData->version==0 && i==CNS_11643) { | |
46f4442e A |
3726 | /* |
3727 | * Version-specific for CN: | |
3728 | * CN version 0 does not map CNS planes 3..7 although | |
3729 | * they are all available in the CNS conversion table; | |
3730 | * CN version 1 (-EXT) does map them all. | |
3731 | * The two versions create different Unicode sets. | |
3732 | */ | |
3733 | filter=UCNV_SET_FILTER_2022_CN; | |
46f4442e A |
3734 | } else if(i==KSC5601) { |
3735 | /* | |
3736 | * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) | |
3737 | * are broader than GR94. | |
3738 | */ | |
3739 | filter=UCNV_SET_FILTER_GR94DBCS; | |
b331163b | 3740 | #endif |
374ca955 | 3741 | } else { |
46f4442e | 3742 | filter=UCNV_SET_FILTER_NONE; |
374ca955 | 3743 | } |
46f4442e | 3744 | ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); |
374ca955 | 3745 | } |
b75a7d8f | 3746 | } |
73c04bcf A |
3747 | |
3748 | /* | |
3749 | * ISO 2022 converters must not convert SO/SI/ESC despite what | |
3750 | * sub-converters do by themselves. | |
3751 | * Remove these characters from the set. | |
3752 | */ | |
3753 | sa->remove(sa->set, 0x0e); | |
3754 | sa->remove(sa->set, 0x0f); | |
3755 | sa->remove(sa->set, 0x1b); | |
46f4442e A |
3756 | |
3757 | /* ISO 2022 converters do not convert C1 controls either */ | |
3758 | sa->removeRange(sa->set, 0x80, 0x9f); | |
b75a7d8f A |
3759 | } |
3760 | ||
374ca955 A |
3761 | static const UConverterImpl _ISO2022Impl={ |
3762 | UCNV_ISO_2022, | |
3763 | ||
3764 | NULL, | |
3765 | NULL, | |
3766 | ||
3767 | _ISO2022Open, | |
3768 | _ISO2022Close, | |
3769 | _ISO2022Reset, | |
3770 | ||
3771 | #ifdef U_ENABLE_GENERIC_ISO_2022 | |
3772 | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, | |
3773 | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, | |
3774 | ucnv_fromUnicode_UTF8, | |
3775 | ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, | |
3776 | #else | |
3777 | NULL, | |
3778 | NULL, | |
3779 | NULL, | |
3780 | NULL, | |
3781 | #endif | |
3782 | NULL, | |
3783 | ||
3784 | NULL, | |
3785 | _ISO2022getName, | |
3786 | _ISO_2022_WriteSub, | |
3787 | _ISO_2022_SafeClone, | |
4388f060 A |
3788 | _ISO_2022_GetUnicodeSet, |
3789 | ||
3790 | NULL, | |
3791 | NULL | |
374ca955 A |
3792 | }; |
3793 | static const UConverterStaticData _ISO2022StaticData={ | |
3794 | sizeof(UConverterStaticData), | |
3795 | "ISO_2022", | |
3796 | 2022, | |
3797 | UCNV_IBM, | |
3798 | UCNV_ISO_2022, | |
3799 | 1, | |
3800 | 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ | |
3801 | { 0x1a, 0, 0, 0 }, | |
3802 | 1, | |
3803 | FALSE, | |
3804 | FALSE, | |
3805 | 0, | |
3806 | 0, | |
3807 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
3808 | }; | |
3809 | const UConverterSharedData _ISO2022Data={ | |
3810 | sizeof(UConverterSharedData), | |
3811 | ~((uint32_t) 0), | |
3812 | NULL, | |
3813 | NULL, | |
3814 | &_ISO2022StaticData, | |
3815 | FALSE, | |
3816 | &_ISO2022Impl, | |
4388f060 | 3817 | 0, UCNV_MBCS_TABLE_INITIALIZER |
374ca955 A |
3818 | }; |
3819 | ||
3820 | /*************JP****************/ | |
3821 | static const UConverterImpl _ISO2022JPImpl={ | |
3822 | UCNV_ISO_2022, | |
3823 | ||
3824 | NULL, | |
3825 | NULL, | |
3826 | ||
3827 | _ISO2022Open, | |
3828 | _ISO2022Close, | |
3829 | _ISO2022Reset, | |
3830 | ||
3831 | UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, | |
3832 | UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, | |
3833 | UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, | |
3834 | UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, | |
3835 | NULL, | |
3836 | ||
3837 | NULL, | |
3838 | _ISO2022getName, | |
3839 | _ISO_2022_WriteSub, | |
3840 | _ISO_2022_SafeClone, | |
4388f060 A |
3841 | _ISO_2022_GetUnicodeSet, |
3842 | ||
3843 | NULL, | |
3844 | NULL | |
374ca955 A |
3845 | }; |
3846 | static const UConverterStaticData _ISO2022JPStaticData={ | |
3847 | sizeof(UConverterStaticData), | |
3848 | "ISO_2022_JP", | |
3849 | 0, | |
3850 | UCNV_IBM, | |
3851 | UCNV_ISO_2022, | |
3852 | 1, | |
3853 | 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ | |
3854 | { 0x1a, 0, 0, 0 }, | |
3855 | 1, | |
3856 | FALSE, | |
3857 | FALSE, | |
3858 | 0, | |
3859 | 0, | |
3860 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
3861 | }; | |
4388f060 A |
3862 | |
3863 | namespace { | |
3864 | ||
3865 | const UConverterSharedData _ISO2022JPData={ | |
374ca955 A |
3866 | sizeof(UConverterSharedData), |
3867 | ~((uint32_t) 0), | |
3868 | NULL, | |
3869 | NULL, | |
3870 | &_ISO2022JPStaticData, | |
3871 | FALSE, | |
3872 | &_ISO2022JPImpl, | |
4388f060 | 3873 | 0, UCNV_MBCS_TABLE_INITIALIZER |
374ca955 A |
3874 | }; |
3875 | ||
4388f060 A |
3876 | } // namespace |
3877 | ||
b331163b | 3878 | #if !UCONFIG_ONLY_HTML_CONVERSION |
374ca955 A |
3879 | /************* KR ***************/ |
3880 | static const UConverterImpl _ISO2022KRImpl={ | |
3881 | UCNV_ISO_2022, | |
3882 | ||
3883 | NULL, | |
3884 | NULL, | |
3885 | ||
3886 | _ISO2022Open, | |
3887 | _ISO2022Close, | |
3888 | _ISO2022Reset, | |
3889 | ||
3890 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, | |
3891 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, | |
3892 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, | |
3893 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, | |
3894 | NULL, | |
3895 | ||
3896 | NULL, | |
3897 | _ISO2022getName, | |
3898 | _ISO_2022_WriteSub, | |
3899 | _ISO_2022_SafeClone, | |
4388f060 A |
3900 | _ISO_2022_GetUnicodeSet, |
3901 | ||
3902 | NULL, | |
3903 | NULL | |
374ca955 A |
3904 | }; |
3905 | static const UConverterStaticData _ISO2022KRStaticData={ | |
3906 | sizeof(UConverterStaticData), | |
3907 | "ISO_2022_KR", | |
3908 | 0, | |
3909 | UCNV_IBM, | |
3910 | UCNV_ISO_2022, | |
3911 | 1, | |
3912 | 3, /* max 3 bytes per UChar: SO+DBCS */ | |
3913 | { 0x1a, 0, 0, 0 }, | |
3914 | 1, | |
3915 | FALSE, | |
3916 | FALSE, | |
3917 | 0, | |
3918 | 0, | |
3919 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
3920 | }; | |
4388f060 A |
3921 | |
3922 | namespace { | |
3923 | ||
3924 | const UConverterSharedData _ISO2022KRData={ | |
374ca955 A |
3925 | sizeof(UConverterSharedData), |
3926 | ~((uint32_t) 0), | |
3927 | NULL, | |
3928 | NULL, | |
3929 | &_ISO2022KRStaticData, | |
3930 | FALSE, | |
3931 | &_ISO2022KRImpl, | |
4388f060 | 3932 | 0, UCNV_MBCS_TABLE_INITIALIZER |
374ca955 A |
3933 | }; |
3934 | ||
4388f060 A |
3935 | } // namespace |
3936 | ||
374ca955 A |
3937 | /*************** CN ***************/ |
3938 | static const UConverterImpl _ISO2022CNImpl={ | |
3939 | ||
3940 | UCNV_ISO_2022, | |
3941 | ||
3942 | NULL, | |
3943 | NULL, | |
3944 | ||
3945 | _ISO2022Open, | |
3946 | _ISO2022Close, | |
3947 | _ISO2022Reset, | |
3948 | ||
3949 | UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, | |
3950 | UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, | |
3951 | UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, | |
3952 | UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, | |
3953 | NULL, | |
3954 | ||
3955 | NULL, | |
3956 | _ISO2022getName, | |
3957 | _ISO_2022_WriteSub, | |
3958 | _ISO_2022_SafeClone, | |
4388f060 A |
3959 | _ISO_2022_GetUnicodeSet, |
3960 | ||
3961 | NULL, | |
3962 | NULL | |
374ca955 A |
3963 | }; |
3964 | static const UConverterStaticData _ISO2022CNStaticData={ | |
3965 | sizeof(UConverterStaticData), | |
3966 | "ISO_2022_CN", | |
3967 | 0, | |
3968 | UCNV_IBM, | |
3969 | UCNV_ISO_2022, | |
73c04bcf | 3970 | 1, |
374ca955 A |
3971 | 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ |
3972 | { 0x1a, 0, 0, 0 }, | |
3973 | 1, | |
3974 | FALSE, | |
3975 | FALSE, | |
3976 | 0, | |
3977 | 0, | |
3978 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ | |
3979 | }; | |
4388f060 A |
3980 | |
3981 | namespace { | |
3982 | ||
3983 | const UConverterSharedData _ISO2022CNData={ | |
374ca955 A |
3984 | sizeof(UConverterSharedData), |
3985 | ~((uint32_t) 0), | |
3986 | NULL, | |
3987 | NULL, | |
3988 | &_ISO2022CNStaticData, | |
3989 | FALSE, | |
3990 | &_ISO2022CNImpl, | |
4388f060 | 3991 | 0, UCNV_MBCS_TABLE_INITIALIZER |
374ca955 A |
3992 | }; |
3993 | ||
4388f060 | 3994 | } // namespace |
b331163b | 3995 | #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */ |
374ca955 | 3996 | |
b75a7d8f | 3997 | #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ |