]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnv2022.cpp
ICU-64232.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucnv2022.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2000-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnv2022.cpp
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2000feb03
14 * created by: Markus W. Scherer
15 *
16 * Change history:
17 *
18 * 06/29/2000 helena Major rewrite of the callback APIs.
19 * 08/08/2000 Ram Included support for ISO-2022-JP-2
20 * Changed implementation of toUnicode
21 * function
22 * 08/21/2000 Ram Added support for ISO-2022-KR
23 * 08/29/2000 Ram Seperated implementation of EBCDIC to
24 * ucnvebdc.c
25 * 09/20/2000 Ram Added support for ISO-2022-CN
26 * Added implementations for getNextUChar()
27 * for specific 2022 country variants.
28 * 10/31/2000 Ram Implemented offsets logic functions
29 */
30
31 #include "unicode/utypes.h"
32
33 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
34
35 #include "unicode/ucnv.h"
36 #include "unicode/uset.h"
37 #include "unicode/ucnv_err.h"
38 #include "unicode/ucnv_cb.h"
39 #include "unicode/utf16.h"
40 #include "ucnv_imp.h"
41 #include "ucnv_bld.h"
42 #include "ucnv_cnv.h"
43 #include "ucnvmbcs.h"
44 #include "cstring.h"
45 #include "cmemory.h"
46 #include "uassert.h"
47
48 #ifdef U_ENABLE_GENERIC_ISO_2022
49 /*
50 * I am disabling the generic ISO-2022 converter after proposing to do so on
51 * the icu mailing list two days ago.
52 *
53 * Reasons:
54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55 * its designation sequences, single shifts with return to the previous state,
56 * switch-with-no-return to UTF-16BE or similar, etc.
57 * This is unlike the language-specific variants like ISO-2022-JP which
58 * require a much smaller repertoire of ISO-2022 features.
59 * These variants continue to be supported.
60 * 2. I believe that no one is really using the generic ISO-2022 converter
61 * but rather always one of the language-specific variants.
62 * Note that ICU's generic ISO-2022 converter has always output one escape
63 * sequence followed by UTF-8 for the whole stream.
64 * 3. Switching between subcharsets is extremely slow, because each time
65 * the previous converter is closed and a new one opened,
66 * without any kind of caching, least-recently-used list, etc.
67 * 4. The code is currently buggy, and given the above it does not seem
68 * reasonable to spend the time on maintenance.
69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70 * This means, for example, that when ISO-8859-7 is designated, the following
71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72 * The ICU ISO-2022 converter does not handle this - and has no information
73 * about which subconverter would have to be shifted vs. which is designed
74 * for 7-bit ISO-2022.
75 *
76 * Markus Scherer 2003-dec-03
77 */
78 #endif
79
80 #if !UCONFIG_ONLY_HTML_CONVERSION
81 static const char SHIFT_IN_STR[] = "\x0F";
82 // static const char SHIFT_OUT_STR[] = "\x0E";
83 #endif
84
85 #define CR 0x0D
86 #define LF 0x0A
87 #define H_TAB 0x09
88 #define V_TAB 0x0B
89 #define SPACE 0x20
90
91 enum {
92 HWKANA_START=0xff61,
93 HWKANA_END=0xff9f
94 };
95
96 /*
97 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
98 * as bytes 21..7E. (Subtract 0x80.)
99 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
100 * as bytes 20..7F. (Subtract 0x80.)
101 * Do not encode C1 control codes with native bytes 80..9F
102 * as bytes 00..1F (C0 control codes).
103 */
104 enum {
105 GR94_START=0xa1,
106 GR94_END=0xfe,
107 GR96_START=0xa0,
108 GR96_END=0xff
109 };
110
111 /*
112 * ISO 2022 control codes must not be converted from Unicode
113 * because they would mess up the byte stream.
114 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
115 * corresponding to SO, SI, and ESC.
116 */
117 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
118
119 /* for ISO-2022-JP and -CN implementations */
120 typedef enum {
121 /* shared values */
122 INVALID_STATE=-1,
123 ASCII = 0,
124
125 SS2_STATE=0x10,
126 SS3_STATE,
127
128 /* JP */
129 ISO8859_1 = 1 ,
130 ISO8859_7 = 2 ,
131 JISX201 = 3,
132 JISX208 = 4,
133 JISX212 = 5,
134 GB2312 =6,
135 KSC5601 =7,
136 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
137
138 /* CN */
139 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
140 GB2312_1=1,
141 ISO_IR_165=2,
142 CNS_11643=3,
143
144 /*
145 * these are used in StateEnum and ISO2022State variables,
146 * but CNS_11643 must be used to index into myConverterArray[]
147 */
148 CNS_11643_0=0x20,
149 CNS_11643_1,
150 CNS_11643_2,
151 CNS_11643_3,
152 CNS_11643_4,
153 CNS_11643_5,
154 CNS_11643_6,
155 CNS_11643_7
156 } StateEnum;
157
158 /* is the StateEnum charset value for a DBCS charset? */
159 #if UCONFIG_ONLY_HTML_CONVERSION
160 #define IS_JP_DBCS(cs) (JISX208==(cs))
161 #else
162 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
163 #endif
164
165 #define CSM(cs) ((uint16_t)1<<(cs))
166
167 /*
168 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
169 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
170 *
171 * Note: The converter uses some leniency:
172 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
173 * all versions, not just JIS7 and JIS8.
174 * - ICU does not distinguish between different versions of JIS X 0208.
175 */
176 #if UCONFIG_ONLY_HTML_CONVERSION
177 enum { MAX_JA_VERSION=0 };
178 #else
179 enum { MAX_JA_VERSION=4 };
180 #endif
181 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
182 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
183 #if !UCONFIG_ONLY_HTML_CONVERSION
184 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
185 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
186 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
187 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
188 #endif
189 };
190
191 typedef enum {
192 ASCII1=0,
193 LATIN1,
194 SBCS,
195 DBCS,
196 MBCS,
197 HWKANA
198 }Cnv2022Type;
199
200 typedef struct ISO2022State {
201 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
202 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
203 int8_t prevG; /* g before single shift (SS2 or SS3) */
204 } ISO2022State;
205
206 #define UCNV_OPTIONS_VERSION_MASK 0xf
207 #define UCNV_2022_MAX_CONVERTERS 10
208
209 typedef struct{
210 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
211 UConverter *currentConverter;
212 Cnv2022Type currentType;
213 ISO2022State toU2022State, fromU2022State;
214 uint32_t key;
215 uint32_t version;
216 #ifdef U_ENABLE_GENERIC_ISO_2022
217 UBool isFirstBuffer;
218 #endif
219 UBool isEmptySegment;
220 char name[30];
221 char locale[3];
222 }UConverterDataISO2022;
223
224 /* Protos */
225 /* ISO-2022 ----------------------------------------------------------------- */
226
227 /*Forward declaration */
228 U_CFUNC void U_CALLCONV
229 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
230 UErrorCode * err);
231 U_CFUNC void U_CALLCONV
232 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
233 UErrorCode * err);
234
235 #define ESC_2022 0x1B /*ESC*/
236
237 typedef enum
238 {
239 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
240 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
241 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
242 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
243 } UCNV_TableStates_2022;
244
245 /*
246 * The way these state transition arrays work is:
247 * ex : ESC$B is the sequence for JISX208
248 * a) First Iteration: char is ESC
249 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
250 * int x = normalize_esq_chars_2022[27] which is equal to 1
251 * ii) Search for this value in escSeqStateTable_Key_2022[]
252 * value of x is stored at escSeqStateTable_Key_2022[0]
253 * iii) Save this index as offset
254 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
255 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
256 * b) Switch on this state and continue to next char
257 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
258 * which is normalize_esq_chars_2022[36] == 4
259 * ii) x is currently 1(from above)
260 * x<<=5 -- x is now 32
261 * x+=normalize_esq_chars_2022[36]
262 * now x is 36
263 * iii) Search for this value in escSeqStateTable_Key_2022[]
264 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
265 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
266 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
267 * c) Switch on this state and continue to next char
268 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
269 * ii) x is currently 36 (from above)
270 * x<<=5 -- x is now 1152
271 * x+=normalize_esq_chars_2022[66]
272 * now x is 1161
273 * iii) Search for this value in escSeqStateTable_Key_2022[]
274 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
275 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
276 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
277 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
278 */
279
280
281 /*Below are the 3 arrays depicting a state transition table*/
282 static const int8_t normalize_esq_chars_2022[256] = {
283 /* 0 1 2 3 4 5 6 7 8 9 */
284
285 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
289 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
292 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
293 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
298 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
299 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
300 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
301 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
302 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
303 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
304 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
305 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
306 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
307 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
308 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
309 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
310 ,0 ,0 ,0 ,0 ,0 ,0
311 };
312
313 #ifdef U_ENABLE_GENERIC_ISO_2022
314 /*
315 * When the generic ISO-2022 converter is completely removed, not just disabled
316 * per #ifdef, then the following state table and the associated tables that are
317 * dimensioned with MAX_STATES_2022 should be trimmed.
318 *
319 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
320 * the associated escape sequences starting with ESC ( B should be removed.
321 * This includes the ones with key values 1097 and all of the ones above 1000000.
322 *
323 * For the latter, the tables can simply be truncated.
324 * For the former, since the tables must be kept parallel, it is probably best
325 * to simply duplicate an adjacent table cell, parallel in all tables.
326 *
327 * It may make sense to restructure the tables, especially by using small search
328 * tables for the variants instead of indexing them parallel to the table here.
329 */
330 #endif
331
332 #define MAX_STATES_2022 74
333 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
334 /* 0 1 2 3 4 5 6 7 8 9 */
335
336 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
337 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
338 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
339 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
340 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
341 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
342 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
343 ,35947631 ,35947635 ,35947636 ,35947638
344 };
345
346 #ifdef U_ENABLE_GENERIC_ISO_2022
347
348 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
349 /* 0 1 2 3 4 5 6 7 8 9 */
350
351 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
352 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
353 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
354 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
355 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
356 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
357 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
358 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
359 };
360
361 #endif
362
363 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
364 /* 0 1 2 3 4 5 6 7 8 9 */
365 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
366 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
367 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
368 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
369 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
370 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
371 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
372 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
373 };
374
375 /* Type def for refactoring changeState_2022 code*/
376 typedef enum{
377 #ifdef U_ENABLE_GENERIC_ISO_2022
378 ISO_2022=0,
379 #endif
380 ISO_2022_JP=1,
381 #if !UCONFIG_ONLY_HTML_CONVERSION
382 ISO_2022_KR=2,
383 ISO_2022_CN=3
384 #endif
385 } Variant2022;
386
387 /*********** ISO 2022 Converter Protos ***********/
388 static void U_CALLCONV
389 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
390
391 static void U_CALLCONV
392 _ISO2022Close(UConverter *converter);
393
394 static void U_CALLCONV
395 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
396
397 U_CDECL_BEGIN
398 static const char * U_CALLCONV
399 _ISO2022getName(const UConverter* cnv);
400 U_CDECL_END
401
402 static void U_CALLCONV
403 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
404
405 U_CDECL_BEGIN
406 static UConverter * U_CALLCONV
407 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
408
409 U_CDECL_END
410
411 #ifdef U_ENABLE_GENERIC_ISO_2022
412 static void U_CALLCONV
413 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
414 #endif
415
416 namespace {
417
418 /*const UConverterSharedData _ISO2022Data;*/
419 extern const UConverterSharedData _ISO2022JPData;
420
421 #if !UCONFIG_ONLY_HTML_CONVERSION
422 extern const UConverterSharedData _ISO2022KRData;
423 extern const UConverterSharedData _ISO2022CNData;
424 #endif
425
426 } // namespace
427
428 /*************** Converter implementations ******************/
429
430 /* The purpose of this function is to get around gcc compiler warnings. */
431 static inline void
432 fromUWriteUInt8(UConverter *cnv,
433 const char *bytes, int32_t length,
434 uint8_t **target, const char *targetLimit,
435 int32_t **offsets,
436 int32_t sourceIndex,
437 UErrorCode *pErrorCode)
438 {
439 char *targetChars = (char *)*target;
440 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
441 offsets, sourceIndex, pErrorCode);
442 *target = (uint8_t*)targetChars;
443
444 }
445
446 static inline void
447 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
448 if(myConverterData->version == 1) {
449 UConverter *cnv = myConverterData->currentConverter;
450
451 cnv->toUnicodeStatus=0; /* offset */
452 cnv->mode=0; /* state */
453 cnv->toULength=0; /* byteIndex */
454 }
455 }
456
457 static inline void
458 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
459 /* in ISO-2022-KR the designator sequence appears only once
460 * in a file so we append it only once
461 */
462 if( converter->charErrorBufferLength==0){
463
464 converter->charErrorBufferLength = 4;
465 converter->charErrorBuffer[0] = 0x1b;
466 converter->charErrorBuffer[1] = 0x24;
467 converter->charErrorBuffer[2] = 0x29;
468 converter->charErrorBuffer[3] = 0x43;
469 }
470 if(myConverterData->version == 1) {
471 UConverter *cnv = myConverterData->currentConverter;
472
473 cnv->fromUChar32=0;
474 cnv->fromUnicodeStatus=1; /* prevLength */
475 }
476 }
477
478 static void U_CALLCONV
479 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
480
481 char myLocale[6]={' ',' ',' ',' ',' ',' '};
482
483 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
484 if(cnv->extraInfo != NULL) {
485 UConverterNamePieces stackPieces;
486 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
487 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
488 uint32_t version;
489
490 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
491
492 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
493 myConverterData->currentType = ASCII1;
494 cnv->fromUnicodeStatus =FALSE;
495 if(pArgs->locale){
496 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
497 }
498 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
499 myConverterData->version = version;
500 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
501 (myLocale[2]=='_' || myLocale[2]=='\0'))
502 {
503 /* open the required converters and cache them */
504 if(version>MAX_JA_VERSION) {
505 // ICU 55 fails to open a converter for an unsupported version.
506 // Previously, it fell back to version 0, but that would yield
507 // unexpected behavior.
508 *errorCode = U_MISSING_RESOURCE_ERROR;
509 return;
510 }
511 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
512 myConverterData->myConverterArray[ISO8859_7] =
513 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
514 }
515 myConverterData->myConverterArray[JISX208] =
516 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
517 if(jpCharsetMasks[version]&CSM(JISX212)) {
518 myConverterData->myConverterArray[JISX212] =
519 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
520 }
521 if(jpCharsetMasks[version]&CSM(GB2312)) {
522 myConverterData->myConverterArray[GB2312] =
523 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
524 }
525 if(jpCharsetMasks[version]&CSM(KSC5601)) {
526 myConverterData->myConverterArray[KSC5601] =
527 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
528 }
529
530 /* set the function pointers to appropriate funtions */
531 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
532 uprv_strcpy(myConverterData->locale,"ja");
533
534 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
535 size_t len = uprv_strlen(myConverterData->name);
536 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
537 myConverterData->name[len+1]='\0';
538 }
539 #if !UCONFIG_ONLY_HTML_CONVERSION
540 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
541 (myLocale[2]=='_' || myLocale[2]=='\0'))
542 {
543 if(version>1) {
544 // ICU 55 fails to open a converter for an unsupported version.
545 // Previously, it fell back to version 0, but that would yield
546 // unexpected behavior.
547 *errorCode = U_MISSING_RESOURCE_ERROR;
548 return;
549 }
550 const char *cnvName;
551 if(version==1) {
552 cnvName="icu-internal-25546";
553 } else {
554 cnvName="ibm-949";
555 myConverterData->version=version=0;
556 }
557 if(pArgs->onlyTestIsLoadable) {
558 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
559 uprv_free(cnv->extraInfo);
560 cnv->extraInfo=NULL;
561 return;
562 } else {
563 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
564 if (U_FAILURE(*errorCode)) {
565 _ISO2022Close(cnv);
566 return;
567 }
568
569 if(version==1) {
570 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
571 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
572 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
573 }else{
574 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
575 }
576
577 /* initialize the state variables */
578 setInitialStateToUnicodeKR(cnv, myConverterData);
579 setInitialStateFromUnicodeKR(cnv, myConverterData);
580
581 /* set the function pointers to appropriate funtions */
582 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
583 uprv_strcpy(myConverterData->locale,"ko");
584 }
585 }
586 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
587 (myLocale[2]=='_' || myLocale[2]=='\0'))
588 {
589 if(version>2) {
590 // ICU 55 fails to open a converter for an unsupported version.
591 // Previously, it fell back to version 0, but that would yield
592 // unexpected behavior.
593 *errorCode = U_MISSING_RESOURCE_ERROR;
594 return;
595 }
596
597 /* open the required converters and cache them */
598 myConverterData->myConverterArray[GB2312_1] =
599 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
600 if(version==1) {
601 myConverterData->myConverterArray[ISO_IR_165] =
602 ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
603 }
604 myConverterData->myConverterArray[CNS_11643] =
605 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
606
607
608 /* set the function pointers to appropriate funtions */
609 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
610 uprv_strcpy(myConverterData->locale,"cn");
611
612 if (version==0){
613 myConverterData->version = 0;
614 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
615 }else if (version==1){
616 myConverterData->version = 1;
617 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
618 }else {
619 myConverterData->version = 2;
620 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
621 }
622 }
623 #endif // !UCONFIG_ONLY_HTML_CONVERSION
624 else{
625 #ifdef U_ENABLE_GENERIC_ISO_2022
626 myConverterData->isFirstBuffer = TRUE;
627
628 /* append the UTF-8 escape sequence */
629 cnv->charErrorBufferLength = 3;
630 cnv->charErrorBuffer[0] = 0x1b;
631 cnv->charErrorBuffer[1] = 0x25;
632 cnv->charErrorBuffer[2] = 0x42;
633
634 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
635 /* initialize the state variables */
636 uprv_strcpy(myConverterData->name,"ISO_2022");
637 #else
638 *errorCode = U_MISSING_RESOURCE_ERROR;
639 // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
640 // data loading error code.
641 return;
642 #endif
643 }
644
645 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
646
647 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
648 _ISO2022Close(cnv);
649 }
650 } else {
651 *errorCode = U_MEMORY_ALLOCATION_ERROR;
652 }
653 }
654
655
656 static void U_CALLCONV
657 _ISO2022Close(UConverter *converter) {
658 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
659 UConverterSharedData **array = myData->myConverterArray;
660 int32_t i;
661
662 if (converter->extraInfo != NULL) {
663 /*close the array of converter pointers and free the memory*/
664 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
665 if(array[i]!=NULL) {
666 ucnv_unloadSharedDataIfReady(array[i]);
667 }
668 }
669
670 ucnv_close(myData->currentConverter);
671
672 if(!converter->isExtraLocal){
673 uprv_free (converter->extraInfo);
674 converter->extraInfo = NULL;
675 }
676 }
677 }
678
679 static void U_CALLCONV
680 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
681 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
682 if(choice<=UCNV_RESET_TO_UNICODE) {
683 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
684 myConverterData->key = 0;
685 myConverterData->isEmptySegment = FALSE;
686 }
687 if(choice!=UCNV_RESET_TO_UNICODE) {
688 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
689 }
690 #ifdef U_ENABLE_GENERIC_ISO_2022
691 if(myConverterData->locale[0] == 0){
692 if(choice<=UCNV_RESET_TO_UNICODE) {
693 myConverterData->isFirstBuffer = TRUE;
694 myConverterData->key = 0;
695 if (converter->mode == UCNV_SO){
696 ucnv_close (myConverterData->currentConverter);
697 myConverterData->currentConverter=NULL;
698 }
699 converter->mode = UCNV_SI;
700 }
701 if(choice!=UCNV_RESET_TO_UNICODE) {
702 /* re-append UTF-8 escape sequence */
703 converter->charErrorBufferLength = 3;
704 converter->charErrorBuffer[0] = 0x1b;
705 converter->charErrorBuffer[1] = 0x28;
706 converter->charErrorBuffer[2] = 0x42;
707 }
708 }
709 else
710 #endif
711 {
712 /* reset the state variables */
713 if(myConverterData->locale[0] == 'k'){
714 if(choice<=UCNV_RESET_TO_UNICODE) {
715 setInitialStateToUnicodeKR(converter, myConverterData);
716 }
717 if(choice!=UCNV_RESET_TO_UNICODE) {
718 setInitialStateFromUnicodeKR(converter, myConverterData);
719 }
720 }
721 }
722 }
723
724 U_CDECL_BEGIN
725
726 static const char * U_CALLCONV
727 _ISO2022getName(const UConverter* cnv){
728 if(cnv->extraInfo){
729 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
730 return myData->name;
731 }
732 return NULL;
733 }
734
735 U_CDECL_END
736
737
738 /*************** to unicode *******************/
739 /****************************************************************************
740 * Recognized escape sequences are
741 * <ESC>(B ASCII
742 * <ESC>.A ISO-8859-1
743 * <ESC>.F ISO-8859-7
744 * <ESC>(J JISX-201
745 * <ESC>(I JISX-201
746 * <ESC>$B JISX-208
747 * <ESC>$@ JISX-208
748 * <ESC>$(D JISX-212
749 * <ESC>$A GB2312
750 * <ESC>$(C KSC5601
751 */
752 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
753 /* 0 1 2 3 4 5 6 7 8 9 */
754 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
755 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
756 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
757 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
758 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
759 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
760 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
761 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
762 };
763
764 #if !UCONFIG_ONLY_HTML_CONVERSION
765 /*************** to unicode *******************/
766 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
767 /* 0 1 2 3 4 5 6 7 8 9 */
768 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
769 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
770 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
771 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
772 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
773 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
774 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
775 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
776 };
777 #endif
778
779
780 static UCNV_TableStates_2022
781 getKey_2022(char c,int32_t* key,int32_t* offset){
782 int32_t togo;
783 int32_t low = 0;
784 int32_t hi = MAX_STATES_2022;
785 int32_t oldmid=0;
786
787 togo = normalize_esq_chars_2022[(uint8_t)c];
788 if(togo == 0) {
789 /* not a valid character anywhere in an escape sequence */
790 *key = 0;
791 *offset = 0;
792 return INVALID_2022;
793 }
794 togo = (*key << 5) + togo;
795
796 while (hi != low) /*binary search*/{
797
798 int32_t mid = (hi+low) >> 1; /*Finds median*/
799
800 if (mid == oldmid)
801 break;
802
803 if (escSeqStateTable_Key_2022[mid] > togo){
804 hi = mid;
805 }
806 else if (escSeqStateTable_Key_2022[mid] < togo){
807 low = mid;
808 }
809 else /*we found it*/{
810 *key = togo;
811 *offset = mid;
812 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
813 }
814 oldmid = mid;
815
816 }
817
818 *key = 0;
819 *offset = 0;
820 return INVALID_2022;
821 }
822
823 /*runs through a state machine to determine the escape sequence - codepage correspondance
824 */
825 static void
826 changeState_2022(UConverter* _this,
827 const char** source,
828 const char* sourceLimit,
829 Variant2022 var,
830 UErrorCode* err){
831 UCNV_TableStates_2022 value;
832 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
833 uint32_t key = myData2022->key;
834 int32_t offset = 0;
835 int8_t initialToULength = _this->toULength;
836 char c;
837
838 value = VALID_NON_TERMINAL_2022;
839 while (*source < sourceLimit) {
840 c = *(*source)++;
841 _this->toUBytes[_this->toULength++]=(uint8_t)c;
842 value = getKey_2022(c,(int32_t *) &key, &offset);
843
844 switch (value){
845
846 case VALID_NON_TERMINAL_2022 :
847 /* continue with the loop */
848 break;
849
850 case VALID_TERMINAL_2022:
851 key = 0;
852 goto DONE;
853
854 case INVALID_2022:
855 goto DONE;
856
857 case VALID_MAYBE_TERMINAL_2022:
858 #ifdef U_ENABLE_GENERIC_ISO_2022
859 /* ESC ( B is ambiguous only for ISO_2022 itself */
860 if(var == ISO_2022) {
861 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
862 _this->toULength = 0;
863
864 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
865
866 /* continue with the loop */
867 value = VALID_NON_TERMINAL_2022;
868 break;
869 } else
870 #endif
871 {
872 /* not ISO_2022 itself, finish here */
873 value = VALID_TERMINAL_2022;
874 key = 0;
875 goto DONE;
876 }
877 }
878 }
879
880 DONE:
881 myData2022->key = key;
882
883 if (value == VALID_NON_TERMINAL_2022) {
884 /* indicate that the escape sequence is incomplete: key!=0 */
885 return;
886 } else if (value == INVALID_2022 ) {
887 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
888 } else /* value == VALID_TERMINAL_2022 */ {
889 switch(var){
890 #ifdef U_ENABLE_GENERIC_ISO_2022
891 case ISO_2022:
892 {
893 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
894 if(chosenConverterName == NULL) {
895 /* SS2 or SS3 */
896 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
897 _this->toUCallbackReason = UCNV_UNASSIGNED;
898 return;
899 }
900
901 _this->mode = UCNV_SI;
902 ucnv_close(myData2022->currentConverter);
903 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
904 if(U_SUCCESS(*err)) {
905 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
906 _this->mode = UCNV_SO;
907 }
908 break;
909 }
910 #endif
911 case ISO_2022_JP:
912 {
913 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
914 switch(tempState) {
915 case INVALID_STATE:
916 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
917 break;
918 case SS2_STATE:
919 if(myData2022->toU2022State.cs[2]!=0) {
920 if(myData2022->toU2022State.g<2) {
921 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
922 }
923 myData2022->toU2022State.g=2;
924 } else {
925 /* illegal to have SS2 before a matching designator */
926 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
927 }
928 break;
929 /* case SS3_STATE: not used in ISO-2022-JP-x */
930 case ISO8859_1:
931 case ISO8859_7:
932 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
933 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
934 } else {
935 /* G2 charset for SS2 */
936 myData2022->toU2022State.cs[2]=(int8_t)tempState;
937 }
938 break;
939 default:
940 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
941 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
942 } else {
943 /* G0 charset */
944 myData2022->toU2022State.cs[0]=(int8_t)tempState;
945 }
946 break;
947 }
948 }
949 break;
950 #if !UCONFIG_ONLY_HTML_CONVERSION
951 case ISO_2022_CN:
952 {
953 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
954 switch(tempState) {
955 case INVALID_STATE:
956 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
957 break;
958 case SS2_STATE:
959 if(myData2022->toU2022State.cs[2]!=0) {
960 if(myData2022->toU2022State.g<2) {
961 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
962 }
963 myData2022->toU2022State.g=2;
964 } else {
965 /* illegal to have SS2 before a matching designator */
966 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
967 }
968 break;
969 case SS3_STATE:
970 if(myData2022->toU2022State.cs[3]!=0) {
971 if(myData2022->toU2022State.g<2) {
972 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
973 }
974 myData2022->toU2022State.g=3;
975 } else {
976 /* illegal to have SS3 before a matching designator */
977 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
978 }
979 break;
980 case ISO_IR_165:
981 if(myData2022->version==0) {
982 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
983 break;
984 }
985 U_FALLTHROUGH;
986 case GB2312_1:
987 U_FALLTHROUGH;
988 case CNS_11643_1:
989 myData2022->toU2022State.cs[1]=(int8_t)tempState;
990 break;
991 case CNS_11643_2:
992 myData2022->toU2022State.cs[2]=(int8_t)tempState;
993 break;
994 default:
995 /* other CNS 11643 planes */
996 if(myData2022->version==0) {
997 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
998 } else {
999 myData2022->toU2022State.cs[3]=(int8_t)tempState;
1000 }
1001 break;
1002 }
1003 }
1004 break;
1005 case ISO_2022_KR:
1006 if(offset==0x30){
1007 /* nothing to be done, just accept this one escape sequence */
1008 } else {
1009 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1010 }
1011 break;
1012 #endif // !UCONFIG_ONLY_HTML_CONVERSION
1013
1014 default:
1015 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1016 break;
1017 }
1018 }
1019 if(U_SUCCESS(*err)) {
1020 _this->toULength = 0;
1021 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1022 if(_this->toULength>1) {
1023 /*
1024 * Ticket 5691: consistent illegal sequences:
1025 * - We include at least the first byte (ESC) in the illegal sequence.
1026 * - If any of the non-initial bytes could be the start of a character,
1027 * we stop the illegal sequence before the first one of those.
1028 * In escape sequences, all following bytes are "printable", that is,
1029 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1030 * they are valid single/lead bytes.
1031 * For simplicity, we always only report the initial ESC byte as the
1032 * illegal sequence and back out all other bytes we looked at.
1033 */
1034 /* Back out some bytes. */
1035 int8_t backOutDistance=_this->toULength-1;
1036 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1037 if(backOutDistance<=bytesFromThisBuffer) {
1038 /* same as initialToULength<=1 */
1039 *source-=backOutDistance;
1040 } else {
1041 /* Back out bytes from the previous buffer: Need to replay them. */
1042 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1043 /* same as -(initialToULength-1) */
1044 /* preToULength is negative! */
1045 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1046 *source-=bytesFromThisBuffer;
1047 }
1048 _this->toULength=1;
1049 }
1050 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1051 _this->toUCallbackReason = UCNV_UNASSIGNED;
1052 }
1053 }
1054
1055 #if !UCONFIG_ONLY_HTML_CONVERSION
1056 /*Checks the characters of the buffer against valid 2022 escape sequences
1057 *if the match we return a pointer to the initial start of the sequence otherwise
1058 *we return sourceLimit
1059 */
1060 /*for 2022 looks ahead in the stream
1061 *to determine the longest possible convertible
1062 *data stream
1063 */
1064 static inline const char*
1065 getEndOfBuffer_2022(const char** source,
1066 const char* sourceLimit,
1067 UBool /*flush*/){
1068
1069 const char* mySource = *source;
1070
1071 #ifdef U_ENABLE_GENERIC_ISO_2022
1072 if (*source >= sourceLimit)
1073 return sourceLimit;
1074
1075 do{
1076
1077 if (*mySource == ESC_2022){
1078 int8_t i;
1079 int32_t key = 0;
1080 int32_t offset;
1081 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1082
1083 /* Kludge: I could not
1084 * figure out the reason for validating an escape sequence
1085 * twice - once here and once in changeState_2022().
1086 * is it possible to have an ESC character in a ISO2022
1087 * byte stream which is valid in a code page? Is it legal?
1088 */
1089 for (i=0;
1090 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1091 i++) {
1092 value = getKey_2022(*(mySource+i), &key, &offset);
1093 }
1094 if (value > 0 || *mySource==ESC_2022)
1095 return mySource;
1096
1097 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1098 return sourceLimit;
1099 }
1100 }while (++mySource < sourceLimit);
1101
1102 return sourceLimit;
1103 #else
1104 while(mySource < sourceLimit && *mySource != ESC_2022) {
1105 ++mySource;
1106 }
1107 return mySource;
1108 #endif
1109 }
1110 #endif
1111
1112 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1113 * any future change in _MBCSFromUChar32() function should be reflected here.
1114 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1115 */
1116 static inline int32_t
1117 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1118 UChar32 c,
1119 uint32_t* value,
1120 UBool useFallback,
1121 int outputType)
1122 {
1123 const int32_t *cx;
1124 const uint16_t *table;
1125 uint32_t stage2Entry;
1126 uint32_t myValue;
1127 int32_t length;
1128 const uint8_t *p;
1129 /*
1130 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1131 * Use internal version of ucnv_open() that verifies that the new structures are available,
1132 * else U_INTERNAL_PROGRAM_ERROR.
1133 */
1134 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1135 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1136 table=sharedData->mbcs.fromUnicodeTable;
1137 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1138 /* get the bytes and the length for the output */
1139 if(outputType==MBCS_OUTPUT_2){
1140 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1141 if(myValue<=0xff) {
1142 length=1;
1143 } else {
1144 length=2;
1145 }
1146 } else /* outputType==MBCS_OUTPUT_3 */ {
1147 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1148 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1149 if(myValue<=0xff) {
1150 length=1;
1151 } else if(myValue<=0xffff) {
1152 length=2;
1153 } else {
1154 length=3;
1155 }
1156 }
1157 /* is this code point assigned, or do we use fallbacks? */
1158 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1159 /* assigned */
1160 *value=myValue;
1161 return length;
1162 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1163 /*
1164 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1165 * There is no way with this data structure for fallback output
1166 * to be a zero byte.
1167 */
1168 *value=myValue;
1169 return -length;
1170 }
1171 }
1172
1173 cx=sharedData->mbcs.extIndexes;
1174 if(cx!=NULL) {
1175 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1176 }
1177
1178 /* unassigned */
1179 return 0;
1180 }
1181
1182 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1183 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1184 * @param retval pointer to output byte
1185 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1186 */
1187 static inline int32_t
1188 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1189 UChar32 c,
1190 uint32_t* retval,
1191 UBool useFallback)
1192 {
1193 const uint16_t *table;
1194 int32_t value;
1195 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1196 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1197 return 0;
1198 }
1199 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1200 table=sharedData->mbcs.fromUnicodeTable;
1201 /* get the byte for the output */
1202 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1203 /* is this code point assigned, or do we use fallbacks? */
1204 *retval=(uint32_t)(value&0xff);
1205 if(value>=0xf00) {
1206 return 1; /* roundtrip */
1207 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1208 return -1; /* fallback taken */
1209 } else {
1210 return 0; /* no mapping */
1211 }
1212 }
1213
1214 /*
1215 * Check that the result is a 2-byte value with each byte in the range A1..FE
1216 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1217 * to move it to the ISO 2022 range 21..7E.
1218 * Return 0 if out of range.
1219 */
1220 static inline uint32_t
1221 _2022FromGR94DBCS(uint32_t value) {
1222 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1223 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1224 ) {
1225 return value - 0x8080; /* shift down to 21..7e byte range */
1226 } else {
1227 return 0; /* not valid for ISO 2022 */
1228 }
1229 }
1230
1231 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1232 /*
1233 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1234 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1235 * unchanged.
1236 */
1237 static inline uint32_t
1238 _2022ToGR94DBCS(uint32_t value) {
1239 uint32_t returnValue = value + 0x8080;
1240 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1241 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1242 return returnValue;
1243 } else {
1244 return value;
1245 }
1246 }
1247 #endif
1248
1249 #ifdef U_ENABLE_GENERIC_ISO_2022
1250
1251 /**********************************************************************************
1252 * ISO-2022 Converter
1253 *
1254 *
1255 */
1256
1257 static void U_CALLCONV
1258 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1259 UErrorCode* err){
1260 const char* mySourceLimit, *realSourceLimit;
1261 const char* sourceStart;
1262 const UChar* myTargetStart;
1263 UConverter* saveThis;
1264 UConverterDataISO2022* myData;
1265 int8_t length;
1266
1267 saveThis = args->converter;
1268 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1269
1270 realSourceLimit = args->sourceLimit;
1271 while (args->source < realSourceLimit) {
1272 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1273 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1274 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1275
1276 if(args->source < mySourceLimit) {
1277 if(myData->currentConverter==NULL) {
1278 myData->currentConverter = ucnv_open("ASCII",err);
1279 if(U_FAILURE(*err)){
1280 return;
1281 }
1282
1283 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1284 saveThis->mode = UCNV_SO;
1285 }
1286
1287 /* convert to before the ESC or until the end of the buffer */
1288 myData->isFirstBuffer=FALSE;
1289 sourceStart = args->source;
1290 myTargetStart = args->target;
1291 args->converter = myData->currentConverter;
1292 ucnv_toUnicode(args->converter,
1293 &args->target,
1294 args->targetLimit,
1295 &args->source,
1296 mySourceLimit,
1297 args->offsets,
1298 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1299 err);
1300 args->converter = saveThis;
1301
1302 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1303 /* move the overflow buffer */
1304 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1305 myData->currentConverter->UCharErrorBufferLength = 0;
1306 if(length > 0) {
1307 uprv_memcpy(saveThis->UCharErrorBuffer,
1308 myData->currentConverter->UCharErrorBuffer,
1309 length*U_SIZEOF_UCHAR);
1310 }
1311 return;
1312 }
1313
1314 /*
1315 * At least one of:
1316 * -Error while converting
1317 * -Done with entire buffer
1318 * -Need to write offsets or update the current offset
1319 * (leave that up to the code in ucnv.c)
1320 *
1321 * or else we just stopped at an ESC byte and continue with changeState_2022()
1322 */
1323 if (U_FAILURE(*err) ||
1324 (args->source == realSourceLimit) ||
1325 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1326 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1327 ) {
1328 /* copy partial or error input for truncated detection and error handling */
1329 if(U_FAILURE(*err)) {
1330 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1331 if(length > 0) {
1332 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1333 }
1334 } else {
1335 length = saveThis->toULength = myData->currentConverter->toULength;
1336 if(length > 0) {
1337 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1338 if(args->source < mySourceLimit) {
1339 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1340 }
1341 }
1342 }
1343 return;
1344 }
1345 }
1346 }
1347
1348 sourceStart = args->source;
1349 changeState_2022(args->converter,
1350 &(args->source),
1351 realSourceLimit,
1352 ISO_2022,
1353 err);
1354 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1355 /* let the ucnv.c code update its current offset */
1356 return;
1357 }
1358 }
1359 }
1360
1361 #endif
1362
1363 /*
1364 * To Unicode Callback helper function
1365 */
1366 static void
1367 toUnicodeCallback(UConverter *cnv,
1368 const uint32_t sourceChar, const uint32_t targetUniChar,
1369 UErrorCode* err){
1370 if(sourceChar>0xff){
1371 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1372 cnv->toUBytes[1] = (uint8_t)sourceChar;
1373 cnv->toULength = 2;
1374 }
1375 else{
1376 cnv->toUBytes[0] =(char) sourceChar;
1377 cnv->toULength = 1;
1378 }
1379
1380 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1381 *err = U_INVALID_CHAR_FOUND;
1382 }
1383 else{
1384 *err = U_ILLEGAL_CHAR_FOUND;
1385 }
1386 }
1387
1388 /**************************************ISO-2022-JP*************************************************/
1389
1390 /************************************** IMPORTANT **************************************************
1391 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1392 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1393 * The converter iterates over each Unicode codepoint
1394 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1395 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1396 * would do as far as possible.
1397 *
1398 * If the implementation of these macros or structure of sharedData struct change in the future, make
1399 * sure that ISO-2022 is also changed.
1400 ***************************************************************************************************
1401 */
1402
1403 /***************************************************************************************************
1404 * Rules for ISO-2022-jp encoding
1405 * (i) Escape sequences must be fully contained within a line they should not
1406 * span new lines or CRs
1407 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1408 * JIS-Roman character escape sequence should follow before the line terminates
1409 * (iii) If the first character on the line is represented by two bytes then a two
1410 * byte character escape sequence should precede it
1411 * (iv) If no escape sequence is encountered then the characters are ASCII
1412 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1413 * and invoked with SS2 (ESC N).
1414 * (vi) If there is any G0 designation in text, there must be a switch to
1415 * ASCII or to JIS X 0201-Roman before a space character (but not
1416 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1417 * characters such as tab or CRLF.
1418 * (vi) Supported encodings:
1419 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1420 *
1421 * source : RFC-1554
1422 *
1423 * JISX201, JISX208,JISX212 : new .cnv data files created
1424 * KSC5601 : alias to ibm-949 mapping table
1425 * GB2312 : alias to ibm-1386 mapping table
1426 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1427 * ISO-8859-7 : alisas to ibm-9409 mapping table
1428 */
1429
1430 /* preference order of JP charsets */
1431 static const StateEnum jpCharsetPref[]={
1432 ASCII,
1433 JISX201,
1434 ISO8859_1,
1435 JISX208,
1436 ISO8859_7,
1437 JISX212,
1438 GB2312,
1439 KSC5601,
1440 HWKANA_7BIT
1441 };
1442
1443 /*
1444 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1445 * not in order of jpCharsetPref[]!
1446 */
1447 static const char escSeqChars[][6] ={
1448 "\x1B\x28\x42", /* <ESC>(B ASCII */
1449 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1450 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1451 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1452 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1453 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1454 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1455 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1456 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1457
1458 };
1459 static const int8_t escSeqCharsLen[] ={
1460 3, /* length of <ESC>(B ASCII */
1461 3, /* length of <ESC>.A ISO-8859-1 */
1462 3, /* length of <ESC>.F ISO-8859-7 */
1463 3, /* length of <ESC>(J JISX-201 */
1464 3, /* length of <ESC>$B JISX-208 */
1465 4, /* length of <ESC>$(D JISX-212 */
1466 3, /* length of <ESC>$A GB2312 */
1467 4, /* length of <ESC>$(C KSC5601 */
1468 3 /* length of <ESC>(I HWKANA_7BIT */
1469 };
1470
1471 /*
1472 * The iteration over various code pages works this way:
1473 * i) Get the currentState from myConverterData->currentState
1474 * ii) Check if the character is mapped to a valid character in the currentState
1475 * Yes -> a) set the initIterState to currentState
1476 * b) remain in this state until an invalid character is found
1477 * No -> a) go to the next code page and find the character
1478 * iii) Before changing the state increment the current state check if the current state
1479 * is equal to the intitIteration state
1480 * Yes -> A character that cannot be represented in any of the supported encodings
1481 * break and return a U_INVALID_CHARACTER error
1482 * No -> Continue and find the character in next code page
1483 *
1484 *
1485 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1486 */
1487
1488 /* Map 00..7F to Unicode according to JIS X 0201. */
1489 static inline uint32_t
1490 jisx201ToU(uint32_t value) {
1491 if(value < 0x5c) {
1492 return value;
1493 } else if(value == 0x5c) {
1494 return 0xa5;
1495 } else if(value == 0x7e) {
1496 return 0x203e;
1497 } else /* value <= 0x7f */ {
1498 return value;
1499 }
1500 }
1501
1502 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1503 static inline uint32_t
1504 jisx201FromU(uint32_t value) {
1505 if(value<=0x7f) {
1506 if(value!=0x5c && value!=0x7e) {
1507 return value;
1508 }
1509 } else if(value==0xa5) {
1510 return 0x5c;
1511 } else if(value==0x203e) {
1512 return 0x7e;
1513 }
1514 return 0xfffe;
1515 }
1516
1517 /*
1518 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1519 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1520 * Return 0 if the byte pair is out of range.
1521 */
1522 static inline uint32_t
1523 _2022FromSJIS(uint32_t value) {
1524 uint8_t trail;
1525
1526 if(value > 0xEFFC) {
1527 return 0; /* beyond JIS X 0208 */
1528 }
1529
1530 trail = (uint8_t)value;
1531
1532 value &= 0xff00; /* lead byte */
1533 if(value <= 0x9f00) {
1534 value -= 0x7000;
1535 } else /* 0xe000 <= value <= 0xef00 */ {
1536 value -= 0xb000;
1537 }
1538 value <<= 1;
1539
1540 if(trail <= 0x9e) {
1541 value -= 0x100;
1542 if(trail <= 0x7e) {
1543 value |= trail - 0x1f;
1544 } else {
1545 value |= trail - 0x20;
1546 }
1547 } else /* trail <= 0xfc */ {
1548 value |= trail - 0x7e;
1549 }
1550 return value;
1551 }
1552
1553 /*
1554 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1555 * If either byte is outside 21..7E make sure that the result is not valid
1556 * for Shift-JIS so that the converter catches it.
1557 * Some invalid byte values already turn into equally invalid Shift-JIS
1558 * byte values and need not be tested explicitly.
1559 */
1560 static inline void
1561 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1562 if(c1&1) {
1563 ++c1;
1564 if(c2 <= 0x5f) {
1565 c2 += 0x1f;
1566 } else if(c2 <= 0x7e) {
1567 c2 += 0x20;
1568 } else {
1569 c2 = 0; /* invalid */
1570 }
1571 } else {
1572 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1573 c2 += 0x7e;
1574 } else {
1575 c2 = 0; /* invalid */
1576 }
1577 }
1578 c1 >>= 1;
1579 if(c1 <= 0x2f) {
1580 c1 += 0x70;
1581 } else if(c1 <= 0x3f) {
1582 c1 += 0xb0;
1583 } else {
1584 c1 = 0; /* invalid */
1585 }
1586 bytes[0] = (char)c1;
1587 bytes[1] = (char)c2;
1588 }
1589
1590 /*
1591 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1592 * Katakana.
1593 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1594 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1595 * These were the only fallbacks in ICU's jisx-208.ucm file.
1596 */
1597 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1598 0x2123, /* U+FF61 */
1599 0x2156,
1600 0x2157,
1601 0x2122,
1602 0x2126,
1603 0x2572,
1604 0x2521,
1605 0x2523,
1606 0x2525,
1607 0x2527,
1608 0x2529,
1609 0x2563,
1610 0x2565,
1611 0x2567,
1612 0x2543,
1613 0x213C, /* U+FF70 */
1614 0x2522,
1615 0x2524,
1616 0x2526,
1617 0x2528,
1618 0x252A,
1619 0x252B,
1620 0x252D,
1621 0x252F,
1622 0x2531,
1623 0x2533,
1624 0x2535,
1625 0x2537,
1626 0x2539,
1627 0x253B,
1628 0x253D,
1629 0x253F, /* U+FF80 */
1630 0x2541,
1631 0x2544,
1632 0x2546,
1633 0x2548,
1634 0x254A,
1635 0x254B,
1636 0x254C,
1637 0x254D,
1638 0x254E,
1639 0x254F,
1640 0x2552,
1641 0x2555,
1642 0x2558,
1643 0x255B,
1644 0x255E,
1645 0x255F, /* U+FF90 */
1646 0x2560,
1647 0x2561,
1648 0x2562,
1649 0x2564,
1650 0x2566,
1651 0x2568,
1652 0x2569,
1653 0x256A,
1654 0x256B,
1655 0x256C,
1656 0x256D,
1657 0x256F,
1658 0x2573,
1659 0x212B,
1660 0x212C /* U+FF9F */
1661 };
1662
1663 static void U_CALLCONV
1664 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1665 UConverter *cnv = args->converter;
1666 UConverterDataISO2022 *converterData;
1667 ISO2022State *pFromU2022State;
1668 uint8_t *target = (uint8_t *) args->target;
1669 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1670 const UChar* source = args->source;
1671 const UChar* sourceLimit = args->sourceLimit;
1672 int32_t* offsets = args->offsets;
1673 UChar32 sourceChar;
1674 char buffer[8];
1675 int32_t len, outLen;
1676 int8_t choices[10];
1677 int32_t choiceCount;
1678 uint32_t targetValue = 0;
1679 UBool useFallback;
1680
1681 int32_t i;
1682 int8_t cs, g;
1683
1684 /* set up the state */
1685 converterData = (UConverterDataISO2022*)cnv->extraInfo;
1686 pFromU2022State = &converterData->fromU2022State;
1687
1688 choiceCount = 0;
1689
1690 /* check if the last codepoint of previous buffer was a lead surrogate*/
1691 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1692 goto getTrail;
1693 }
1694
1695 while(source < sourceLimit) {
1696 if(target < targetLimit) {
1697
1698 sourceChar = *(source++);
1699 /*check if the char is a First surrogate*/
1700 if(U16_IS_SURROGATE(sourceChar)) {
1701 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1702 getTrail:
1703 /*look ahead to find the trail surrogate*/
1704 if(source < sourceLimit) {
1705 /* test the following code unit */
1706 UChar trail=(UChar) *source;
1707 if(U16_IS_TRAIL(trail)) {
1708 source++;
1709 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1710 cnv->fromUChar32=0x00;
1711 /* convert this supplementary code point */
1712 /* exit this condition tree */
1713 } else {
1714 /* this is an unmatched lead code unit (1st surrogate) */
1715 /* callback(illegal) */
1716 *err=U_ILLEGAL_CHAR_FOUND;
1717 cnv->fromUChar32=sourceChar;
1718 break;
1719 }
1720 } else {
1721 /* no more input */
1722 cnv->fromUChar32=sourceChar;
1723 break;
1724 }
1725 } else {
1726 /* this is an unmatched trail code unit (2nd surrogate) */
1727 /* callback(illegal) */
1728 *err=U_ILLEGAL_CHAR_FOUND;
1729 cnv->fromUChar32=sourceChar;
1730 break;
1731 }
1732 }
1733
1734 /* do not convert SO/SI/ESC */
1735 if(IS_2022_CONTROL(sourceChar)) {
1736 /* callback(illegal) */
1737 *err=U_ILLEGAL_CHAR_FOUND;
1738 cnv->fromUChar32=sourceChar;
1739 break;
1740 }
1741
1742 /* do the conversion */
1743
1744 if(choiceCount == 0) {
1745 uint16_t csm;
1746
1747 /*
1748 * The csm variable keeps track of which charsets are allowed
1749 * and not used yet while building the choices[].
1750 */
1751 csm = jpCharsetMasks[converterData->version];
1752 choiceCount = 0;
1753
1754 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1755 if(converterData->version == 3 || converterData->version == 4) {
1756 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1757 }
1758 /* Do not try single-byte half-width Katakana for other versions. */
1759 csm &= ~CSM(HWKANA_7BIT);
1760
1761 /* try the current G0 charset */
1762 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1763 csm &= ~CSM(cs);
1764
1765 /* try the current G2 charset */
1766 if((cs = pFromU2022State->cs[2]) != 0) {
1767 choices[choiceCount++] = cs;
1768 csm &= ~CSM(cs);
1769 }
1770
1771 /* try all the other possible charsets */
1772 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
1773 cs = (int8_t)jpCharsetPref[i];
1774 if(CSM(cs) & csm) {
1775 choices[choiceCount++] = cs;
1776 csm &= ~CSM(cs);
1777 }
1778 }
1779 }
1780
1781 cs = g = 0;
1782 /*
1783 * len==0: no mapping found yet
1784 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1785 * len>0: found a roundtrip result, done
1786 */
1787 len = 0;
1788 /*
1789 * We will turn off useFallback after finding a fallback,
1790 * but we still get fallbacks from PUA code points as usual.
1791 * Therefore, we will also need to check that we don't overwrite
1792 * an early fallback with a later one.
1793 */
1794 useFallback = cnv->useFallback;
1795
1796 for(i = 0; i < choiceCount && len <= 0; ++i) {
1797 uint32_t value;
1798 int32_t len2;
1799 int8_t cs0 = choices[i];
1800 switch(cs0) {
1801 case ASCII:
1802 if(sourceChar <= 0x7f) {
1803 targetValue = (uint32_t)sourceChar;
1804 len = 1;
1805 cs = cs0;
1806 g = 0;
1807 }
1808 break;
1809 case ISO8859_1:
1810 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1811 targetValue = (uint32_t)sourceChar - 0x80;
1812 len = 1;
1813 cs = cs0;
1814 g = 2;
1815 }
1816 break;
1817 case HWKANA_7BIT:
1818 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1819 if(converterData->version==3) {
1820 /* JIS7: use G1 (SO) */
1821 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1822 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1823 len = 1;
1824 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1825 g = 1;
1826 } else if(converterData->version==4) {
1827 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1828 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1829 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1830 len = 1;
1831
1832 cs = pFromU2022State->cs[0];
1833 if(IS_JP_DBCS(cs)) {
1834 /* switch from a DBCS charset to JISX201 */
1835 cs = (int8_t)JISX201;
1836 }
1837 /* else stay in the current G0 charset */
1838 g = 0;
1839 }
1840 /* else do not use HWKANA_7BIT with other versions */
1841 }
1842 break;
1843 case JISX201:
1844 /* G0 SBCS */
1845 value = jisx201FromU(sourceChar);
1846 if(value <= 0x7f) {
1847 targetValue = value;
1848 len = 1;
1849 cs = cs0;
1850 g = 0;
1851 useFallback = FALSE;
1852 }
1853 break;
1854 case JISX208:
1855 /* G0 DBCS from Shift-JIS table */
1856 len2 = MBCS_FROM_UCHAR32_ISO2022(
1857 converterData->myConverterArray[cs0],
1858 sourceChar, &value,
1859 useFallback, MBCS_OUTPUT_2);
1860 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1861 value = _2022FromSJIS(value);
1862 if(value != 0) {
1863 targetValue = value;
1864 len = len2;
1865 cs = cs0;
1866 g = 0;
1867 useFallback = FALSE;
1868 }
1869 } else if(len == 0 && useFallback &&
1870 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1871 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1872 len = -2;
1873 cs = cs0;
1874 g = 0;
1875 useFallback = FALSE;
1876 }
1877 break;
1878 case ISO8859_7:
1879 /* G0 SBCS forced to 7-bit output */
1880 len2 = MBCS_SINGLE_FROM_UCHAR32(
1881 converterData->myConverterArray[cs0],
1882 sourceChar, &value,
1883 useFallback);
1884 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1885 targetValue = value - 0x80;
1886 len = len2;
1887 cs = cs0;
1888 g = 2;
1889 useFallback = FALSE;
1890 }
1891 break;
1892 default:
1893 /* G0 DBCS */
1894 len2 = MBCS_FROM_UCHAR32_ISO2022(
1895 converterData->myConverterArray[cs0],
1896 sourceChar, &value,
1897 useFallback, MBCS_OUTPUT_2);
1898 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1899 if(cs0 == KSC5601) {
1900 /*
1901 * Check for valid bytes for the encoding scheme.
1902 * This is necessary because the sub-converter (windows-949)
1903 * has a broader encoding scheme than is valid for 2022.
1904 */
1905 value = _2022FromGR94DBCS(value);
1906 if(value == 0) {
1907 break;
1908 }
1909 }
1910 targetValue = value;
1911 len = len2;
1912 cs = cs0;
1913 g = 0;
1914 useFallback = FALSE;
1915 }
1916 break;
1917 }
1918 }
1919
1920 if(len != 0) {
1921 if(len < 0) {
1922 len = -len; /* fallback */
1923 }
1924 outLen = 0; /* count output bytes */
1925
1926 /* write SI if necessary (only for JIS7) */
1927 if(pFromU2022State->g == 1 && g == 0) {
1928 buffer[outLen++] = UCNV_SI;
1929 pFromU2022State->g = 0;
1930 }
1931
1932 /* write the designation sequence if necessary */
1933 if(cs != pFromU2022State->cs[g]) {
1934 int32_t escLen = escSeqCharsLen[cs];
1935 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1936 outLen += escLen;
1937 pFromU2022State->cs[g] = cs;
1938
1939 /* invalidate the choices[] */
1940 choiceCount = 0;
1941 }
1942
1943 /* write the shift sequence if necessary */
1944 if(g != pFromU2022State->g) {
1945 switch(g) {
1946 /* case 0 handled before writing escapes */
1947 case 1:
1948 buffer[outLen++] = UCNV_SO;
1949 pFromU2022State->g = 1;
1950 break;
1951 default: /* case 2 */
1952 buffer[outLen++] = 0x1b;
1953 buffer[outLen++] = 0x4e;
1954 break;
1955 /* no case 3: no SS3 in ISO-2022-JP-x */
1956 }
1957 }
1958
1959 /* write the output bytes */
1960 if(len == 1) {
1961 buffer[outLen++] = (char)targetValue;
1962 } else /* len == 2 */ {
1963 buffer[outLen++] = (char)(targetValue >> 8);
1964 buffer[outLen++] = (char)targetValue;
1965 }
1966 } else {
1967 /*
1968 * if we cannot find the character after checking all codepages
1969 * then this is an error
1970 */
1971 *err = U_INVALID_CHAR_FOUND;
1972 cnv->fromUChar32=sourceChar;
1973 break;
1974 }
1975
1976 if(sourceChar == CR || sourceChar == LF) {
1977 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1978 pFromU2022State->cs[2] = 0;
1979 choiceCount = 0;
1980 }
1981
1982 /* output outLen>0 bytes in buffer[] */
1983 if(outLen == 1) {
1984 *target++ = buffer[0];
1985 if(offsets) {
1986 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1987 }
1988 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1989 *target++ = buffer[0];
1990 *target++ = buffer[1];
1991 if(offsets) {
1992 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1993 *offsets++ = sourceIndex;
1994 *offsets++ = sourceIndex;
1995 }
1996 } else {
1997 fromUWriteUInt8(
1998 cnv,
1999 buffer, outLen,
2000 &target, (const char *)targetLimit,
2001 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2002 err);
2003 if(U_FAILURE(*err)) {
2004 break;
2005 }
2006 }
2007 } /* end if(myTargetIndex<myTargetLength) */
2008 else{
2009 *err =U_BUFFER_OVERFLOW_ERROR;
2010 break;
2011 }
2012
2013 }/* end while(mySourceIndex<mySourceLength) */
2014
2015 /*
2016 * the end of the input stream and detection of truncated input
2017 * are handled by the framework, but for ISO-2022-JP conversion
2018 * we need to be in ASCII mode at the very end
2019 *
2020 * conditions:
2021 * successful
2022 * in SO mode or not in ASCII mode
2023 * end of input and no truncated input
2024 */
2025 if( U_SUCCESS(*err) &&
2026 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2027 args->flush && source>=sourceLimit && cnv->fromUChar32==0
2028 ) {
2029 int32_t sourceIndex;
2030
2031 outLen = 0;
2032
2033 if(pFromU2022State->g != 0) {
2034 buffer[outLen++] = UCNV_SI;
2035 pFromU2022State->g = 0;
2036 }
2037
2038 if(pFromU2022State->cs[0] != ASCII) {
2039 int32_t escLen = escSeqCharsLen[ASCII];
2040 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2041 outLen += escLen;
2042 pFromU2022State->cs[0] = (int8_t)ASCII;
2043 }
2044
2045 /* get the source index of the last input character */
2046 /*
2047 * TODO this would be simpler and more reliable if we used a pair
2048 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2049 * so that we could simply use the prevSourceIndex here;
2050 * this code gives an incorrect result for the rare case of an unmatched
2051 * trail surrogate that is alone in the last buffer of the text stream
2052 */
2053 sourceIndex=(int32_t)(source-args->source);
2054 if(sourceIndex>0) {
2055 --sourceIndex;
2056 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2057 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2058 ) {
2059 --sourceIndex;
2060 }
2061 } else {
2062 sourceIndex=-1;
2063 }
2064
2065 fromUWriteUInt8(
2066 cnv,
2067 buffer, outLen,
2068 &target, (const char *)targetLimit,
2069 &offsets, sourceIndex,
2070 err);
2071 }
2072
2073 /*save the state and return */
2074 args->source = source;
2075 args->target = (char*)target;
2076 }
2077
2078 /*************** to unicode *******************/
2079
2080 static void U_CALLCONV
2081 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2082 UErrorCode* err){
2083 char tempBuf[2];
2084 const char *mySource = (char *) args->source;
2085 UChar *myTarget = args->target;
2086 const char *mySourceLimit = args->sourceLimit;
2087 uint32_t targetUniChar = 0x0000;
2088 uint32_t mySourceChar = 0x0000;
2089 uint32_t tmpSourceChar = 0x0000;
2090 UConverterDataISO2022* myData;
2091 ISO2022State *pToU2022State;
2092 StateEnum cs;
2093
2094 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2095 pToU2022State = &myData->toU2022State;
2096
2097 if(myData->key != 0) {
2098 /* continue with a partial escape sequence */
2099 goto escape;
2100 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2101 /* continue with a partial double-byte character */
2102 mySourceChar = args->converter->toUBytes[0];
2103 args->converter->toULength = 0;
2104 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2105 targetUniChar = missingCharMarker;
2106 goto getTrailByte;
2107 }
2108
2109 while(mySource < mySourceLimit){
2110
2111 targetUniChar =missingCharMarker;
2112
2113 if(myTarget < args->targetLimit){
2114
2115 mySourceChar= (unsigned char) *mySource++;
2116
2117 switch(mySourceChar) {
2118 case UCNV_SI:
2119 if(myData->version==3) {
2120 pToU2022State->g=0;
2121 continue;
2122 } else {
2123 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2124 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2125 break;
2126 }
2127
2128 case UCNV_SO:
2129 if(myData->version==3) {
2130 /* JIS7: switch to G1 half-width Katakana */
2131 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2132 pToU2022State->g=1;
2133 continue;
2134 } else {
2135 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2136 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2137 break;
2138 }
2139
2140 case ESC_2022:
2141 mySource--;
2142 escape:
2143 {
2144 const char * mySourceBefore = mySource;
2145 int8_t toULengthBefore = args->converter->toULength;
2146
2147 changeState_2022(args->converter,&(mySource),
2148 mySourceLimit, ISO_2022_JP,err);
2149
2150 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2151 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2152 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2153 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2154 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2155 }
2156 }
2157
2158 /* invalid or illegal escape sequence */
2159 if(U_FAILURE(*err)){
2160 args->target = myTarget;
2161 args->source = mySource;
2162 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
2163 return;
2164 }
2165 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2166 if(myData->key==0) {
2167 myData->isEmptySegment = TRUE;
2168 }
2169 continue;
2170
2171 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2172
2173 case CR:
2174 case LF:
2175 /* automatically reset to single-byte mode */
2176 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2177 pToU2022State->cs[0] = (int8_t)ASCII;
2178 }
2179 pToU2022State->cs[2] = 0;
2180 pToU2022State->g = 0;
2181 U_FALLTHROUGH;
2182 default:
2183 /* convert one or two bytes */
2184 myData->isEmptySegment = FALSE;
2185 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2186 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2187 !IS_JP_DBCS(cs)
2188 ) {
2189 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2190 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2191
2192 /* return from a single-shift state to the previous one */
2193 if(pToU2022State->g >= 2) {
2194 pToU2022State->g=pToU2022State->prevG;
2195 }
2196 } else switch(cs) {
2197 case ASCII:
2198 if(mySourceChar <= 0x7f) {
2199 targetUniChar = mySourceChar;
2200 }
2201 break;
2202 case ISO8859_1:
2203 if(mySourceChar <= 0x7f) {
2204 targetUniChar = mySourceChar + 0x80;
2205 }
2206 /* return from a single-shift state to the previous one */
2207 pToU2022State->g=pToU2022State->prevG;
2208 break;
2209 case ISO8859_7:
2210 if(mySourceChar <= 0x7f) {
2211 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2212 targetUniChar =
2213 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2214 myData->myConverterArray[cs],
2215 mySourceChar + 0x80);
2216 }
2217 /* return from a single-shift state to the previous one */
2218 pToU2022State->g=pToU2022State->prevG;
2219 break;
2220 case JISX201:
2221 if(mySourceChar <= 0x7f) {
2222 targetUniChar = jisx201ToU(mySourceChar);
2223 }
2224 break;
2225 case HWKANA_7BIT:
2226 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2227 /* 7-bit halfwidth Katakana */
2228 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2229 }
2230 break;
2231 default:
2232 /* G0 DBCS */
2233 if(mySource < mySourceLimit) {
2234 int leadIsOk, trailIsOk;
2235 uint8_t trailByte;
2236 getTrailByte:
2237 trailByte = (uint8_t)*mySource;
2238 /*
2239 * Ticket 5691: consistent illegal sequences:
2240 * - We include at least the first byte in the illegal sequence.
2241 * - If any of the non-initial bytes could be the start of a character,
2242 * we stop the illegal sequence before the first one of those.
2243 *
2244 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2245 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2246 * Otherwise we convert or report the pair of bytes.
2247 */
2248 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2249 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2250 if (leadIsOk && trailIsOk) {
2251 ++mySource;
2252 tmpSourceChar = (mySourceChar << 8) | trailByte;
2253 if(cs == JISX208) {
2254 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2255 mySourceChar = tmpSourceChar;
2256 } else {
2257 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2258 mySourceChar = tmpSourceChar;
2259 if (cs == KSC5601) {
2260 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2261 }
2262 tempBuf[0] = (char)(tmpSourceChar >> 8);
2263 tempBuf[1] = (char)(tmpSourceChar);
2264 }
2265 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2266 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2267 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2268 ++mySource;
2269 /* add another bit so that the code below writes 2 bytes in case of error */
2270 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2271 }
2272 } else {
2273 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2274 args->converter->toULength = 1;
2275 goto endloop;
2276 }
2277 } /* End of inner switch */
2278 break;
2279 } /* End of outer switch */
2280 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2281 if(args->offsets){
2282 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2283 }
2284 *(myTarget++)=(UChar)targetUniChar;
2285 }
2286 else if(targetUniChar > missingCharMarker){
2287 /* disassemble the surrogate pair and write to output*/
2288 targetUniChar-=0x0010000;
2289 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2290 if(args->offsets){
2291 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2292 }
2293 ++myTarget;
2294 if(myTarget< args->targetLimit){
2295 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2296 if(args->offsets){
2297 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2298 }
2299 ++myTarget;
2300 }else{
2301 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2302 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2303 }
2304
2305 }
2306 else{
2307 /* Call the callback function*/
2308 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2309 break;
2310 }
2311 }
2312 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2313 *err =U_BUFFER_OVERFLOW_ERROR;
2314 break;
2315 }
2316 }
2317 endloop:
2318 args->target = myTarget;
2319 args->source = mySource;
2320 }
2321
2322
2323 #if !UCONFIG_ONLY_HTML_CONVERSION
2324 /***************************************************************
2325 * Rules for ISO-2022-KR encoding
2326 * i) The KSC5601 designator sequence should appear only once in a file,
2327 * at the begining of a line before any KSC5601 characters. This usually
2328 * means that it appears by itself on the first line of the file
2329 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2330 * and SI to shift into single byte mode
2331 */
2332 static void U_CALLCONV
2333 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2334
2335 UConverter* saveConv = args->converter;
2336 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2337 args->converter=myConverterData->currentConverter;
2338
2339 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2340 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2341 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2342
2343 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2344 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2345 uprv_memcpy(
2346 saveConv->charErrorBuffer,
2347 myConverterData->currentConverter->charErrorBuffer,
2348 myConverterData->currentConverter->charErrorBufferLength);
2349 }
2350 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2351 myConverterData->currentConverter->charErrorBufferLength = 0;
2352 }
2353 args->converter=saveConv;
2354 }
2355
2356 static void U_CALLCONV
2357 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2358
2359 const UChar *source = args->source;
2360 const UChar *sourceLimit = args->sourceLimit;
2361 unsigned char *target = (unsigned char *) args->target;
2362 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2363 int32_t* offsets = args->offsets;
2364 uint32_t targetByteUnit = 0x0000;
2365 UChar32 sourceChar = 0x0000;
2366 UBool isTargetByteDBCS;
2367 UBool oldIsTargetByteDBCS;
2368 UConverterDataISO2022 *converterData;
2369 UConverterSharedData* sharedData;
2370 UBool useFallback;
2371 int32_t length =0;
2372
2373 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2374 /* if the version is 1 then the user is requesting
2375 * conversion with ibm-25546 pass the arguments to
2376 * MBCS converter and return
2377 */
2378 if(converterData->version==1){
2379 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2380 return;
2381 }
2382
2383 /* initialize data */
2384 sharedData = converterData->currentConverter->sharedData;
2385 useFallback = args->converter->useFallback;
2386 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2387 oldIsTargetByteDBCS = isTargetByteDBCS;
2388
2389 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2390 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2391 goto getTrail;
2392 }
2393 while(source < sourceLimit){
2394
2395 targetByteUnit = missingCharMarker;
2396
2397 if(target < (unsigned char*) args->targetLimit){
2398 sourceChar = *source++;
2399
2400 /* do not convert SO/SI/ESC */
2401 if(IS_2022_CONTROL(sourceChar)) {
2402 /* callback(illegal) */
2403 *err=U_ILLEGAL_CHAR_FOUND;
2404 args->converter->fromUChar32=sourceChar;
2405 break;
2406 }
2407
2408 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2409 if(length < 0) {
2410 length = -length; /* fallback */
2411 }
2412 /* only DBCS or SBCS characters are expected*/
2413 /* DB characters with high bit set to 1 are expected */
2414 if( length > 2 || length==0 ||
2415 (length == 1 && targetByteUnit > 0x7f) ||
2416 (length == 2 &&
2417 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2418 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2419 ) {
2420 targetByteUnit=missingCharMarker;
2421 }
2422 if (targetByteUnit != missingCharMarker){
2423
2424 oldIsTargetByteDBCS = isTargetByteDBCS;
2425 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2426 /* append the shift sequence */
2427 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2428
2429 if (isTargetByteDBCS)
2430 *target++ = UCNV_SO;
2431 else
2432 *target++ = UCNV_SI;
2433 if(offsets)
2434 *(offsets++) = (int32_t)(source - args->source-1);
2435 }
2436 /* write the targetUniChar to target */
2437 if(targetByteUnit <= 0x00FF){
2438 if( target < targetLimit){
2439 *(target++) = (unsigned char) targetByteUnit;
2440 if(offsets){
2441 *(offsets++) = (int32_t)(source - args->source-1);
2442 }
2443
2444 }else{
2445 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2446 *err = U_BUFFER_OVERFLOW_ERROR;
2447 }
2448 }else{
2449 if(target < targetLimit){
2450 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2451 if(offsets){
2452 *(offsets++) = (int32_t)(source - args->source-1);
2453 }
2454 if(target < targetLimit){
2455 *(target++) =(unsigned char) (targetByteUnit -0x80);
2456 if(offsets){
2457 *(offsets++) = (int32_t)(source - args->source-1);
2458 }
2459 }else{
2460 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2461 *err = U_BUFFER_OVERFLOW_ERROR;
2462 }
2463 }else{
2464 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2465 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2466 *err = U_BUFFER_OVERFLOW_ERROR;
2467 }
2468 }
2469
2470 }
2471 else{
2472 /* oops.. the code point is unassingned
2473 * set the error and reason
2474 */
2475
2476 /*check if the char is a First surrogate*/
2477 if(U16_IS_SURROGATE(sourceChar)) {
2478 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2479 getTrail:
2480 /*look ahead to find the trail surrogate*/
2481 if(source < sourceLimit) {
2482 /* test the following code unit */
2483 UChar trail=(UChar) *source;
2484 if(U16_IS_TRAIL(trail)) {
2485 source++;
2486 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2487 *err = U_INVALID_CHAR_FOUND;
2488 /* convert this surrogate code point */
2489 /* exit this condition tree */
2490 } else {
2491 /* this is an unmatched lead code unit (1st surrogate) */
2492 /* callback(illegal) */
2493 *err=U_ILLEGAL_CHAR_FOUND;
2494 }
2495 } else {
2496 /* no more input */
2497 *err = U_ZERO_ERROR;
2498 }
2499 } else {
2500 /* this is an unmatched trail code unit (2nd surrogate) */
2501 /* callback(illegal) */
2502 *err=U_ILLEGAL_CHAR_FOUND;
2503 }
2504 } else {
2505 /* callback(unassigned) for a BMP code point */
2506 *err = U_INVALID_CHAR_FOUND;
2507 }
2508
2509 args->converter->fromUChar32=sourceChar;
2510 break;
2511 }
2512 } /* end if(myTargetIndex<myTargetLength) */
2513 else{
2514 *err =U_BUFFER_OVERFLOW_ERROR;
2515 break;
2516 }
2517
2518 }/* end while(mySourceIndex<mySourceLength) */
2519
2520 /*
2521 * the end of the input stream and detection of truncated input
2522 * are handled by the framework, but for ISO-2022-KR conversion
2523 * we need to be in ASCII mode at the very end
2524 *
2525 * conditions:
2526 * successful
2527 * not in ASCII mode
2528 * end of input and no truncated input
2529 */
2530 if( U_SUCCESS(*err) &&
2531 isTargetByteDBCS &&
2532 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2533 ) {
2534 int32_t sourceIndex;
2535
2536 /* we are switching to ASCII */
2537 isTargetByteDBCS=FALSE;
2538
2539 /* get the source index of the last input character */
2540 /*
2541 * TODO this would be simpler and more reliable if we used a pair
2542 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2543 * so that we could simply use the prevSourceIndex here;
2544 * this code gives an incorrect result for the rare case of an unmatched
2545 * trail surrogate that is alone in the last buffer of the text stream
2546 */
2547 sourceIndex=(int32_t)(source-args->source);
2548 if(sourceIndex>0) {
2549 --sourceIndex;
2550 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2551 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2552 ) {
2553 --sourceIndex;
2554 }
2555 } else {
2556 sourceIndex=-1;
2557 }
2558
2559 fromUWriteUInt8(
2560 args->converter,
2561 SHIFT_IN_STR, 1,
2562 &target, (const char *)targetLimit,
2563 &offsets, sourceIndex,
2564 err);
2565 }
2566
2567 /*save the state and return */
2568 args->source = source;
2569 args->target = (char*)target;
2570 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2571 }
2572
2573 /************************ To Unicode ***************************************/
2574
2575 static void U_CALLCONV
2576 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2577 UErrorCode* err){
2578 char const* sourceStart;
2579 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2580
2581 UConverterToUnicodeArgs subArgs;
2582 int32_t minArgsSize;
2583
2584 /* set up the subconverter arguments */
2585 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2586 minArgsSize = args->size;
2587 } else {
2588 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2589 }
2590
2591 uprv_memcpy(&subArgs, args, minArgsSize);
2592 subArgs.size = (uint16_t)minArgsSize;
2593 subArgs.converter = myData->currentConverter;
2594
2595 /* remember the original start of the input for offsets */
2596 sourceStart = args->source;
2597
2598 if(myData->key != 0) {
2599 /* continue with a partial escape sequence */
2600 goto escape;
2601 }
2602
2603 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2604 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2605 subArgs.source = args->source;
2606 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2607 if(subArgs.source != subArgs.sourceLimit) {
2608 /*
2609 * get the current partial byte sequence
2610 *
2611 * it needs to be moved between the public and the subconverter
2612 * so that the conversion framework, which only sees the public
2613 * converter, can handle truncated and illegal input etc.
2614 */
2615 if(args->converter->toULength > 0) {
2616 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2617 }
2618 subArgs.converter->toULength = args->converter->toULength;
2619
2620 /*
2621 * Convert up to the end of the input, or to before the next escape character.
2622 * Does not handle conversion extensions because the preToU[] state etc.
2623 * is not copied.
2624 */
2625 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2626
2627 if(args->offsets != NULL && sourceStart != args->source) {
2628 /* update offsets to base them on the actual start of the input */
2629 int32_t *offsets = args->offsets;
2630 UChar *target = args->target;
2631 int32_t delta = (int32_t)(args->source - sourceStart);
2632 while(target < subArgs.target) {
2633 if(*offsets >= 0) {
2634 *offsets += delta;
2635 }
2636 ++offsets;
2637 ++target;
2638 }
2639 }
2640 args->source = subArgs.source;
2641 args->target = subArgs.target;
2642 args->offsets = subArgs.offsets;
2643
2644 /* copy input/error/overflow buffers */
2645 if(subArgs.converter->toULength > 0) {
2646 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2647 }
2648 args->converter->toULength = subArgs.converter->toULength;
2649
2650 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2651 if(subArgs.converter->UCharErrorBufferLength > 0) {
2652 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2653 subArgs.converter->UCharErrorBufferLength);
2654 }
2655 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2656 subArgs.converter->UCharErrorBufferLength = 0;
2657 }
2658 }
2659
2660 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2661 return;
2662 }
2663
2664 escape:
2665 changeState_2022(args->converter,
2666 &(args->source),
2667 args->sourceLimit,
2668 ISO_2022_KR,
2669 err);
2670 }
2671 }
2672
2673 static void U_CALLCONV
2674 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2675 UErrorCode* err){
2676 char tempBuf[2];
2677 const char *mySource = ( char *) args->source;
2678 UChar *myTarget = args->target;
2679 const char *mySourceLimit = args->sourceLimit;
2680 UChar32 targetUniChar = 0x0000;
2681 UChar mySourceChar = 0x0000;
2682 UConverterDataISO2022* myData;
2683 UConverterSharedData* sharedData ;
2684 UBool useFallback;
2685
2686 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2687 if(myData->version==1){
2688 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2689 return;
2690 }
2691
2692 /* initialize state */
2693 sharedData = myData->currentConverter->sharedData;
2694 useFallback = args->converter->useFallback;
2695
2696 if(myData->key != 0) {
2697 /* continue with a partial escape sequence */
2698 goto escape;
2699 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2700 /* continue with a partial double-byte character */
2701 mySourceChar = args->converter->toUBytes[0];
2702 args->converter->toULength = 0;
2703 goto getTrailByte;
2704 }
2705
2706 while(mySource< mySourceLimit){
2707
2708 if(myTarget < args->targetLimit){
2709
2710 mySourceChar= (unsigned char) *mySource++;
2711
2712 if(mySourceChar==UCNV_SI){
2713 myData->toU2022State.g = 0;
2714 if (myData->isEmptySegment) {
2715 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2716 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2717 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2718 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2719 args->converter->toULength = 1;
2720 args->target = myTarget;
2721 args->source = mySource;
2722 return;
2723 }
2724 /*consume the source */
2725 continue;
2726 }else if(mySourceChar==UCNV_SO){
2727 myData->toU2022State.g = 1;
2728 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
2729 /*consume the source */
2730 continue;
2731 }else if(mySourceChar==ESC_2022){
2732 mySource--;
2733 escape:
2734 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2735 changeState_2022(args->converter,&(mySource),
2736 mySourceLimit, ISO_2022_KR, err);
2737 if(U_FAILURE(*err)){
2738 args->target = myTarget;
2739 args->source = mySource;
2740 return;
2741 }
2742 continue;
2743 }
2744
2745 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
2746 if(myData->toU2022State.g == 1) {
2747 if(mySource < mySourceLimit) {
2748 int leadIsOk, trailIsOk;
2749 uint8_t trailByte;
2750 getTrailByte:
2751 targetUniChar = missingCharMarker;
2752 trailByte = (uint8_t)*mySource;
2753 /*
2754 * Ticket 5691: consistent illegal sequences:
2755 * - We include at least the first byte in the illegal sequence.
2756 * - If any of the non-initial bytes could be the start of a character,
2757 * we stop the illegal sequence before the first one of those.
2758 *
2759 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2760 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2761 * Otherwise we convert or report the pair of bytes.
2762 */
2763 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2764 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2765 if (leadIsOk && trailIsOk) {
2766 ++mySource;
2767 tempBuf[0] = (char)(mySourceChar + 0x80);
2768 tempBuf[1] = (char)(trailByte + 0x80);
2769 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2770 mySourceChar = (mySourceChar << 8) | trailByte;
2771 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2772 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2773 ++mySource;
2774 /* add another bit so that the code below writes 2 bytes in case of error */
2775 mySourceChar = static_cast<UChar>(0x10000 | (mySourceChar << 8) | trailByte);
2776 }
2777 } else {
2778 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2779 args->converter->toULength = 1;
2780 break;
2781 }
2782 }
2783 else if(mySourceChar <= 0x7f) {
2784 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2785 } else {
2786 targetUniChar = 0xffff;
2787 }
2788 if(targetUniChar < 0xfffe){
2789 if(args->offsets) {
2790 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2791 }
2792 *(myTarget++)=(UChar)targetUniChar;
2793 }
2794 else {
2795 /* Call the callback function*/
2796 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2797 break;
2798 }
2799 }
2800 else{
2801 *err =U_BUFFER_OVERFLOW_ERROR;
2802 break;
2803 }
2804 }
2805 args->target = myTarget;
2806 args->source = mySource;
2807 }
2808
2809 /*************************** END ISO2022-KR *********************************/
2810
2811 /*************************** ISO-2022-CN *********************************
2812 *
2813 * Rules for ISO-2022-CN Encoding:
2814 * i) The designator sequence must appear once on a line before any instance
2815 * of character set it designates.
2816 * ii) If two lines contain characters from the same character set, both lines
2817 * must include the designator sequence.
2818 * iii) Once the designator sequence is known, a shifting sequence has to be found
2819 * to invoke the shifting
2820 * iv) All lines start in ASCII and end in ASCII.
2821 * v) Four shifting sequences are employed for this purpose:
2822 *
2823 * Sequcence ASCII Eq Charsets
2824 * ---------- ------- ---------
2825 * SI <SI> US-ASCII
2826 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2827 * SS2 <ESC>N CNS-11643-1992 Plane 2
2828 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2829 *
2830 * vi)
2831 * SOdesignator : ESC "$" ")" finalchar_for_SO
2832 * SS2designator : ESC "$" "*" finalchar_for_SS2
2833 * SS3designator : ESC "$" "+" finalchar_for_SS3
2834 *
2835 * ESC $ ) A Indicates the bytes following SO are Chinese
2836 * characters as defined in GB 2312-80, until
2837 * another SOdesignation appears
2838 *
2839 *
2840 * ESC $ ) E Indicates the bytes following SO are as defined
2841 * in ISO-IR-165 (for details, see section 2.1),
2842 * until another SOdesignation appears
2843 *
2844 * ESC $ ) G Indicates the bytes following SO are as defined
2845 * in CNS 11643-plane-1, until another
2846 * SOdesignation appears
2847 *
2848 * ESC $ * H Indicates the two bytes immediately following
2849 * SS2 is a Chinese character as defined in CNS
2850 * 11643-plane-2, until another SS2designation
2851 * appears
2852 * (Meaning <ESC>N must preceed every 2 byte
2853 * sequence.)
2854 *
2855 * ESC $ + I Indicates the immediate two bytes following SS3
2856 * is a Chinese character as defined in CNS
2857 * 11643-plane-3, until another SS3designation
2858 * appears
2859 * (Meaning <ESC>O must preceed every 2 byte
2860 * sequence.)
2861 *
2862 * ESC $ + J Indicates the immediate two bytes following SS3
2863 * is a Chinese character as defined in CNS
2864 * 11643-plane-4, until another SS3designation
2865 * appears
2866 * (In English: <ESC>O must preceed every 2 byte
2867 * sequence.)
2868 *
2869 * ESC $ + K Indicates the immediate two bytes following SS3
2870 * is a Chinese character as defined in CNS
2871 * 11643-plane-5, until another SS3designation
2872 * appears
2873 *
2874 * ESC $ + L Indicates the immediate two bytes following SS3
2875 * is a Chinese character as defined in CNS
2876 * 11643-plane-6, until another SS3designation
2877 * appears
2878 *
2879 * ESC $ + M Indicates the immediate two bytes following SS3
2880 * is a Chinese character as defined in CNS
2881 * 11643-plane-7, until another SS3designation
2882 * appears
2883 *
2884 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2885 * has its own designation information before any Chinese characters
2886 * appear
2887 *
2888 */
2889
2890 /* The following are defined this way to make the strings truly readonly */
2891 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2892 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2893 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2894 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2895 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2896 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2897 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2898 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2899 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2900
2901 /********************** ISO2022-CN Data **************************/
2902 static const char* const escSeqCharsCN[10] ={
2903 SHIFT_IN_STR, /* 0 ASCII */
2904 GB_2312_80_STR, /* 1 GB2312_1 */
2905 ISO_IR_165_STR, /* 2 ISO_IR_165 */
2906 CNS_11643_1992_Plane_1_STR,
2907 CNS_11643_1992_Plane_2_STR,
2908 CNS_11643_1992_Plane_3_STR,
2909 CNS_11643_1992_Plane_4_STR,
2910 CNS_11643_1992_Plane_5_STR,
2911 CNS_11643_1992_Plane_6_STR,
2912 CNS_11643_1992_Plane_7_STR
2913 };
2914
2915 static void U_CALLCONV
2916 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2917 UConverter *cnv = args->converter;
2918 UConverterDataISO2022 *converterData;
2919 ISO2022State *pFromU2022State;
2920 uint8_t *target = (uint8_t *) args->target;
2921 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2922 const UChar* source = args->source;
2923 const UChar* sourceLimit = args->sourceLimit;
2924 int32_t* offsets = args->offsets;
2925 UChar32 sourceChar;
2926 char buffer[8];
2927 int32_t len;
2928 int8_t choices[3];
2929 int32_t choiceCount;
2930 uint32_t targetValue = 0;
2931 UBool useFallback;
2932
2933 /* set up the state */
2934 converterData = (UConverterDataISO2022*)cnv->extraInfo;
2935 pFromU2022State = &converterData->fromU2022State;
2936
2937 choiceCount = 0;
2938
2939 /* check if the last codepoint of previous buffer was a lead surrogate*/
2940 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2941 goto getTrail;
2942 }
2943
2944 while( source < sourceLimit){
2945 if(target < targetLimit){
2946
2947 sourceChar = *(source++);
2948 /*check if the char is a First surrogate*/
2949 if(U16_IS_SURROGATE(sourceChar)) {
2950 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2951 getTrail:
2952 /*look ahead to find the trail surrogate*/
2953 if(source < sourceLimit) {
2954 /* test the following code unit */
2955 UChar trail=(UChar) *source;
2956 if(U16_IS_TRAIL(trail)) {
2957 source++;
2958 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2959 cnv->fromUChar32=0x00;
2960 /* convert this supplementary code point */
2961 /* exit this condition tree */
2962 } else {
2963 /* this is an unmatched lead code unit (1st surrogate) */
2964 /* callback(illegal) */
2965 *err=U_ILLEGAL_CHAR_FOUND;
2966 cnv->fromUChar32=sourceChar;
2967 break;
2968 }
2969 } else {
2970 /* no more input */
2971 cnv->fromUChar32=sourceChar;
2972 break;
2973 }
2974 } else {
2975 /* this is an unmatched trail code unit (2nd surrogate) */
2976 /* callback(illegal) */
2977 *err=U_ILLEGAL_CHAR_FOUND;
2978 cnv->fromUChar32=sourceChar;
2979 break;
2980 }
2981 }
2982
2983 /* do the conversion */
2984 if(sourceChar <= 0x007f ){
2985 /* do not convert SO/SI/ESC */
2986 if(IS_2022_CONTROL(sourceChar)) {
2987 /* callback(illegal) */
2988 *err=U_ILLEGAL_CHAR_FOUND;
2989 cnv->fromUChar32=sourceChar;
2990 break;
2991 }
2992
2993 /* US-ASCII */
2994 if(pFromU2022State->g == 0) {
2995 buffer[0] = (char)sourceChar;
2996 len = 1;
2997 } else {
2998 buffer[0] = UCNV_SI;
2999 buffer[1] = (char)sourceChar;
3000 len = 2;
3001 pFromU2022State->g = 0;
3002 choiceCount = 0;
3003 }
3004 if(sourceChar == CR || sourceChar == LF) {
3005 /* reset the state at the end of a line */
3006 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
3007 choiceCount = 0;
3008 }
3009 }
3010 else{
3011 /* convert U+0080..U+10ffff */
3012 int32_t i;
3013 int8_t cs, g;
3014
3015 if(choiceCount == 0) {
3016 /* try the current SO/G1 converter first */
3017 choices[0] = pFromU2022State->cs[1];
3018
3019 /* default to GB2312_1 if none is designated yet */
3020 if(choices[0] == 0) {
3021 choices[0] = GB2312_1;
3022 }
3023
3024 if(converterData->version == 0) {
3025 /* ISO-2022-CN */
3026
3027 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3028 if(choices[0] == GB2312_1) {
3029 choices[1] = (int8_t)CNS_11643_1;
3030 } else {
3031 choices[1] = (int8_t)GB2312_1;
3032 }
3033
3034 choiceCount = 2;
3035 } else if (converterData->version == 1) {
3036 /* ISO-2022-CN-EXT */
3037
3038 /* try one of the other converters */
3039 switch(choices[0]) {
3040 case GB2312_1:
3041 choices[1] = (int8_t)CNS_11643_1;
3042 choices[2] = (int8_t)ISO_IR_165;
3043 break;
3044 case ISO_IR_165:
3045 choices[1] = (int8_t)GB2312_1;
3046 choices[2] = (int8_t)CNS_11643_1;
3047 break;
3048 default: /* CNS_11643_x */
3049 choices[1] = (int8_t)GB2312_1;
3050 choices[2] = (int8_t)ISO_IR_165;
3051 break;
3052 }
3053
3054 choiceCount = 3;
3055 } else {
3056 choices[0] = (int8_t)CNS_11643_1;
3057 choices[1] = (int8_t)GB2312_1;
3058 }
3059 }
3060
3061 cs = g = 0;
3062 /*
3063 * len==0: no mapping found yet
3064 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3065 * len>0: found a roundtrip result, done
3066 */
3067 len = 0;
3068 /*
3069 * We will turn off useFallback after finding a fallback,
3070 * but we still get fallbacks from PUA code points as usual.
3071 * Therefore, we will also need to check that we don't overwrite
3072 * an early fallback with a later one.
3073 */
3074 useFallback = cnv->useFallback;
3075
3076 for(i = 0; i < choiceCount && len <= 0; ++i) {
3077 int8_t cs0 = choices[i];
3078 if(cs0 > 0) {
3079 uint32_t value;
3080 int32_t len2;
3081 if(cs0 >= CNS_11643_0) {
3082 len2 = MBCS_FROM_UCHAR32_ISO2022(
3083 converterData->myConverterArray[CNS_11643],
3084 sourceChar,
3085 &value,
3086 useFallback,
3087 MBCS_OUTPUT_3);
3088 if(len2 == 3 || (len2 == -3 && len == 0)) {
3089 targetValue = value;
3090 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3091 if(len2 >= 0) {
3092 len = 2;
3093 } else {
3094 len = -2;
3095 useFallback = FALSE;
3096 }
3097 if(cs == CNS_11643_1) {
3098 g = 1;
3099 } else if(cs == CNS_11643_2) {
3100 g = 2;
3101 } else /* plane 3..7 */ if(converterData->version == 1) {
3102 g = 3;
3103 } else {
3104 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3105 len = 0;
3106 }
3107 }
3108 } else {
3109 /* GB2312_1 or ISO-IR-165 */
3110 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3111 len2 = MBCS_FROM_UCHAR32_ISO2022(
3112 converterData->myConverterArray[cs0],
3113 sourceChar,
3114 &value,
3115 useFallback,
3116 MBCS_OUTPUT_2);
3117 if(len2 == 2 || (len2 == -2 && len == 0)) {
3118 targetValue = value;
3119 len = len2;
3120 cs = cs0;
3121 g = 1;
3122 useFallback = FALSE;
3123 }
3124 }
3125 }
3126 }
3127
3128 if(len != 0) {
3129 len = 0; /* count output bytes; it must have been abs(len) == 2 */
3130
3131 /* write the designation sequence if necessary */
3132 if(cs != pFromU2022State->cs[g]) {
3133 if(cs < CNS_11643) {
3134 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3135 } else {
3136 U_ASSERT(cs >= CNS_11643_1);
3137 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3138 }
3139 len = 4;
3140 pFromU2022State->cs[g] = cs;
3141 if(g == 1) {
3142 /* changing the SO/G1 charset invalidates the choices[] */
3143 choiceCount = 0;
3144 }
3145 }
3146
3147 /* write the shift sequence if necessary */
3148 if(g != pFromU2022State->g) {
3149 switch(g) {
3150 case 1:
3151 buffer[len++] = UCNV_SO;
3152
3153 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3154 pFromU2022State->g = 1;
3155 break;
3156 case 2:
3157 buffer[len++] = 0x1b;
3158 buffer[len++] = 0x4e;
3159 break;
3160 default: /* case 3 */
3161 buffer[len++] = 0x1b;
3162 buffer[len++] = 0x4f;
3163 break;
3164 }
3165 }
3166
3167 /* write the two output bytes */
3168 buffer[len++] = (char)(targetValue >> 8);
3169 buffer[len++] = (char)targetValue;
3170 } else {
3171 /* if we cannot find the character after checking all codepages
3172 * then this is an error
3173 */
3174 *err = U_INVALID_CHAR_FOUND;
3175 cnv->fromUChar32=sourceChar;
3176 break;
3177 }
3178 }
3179
3180 /* output len>0 bytes in buffer[] */
3181 if(len == 1) {
3182 *target++ = buffer[0];
3183 if(offsets) {
3184 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3185 }
3186 } else if(len == 2 && (target + 2) <= targetLimit) {
3187 *target++ = buffer[0];
3188 *target++ = buffer[1];
3189 if(offsets) {
3190 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3191 *offsets++ = sourceIndex;
3192 *offsets++ = sourceIndex;
3193 }
3194 } else {
3195 fromUWriteUInt8(
3196 cnv,
3197 buffer, len,
3198 &target, (const char *)targetLimit,
3199 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3200 err);
3201 if(U_FAILURE(*err)) {
3202 break;
3203 }
3204 }
3205 } /* end if(myTargetIndex<myTargetLength) */
3206 else{
3207 *err =U_BUFFER_OVERFLOW_ERROR;
3208 break;
3209 }
3210
3211 }/* end while(mySourceIndex<mySourceLength) */
3212
3213 /*
3214 * the end of the input stream and detection of truncated input
3215 * are handled by the framework, but for ISO-2022-CN conversion
3216 * we need to be in ASCII mode at the very end
3217 *
3218 * conditions:
3219 * successful
3220 * not in ASCII mode
3221 * end of input and no truncated input
3222 */
3223 if( U_SUCCESS(*err) &&
3224 pFromU2022State->g!=0 &&
3225 args->flush && source>=sourceLimit && cnv->fromUChar32==0
3226 ) {
3227 int32_t sourceIndex;
3228
3229 /* we are switching to ASCII */
3230 pFromU2022State->g=0;
3231
3232 /* get the source index of the last input character */
3233 /*
3234 * TODO this would be simpler and more reliable if we used a pair
3235 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3236 * so that we could simply use the prevSourceIndex here;
3237 * this code gives an incorrect result for the rare case of an unmatched
3238 * trail surrogate that is alone in the last buffer of the text stream
3239 */
3240 sourceIndex=(int32_t)(source-args->source);
3241 if(sourceIndex>0) {
3242 --sourceIndex;
3243 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3244 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3245 ) {
3246 --sourceIndex;
3247 }
3248 } else {
3249 sourceIndex=-1;
3250 }
3251
3252 fromUWriteUInt8(
3253 cnv,
3254 SHIFT_IN_STR, 1,
3255 &target, (const char *)targetLimit,
3256 &offsets, sourceIndex,
3257 err);
3258 }
3259
3260 /*save the state and return */
3261 args->source = source;
3262 args->target = (char*)target;
3263 }
3264
3265
3266 static void U_CALLCONV
3267 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3268 UErrorCode* err){
3269 char tempBuf[3];
3270 const char *mySource = (char *) args->source;
3271 UChar *myTarget = args->target;
3272 const char *mySourceLimit = args->sourceLimit;
3273 uint32_t targetUniChar = 0x0000;
3274 uint32_t mySourceChar = 0x0000;
3275 UConverterDataISO2022* myData;
3276 ISO2022State *pToU2022State;
3277
3278 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3279 pToU2022State = &myData->toU2022State;
3280
3281 if(myData->key != 0) {
3282 /* continue with a partial escape sequence */
3283 goto escape;
3284 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3285 /* continue with a partial double-byte character */
3286 mySourceChar = args->converter->toUBytes[0];
3287 args->converter->toULength = 0;
3288 targetUniChar = missingCharMarker;
3289 goto getTrailByte;
3290 }
3291
3292 while(mySource < mySourceLimit){
3293
3294 targetUniChar =missingCharMarker;
3295
3296 if(myTarget < args->targetLimit){
3297
3298 mySourceChar= (unsigned char) *mySource++;
3299
3300 switch(mySourceChar){
3301 case UCNV_SI:
3302 pToU2022State->g=0;
3303 if (myData->isEmptySegment) {
3304 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
3305 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3306 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3307 args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
3308 args->converter->toULength = 1;
3309 args->target = myTarget;
3310 args->source = mySource;
3311 return;
3312 }
3313 continue;
3314
3315 case UCNV_SO:
3316 if(pToU2022State->cs[1] != 0) {
3317 pToU2022State->g=1;
3318 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
3319 continue;
3320 } else {
3321 /* illegal to have SO before a matching designator */
3322 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
3323 break;
3324 }
3325
3326 case ESC_2022:
3327 mySource--;
3328 escape:
3329 {
3330 const char * mySourceBefore = mySource;
3331 int8_t toULengthBefore = args->converter->toULength;
3332
3333 changeState_2022(args->converter,&(mySource),
3334 mySourceLimit, ISO_2022_CN,err);
3335
3336 /* After SO there must be at least one character before a designator (designator error handled separately) */
3337 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3338 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3339 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3340 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3341 }
3342 }
3343
3344 /* invalid or illegal escape sequence */
3345 if(U_FAILURE(*err)){
3346 args->target = myTarget;
3347 args->source = mySource;
3348 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
3349 return;
3350 }
3351 continue;
3352
3353 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3354
3355 case CR:
3356 case LF:
3357 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3358 U_FALLTHROUGH;
3359 default:
3360 /* convert one or two bytes */
3361 myData->isEmptySegment = FALSE;
3362 if(pToU2022State->g != 0) {
3363 if(mySource < mySourceLimit) {
3364 UConverterSharedData *cnv;
3365 StateEnum tempState;
3366 int32_t tempBufLen;
3367 int leadIsOk, trailIsOk;
3368 uint8_t trailByte;
3369 getTrailByte:
3370 trailByte = (uint8_t)*mySource;
3371 /*
3372 * Ticket 5691: consistent illegal sequences:
3373 * - We include at least the first byte in the illegal sequence.
3374 * - If any of the non-initial bytes could be the start of a character,
3375 * we stop the illegal sequence before the first one of those.
3376 *
3377 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3378 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3379 * Otherwise we convert or report the pair of bytes.
3380 */
3381 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3382 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3383 if (leadIsOk && trailIsOk) {
3384 ++mySource;
3385 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3386 if(tempState >= CNS_11643_0) {
3387 cnv = myData->myConverterArray[CNS_11643];
3388 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3389 tempBuf[1] = (char) (mySourceChar);
3390 tempBuf[2] = (char) trailByte;
3391 tempBufLen = 3;
3392
3393 }else{
3394 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3395 cnv = myData->myConverterArray[tempState];
3396 tempBuf[0] = (char) (mySourceChar);
3397 tempBuf[1] = (char) trailByte;
3398 tempBufLen = 2;
3399 }
3400 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3401 mySourceChar = (mySourceChar << 8) | trailByte;
3402 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3403 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3404 ++mySource;
3405 /* add another bit so that the code below writes 2 bytes in case of error */
3406 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3407 }
3408 if(pToU2022State->g>=2) {
3409 /* return from a single-shift state to the previous one */
3410 pToU2022State->g=pToU2022State->prevG;
3411 }
3412 } else {
3413 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3414 args->converter->toULength = 1;
3415 goto endloop;
3416 }
3417 }
3418 else{
3419 if(mySourceChar <= 0x7f) {
3420 targetUniChar = (UChar) mySourceChar;
3421 }
3422 }
3423 break;
3424 }
3425 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3426 if(args->offsets){
3427 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3428 }
3429 *(myTarget++)=(UChar)targetUniChar;
3430 }
3431 else if(targetUniChar > missingCharMarker){
3432 /* disassemble the surrogate pair and write to output*/
3433 targetUniChar-=0x0010000;
3434 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3435 if(args->offsets){
3436 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3437 }
3438 ++myTarget;
3439 if(myTarget< args->targetLimit){
3440 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3441 if(args->offsets){
3442 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3443 }
3444 ++myTarget;
3445 }else{
3446 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3447 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3448 }
3449
3450 }
3451 else{
3452 /* Call the callback function*/
3453 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3454 break;
3455 }
3456 }
3457 else{
3458 *err =U_BUFFER_OVERFLOW_ERROR;
3459 break;
3460 }
3461 }
3462 endloop:
3463 args->target = myTarget;
3464 args->source = mySource;
3465 }
3466 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3467
3468 static void U_CALLCONV
3469 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3470 UConverter *cnv = args->converter;
3471 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3472 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3473 char *p, *subchar;
3474 char buffer[8];
3475 int32_t length;
3476
3477 subchar=(char *)cnv->subChars;
3478 length=cnv->subCharLen; /* assume length==1 for most variants */
3479
3480 p = buffer;
3481 switch(myConverterData->locale[0]){
3482 case 'j':
3483 {
3484 int8_t cs;
3485
3486 if(pFromU2022State->g == 1) {
3487 /* JIS7: switch from G1 to G0 */
3488 pFromU2022State->g = 0;
3489 *p++ = UCNV_SI;
3490 }
3491
3492 cs = pFromU2022State->cs[0];
3493 if(cs != ASCII && cs != JISX201) {
3494 /* not in ASCII or JIS X 0201: switch to ASCII */
3495 pFromU2022State->cs[0] = (int8_t)ASCII;
3496 *p++ = '\x1b';
3497 *p++ = '\x28';
3498 *p++ = '\x42';
3499 }
3500
3501 *p++ = subchar[0];
3502 break;
3503 }
3504 case 'c':
3505 if(pFromU2022State->g != 0) {
3506 /* not in ASCII mode: switch to ASCII */
3507 pFromU2022State->g = 0;
3508 *p++ = UCNV_SI;
3509 }
3510 *p++ = subchar[0];
3511 break;
3512 case 'k':
3513 if(myConverterData->version == 0) {
3514 if(length == 1) {
3515 if(args->converter->fromUnicodeStatus) {
3516 /* in DBCS mode: switch to SBCS */
3517 args->converter->fromUnicodeStatus = 0;
3518 *p++ = UCNV_SI;
3519 }
3520 *p++ = subchar[0];
3521 } else /* length == 2*/ {
3522 if(!args->converter->fromUnicodeStatus) {
3523 /* in SBCS mode: switch to DBCS */
3524 args->converter->fromUnicodeStatus = 1;
3525 *p++ = UCNV_SO;
3526 }
3527 *p++ = subchar[0];
3528 *p++ = subchar[1];
3529 }
3530 break;
3531 } else {
3532 /* save the subconverter's substitution string */
3533 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3534 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3535
3536 /* set our substitution string into the subconverter */
3537 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3538 myConverterData->currentConverter->subCharLen = (int8_t)length;
3539
3540 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3541 args->converter = myConverterData->currentConverter;
3542 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3543 ucnv_cbFromUWriteSub(args, 0, err);
3544 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3545 args->converter = cnv;
3546
3547 /* restore the subconverter's substitution string */
3548 myConverterData->currentConverter->subChars = currentSubChars;
3549 myConverterData->currentConverter->subCharLen = currentSubCharLen;
3550
3551 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3552 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3553 uprv_memcpy(
3554 cnv->charErrorBuffer,
3555 myConverterData->currentConverter->charErrorBuffer,
3556 myConverterData->currentConverter->charErrorBufferLength);
3557 }
3558 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3559 myConverterData->currentConverter->charErrorBufferLength = 0;
3560 }
3561 return;
3562 }
3563 default:
3564 /* not expected */
3565 break;
3566 }
3567 ucnv_cbFromUWriteBytes(args,
3568 buffer, (int32_t)(p - buffer),
3569 offsetIndex, err);
3570 }
3571
3572 /*
3573 * Structure for cloning an ISO 2022 converter into a single memory block.
3574 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3575 * and then ucnv_safeClone() of the sub-converter may additionally align
3576 * currentConverter inside the cloneStruct, for which we need the deadSpace
3577 * after currentConverter.
3578 * This is because UAlignedMemory may be larger than the actually
3579 * necessary alignment size for the platform.
3580 * The other cloneStruct fields will not be moved around,
3581 * and are aligned properly with cloneStruct's alignment.
3582 */
3583 struct cloneStruct
3584 {
3585 UConverter cnv;
3586 UConverter currentConverter;
3587 UAlignedMemory deadSpace;
3588 UConverterDataISO2022 mydata;
3589 };
3590
3591
3592 U_CDECL_BEGIN
3593
3594 static UConverter * U_CALLCONV
3595 _ISO_2022_SafeClone(
3596 const UConverter *cnv,
3597 void *stackBuffer,
3598 int32_t *pBufferSize,
3599 UErrorCode *status)
3600 {
3601 struct cloneStruct * localClone;
3602 UConverterDataISO2022 *cnvData;
3603 int32_t i, size;
3604
3605 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3606 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3607 return NULL;
3608 }
3609
3610 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3611 localClone = (struct cloneStruct *)stackBuffer;
3612
3613 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3614
3615 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3616 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3617 localClone->cnv.isExtraLocal = TRUE;
3618
3619 /* share the subconverters */
3620
3621 if(cnvData->currentConverter != NULL) {
3622 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3623 localClone->mydata.currentConverter =
3624 ucnv_safeClone(cnvData->currentConverter,
3625 &localClone->currentConverter,
3626 &size, status);
3627 if(U_FAILURE(*status)) {
3628 return NULL;
3629 }
3630 }
3631
3632 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3633 if(cnvData->myConverterArray[i] != NULL) {
3634 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3635 }
3636 }
3637
3638 return &localClone->cnv;
3639 }
3640
3641 U_CDECL_END
3642
3643 static void U_CALLCONV
3644 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3645 const USetAdder *sa,
3646 UConverterUnicodeSet which,
3647 UErrorCode *pErrorCode)
3648 {
3649 int32_t i;
3650 UConverterDataISO2022* cnvData;
3651
3652 if (U_FAILURE(*pErrorCode)) {
3653 return;
3654 }
3655 #ifdef U_ENABLE_GENERIC_ISO_2022
3656 if (cnv->sharedData == &_ISO2022Data) {
3657 /* We use UTF-8 in this case */
3658 sa->addRange(sa->set, 0, 0xd7FF);
3659 sa->addRange(sa->set, 0xE000, 0x10FFFF);
3660 return;
3661 }
3662 #endif
3663
3664 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3665
3666 /* open a set and initialize it with code points that are algorithmically round-tripped */
3667 switch(cnvData->locale[0]){
3668 case 'j':
3669 /* include JIS X 0201 which is hardcoded */
3670 sa->add(sa->set, 0xa5);
3671 sa->add(sa->set, 0x203e);
3672 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3673 /* include Latin-1 for some variants of JP */
3674 sa->addRange(sa->set, 0, 0xff);
3675 } else {
3676 /* include ASCII for JP */
3677 sa->addRange(sa->set, 0, 0x7f);
3678 }
3679 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3680 /*
3681 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3682 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3683 * use half-width Katakana.
3684 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3685 * half-width Katakana via the ESC ( I sequence.
3686 * However, we only emit (fromUnicode) half-width Katakana according to the
3687 * definition of each variant.
3688 *
3689 * When including fallbacks,
3690 * we need to include half-width Katakana Unicode code points for all JP variants because
3691 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3692 */
3693 /* include half-width Katakana for JP */
3694 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3695 }
3696 break;
3697 #if !UCONFIG_ONLY_HTML_CONVERSION
3698 case 'c':
3699 case 'z':
3700 /* include ASCII for CN */
3701 sa->addRange(sa->set, 0, 0x7f);
3702 break;
3703 case 'k':
3704 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3705 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3706 cnvData->currentConverter, sa, which, pErrorCode);
3707 /* the loop over myConverterArray[] will simply not find another converter */
3708 break;
3709 #endif
3710 default:
3711 break;
3712 }
3713
3714 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3715 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3716 cnvData->version==0 && i==CNS_11643
3717 ) {
3718 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3719 ucnv_MBCSGetUnicodeSetForBytes(
3720 cnvData->myConverterArray[i],
3721 sa, UCNV_ROUNDTRIP_SET,
3722 0, 0x81, 0x82,
3723 pErrorCode);
3724 }
3725 #endif
3726
3727 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3728 UConverterSetFilter filter;
3729 if(cnvData->myConverterArray[i]!=NULL) {
3730 if(cnvData->locale[0]=='j' && i==JISX208) {
3731 /*
3732 * Only add code points that map to Shift-JIS codes
3733 * corresponding to JIS X 0208.
3734 */
3735 filter=UCNV_SET_FILTER_SJIS;
3736 #if !UCONFIG_ONLY_HTML_CONVERSION
3737 } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3738 cnvData->version==0 && i==CNS_11643) {
3739 /*
3740 * Version-specific for CN:
3741 * CN version 0 does not map CNS planes 3..7 although
3742 * they are all available in the CNS conversion table;
3743 * CN version 1 (-EXT) does map them all.
3744 * The two versions create different Unicode sets.
3745 */
3746 filter=UCNV_SET_FILTER_2022_CN;
3747 } else if(i==KSC5601) {
3748 /*
3749 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3750 * are broader than GR94.
3751 */
3752 filter=UCNV_SET_FILTER_GR94DBCS;
3753 #endif
3754 } else {
3755 filter=UCNV_SET_FILTER_NONE;
3756 }
3757 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3758 }
3759 }
3760
3761 /*
3762 * ISO 2022 converters must not convert SO/SI/ESC despite what
3763 * sub-converters do by themselves.
3764 * Remove these characters from the set.
3765 */
3766 sa->remove(sa->set, 0x0e);
3767 sa->remove(sa->set, 0x0f);
3768 sa->remove(sa->set, 0x1b);
3769
3770 /* ISO 2022 converters do not convert C1 controls either */
3771 sa->removeRange(sa->set, 0x80, 0x9f);
3772 }
3773
3774 static const UConverterImpl _ISO2022Impl={
3775 UCNV_ISO_2022,
3776
3777 NULL,
3778 NULL,
3779
3780 _ISO2022Open,
3781 _ISO2022Close,
3782 _ISO2022Reset,
3783
3784 #ifdef U_ENABLE_GENERIC_ISO_2022
3785 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3786 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3787 ucnv_fromUnicode_UTF8,
3788 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3789 #else
3790 NULL,
3791 NULL,
3792 NULL,
3793 NULL,
3794 #endif
3795 NULL,
3796
3797 NULL,
3798 _ISO2022getName,
3799 _ISO_2022_WriteSub,
3800 _ISO_2022_SafeClone,
3801 _ISO_2022_GetUnicodeSet,
3802
3803 NULL,
3804 NULL
3805 };
3806 static const UConverterStaticData _ISO2022StaticData={
3807 sizeof(UConverterStaticData),
3808 "ISO_2022",
3809 2022,
3810 UCNV_IBM,
3811 UCNV_ISO_2022,
3812 1,
3813 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3814 { 0x1a, 0, 0, 0 },
3815 1,
3816 FALSE,
3817 FALSE,
3818 0,
3819 0,
3820 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3821 };
3822 const UConverterSharedData _ISO2022Data=
3823 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
3824
3825 /*************JP****************/
3826 static const UConverterImpl _ISO2022JPImpl={
3827 UCNV_ISO_2022,
3828
3829 NULL,
3830 NULL,
3831
3832 _ISO2022Open,
3833 _ISO2022Close,
3834 _ISO2022Reset,
3835
3836 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3837 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3838 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3839 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3840 NULL,
3841
3842 NULL,
3843 _ISO2022getName,
3844 _ISO_2022_WriteSub,
3845 _ISO_2022_SafeClone,
3846 _ISO_2022_GetUnicodeSet,
3847
3848 NULL,
3849 NULL
3850 };
3851 static const UConverterStaticData _ISO2022JPStaticData={
3852 sizeof(UConverterStaticData),
3853 "ISO_2022_JP",
3854 0,
3855 UCNV_IBM,
3856 UCNV_ISO_2022,
3857 1,
3858 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3859 { 0x1a, 0, 0, 0 },
3860 1,
3861 FALSE,
3862 FALSE,
3863 0,
3864 0,
3865 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3866 };
3867
3868 namespace {
3869
3870 const UConverterSharedData _ISO2022JPData=
3871 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
3872
3873 } // namespace
3874
3875 #if !UCONFIG_ONLY_HTML_CONVERSION
3876 /************* KR ***************/
3877 static const UConverterImpl _ISO2022KRImpl={
3878 UCNV_ISO_2022,
3879
3880 NULL,
3881 NULL,
3882
3883 _ISO2022Open,
3884 _ISO2022Close,
3885 _ISO2022Reset,
3886
3887 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3888 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3889 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3890 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3891 NULL,
3892
3893 NULL,
3894 _ISO2022getName,
3895 _ISO_2022_WriteSub,
3896 _ISO_2022_SafeClone,
3897 _ISO_2022_GetUnicodeSet,
3898
3899 NULL,
3900 NULL
3901 };
3902 static const UConverterStaticData _ISO2022KRStaticData={
3903 sizeof(UConverterStaticData),
3904 "ISO_2022_KR",
3905 0,
3906 UCNV_IBM,
3907 UCNV_ISO_2022,
3908 1,
3909 8, /* max 8 bytes per UChar */
3910 { 0x1a, 0, 0, 0 },
3911 1,
3912 FALSE,
3913 FALSE,
3914 0,
3915 0,
3916 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3917 };
3918
3919 namespace {
3920
3921 const UConverterSharedData _ISO2022KRData=
3922 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
3923
3924 } // namespace
3925
3926 /*************** CN ***************/
3927 static const UConverterImpl _ISO2022CNImpl={
3928
3929 UCNV_ISO_2022,
3930
3931 NULL,
3932 NULL,
3933
3934 _ISO2022Open,
3935 _ISO2022Close,
3936 _ISO2022Reset,
3937
3938 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3939 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3940 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3941 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3942 NULL,
3943
3944 NULL,
3945 _ISO2022getName,
3946 _ISO_2022_WriteSub,
3947 _ISO_2022_SafeClone,
3948 _ISO_2022_GetUnicodeSet,
3949
3950 NULL,
3951 NULL
3952 };
3953 static const UConverterStaticData _ISO2022CNStaticData={
3954 sizeof(UConverterStaticData),
3955 "ISO_2022_CN",
3956 0,
3957 UCNV_IBM,
3958 UCNV_ISO_2022,
3959 1,
3960 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3961 { 0x1a, 0, 0, 0 },
3962 1,
3963 FALSE,
3964 FALSE,
3965 0,
3966 0,
3967 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3968 };
3969
3970 namespace {
3971
3972 const UConverterSharedData _ISO2022CNData=
3973 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
3974
3975 } // namespace
3976 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3977
3978 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */