]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnv2022.c
ICU-400.42.tar.gz
[apple/icu.git] / icuSources / common / ucnv2022.c
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
46f4442e 3* Copyright (C) 2000-2008, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6* file name: ucnv2022.c
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2000feb03
12* created by: Markus W. Scherer
13*
14* Change history:
15*
16* 06/29/2000 helena Major rewrite of the callback APIs.
17* 08/08/2000 Ram Included support for ISO-2022-JP-2
18* Changed implementation of toUnicode
19* function
20* 08/21/2000 Ram Added support for ISO-2022-KR
21* 08/29/2000 Ram Seperated implementation of EBCDIC to
22* ucnvebdc.c
23* 09/20/2000 Ram Added support for ISO-2022-CN
24* Added implementations for getNextUChar()
25* for specific 2022 country variants.
26* 10/31/2000 Ram Implemented offsets logic functions
27*/
28
29#include "unicode/utypes.h"
30
374ca955 31#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
b75a7d8f
A
32
33#include "unicode/ucnv.h"
34#include "unicode/uset.h"
35#include "unicode/ucnv_err.h"
36#include "unicode/ucnv_cb.h"
374ca955 37#include "ucnv_imp.h"
b75a7d8f
A
38#include "ucnv_bld.h"
39#include "ucnv_cnv.h"
40#include "ucnvmbcs.h"
41#include "cstring.h"
42#include "cmemory.h"
43
374ca955
A
44#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45
46#ifdef U_ENABLE_GENERIC_ISO_2022
47/*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
72 * for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76#endif
77
78static const char SHIFT_IN_STR[] = "\x0F";
79static const char SHIFT_OUT_STR[] = "\x0E";
b75a7d8f
A
80
81#define CR 0x0D
82#define LF 0x0A
83#define H_TAB 0x09
84#define V_TAB 0x0B
85#define SPACE 0x20
86
46f4442e
A
87enum {
88 HWKANA_START=0xff61,
89 HWKANA_END=0xff9f
90};
91
92/*
93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
94 * as bytes 21..7E. (Subtract 0x80.)
95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
96 * as bytes 20..7F. (Subtract 0x80.)
97 * Do not encode C1 control codes with native bytes 80..9F
98 * as bytes 00..1F (C0 control codes).
99 */
100enum {
101 GR94_START=0xa1,
102 GR94_END=0xfe,
103 GR96_START=0xa0,
104 GR96_END=0xff
105};
106
73c04bcf
A
107/*
108 * ISO 2022 control codes must not be converted from Unicode
109 * because they would mess up the byte stream.
110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
111 * corresponding to SO, SI, and ESC.
112 */
113#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
114
374ca955 115/* for ISO-2022-JP and -CN implementations */
b75a7d8f 116typedef enum {
374ca955
A
117 /* shared values */
118 INVALID_STATE=-1,
b75a7d8f 119 ASCII = 0,
374ca955
A
120
121 SS2_STATE=0x10,
122 SS3_STATE,
123
124 /* JP */
b75a7d8f
A
125 ISO8859_1 = 1 ,
126 ISO8859_7 = 2 ,
127 JISX201 = 3,
128 JISX208 = 4,
129 JISX212 = 5,
130 GB2312 =6,
131 KSC5601 =7,
132 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
b75a7d8f 133
374ca955
A
134 /* CN */
135 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
136 GB2312_1=1,
137 ISO_IR_165=2,
138 CNS_11643=3,
139
140 /*
141 * these are used in StateEnum and ISO2022State variables,
142 * but CNS_11643 must be used to index into myConverterArray[]
143 */
144 CNS_11643_0=0x20,
145 CNS_11643_1,
146 CNS_11643_2,
147 CNS_11643_3,
148 CNS_11643_4,
149 CNS_11643_5,
150 CNS_11643_6,
151 CNS_11643_7
b75a7d8f
A
152} StateEnum;
153
374ca955
A
154/* is the StateEnum charset value for a DBCS charset? */
155#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
156
157#define CSM(cs) ((uint16_t)1<<(cs))
b75a7d8f 158
374ca955
A
159/*
160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
162 *
163 * Note: The converter uses some leniency:
164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
165 * all versions, not just JIS7 and JIS8.
166 * - ICU does not distinguish between different versions of JIS X 0208.
167 */
168static const uint16_t jpCharsetMasks[5]={
169 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
170 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
171 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
174};
b75a7d8f
A
175
176typedef enum {
177 ASCII1=0,
178 LATIN1,
179 SBCS,
180 DBCS,
374ca955
A
181 MBCS,
182 HWKANA
b75a7d8f
A
183}Cnv2022Type;
184
374ca955
A
185typedef struct ISO2022State {
186 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
187 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
188 int8_t prevG; /* g before single shift (SS2 or SS3) */
189} ISO2022State;
190
b75a7d8f
A
191#define UCNV_OPTIONS_VERSION_MASK 0xf
192#define UCNV_2022_MAX_CONVERTERS 10
193
194typedef struct{
73c04bcf 195 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
b75a7d8f 196 UConverter *currentConverter;
b75a7d8f 197 Cnv2022Type currentType;
374ca955 198 ISO2022State toU2022State, fromU2022State;
b75a7d8f
A
199 uint32_t key;
200 uint32_t version;
73c04bcf
A
201#ifdef U_ENABLE_GENERIC_ISO_2022
202 UBool isFirstBuffer;
203#endif
d5d484b0 204 UBool isEmptySegment;
b75a7d8f 205 char name[30];
73c04bcf 206 char locale[3];
b75a7d8f
A
207}UConverterDataISO2022;
208
374ca955 209/* Protos */
b75a7d8f
A
210/* ISO-2022 ----------------------------------------------------------------- */
211
212/*Forward declaration */
46f4442e 213U_CFUNC void
374ca955
A
214ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
215 UErrorCode * err);
46f4442e 216U_CFUNC void
374ca955
A
217ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
218 UErrorCode * err);
b75a7d8f
A
219
220#define ESC_2022 0x1B /*ESC*/
221
222typedef enum
223{
224 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
225 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
226 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
374ca955 227 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
b75a7d8f
A
228} UCNV_TableStates_2022;
229
230/*
231* The way these state transition arrays work is:
232* ex : ESC$B is the sequence for JISX208
233* a) First Iteration: char is ESC
234* i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
235* int x = normalize_esq_chars_2022[27] which is equal to 1
236* ii) Search for this value in escSeqStateTable_Key_2022[]
237* value of x is stored at escSeqStateTable_Key_2022[0]
238* iii) Save this index as offset
239* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
240* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
241* b) Switch on this state and continue to next char
242* i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
243* which is normalize_esq_chars_2022[36] == 4
244* ii) x is currently 1(from above)
245* x<<=5 -- x is now 32
246* x+=normalize_esq_chars_2022[36]
247* now x is 36
248* iii) Search for this value in escSeqStateTable_Key_2022[]
249* value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
250* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
251* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
252* c) Switch on this state and continue to next char
253* i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
254* ii) x is currently 36 (from above)
255* x<<=5 -- x is now 1152
256* x+=normalize_esq_chars_2022[66]
257* now x is 1161
258* iii) Search for this value in escSeqStateTable_Key_2022[]
259* value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
260* iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
261* escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
262* v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
263*/
264
265
266/*Below are the 3 arrays depicting a state transition table*/
267static const int8_t normalize_esq_chars_2022[256] = {
268/* 0 1 2 3 4 5 6 7 8 9 */
269
270 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
271 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
273 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
274 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
275 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
276 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
277 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
278 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
279 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
280 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
282 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0
296};
297
374ca955
A
298#ifdef U_ENABLE_GENERIC_ISO_2022
299/*
300 * When the generic ISO-2022 converter is completely removed, not just disabled
301 * per #ifdef, then the following state table and the associated tables that are
302 * dimensioned with MAX_STATES_2022 should be trimmed.
303 *
304 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
305 * the associated escape sequences starting with ESC ( B should be removed.
306 * This includes the ones with key values 1097 and all of the ones above 1000000.
307 *
308 * For the latter, the tables can simply be truncated.
309 * For the former, since the tables must be kept parallel, it is probably best
310 * to simply duplicate an adjacent table cell, parallel in all tables.
311 *
312 * It may make sense to restructure the tables, especially by using small search
313 * tables for the variants instead of indexing them parallel to the table here.
314 */
315#endif
316
b75a7d8f
A
317#define MAX_STATES_2022 74
318static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
319/* 0 1 2 3 4 5 6 7 8 9 */
320
321 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
322 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
323 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
324 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
325 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
326 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
327 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
328 ,35947631 ,35947635 ,35947636 ,35947638
329};
330
374ca955 331#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f
A
332
333static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
334 /* 0 1 2 3 4 5 6 7 8 9 */
335
336 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
374ca955 337 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
b75a7d8f
A
338 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
339 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
340 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
341 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
342 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
343 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
344};
345
374ca955
A
346#endif
347
46f4442e 348static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
b75a7d8f 349/* 0 1 2 3 4 5 6 7 8 9 */
374ca955 350 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
b75a7d8f
A
351 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
352 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
353 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
354 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
355 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
358};
359
360
b75a7d8f
A
361/* Type def for refactoring changeState_2022 code*/
362typedef enum{
374ca955 363#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f 364 ISO_2022=0,
374ca955 365#endif
b75a7d8f
A
366 ISO_2022_JP=1,
367 ISO_2022_KR=2,
368 ISO_2022_CN=3
369} Variant2022;
370
b75a7d8f 371/*********** ISO 2022 Converter Protos ***********/
46f4442e 372static void
b75a7d8f
A
373_ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode);
374
375static void
376 _ISO2022Close(UConverter *converter);
377
46f4442e 378static void
b75a7d8f
A
379_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
380
46f4442e 381static const char*
b75a7d8f
A
382_ISO2022getName(const UConverter* cnv);
383
46f4442e 384static void
b75a7d8f
A
385_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
386
46f4442e 387static UConverter *
b75a7d8f
A
388_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
389
374ca955 390#ifdef U_ENABLE_GENERIC_ISO_2022
46f4442e 391static void
374ca955
A
392T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
393#endif
b75a7d8f 394
374ca955
A
395/*const UConverterSharedData _ISO2022Data;*/
396static const UConverterSharedData _ISO2022JPData;
397static const UConverterSharedData _ISO2022KRData;
398static const UConverterSharedData _ISO2022CNData;
b75a7d8f 399
374ca955 400/*************** Converter implementations ******************/
b75a7d8f 401
73c04bcf
A
402/* The purpose of this function is to get around gcc compiler warnings. */
403static U_INLINE void
404fromUWriteUInt8(UConverter *cnv,
405 const char *bytes, int32_t length,
406 uint8_t **target, const char *targetLimit,
407 int32_t **offsets,
408 int32_t sourceIndex,
409 UErrorCode *pErrorCode)
410{
411 char *targetChars = (char *)*target;
412 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
413 offsets, sourceIndex, pErrorCode);
414 *target = (uint8_t*)targetChars;
415
416}
417
46f4442e 418static U_INLINE void
374ca955
A
419setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
420 if(myConverterData->version == 1) {
421 UConverter *cnv = myConverterData->currentConverter;
b75a7d8f 422
374ca955
A
423 cnv->toUnicodeStatus=0; /* offset */
424 cnv->mode=0; /* state */
425 cnv->toULength=0; /* byteIndex */
426 }
427}
b75a7d8f 428
46f4442e 429static U_INLINE void
374ca955
A
430setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
431 /* in ISO-2022-KR the designator sequence appears only once
432 * in a file so we append it only once
433 */
434 if( converter->charErrorBufferLength==0){
b75a7d8f 435
374ca955
A
436 converter->charErrorBufferLength = 4;
437 converter->charErrorBuffer[0] = 0x1b;
438 converter->charErrorBuffer[1] = 0x24;
439 converter->charErrorBuffer[2] = 0x29;
440 converter->charErrorBuffer[3] = 0x43;
441 }
442 if(myConverterData->version == 1) {
443 UConverter *cnv = myConverterData->currentConverter;
b75a7d8f 444
374ca955
A
445 cnv->fromUChar32=0;
446 cnv->fromUnicodeStatus=1; /* prevLength */
447 }
448}
b75a7d8f 449
46f4442e 450static void
374ca955 451_ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
b75a7d8f 452
374ca955 453 char myLocale[6]={' ',' ',' ',' ',' ',' '};
b75a7d8f 454
374ca955
A
455 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
456 if(cnv->extraInfo != NULL) {
457 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
458 uint32_t version;
b75a7d8f 459
374ca955 460 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
374ca955 461 myConverterData->currentType = ASCII1;
374ca955
A
462 cnv->fromUnicodeStatus =FALSE;
463 if(locale){
464 uprv_strncpy(myLocale, locale, sizeof(myLocale));
465 }
374ca955 466 version = options & UCNV_OPTIONS_VERSION_MASK;
73c04bcf 467 myConverterData->version = version;
46f4442e 468 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
73c04bcf
A
469 (myLocale[2]=='_' || myLocale[2]=='\0'))
470 {
471 size_t len=0;
374ca955
A
472 /* open the required converters and cache them */
473 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
474 myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
475 }
46f4442e 476 myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("Shift-JIS", NULL, errorCode);
374ca955
A
477 if(jpCharsetMasks[version]&CSM(JISX212)) {
478 myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode);
479 }
480 if(jpCharsetMasks[version]&CSM(GB2312)) {
481 myConverterData->myConverterArray[GB2312] = ucnv_loadSharedData("ibm-5478", NULL, errorCode); /* gb_2312_80-1 */
482 }
483 if(jpCharsetMasks[version]&CSM(KSC5601)) {
484 myConverterData->myConverterArray[KSC5601] = ucnv_loadSharedData("ksc_5601", NULL, errorCode);
485 }
b75a7d8f 486
374ca955
A
487 /* set the function pointers to appropriate funtions */
488 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
489 uprv_strcpy(myConverterData->locale,"ja");
b75a7d8f 490
46f4442e 491 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
374ca955
A
492 len = uprv_strlen(myConverterData->name);
493 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
494 myConverterData->name[len+1]='\0';
495 }
46f4442e 496 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
73c04bcf
A
497 (myLocale[2]=='_' || myLocale[2]=='\0'))
498 {
499 if (version==1){
500 myConverterData->currentConverter=
501 ucnv_open("icu-internal-25546",errorCode);
b75a7d8f 502
73c04bcf
A
503 if (U_FAILURE(*errorCode)) {
504 _ISO2022Close(cnv);
505 return;
506 }
b75a7d8f 507
46f4442e 508 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
73c04bcf
A
509 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
510 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
374ca955 511 }else{
73c04bcf 512 myConverterData->currentConverter=ucnv_open("ibm-949",errorCode);
b75a7d8f 513
73c04bcf
A
514 if (U_FAILURE(*errorCode)) {
515 _ISO2022Close(cnv);
516 return;
517 }
b75a7d8f 518
73c04bcf 519 myConverterData->version = 0;
46f4442e 520 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
374ca955 521 }
b75a7d8f 522
374ca955
A
523 /* initialize the state variables */
524 setInitialStateToUnicodeKR(cnv, myConverterData);
73c04bcf 525 setInitialStateFromUnicodeKR(cnv, myConverterData);
b75a7d8f
A
526
527 /* set the function pointers to appropriate funtions */
528 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
b75a7d8f
A
529 uprv_strcpy(myConverterData->locale,"ko");
530 }
46f4442e 531 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
73c04bcf
A
532 (myLocale[2]=='_' || myLocale[2]=='\0'))
533 {
b75a7d8f
A
534
535 /* open the required converters and cache them */
374ca955
A
536 myConverterData->myConverterArray[GB2312_1] = ucnv_loadSharedData("ibm-5478", NULL, errorCode);
537 if(version==1) {
538 myConverterData->myConverterArray[ISO_IR_165] = ucnv_loadSharedData("iso-ir-165", NULL, errorCode);
539 }
540 myConverterData->myConverterArray[CNS_11643] = ucnv_loadSharedData("cns-11643-1992", NULL, errorCode);
b75a7d8f 541
b75a7d8f
A
542
543 /* set the function pointers to appropriate funtions */
544 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
545 uprv_strcpy(myConverterData->locale,"cn");
546
73c04bcf 547 if (version==1){
46f4442e 548 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
b75a7d8f 549 }else{
b75a7d8f 550 myConverterData->version = 0;
46f4442e 551 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
b75a7d8f
A
552 }
553 }
554 else{
374ca955 555#ifdef U_ENABLE_GENERIC_ISO_2022
73c04bcf
A
556 myConverterData->isFirstBuffer = TRUE;
557
b75a7d8f
A
558 /* append the UTF-8 escape sequence */
559 cnv->charErrorBufferLength = 3;
560 cnv->charErrorBuffer[0] = 0x1b;
561 cnv->charErrorBuffer[1] = 0x25;
562 cnv->charErrorBuffer[2] = 0x42;
563
564 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
565 /* initialize the state variables */
b75a7d8f 566 uprv_strcpy(myConverterData->name,"ISO_2022");
374ca955
A
567#else
568 *errorCode = U_UNSUPPORTED_ERROR;
569 return;
570#endif
b75a7d8f
A
571 }
572
374ca955
A
573 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
574
575 if(U_FAILURE(*errorCode)) {
576 _ISO2022Close(cnv);
577 }
b75a7d8f
A
578 } else {
579 *errorCode = U_MEMORY_ALLOCATION_ERROR;
580 }
b75a7d8f
A
581}
582
583
584static void
585_ISO2022Close(UConverter *converter) {
374ca955
A
586 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
587 UConverterSharedData **array = myData->myConverterArray;
588 int32_t i;
b75a7d8f
A
589
590 if (converter->extraInfo != NULL) {
591 /*close the array of converter pointers and free the memory*/
374ca955
A
592 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
593 if(array[i]!=NULL) {
594 ucnv_unloadSharedDataIfReady(array[i]);
b75a7d8f 595 }
b75a7d8f
A
596 }
597
374ca955 598 ucnv_close(myData->currentConverter);
b75a7d8f
A
599
600 if(!converter->isExtraLocal){
601 uprv_free (converter->extraInfo);
374ca955 602 converter->extraInfo = NULL;
b75a7d8f
A
603 }
604 }
605}
606
607static void
608_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
609 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
374ca955
A
610 if(choice<=UCNV_RESET_TO_UNICODE) {
611 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
612 myConverterData->key = 0;
d5d484b0 613 myConverterData->isEmptySegment = FALSE;
374ca955
A
614 }
615 if(choice!=UCNV_RESET_TO_UNICODE) {
616 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
617 }
618#ifdef U_ENABLE_GENERIC_ISO_2022
619 if(myConverterData->locale[0] == 0){
b75a7d8f
A
620 if(choice<=UCNV_RESET_TO_UNICODE) {
621 myConverterData->isFirstBuffer = TRUE;
374ca955 622 myConverterData->key = 0;
b75a7d8f
A
623 if (converter->mode == UCNV_SO){
624 ucnv_close (myConverterData->currentConverter);
625 myConverterData->currentConverter=NULL;
626 }
46f4442e 627 converter->mode = UCNV_SI;
b75a7d8f
A
628 }
629 if(choice!=UCNV_RESET_TO_UNICODE) {
630 /* re-append UTF-8 escape sequence */
631 converter->charErrorBufferLength = 3;
632 converter->charErrorBuffer[0] = 0x1b;
633 converter->charErrorBuffer[1] = 0x28;
634 converter->charErrorBuffer[2] = 0x42;
635 }
636 }
374ca955
A
637 else
638#endif
639 {
b75a7d8f 640 /* reset the state variables */
374ca955 641 if(myConverterData->locale[0] == 'k'){
b75a7d8f
A
642 if(choice<=UCNV_RESET_TO_UNICODE) {
643 setInitialStateToUnicodeKR(converter, myConverterData);
644 }
645 if(choice!=UCNV_RESET_TO_UNICODE) {
646 setInitialStateFromUnicodeKR(converter, myConverterData);
647 }
648 }
649 }
650}
651
46f4442e 652static const char*
b75a7d8f
A
653_ISO2022getName(const UConverter* cnv){
654 if(cnv->extraInfo){
655 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
656 return myData->name;
657 }
658 return NULL;
659}
660
b75a7d8f 661
374ca955
A
662/*************** to unicode *******************/
663/****************************************************************************
664 * Recognized escape sequences are
665 * <ESC>(B ASCII
666 * <ESC>.A ISO-8859-1
667 * <ESC>.F ISO-8859-7
668 * <ESC>(J JISX-201
669 * <ESC>(I JISX-201
670 * <ESC>$B JISX-208
671 * <ESC>$@ JISX-208
672 * <ESC>$(D JISX-212
673 * <ESC>$A GB2312
674 * <ESC>$(C KSC5601
675 */
46f4442e 676static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
374ca955
A
677/* 0 1 2 3 4 5 6 7 8 9 */
678 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
679 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
680 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
681 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
682 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
683 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
684 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
685 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
686};
b75a7d8f 687
374ca955 688/*************** to unicode *******************/
46f4442e 689static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
374ca955
A
690/* 0 1 2 3 4 5 6 7 8 9 */
691 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
692 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
693 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
694 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
695 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
696 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
697 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
698 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
699};
b75a7d8f 700
b75a7d8f 701
46f4442e 702static UCNV_TableStates_2022
374ca955
A
703getKey_2022(char c,int32_t* key,int32_t* offset){
704 int32_t togo;
705 int32_t low = 0;
706 int32_t hi = MAX_STATES_2022;
707 int32_t oldmid=0;
b75a7d8f 708
374ca955
A
709 togo = normalize_esq_chars_2022[(uint8_t)c];
710 if(togo == 0) {
711 /* not a valid character anywhere in an escape sequence */
712 *key = 0;
713 *offset = 0;
714 return INVALID_2022;
715 }
716 togo = (*key << 5) + togo;
b75a7d8f 717
374ca955 718 while (hi != low) /*binary search*/{
b75a7d8f 719
374ca955
A
720 register int32_t mid = (hi+low) >> 1; /*Finds median*/
721
46f4442e 722 if (mid == oldmid)
374ca955
A
723 break;
724
725 if (escSeqStateTable_Key_2022[mid] > togo){
726 hi = mid;
727 }
728 else if (escSeqStateTable_Key_2022[mid] < togo){
729 low = mid;
730 }
731 else /*we found it*/{
732 *key = togo;
733 *offset = mid;
46f4442e 734 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
374ca955
A
735 }
736 oldmid = mid;
b75a7d8f 737
b75a7d8f 738 }
b75a7d8f 739
374ca955
A
740 *key = 0;
741 *offset = 0;
742 return INVALID_2022;
b75a7d8f
A
743}
744
374ca955
A
745/*runs through a state machine to determine the escape sequence - codepage correspondance
746 */
46f4442e 747static void
374ca955 748changeState_2022(UConverter* _this,
46f4442e 749 const char** source,
374ca955
A
750 const char* sourceLimit,
751 Variant2022 var,
752 UErrorCode* err){
753 UCNV_TableStates_2022 value;
754 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
755 uint32_t key = myData2022->key;
73c04bcf 756 int32_t offset = 0;
fd0068a8 757 int8_t initialToULength = _this->toULength;
374ca955
A
758 char c;
759
760 value = VALID_NON_TERMINAL_2022;
761 while (*source < sourceLimit) {
762 c = *(*source)++;
763 _this->toUBytes[_this->toULength++]=(uint8_t)c;
764 value = getKey_2022(c,(int32_t *) &key, &offset);
46f4442e 765
374ca955 766 switch (value){
b75a7d8f 767
374ca955
A
768 case VALID_NON_TERMINAL_2022 :
769 /* continue with the loop */
770 break;
b75a7d8f 771
374ca955
A
772 case VALID_TERMINAL_2022:
773 key = 0;
774 goto DONE;
b75a7d8f 775
374ca955
A
776 case INVALID_2022:
777 goto DONE;
b75a7d8f 778
374ca955
A
779 case VALID_MAYBE_TERMINAL_2022:
780#ifdef U_ENABLE_GENERIC_ISO_2022
781 /* ESC ( B is ambiguous only for ISO_2022 itself */
782 if(var == ISO_2022) {
783 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
784 _this->toULength = 0;
b75a7d8f 785
374ca955
A
786 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
787
788 /* continue with the loop */
789 value = VALID_NON_TERMINAL_2022;
790 break;
791 } else
792#endif
793 {
794 /* not ISO_2022 itself, finish here */
795 value = VALID_TERMINAL_2022;
796 key = 0;
797 goto DONE;
b75a7d8f
A
798 }
799 }
b75a7d8f 800 }
b75a7d8f 801
374ca955
A
802DONE:
803 myData2022->key = key;
b75a7d8f 804
374ca955
A
805 if (value == VALID_NON_TERMINAL_2022) {
806 /* indicate that the escape sequence is incomplete: key!=0 */
807 return;
808 } else if (value == INVALID_2022 ) {
809 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
374ca955
A
810 } else /* value == VALID_TERMINAL_2022 */ {
811 switch(var){
812#ifdef U_ENABLE_GENERIC_ISO_2022
813 case ISO_2022:
814 {
815 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
816 if(chosenConverterName == NULL) {
817 /* SS2 or SS3 */
818 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
46f4442e 819 _this->toUCallbackReason = UCNV_UNASSIGNED;
374ca955 820 return;
b75a7d8f 821 }
374ca955
A
822
823 _this->mode = UCNV_SI;
824 ucnv_close(myData2022->currentConverter);
825 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
826 if(U_SUCCESS(*err)) {
827 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
828 _this->mode = UCNV_SO;
829 }
830 break;
831 }
832#endif
833 case ISO_2022_JP:
834 {
46f4442e 835 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
374ca955
A
836 switch(tempState) {
837 case INVALID_STATE:
838 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
839 break;
840 case SS2_STATE:
841 if(myData2022->toU2022State.cs[2]!=0) {
842 if(myData2022->toU2022State.g<2) {
843 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
844 }
845 myData2022->toU2022State.g=2;
846 } else {
847 /* illegal to have SS2 before a matching designator */
848 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
849 }
850 break;
851 /* case SS3_STATE: not used in ISO-2022-JP-x */
852 case ISO8859_1:
853 case ISO8859_7:
854 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
855 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
856 } else {
857 /* G2 charset for SS2 */
858 myData2022->toU2022State.cs[2]=(int8_t)tempState;
859 }
860 break;
861 default:
862 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
863 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
864 } else {
865 /* G0 charset */
866 myData2022->toU2022State.cs[0]=(int8_t)tempState;
867 }
868 break;
869 }
870 }
871 break;
872 case ISO_2022_CN:
873 {
46f4442e 874 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
374ca955
A
875 switch(tempState) {
876 case INVALID_STATE:
877 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
878 break;
879 case SS2_STATE:
880 if(myData2022->toU2022State.cs[2]!=0) {
881 if(myData2022->toU2022State.g<2) {
882 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
883 }
884 myData2022->toU2022State.g=2;
885 } else {
886 /* illegal to have SS2 before a matching designator */
887 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
888 }
889 break;
890 case SS3_STATE:
891 if(myData2022->toU2022State.cs[3]!=0) {
892 if(myData2022->toU2022State.g<2) {
893 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
894 }
895 myData2022->toU2022State.g=3;
896 } else {
897 /* illegal to have SS3 before a matching designator */
898 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
899 }
900 break;
901 case ISO_IR_165:
902 if(myData2022->version==0) {
903 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
904 break;
905 }
73c04bcf 906 /*fall through*/
374ca955 907 case GB2312_1:
73c04bcf 908 /*fall through*/
374ca955
A
909 case CNS_11643_1:
910 myData2022->toU2022State.cs[1]=(int8_t)tempState;
911 break;
912 case CNS_11643_2:
913 myData2022->toU2022State.cs[2]=(int8_t)tempState;
914 break;
915 default:
916 /* other CNS 11643 planes */
917 if(myData2022->version==0) {
918 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
919 } else {
920 myData2022->toU2022State.cs[3]=(int8_t)tempState;
921 }
922 break;
923 }
924 }
925 break;
926 case ISO_2022_KR:
927 if(offset==0x30){
928 /* nothing to be done, just accept this one escape sequence */
929 } else {
930 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
931 }
932 break;
933
934 default:
935 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
936 break;
937 }
938 }
939 if(U_SUCCESS(*err)) {
940 _this->toULength = 0;
fd0068a8
A
941 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
942 if(_this->toULength>1) {
943 /*
944 * Ticket 5691: consistent illegal sequences:
945 * - We include at least the first byte (ESC) in the illegal sequence.
946 * - If any of the non-initial bytes could be the start of a character,
947 * we stop the illegal sequence before the first one of those.
948 * In escape sequences, all following bytes are "printable", that is,
949 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
950 * they are valid single/lead bytes.
951 * For simplicity, we always only report the initial ESC byte as the
952 * illegal sequence and back out all other bytes we looked at.
953 */
954 /* Back out some bytes. */
955 int8_t backOutDistance=_this->toULength-1;
956 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
957 if(backOutDistance<=bytesFromThisBuffer) {
958 /* same as initialToULength<=1 */
959 *source-=backOutDistance;
960 } else {
961 /* Back out bytes from the previous buffer: Need to replay them. */
962 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
963 /* same as -(initialToULength-1) */
964 /* preToULength is negative! */
965 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
966 *source-=bytesFromThisBuffer;
967 }
968 _this->toULength=1;
969 }
46f4442e
A
970 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
971 _this->toUCallbackReason = UCNV_UNASSIGNED;
374ca955
A
972 }
973}
974
975/*Checks the characters of the buffer against valid 2022 escape sequences
976*if the match we return a pointer to the initial start of the sequence otherwise
977*we return sourceLimit
978*/
979/*for 2022 looks ahead in the stream
980 *to determine the longest possible convertible
981 *data stream
982 */
46f4442e 983static U_INLINE const char*
374ca955
A
984getEndOfBuffer_2022(const char** source,
985 const char* sourceLimit,
986 UBool flush){
987
988 const char* mySource = *source;
989
990#ifdef U_ENABLE_GENERIC_ISO_2022
46f4442e 991 if (*source >= sourceLimit)
374ca955
A
992 return sourceLimit;
993
994 do{
995
996 if (*mySource == ESC_2022){
997 int8_t i;
998 int32_t key = 0;
999 int32_t offset;
1000 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1001
1002 /* Kludge: I could not
1003 * figure out the reason for validating an escape sequence
1004 * twice - once here and once in changeState_2022().
1005 * is it possible to have an ESC character in a ISO2022
1006 * byte stream which is valid in a code page? Is it legal?
1007 */
46f4442e 1008 for (i=0;
374ca955
A
1009 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1010 i++) {
1011 value = getKey_2022(*(mySource+i), &key, &offset);
1012 }
46f4442e 1013 if (value > 0 || *mySource==ESC_2022)
374ca955
A
1014 return mySource;
1015
46f4442e 1016 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
374ca955
A
1017 return sourceLimit;
1018 }
1019 }while (++mySource < sourceLimit);
1020
1021 return sourceLimit;
1022#else
1023 while(mySource < sourceLimit && *mySource != ESC_2022) {
1024 ++mySource;
1025 }
1026 return mySource;
1027#endif
1028}
1029
1030
1031/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
46f4442e
A
1032 * any future change in _MBCSFromUChar32() function should be reflected here.
1033 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
374ca955 1034 */
46f4442e 1035static U_INLINE int32_t
374ca955 1036MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
46f4442e
A
1037 UChar32 c,
1038 uint32_t* value,
1039 UBool useFallback,
374ca955
A
1040 int outputType)
1041{
1042 const int32_t *cx;
1043 const uint16_t *table;
1044 uint32_t stage2Entry;
1045 uint32_t myValue;
46f4442e 1046 int32_t length;
374ca955 1047 const uint8_t *p;
46f4442e
A
1048 /*
1049 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1050 * Use internal version of ucnv_open() that verifies that the new structures are available,
1051 * else U_INTERNAL_PROGRAM_ERROR.
1052 */
374ca955
A
1053 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1054 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1055 table=sharedData->mbcs.fromUnicodeTable;
1056 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1057 /* get the bytes and the length for the output */
1058 if(outputType==MBCS_OUTPUT_2){
1059 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1060 if(myValue<=0xff) {
46f4442e 1061 length=1;
374ca955 1062 } else {
46f4442e 1063 length=2;
374ca955
A
1064 }
1065 } else /* outputType==MBCS_OUTPUT_3 */ {
1066 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1067 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1068 if(myValue<=0xff) {
46f4442e 1069 length=1;
374ca955 1070 } else if(myValue<=0xffff) {
46f4442e 1071 length=2;
374ca955 1072 } else {
46f4442e 1073 length=3;
b75a7d8f
A
1074 }
1075 }
1076 /* is this code point assigned, or do we use fallbacks? */
46f4442e
A
1077 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1078 /* assigned */
1079 *value=myValue;
1080 return length;
1081 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
b75a7d8f 1082 /*
374ca955 1083 * We allow a 0 byte output if the "assigned" bit is set for this entry.
b75a7d8f 1084 * There is no way with this data structure for fallback output
374ca955 1085 * to be a zero byte.
b75a7d8f 1086 */
b75a7d8f 1087 *value=myValue;
46f4442e 1088 return -length;
b75a7d8f 1089 }
b75a7d8f 1090 }
374ca955
A
1091
1092 cx=sharedData->mbcs.extIndexes;
1093 if(cx!=NULL) {
46f4442e 1094 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
374ca955
A
1095 }
1096
1097 /* unassigned */
46f4442e 1098 return 0;
b75a7d8f
A
1099}
1100
1101/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
46f4442e
A
1102 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1103 * @param retval pointer to output byte
1104 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
b75a7d8f 1105 */
46f4442e 1106static U_INLINE int32_t
b75a7d8f 1107MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
46f4442e
A
1108 UChar32 c,
1109 uint32_t* retval,
b75a7d8f
A
1110 UBool useFallback)
1111{
46f4442e 1112 const uint16_t *table;
b75a7d8f
A
1113 int32_t value;
1114 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
374ca955 1115 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
46f4442e 1116 return 0;
b75a7d8f
A
1117 }
1118 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
374ca955 1119 table=sharedData->mbcs.fromUnicodeTable;
b75a7d8f 1120 /* get the byte for the output */
374ca955 1121 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
b75a7d8f 1122 /* is this code point assigned, or do we use fallbacks? */
46f4442e
A
1123 *retval=(uint32_t)(value&0xff);
1124 if(value>=0xf00) {
1125 return 1; /* roundtrip */
1126 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1127 return -1; /* fallback taken */
b75a7d8f 1128 } else {
46f4442e 1129 return 0; /* no mapping */
b75a7d8f 1130 }
b75a7d8f
A
1131}
1132
46f4442e
A
1133/*
1134 * Check that the result is a 2-byte value with each byte in the range A1..FE
1135 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1136 * to move it to the ISO 2022 range 21..7E.
1137 * Return 0 if out of range.
1138 */
1139static U_INLINE uint32_t
1140_2022FromGR94DBCS(uint32_t value) {
1141 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1142 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1143 ) {
1144 return value - 0x8080; /* shift down to 21..7e byte range */
1145 } else {
1146 return 0; /* not valid for ISO 2022 */
1147 }
1148}
1149
1150#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1151/*
1152 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1153 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1154 * unchanged.
1155 */
1156static U_INLINE uint32_t
1157_2022ToGR94DBCS(uint32_t value) {
1158 uint32_t returnValue = value + 0x8080;
1159 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1160 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1161 return returnValue;
1162 } else {
1163 return value;
1164 }
1165}
1166#endif
1167
374ca955
A
1168#ifdef U_ENABLE_GENERIC_ISO_2022
1169
b75a7d8f
A
1170/**********************************************************************************
1171* ISO-2022 Converter
1172*
1173*
1174*/
1175
46f4442e 1176static void
b75a7d8f
A
1177T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1178 UErrorCode* err){
374ca955
A
1179 const char* mySourceLimit, *realSourceLimit;
1180 const char* sourceStart;
1181 const UChar* myTargetStart;
b75a7d8f 1182 UConverter* saveThis;
b75a7d8f 1183 UConverterDataISO2022* myData;
374ca955
A
1184 int8_t length;
1185
1186 saveThis = args->converter;
1187 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1188
1189 realSourceLimit = args->sourceLimit;
1190 while (args->source < realSourceLimit) {
1191 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1192 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1193 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1194
1195 if(args->source < mySourceLimit) {
1196 if(myData->currentConverter==NULL) {
1197 myData->currentConverter = ucnv_open("ASCII",err);
1198 if(U_FAILURE(*err)){
1199 return;
1200 }
b75a7d8f 1201
374ca955
A
1202 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1203 saveThis->mode = UCNV_SO;
b75a7d8f 1204 }
b75a7d8f 1205
374ca955
A
1206 /* convert to before the ESC or until the end of the buffer */
1207 myData->isFirstBuffer=FALSE;
1208 sourceStart = args->source;
1209 myTargetStart = args->target;
1210 args->converter = myData->currentConverter;
1211 ucnv_toUnicode(args->converter,
1212 &args->target,
1213 args->targetLimit,
1214 &args->source,
1215 mySourceLimit,
1216 args->offsets,
1217 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1218 err);
1219 args->converter = saveThis;
1220
1221 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1222 /* move the overflow buffer */
1223 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1224 myData->currentConverter->UCharErrorBufferLength = 0;
1225 if(length > 0) {
1226 uprv_memcpy(saveThis->UCharErrorBuffer,
1227 myData->currentConverter->UCharErrorBuffer,
1228 length*U_SIZEOF_UCHAR);
1229 }
1230 return;
1231 }
b75a7d8f 1232
374ca955
A
1233 /*
1234 * At least one of:
1235 * -Error while converting
1236 * -Done with entire buffer
1237 * -Need to write offsets or update the current offset
1238 * (leave that up to the code in ucnv.c)
1239 *
1240 * or else we just stopped at an ESC byte and continue with changeState_2022()
1241 */
1242 if (U_FAILURE(*err) ||
1243 (args->source == realSourceLimit) ||
1244 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1245 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1246 ) {
1247 /* copy partial or error input for truncated detection and error handling */
1248 if(U_FAILURE(*err)) {
1249 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1250 if(length > 0) {
1251 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1252 }
1253 } else {
1254 length = saveThis->toULength = myData->currentConverter->toULength;
1255 if(length > 0) {
1256 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1257 if(args->source < mySourceLimit) {
1258 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1259 }
1260 }
1261 }
1262 return;
b75a7d8f 1263 }
b75a7d8f
A
1264 }
1265 }
b75a7d8f
A
1266
1267 sourceStart = args->source;
1268 changeState_2022(args->converter,
46f4442e 1269 &(args->source),
374ca955 1270 realSourceLimit,
b75a7d8f 1271 ISO_2022,
b75a7d8f 1272 err);
374ca955
A
1273 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1274 /* let the ucnv.c code update its current offset */
1275 return;
b75a7d8f 1276 }
b75a7d8f 1277 }
b75a7d8f
A
1278}
1279
374ca955 1280#endif
b75a7d8f
A
1281
1282/*
1283 * To Unicode Callback helper function
1284 */
46f4442e 1285static void
374ca955
A
1286toUnicodeCallback(UConverter *cnv,
1287 const uint32_t sourceChar, const uint32_t targetUniChar,
1288 UErrorCode* err){
b75a7d8f 1289 if(sourceChar>0xff){
374ca955
A
1290 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1291 cnv->toUBytes[1] = (uint8_t)sourceChar;
1292 cnv->toULength = 2;
b75a7d8f
A
1293 }
1294 else{
374ca955 1295 cnv->toUBytes[0] =(char) sourceChar;
fd0068a8 1296 cnv->toULength = 1;
b75a7d8f
A
1297 }
1298
1299 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
b75a7d8f
A
1300 *err = U_INVALID_CHAR_FOUND;
1301 }
1302 else{
b75a7d8f
A
1303 *err = U_ILLEGAL_CHAR_FOUND;
1304 }
b75a7d8f
A
1305}
1306
1307/**************************************ISO-2022-JP*************************************************/
1308
1309/************************************** IMPORTANT **************************************************
1310* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1311* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
46f4442e
A
1312* The converter iterates over each Unicode codepoint
1313* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1314* processed one char at a time it would make sense to reduce the extra processing a canned converter
b75a7d8f
A
1315* would do as far as possible.
1316*
46f4442e
A
1317* If the implementation of these macros or structure of sharedData struct change in the future, make
1318* sure that ISO-2022 is also changed.
b75a7d8f
A
1319***************************************************************************************************
1320*/
1321
1322/***************************************************************************************************
1323* Rules for ISO-2022-jp encoding
46f4442e 1324* (i) Escape sequences must be fully contained within a line they should not
b75a7d8f
A
1325* span new lines or CRs
1326* (ii) If the last character on a line is represented by two bytes then an ASCII or
1327* JIS-Roman character escape sequence should follow before the line terminates
46f4442e
A
1328* (iii) If the first character on the line is represented by two bytes then a two
1329* byte character escape sequence should precede it
b75a7d8f
A
1330* (iv) If no escape sequence is encountered then the characters are ASCII
1331* (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1332* and invoked with SS2 (ESC N).
1333* (vi) If there is any G0 designation in text, there must be a switch to
1334* ASCII or to JIS X 0201-Roman before a space character (but not
1335* necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1336* characters such as tab or CRLF.
1337* (vi) Supported encodings:
1338* ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1339*
1340* source : RFC-1554
1341*
1342* JISX201, JISX208,JISX212 : new .cnv data files created
1343* KSC5601 : alias to ibm-949 mapping table
1344* GB2312 : alias to ibm-1386 mapping table
1345* ISO-8859-1 : Algorithmic implemented as LATIN1 case
1346* ISO-8859-7 : alisas to ibm-9409 mapping table
1347*/
b75a7d8f 1348
374ca955
A
1349/* preference order of JP charsets */
1350static const StateEnum jpCharsetPref[]={
1351 ASCII,
1352 JISX201,
1353 ISO8859_1,
1354 ISO8859_7,
1355 JISX208,
1356 JISX212,
1357 GB2312,
1358 KSC5601,
1359 HWKANA_7BIT
b75a7d8f
A
1360};
1361
73c04bcf
A
1362/*
1363 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1364 * not in order of jpCharsetPref[]!
1365 */
374ca955 1366static const char escSeqChars[][6] ={
b75a7d8f
A
1367 "\x1B\x28\x42", /* <ESC>(B ASCII */
1368 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1369 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1370 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1371 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1372 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1373 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1374 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1375 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1376
1377};
46f4442e 1378static const int8_t escSeqCharsLen[] ={
374ca955 1379 3, /* length of <ESC>(B ASCII */
b75a7d8f
A
1380 3, /* length of <ESC>.A ISO-8859-1 */
1381 3, /* length of <ESC>.F ISO-8859-7 */
1382 3, /* length of <ESC>(J JISX-201 */
1383 3, /* length of <ESC>$B JISX-208 */
1384 4, /* length of <ESC>$(D JISX-212 */
1385 3, /* length of <ESC>$A GB2312 */
1386 4, /* length of <ESC>$(C KSC5601 */
1387 3 /* length of <ESC>(I HWKANA_7BIT */
1388};
1389
1390/*
1391* The iteration over various code pages works this way:
1392* i) Get the currentState from myConverterData->currentState
1393* ii) Check if the character is mapped to a valid character in the currentState
1394* Yes -> a) set the initIterState to currentState
1395* b) remain in this state until an invalid character is found
1396* No -> a) go to the next code page and find the character
46f4442e 1397* iii) Before changing the state increment the current state check if the current state
b75a7d8f
A
1398* is equal to the intitIteration state
1399* Yes -> A character that cannot be represented in any of the supported encodings
1400* break and return a U_INVALID_CHARACTER error
1401* No -> Continue and find the character in next code page
1402*
1403*
46f4442e 1404* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
b75a7d8f
A
1405*/
1406
46f4442e
A
1407/* Map 00..7F to Unicode according to JIS X 0201. */
1408static U_INLINE uint32_t
1409jisx201ToU(uint32_t value) {
1410 if(value < 0x5c) {
1411 return value;
1412 } else if(value == 0x5c) {
1413 return 0xa5;
1414 } else if(value == 0x7e) {
1415 return 0x203e;
1416 } else /* value <= 0x7f */ {
1417 return value;
1418 }
1419}
1420
1421/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1422static U_INLINE uint32_t
1423jisx201FromU(uint32_t value) {
1424 if(value<=0x7f) {
1425 if(value!=0x5c && value!=0x7e) {
1426 return value;
1427 }
1428 } else if(value==0xa5) {
1429 return 0x5c;
1430 } else if(value==0x203e) {
1431 return 0x7e;
1432 }
1433 return 0xfffe;
1434}
1435
1436/*
1437 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1438 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1439 * Return 0 if the byte pair is out of range.
1440 */
1441static U_INLINE uint32_t
1442_2022FromSJIS(uint32_t value) {
1443 uint8_t trail;
1444
1445 if(value > 0xEFFC) {
1446 return 0; /* beyond JIS X 0208 */
1447 }
1448
1449 trail = (uint8_t)value;
1450
1451 value &= 0xff00; /* lead byte */
1452 if(value <= 0x9f00) {
1453 value -= 0x7000;
1454 } else /* 0xe000 <= value <= 0xef00 */ {
1455 value -= 0xb000;
1456 }
1457 value <<= 1;
1458
1459 if(trail <= 0x9e) {
1460 value -= 0x100;
1461 if(trail <= 0x7e) {
1462 value |= trail - 0x1f;
1463 } else {
1464 value |= trail - 0x20;
1465 }
1466 } else /* trail <= 0xfc */ {
1467 value |= trail - 0x7e;
1468 }
1469 return value;
1470}
1471
1472/*
1473 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1474 * If either byte is outside 21..7E make sure that the result is not valid
1475 * for Shift-JIS so that the converter catches it.
1476 * Some invalid byte values already turn into equally invalid Shift-JIS
1477 * byte values and need not be tested explicitly.
1478 */
1479static U_INLINE void
1480_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1481 if(c1&1) {
1482 ++c1;
1483 if(c2 <= 0x5f) {
1484 c2 += 0x1f;
1485 } else if(c2 <= 0x7e) {
1486 c2 += 0x20;
1487 } else {
1488 c2 = 0; /* invalid */
1489 }
1490 } else {
1491 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1492 c2 += 0x7e;
1493 } else {
1494 c2 = 0; /* invalid */
1495 }
1496 }
1497 c1 >>= 1;
1498 if(c1 <= 0x2f) {
1499 c1 += 0x70;
1500 } else if(c1 <= 0x3f) {
1501 c1 += 0xb0;
1502 } else {
1503 c1 = 0; /* invalid */
1504 }
1505 bytes[0] = (char)c1;
1506 bytes[1] = (char)c2;
1507}
1508
1509/*
1510 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1511 * Katakana.
1512 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1513 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1514 * These were the only fallbacks in ICU's jisx-208.ucm file.
1515 */
1516static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1517 0x2123, /* U+FF61 */
1518 0x2156,
1519 0x2157,
1520 0x2122,
1521 0x2126,
1522 0x2572,
1523 0x2521,
1524 0x2523,
1525 0x2525,
1526 0x2527,
1527 0x2529,
1528 0x2563,
1529 0x2565,
1530 0x2567,
1531 0x2543,
1532 0x213C, /* U+FF70 */
1533 0x2522,
1534 0x2524,
1535 0x2526,
1536 0x2528,
1537 0x252A,
1538 0x252B,
1539 0x252D,
1540 0x252F,
1541 0x2531,
1542 0x2533,
1543 0x2535,
1544 0x2537,
1545 0x2539,
1546 0x253B,
1547 0x253D,
1548 0x253F, /* U+FF80 */
1549 0x2541,
1550 0x2544,
1551 0x2546,
1552 0x2548,
1553 0x254A,
1554 0x254B,
1555 0x254C,
1556 0x254D,
1557 0x254E,
1558 0x254F,
1559 0x2552,
1560 0x2555,
1561 0x2558,
1562 0x255B,
1563 0x255E,
1564 0x255F, /* U+FF90 */
1565 0x2560,
1566 0x2561,
1567 0x2562,
1568 0x2564,
1569 0x2566,
1570 0x2568,
1571 0x2569,
1572 0x256A,
1573 0x256B,
1574 0x256C,
1575 0x256D,
1576 0x256F,
1577 0x2573,
1578 0x212B,
1579 0x212C /* U+FF9F */
1580};
1581
1582static void
374ca955 1583UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
46f4442e 1584 UConverter *cnv = args->converter;
b75a7d8f 1585 UConverterDataISO2022 *converterData;
374ca955
A
1586 ISO2022State *pFromU2022State;
1587 uint8_t *target = (uint8_t *) args->target;
1588 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
b75a7d8f
A
1589 const UChar* source = args->source;
1590 const UChar* sourceLimit = args->sourceLimit;
1591 int32_t* offsets = args->offsets;
374ca955
A
1592 UChar32 sourceChar;
1593 char buffer[8];
1594 int32_t len, outLen;
1595 int8_t choices[10];
1596 int32_t choiceCount;
73c04bcf 1597 uint32_t targetValue = 0;
374ca955
A
1598 UBool useFallback;
1599
1600 int32_t i;
1601 int8_t cs, g;
1602
1603 /* set up the state */
46f4442e 1604 converterData = (UConverterDataISO2022*)cnv->extraInfo;
374ca955 1605 pFromU2022State = &converterData->fromU2022State;
374ca955
A
1606
1607 choiceCount = 0;
b75a7d8f 1608
b75a7d8f 1609 /* check if the last codepoint of previous buffer was a lead surrogate*/
46f4442e 1610 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
b75a7d8f
A
1611 goto getTrail;
1612 }
b75a7d8f 1613
374ca955
A
1614 while(source < sourceLimit) {
1615 if(target < targetLimit) {
b75a7d8f 1616
b75a7d8f 1617 sourceChar = *(source++);
374ca955 1618 /*check if the char is a First surrogate*/
73c04bcf 1619 if(UTF_IS_SURROGATE(sourceChar)) {
374ca955
A
1620 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1621getTrail:
1622 /*look ahead to find the trail surrogate*/
1623 if(source < sourceLimit) {
1624 /* test the following code unit */
1625 UChar trail=(UChar) *source;
1626 if(UTF_IS_SECOND_SURROGATE(trail)) {
1627 source++;
1628 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
46f4442e 1629 cnv->fromUChar32=0x00;
374ca955
A
1630 /* convert this supplementary code point */
1631 /* exit this condition tree */
1632 } else {
1633 /* this is an unmatched lead code unit (1st surrogate) */
1634 /* callback(illegal) */
1635 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 1636 cnv->fromUChar32=sourceChar;
374ca955 1637 break;
b75a7d8f 1638 }
374ca955
A
1639 } else {
1640 /* no more input */
46f4442e 1641 cnv->fromUChar32=sourceChar;
b75a7d8f
A
1642 break;
1643 }
374ca955
A
1644 } else {
1645 /* this is an unmatched trail code unit (2nd surrogate) */
1646 /* callback(illegal) */
1647 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 1648 cnv->fromUChar32=sourceChar;
374ca955
A
1649 break;
1650 }
b75a7d8f
A
1651 }
1652
73c04bcf
A
1653 /* do not convert SO/SI/ESC */
1654 if(IS_2022_CONTROL(sourceChar)) {
1655 /* callback(illegal) */
1656 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 1657 cnv->fromUChar32=sourceChar;
73c04bcf
A
1658 break;
1659 }
1660
374ca955 1661 /* do the conversion */
b75a7d8f 1662
374ca955
A
1663 if(choiceCount == 0) {
1664 uint16_t csm;
b75a7d8f 1665
374ca955
A
1666 /*
1667 * The csm variable keeps track of which charsets are allowed
1668 * and not used yet while building the choices[].
1669 */
1670 csm = jpCharsetMasks[converterData->version];
1671 choiceCount = 0;
1672
1673 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1674 if(converterData->version == 3 || converterData->version == 4) {
46f4442e 1675 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
374ca955 1676 }
46f4442e
A
1677 /* Do not try single-byte half-width Katakana for other versions. */
1678 csm &= ~CSM(HWKANA_7BIT);
b75a7d8f 1679
374ca955
A
1680 /* try the current G0 charset */
1681 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1682 csm &= ~CSM(cs);
b75a7d8f 1683
374ca955
A
1684 /* try the current G2 charset */
1685 if((cs = pFromU2022State->cs[2]) != 0) {
1686 choices[choiceCount++] = cs;
1687 csm &= ~CSM(cs);
1688 }
1689
1690 /* try all the other possible charsets */
1691 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1692 cs = (int8_t)jpCharsetPref[i];
1693 if(CSM(cs) & csm) {
1694 choices[choiceCount++] = cs;
1695 csm &= ~CSM(cs);
b75a7d8f
A
1696 }
1697 }
374ca955 1698 }
b75a7d8f 1699
374ca955 1700 cs = g = 0;
46f4442e
A
1701 /*
1702 * len==0: no mapping found yet
1703 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1704 * len>0: found a roundtrip result, done
1705 */
374ca955 1706 len = 0;
46f4442e
A
1707 /*
1708 * We will turn off useFallback after finding a fallback,
1709 * but we still get fallbacks from PUA code points as usual.
1710 * Therefore, we will also need to check that we don't overwrite
1711 * an early fallback with a later one.
1712 */
1713 useFallback = cnv->useFallback;
374ca955 1714
46f4442e
A
1715 for(i = 0; i < choiceCount && len <= 0; ++i) {
1716 uint32_t value;
1717 int32_t len2;
1718 int8_t cs0 = choices[i];
1719 switch(cs0) {
374ca955
A
1720 case ASCII:
1721 if(sourceChar <= 0x7f) {
1722 targetValue = (uint32_t)sourceChar;
1723 len = 1;
46f4442e
A
1724 cs = cs0;
1725 g = 0;
b75a7d8f 1726 }
374ca955
A
1727 break;
1728 case ISO8859_1:
46f4442e 1729 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
374ca955
A
1730 targetValue = (uint32_t)sourceChar - 0x80;
1731 len = 1;
46f4442e 1732 cs = cs0;
374ca955
A
1733 g = 2;
1734 }
1735 break;
1736 case HWKANA_7BIT:
46f4442e 1737 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
374ca955
A
1738 if(converterData->version==3) {
1739 /* JIS7: use G1 (SO) */
46f4442e
A
1740 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1741 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1742 len = 1;
1743 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
374ca955
A
1744 g = 1;
1745 } else if(converterData->version==4) {
1746 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
46f4442e
A
1747 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1748 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1749 len = 1;
374ca955 1750
46f4442e
A
1751 cs = pFromU2022State->cs[0];
1752 if(IS_JP_DBCS(cs)) {
374ca955
A
1753 /* switch from a DBCS charset to JISX201 */
1754 cs = (int8_t)JISX201;
b75a7d8f 1755 }
46f4442e
A
1756 /* else stay in the current G0 charset */
1757 g = 0;
b75a7d8f 1758 }
46f4442e 1759 /* else do not use HWKANA_7BIT with other versions */
b75a7d8f 1760 }
374ca955
A
1761 break;
1762 case JISX201:
1763 /* G0 SBCS */
46f4442e
A
1764 value = jisx201FromU(sourceChar);
1765 if(value <= 0x7f) {
1766 targetValue = value;
374ca955 1767 len = 1;
46f4442e
A
1768 cs = cs0;
1769 g = 0;
1770 useFallback = FALSE;
1771 }
1772 break;
1773 case JISX208:
1774 /* G0 DBCS from Shift-JIS table */
1775 len2 = MBCS_FROM_UCHAR32_ISO2022(
1776 converterData->myConverterArray[cs0],
1777 sourceChar, &value,
1778 useFallback, MBCS_OUTPUT_2);
1779 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1780 value = _2022FromSJIS(value);
1781 if(value != 0) {
1782 targetValue = value;
1783 len = len2;
1784 cs = cs0;
1785 g = 0;
1786 useFallback = FALSE;
1787 }
1788 } else if(len == 0 && useFallback &&
1789 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1790 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1791 len = -2;
1792 cs = cs0;
1793 g = 0;
1794 useFallback = FALSE;
374ca955
A
1795 }
1796 break;
1797 case ISO8859_7:
1798 /* G0 SBCS forced to 7-bit output */
46f4442e
A
1799 len2 = MBCS_SINGLE_FROM_UCHAR32(
1800 converterData->myConverterArray[cs0],
1801 sourceChar, &value,
1802 useFallback);
1803 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1804 targetValue = value - 0x80;
1805 len = len2;
1806 cs = cs0;
374ca955 1807 g = 2;
46f4442e 1808 useFallback = FALSE;
374ca955
A
1809 }
1810 break;
1811 default:
1812 /* G0 DBCS */
46f4442e
A
1813 len2 = MBCS_FROM_UCHAR32_ISO2022(
1814 converterData->myConverterArray[cs0],
1815 sourceChar, &value,
1816 useFallback, MBCS_OUTPUT_2);
1817 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1818 if(cs0 == KSC5601) {
1819 /*
1820 * Check for valid bytes for the encoding scheme.
1821 * This is necessary because the sub-converter (windows-949)
1822 * has a broader encoding scheme than is valid for 2022.
1823 */
1824 value = _2022FromGR94DBCS(value);
1825 if(value == 0) {
1826 break;
1827 }
1828 }
1829 targetValue = value;
1830 len = len2;
1831 cs = cs0;
1832 g = 0;
1833 useFallback = FALSE;
374ca955
A
1834 }
1835 break;
b75a7d8f
A
1836 }
1837 }
b75a7d8f 1838
46f4442e
A
1839 if(len != 0) {
1840 if(len < 0) {
1841 len = -len; /* fallback */
1842 }
374ca955
A
1843 outLen = 0; /* count output bytes */
1844
1845 /* write SI if necessary (only for JIS7) */
1846 if(pFromU2022State->g == 1 && g == 0) {
1847 buffer[outLen++] = UCNV_SI;
1848 pFromU2022State->g = 0;
1849 }
1850
1851 /* write the designation sequence if necessary */
1852 if(cs != pFromU2022State->cs[g]) {
1853 int32_t escLen = escSeqCharsLen[cs];
1854 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1855 outLen += escLen;
1856 pFromU2022State->cs[g] = cs;
1857
1858 /* invalidate the choices[] */
1859 choiceCount = 0;
1860 }
1861
1862 /* write the shift sequence if necessary */
1863 if(g != pFromU2022State->g) {
1864 switch(g) {
1865 /* case 0 handled before writing escapes */
1866 case 1:
1867 buffer[outLen++] = UCNV_SO;
1868 pFromU2022State->g = 1;
1869 break;
1870 default: /* case 2 */
1871 buffer[outLen++] = 0x1b;
1872 buffer[outLen++] = 0x4e;
1873 break;
1874 /* no case 3: no SS3 in ISO-2022-JP-x */
1875 }
1876 }
1877
1878 /* write the output bytes */
1879 if(len == 1) {
1880 buffer[outLen++] = (char)targetValue;
1881 } else /* len == 2 */ {
1882 buffer[outLen++] = (char)(targetValue >> 8);
1883 buffer[outLen++] = (char)targetValue;
1884 }
1885 } else {
1886 /*
46f4442e 1887 * if we cannot find the character after checking all codepages
b75a7d8f
A
1888 * then this is an error
1889 */
b75a7d8f 1890 *err = U_INVALID_CHAR_FOUND;
46f4442e 1891 cnv->fromUChar32=sourceChar;
374ca955
A
1892 break;
1893 }
1894
1895 if(sourceChar == CR || sourceChar == LF) {
1896 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1897 pFromU2022State->cs[2] = 0;
1898 choiceCount = 0;
1899 }
1900
1901 /* output outLen>0 bytes in buffer[] */
1902 if(outLen == 1) {
1903 *target++ = buffer[0];
1904 if(offsets) {
73c04bcf 1905 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
b75a7d8f 1906 }
374ca955
A
1907 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1908 *target++ = buffer[0];
1909 *target++ = buffer[1];
1910 if(offsets) {
1911 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1912 *offsets++ = sourceIndex;
1913 *offsets++ = sourceIndex;
1914 }
1915 } else {
73c04bcf 1916 fromUWriteUInt8(
46f4442e 1917 cnv,
374ca955 1918 buffer, outLen,
73c04bcf 1919 &target, (const char *)targetLimit,
374ca955
A
1920 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1921 err);
1922 if(U_FAILURE(*err)) {
b75a7d8f
A
1923 break;
1924 }
1925 }
1926 } /* end if(myTargetIndex<myTargetLength) */
1927 else{
1928 *err =U_BUFFER_OVERFLOW_ERROR;
1929 break;
1930 }
1931
1932 }/* end while(mySourceIndex<mySourceLength) */
1933
374ca955
A
1934 /*
1935 * the end of the input stream and detection of truncated input
1936 * are handled by the framework, but for ISO-2022-JP conversion
1937 * we need to be in ASCII mode at the very end
1938 *
1939 * conditions:
1940 * successful
1941 * in SO mode or not in ASCII mode
1942 * end of input and no truncated input
b75a7d8f 1943 */
374ca955
A
1944 if( U_SUCCESS(*err) &&
1945 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
46f4442e 1946 args->flush && source>=sourceLimit && cnv->fromUChar32==0
374ca955
A
1947 ) {
1948 int32_t sourceIndex;
1949
1950 outLen = 0;
1951
1952 if(pFromU2022State->g != 0) {
1953 buffer[outLen++] = UCNV_SI;
1954 pFromU2022State->g = 0;
1955 }
1956
1957 if(pFromU2022State->cs[0] != ASCII) {
1958 int32_t escLen = escSeqCharsLen[ASCII];
1959 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1960 outLen += escLen;
1961 pFromU2022State->cs[0] = (int8_t)ASCII;
1962 }
1963
1964 /* get the source index of the last input character */
1965 /*
1966 * TODO this would be simpler and more reliable if we used a pair
1967 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1968 * so that we could simply use the prevSourceIndex here;
1969 * this code gives an incorrect result for the rare case of an unmatched
1970 * trail surrogate that is alone in the last buffer of the text stream
1971 */
1972 sourceIndex=(int32_t)(source-args->source);
1973 if(sourceIndex>0) {
1974 --sourceIndex;
1975 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1976 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1977 ) {
1978 --sourceIndex;
1979 }
1980 } else {
1981 sourceIndex=-1;
1982 }
1983
73c04bcf 1984 fromUWriteUInt8(
46f4442e 1985 cnv,
374ca955 1986 buffer, outLen,
73c04bcf 1987 &target, (const char *)targetLimit,
374ca955
A
1988 &offsets, sourceIndex,
1989 err);
b75a7d8f
A
1990 }
1991
1992 /*save the state and return */
1993 args->source = source;
1994 args->target = (char*)target;
1995}
1996
1997/*************** to unicode *******************/
1998
46f4442e 1999static void
b75a7d8f 2000UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
374ca955 2001 UErrorCode* err){
46f4442e 2002 char tempBuf[2];
374ca955 2003 const char *mySource = (char *) args->source;
b75a7d8f
A
2004 UChar *myTarget = args->target;
2005 const char *mySourceLimit = args->sourceLimit;
2006 uint32_t targetUniChar = 0x0000;
2007 uint32_t mySourceChar = 0x0000;
46f4442e 2008 uint32_t tmpSourceChar = 0x0000;
b75a7d8f 2009 UConverterDataISO2022* myData;
374ca955
A
2010 ISO2022State *pToU2022State;
2011 StateEnum cs;
b75a7d8f 2012
b75a7d8f 2013 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
374ca955 2014 pToU2022State = &myData->toU2022State;
b75a7d8f 2015
374ca955
A
2016 if(myData->key != 0) {
2017 /* continue with a partial escape sequence */
2018 goto escape;
2019 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2020 /* continue with a partial double-byte character */
2021 mySourceChar = args->converter->toUBytes[0];
2022 args->converter->toULength = 0;
2023 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
fd0068a8 2024 targetUniChar = missingCharMarker;
374ca955
A
2025 goto getTrailByte;
2026 }
2027
2028 while(mySource < mySourceLimit){
2029
2030 targetUniChar =missingCharMarker;
b75a7d8f
A
2031
2032 if(myTarget < args->targetLimit){
2033
2034 mySourceChar= (unsigned char) *mySource++;
374ca955
A
2035
2036 switch(mySourceChar) {
2037 case UCNV_SI:
2038 if(myData->version==3) {
2039 pToU2022State->g=0;
b75a7d8f 2040 continue;
374ca955
A
2041 } else {
2042 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
d5d484b0 2043 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
374ca955 2044 break;
b75a7d8f 2045 }
b75a7d8f 2046
374ca955
A
2047 case UCNV_SO:
2048 if(myData->version==3) {
2049 /* JIS7: switch to G1 half-width Katakana */
2050 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2051 pToU2022State->g=1;
b75a7d8f 2052 continue;
374ca955
A
2053 } else {
2054 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
d5d484b0 2055 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
374ca955 2056 break;
b75a7d8f 2057 }
b75a7d8f 2058
374ca955
A
2059 case ESC_2022:
2060 mySource--;
2061escape:
d5d484b0
A
2062 {
2063 const char * mySourceBefore = mySource;
2064 int8_t toULengthBefore = args->converter->toULength;
2065
46f4442e 2066 changeState_2022(args->converter,&(mySource),
d5d484b0
A
2067 mySourceLimit, ISO_2022_JP,err);
2068
2069 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
46f4442e
A
2070 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2071 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2072 args->converter->toUCallbackReason = UCNV_IRREGULAR;
d5d484b0
A
2073 args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
2074 }
d5d484b0 2075 }
46f4442e 2076
374ca955
A
2077 /* invalid or illegal escape sequence */
2078 if(U_FAILURE(*err)){
2079 args->target = myTarget;
2080 args->source = mySource;
d5d484b0 2081 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
374ca955 2082 return;
b75a7d8f 2083 }
d5d484b0 2084 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
46f4442e 2085 if(myData->key==0) {
d5d484b0
A
2086 myData->isEmptySegment = TRUE;
2087 }
374ca955 2088 continue;
b75a7d8f 2089
374ca955 2090 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
b75a7d8f 2091
374ca955
A
2092 case CR:
2093 /*falls through*/
2094 case LF:
2095 /* automatically reset to single-byte mode */
2096 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2097 pToU2022State->cs[0] = (int8_t)ASCII;
b75a7d8f 2098 }
374ca955
A
2099 pToU2022State->cs[2] = 0;
2100 pToU2022State->g = 0;
2101 /* falls through */
b75a7d8f 2102 default:
374ca955 2103 /* convert one or two bytes */
d5d484b0 2104 myData->isEmptySegment = FALSE;
374ca955
A
2105 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2106 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2107 !IS_JP_DBCS(cs)
2108 ) {
2109 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
46f4442e 2110 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
374ca955
A
2111
2112 /* return from a single-shift state to the previous one */
2113 if(pToU2022State->g >= 2) {
2114 pToU2022State->g=pToU2022State->prevG;
2115 }
2116 } else switch(cs) {
2117 case ASCII:
2118 if(mySourceChar <= 0x7f) {
2119 targetUniChar = mySourceChar;
2120 }
2121 break;
2122 case ISO8859_1:
2123 if(mySourceChar <= 0x7f) {
2124 targetUniChar = mySourceChar + 0x80;
2125 }
2126 /* return from a single-shift state to the previous one */
2127 pToU2022State->g=pToU2022State->prevG;
2128 break;
2129 case ISO8859_7:
2130 if(mySourceChar <= 0x7f) {
2131 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2132 targetUniChar =
2133 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2134 myData->myConverterArray[cs],
2135 mySourceChar + 0x80);
2136 }
2137 /* return from a single-shift state to the previous one */
2138 pToU2022State->g=pToU2022State->prevG;
2139 break;
2140 case JISX201:
2141 if(mySourceChar <= 0x7f) {
46f4442e 2142 targetUniChar = jisx201ToU(mySourceChar);
374ca955
A
2143 }
2144 break;
2145 case HWKANA_7BIT:
2146 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2147 /* 7-bit halfwidth Katakana */
46f4442e 2148 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
374ca955
A
2149 }
2150 break;
2151 default:
2152 /* G0 DBCS */
2153 if(mySource < mySourceLimit) {
fd0068a8
A
2154 int leadIsOk, trailIsOk;
2155 uint8_t trailByte;
374ca955 2156getTrailByte:
fd0068a8 2157 trailByte = (uint8_t)*mySource;
fd0068a8
A
2158 /*
2159 * Ticket 5691: consistent illegal sequences:
2160 * - We include at least the first byte in the illegal sequence.
2161 * - If any of the non-initial bytes could be the start of a character,
46f4442e 2162 * we stop the illegal sequence before the first one of those.
fd0068a8
A
2163 *
2164 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2165 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2166 * Otherwise we convert or report the pair of bytes.
2167 */
2168 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2169 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2170 if (leadIsOk && trailIsOk) {
2171 ++mySource;
46f4442e
A
2172 tmpSourceChar = (mySourceChar << 8) | trailByte;
2173 if(cs == JISX208) {
2174 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2175 mySourceChar = tmpSourceChar;
2176 } else {
2177 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2178 mySourceChar = tmpSourceChar;
2179 if (cs == KSC5601) {
2180 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2181 }
2182 tempBuf[0] = (char)(tmpSourceChar >> 8);
2183 tempBuf[1] = (char)(tmpSourceChar);
2184 }
fd0068a8
A
2185 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2186 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2187 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2188 ++mySource;
2189 /* add another bit so that the code below writes 2 bytes in case of error */
2190 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2191 }
374ca955
A
2192 } else {
2193 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2194 args->converter->toULength = 1;
2195 goto endloop;
2196 }
46f4442e 2197 } /* End of inner switch */
b75a7d8f 2198 break;
46f4442e 2199 } /* End of outer switch */
b75a7d8f
A
2200 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2201 if(args->offsets){
73c04bcf 2202 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f
A
2203 }
2204 *(myTarget++)=(UChar)targetUniChar;
b75a7d8f 2205 }
374ca955
A
2206 else if(targetUniChar > missingCharMarker){
2207 /* disassemble the surrogate pair and write to output*/
2208 targetUniChar-=0x0010000;
2209 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2210 if(args->offsets){
73c04bcf 2211 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
374ca955
A
2212 }
2213 ++myTarget;
46f4442e 2214 if(myTarget< args->targetLimit){
374ca955
A
2215 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2216 if(args->offsets){
73c04bcf 2217 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
374ca955
A
2218 }
2219 ++myTarget;
2220 }else{
2221 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2222 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2223 }
b75a7d8f 2224
374ca955
A
2225 }
2226 else{
b75a7d8f 2227 /* Call the callback function*/
374ca955
A
2228 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2229 break;
b75a7d8f
A
2230 }
2231 }
46f4442e 2232 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
b75a7d8f
A
2233 *err =U_BUFFER_OVERFLOW_ERROR;
2234 break;
2235 }
2236 }
374ca955 2237endloop:
b75a7d8f
A
2238 args->target = myTarget;
2239 args->source = mySource;
2240}
2241
2242
b75a7d8f
A
2243/***************************************************************
2244* Rules for ISO-2022-KR encoding
46f4442e 2245* i) The KSC5601 designator sequence should appear only once in a file,
b75a7d8f
A
2246* at the begining of a line before any KSC5601 characters. This usually
2247* means that it appears by itself on the first line of the file
2248* ii) There are only 2 shifting sequences SO to shift into double byte mode
2249* and SI to shift into single byte mode
2250*/
46f4442e 2251static void
b75a7d8f
A
2252UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2253
374ca955
A
2254 UConverter* saveConv = args->converter;
2255 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2256 args->converter=myConverterData->currentConverter;
2257
2258 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2259 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2260 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2261
2262 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2263 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2264 uprv_memcpy(
2265 saveConv->charErrorBuffer,
2266 myConverterData->currentConverter->charErrorBuffer,
2267 myConverterData->currentConverter->charErrorBufferLength);
2268 }
2269 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2270 myConverterData->currentConverter->charErrorBufferLength = 0;
2271 }
2272 args->converter=saveConv;
b75a7d8f
A
2273}
2274
46f4442e 2275static void
b75a7d8f
A
2276UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2277
2278 const UChar *source = args->source;
2279 const UChar *sourceLimit = args->sourceLimit;
2280 unsigned char *target = (unsigned char *) args->target;
2281 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2282 int32_t* offsets = args->offsets;
2283 uint32_t targetByteUnit = 0x0000;
2284 UChar32 sourceChar = 0x0000;
2285 UBool isTargetByteDBCS;
2286 UBool oldIsTargetByteDBCS;
2287 UConverterDataISO2022 *converterData;
b75a7d8f
A
2288 UConverterSharedData* sharedData;
2289 UBool useFallback;
2290 int32_t length =0;
2291
b75a7d8f 2292 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
46f4442e
A
2293 /* if the version is 1 then the user is requesting
2294 * conversion with ibm-25546 pass the arguments to
b75a7d8f
A
2295 * MBCS converter and return
2296 */
2297 if(converterData->version==1){
2298 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2299 return;
2300 }
374ca955
A
2301
2302 /* initialize data */
2303 sharedData = converterData->currentConverter->sharedData;
2304 useFallback = args->converter->useFallback;
2305 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2306 oldIsTargetByteDBCS = isTargetByteDBCS;
46f4442e 2307
b75a7d8f 2308 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
374ca955 2309 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
b75a7d8f
A
2310 goto getTrail;
2311 }
2312 while(source < sourceLimit){
46f4442e 2313
b75a7d8f
A
2314 targetByteUnit = missingCharMarker;
2315
2316 if(target < (unsigned char*) args->targetLimit){
2317 sourceChar = *source++;
73c04bcf
A
2318
2319 /* do not convert SO/SI/ESC */
2320 if(IS_2022_CONTROL(sourceChar)) {
2321 /* callback(illegal) */
2322 *err=U_ILLEGAL_CHAR_FOUND;
2323 args->converter->fromUChar32=sourceChar;
2324 break;
2325 }
2326
46f4442e
A
2327 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2328 if(length < 0) {
2329 length = -length; /* fallback */
2330 }
b75a7d8f 2331 /* only DBCS or SBCS characters are expected*/
374ca955 2332 /* DB characters with high bit set to 1 are expected */
fd0068a8
A
2333 if( length > 2 || length==0 ||
2334 (length == 1 && targetByteUnit > 0x7f) ||
2335 (length == 2 &&
2336 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2337 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2338 ) {
b75a7d8f
A
2339 targetByteUnit=missingCharMarker;
2340 }
2341 if (targetByteUnit != missingCharMarker){
2342
2343 oldIsTargetByteDBCS = isTargetByteDBCS;
2344 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2345 /* append the shift sequence */
2346 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
46f4442e
A
2347
2348 if (isTargetByteDBCS)
b75a7d8f 2349 *target++ = UCNV_SO;
46f4442e 2350 else
b75a7d8f
A
2351 *target++ = UCNV_SI;
2352 if(offsets)
73c04bcf 2353 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2354 }
2355 /* write the targetUniChar to target */
2356 if(targetByteUnit <= 0x00FF){
2357 if( target < targetLimit){
2358 *(target++) = (unsigned char) targetByteUnit;
2359 if(offsets){
73c04bcf 2360 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2361 }
2362
2363 }else{
2364 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2365 *err = U_BUFFER_OVERFLOW_ERROR;
2366 }
2367 }else{
2368 if(target < targetLimit){
2369 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2370 if(offsets){
73c04bcf 2371 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2372 }
2373 if(target < targetLimit){
2374 *(target++) =(unsigned char) (targetByteUnit -0x80);
2375 if(offsets){
73c04bcf 2376 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2377 }
2378 }else{
2379 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2380 *err = U_BUFFER_OVERFLOW_ERROR;
2381 }
2382 }else{
2383 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2384 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2385 *err = U_BUFFER_OVERFLOW_ERROR;
2386 }
2387 }
2388
2389 }
2390 else{
2391 /* oops.. the code point is unassingned
2392 * set the error and reason
2393 */
b75a7d8f
A
2394
2395 /*check if the char is a First surrogate*/
2396 if(UTF_IS_SURROGATE(sourceChar)) {
2397 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
b75a7d8f
A
2398getTrail:
2399 /*look ahead to find the trail surrogate*/
2400 if(source < sourceLimit) {
2401 /* test the following code unit */
2402 UChar trail=(UChar) *source;
2403 if(UTF_IS_SECOND_SURROGATE(trail)) {
2404 source++;
374ca955 2405 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
b75a7d8f 2406 *err = U_INVALID_CHAR_FOUND;
b75a7d8f
A
2407 /* convert this surrogate code point */
2408 /* exit this condition tree */
2409 } else {
2410 /* this is an unmatched lead code unit (1st surrogate) */
2411 /* callback(illegal) */
b75a7d8f
A
2412 *err=U_ILLEGAL_CHAR_FOUND;
2413 }
2414 } else {
2415 /* no more input */
2416 *err = U_ZERO_ERROR;
b75a7d8f
A
2417 }
2418 } else {
2419 /* this is an unmatched trail code unit (2nd surrogate) */
2420 /* callback(illegal) */
b75a7d8f
A
2421 *err=U_ILLEGAL_CHAR_FOUND;
2422 }
374ca955
A
2423 } else {
2424 /* callback(unassigned) for a BMP code point */
2425 *err = U_INVALID_CHAR_FOUND;
b75a7d8f 2426 }
b75a7d8f 2427
374ca955 2428 args->converter->fromUChar32=sourceChar;
374ca955 2429 break;
b75a7d8f
A
2430 }
2431 } /* end if(myTargetIndex<myTargetLength) */
2432 else{
2433 *err =U_BUFFER_OVERFLOW_ERROR;
2434 break;
2435 }
2436
2437 }/* end while(mySourceIndex<mySourceLength) */
2438
374ca955
A
2439 /*
2440 * the end of the input stream and detection of truncated input
2441 * are handled by the framework, but for ISO-2022-KR conversion
2442 * we need to be in ASCII mode at the very end
2443 *
2444 * conditions:
2445 * successful
2446 * not in ASCII mode
2447 * end of input and no truncated input
b75a7d8f 2448 */
374ca955
A
2449 if( U_SUCCESS(*err) &&
2450 isTargetByteDBCS &&
2451 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2452 ) {
2453 int32_t sourceIndex;
2454
2455 /* we are switching to ASCII */
2456 isTargetByteDBCS=FALSE;
2457
2458 /* get the source index of the last input character */
2459 /*
2460 * TODO this would be simpler and more reliable if we used a pair
2461 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2462 * so that we could simply use the prevSourceIndex here;
2463 * this code gives an incorrect result for the rare case of an unmatched
2464 * trail surrogate that is alone in the last buffer of the text stream
2465 */
2466 sourceIndex=(int32_t)(source-args->source);
2467 if(sourceIndex>0) {
2468 --sourceIndex;
2469 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2470 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2471 ) {
2472 --sourceIndex;
2473 }
2474 } else {
2475 sourceIndex=-1;
2476 }
2477
73c04bcf 2478 fromUWriteUInt8(
374ca955
A
2479 args->converter,
2480 SHIFT_IN_STR, 1,
73c04bcf 2481 &target, (const char *)targetLimit,
374ca955
A
2482 &offsets, sourceIndex,
2483 err);
b75a7d8f
A
2484 }
2485
2486 /*save the state and return */
2487 args->source = source;
2488 args->target = (char*)target;
2489 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2490}
2491
2492/************************ To Unicode ***************************************/
2493
46f4442e 2494static void
b75a7d8f
A
2495UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2496 UErrorCode* err){
b75a7d8f 2497 char const* sourceStart;
b75a7d8f 2498 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
b75a7d8f 2499
374ca955
A
2500 UConverterToUnicodeArgs subArgs;
2501 int32_t minArgsSize;
2502
2503 /* set up the subconverter arguments */
2504 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2505 minArgsSize = args->size;
2506 } else {
2507 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2508 }
2509
2510 uprv_memcpy(&subArgs, args, minArgsSize);
2511 subArgs.size = (uint16_t)minArgsSize;
2512 subArgs.converter = myData->currentConverter;
2513
2514 /* remember the original start of the input for offsets */
2515 sourceStart = args->source;
2516
2517 if(myData->key != 0) {
2518 /* continue with a partial escape sequence */
2519 goto escape;
2520 }
2521
2522 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
b75a7d8f 2523 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
374ca955
A
2524 subArgs.source = args->source;
2525 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2526 if(subArgs.source != subArgs.sourceLimit) {
2527 /*
2528 * get the current partial byte sequence
2529 *
2530 * it needs to be moved between the public and the subconverter
2531 * so that the conversion framework, which only sees the public
2532 * converter, can handle truncated and illegal input etc.
2533 */
2534 if(args->converter->toULength > 0) {
2535 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2536 }
2537 subArgs.converter->toULength = args->converter->toULength;
2538
2539 /*
2540 * Convert up to the end of the input, or to before the next escape character.
2541 * Does not handle conversion extensions because the preToU[] state etc.
2542 * is not copied.
2543 */
2544 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2545
2546 if(args->offsets != NULL && sourceStart != args->source) {
2547 /* update offsets to base them on the actual start of the input */
2548 int32_t *offsets = args->offsets;
2549 UChar *target = args->target;
2550 int32_t delta = (int32_t)(args->source - sourceStart);
2551 while(target < subArgs.target) {
2552 if(*offsets >= 0) {
2553 *offsets += delta;
2554 }
2555 ++offsets;
2556 ++target;
2557 }
2558 }
2559 args->source = subArgs.source;
2560 args->target = subArgs.target;
2561 args->offsets = subArgs.offsets;
2562
2563 /* copy input/error/overflow buffers */
2564 if(subArgs.converter->toULength > 0) {
2565 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2566 }
2567 args->converter->toULength = subArgs.converter->toULength;
2568
2569 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2570 if(subArgs.converter->UCharErrorBufferLength > 0) {
2571 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2572 subArgs.converter->UCharErrorBufferLength);
2573 }
2574 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2575 subArgs.converter->UCharErrorBufferLength = 0;
b75a7d8f 2576 }
b75a7d8f
A
2577 }
2578
374ca955 2579 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
b75a7d8f 2580 return;
374ca955 2581 }
b75a7d8f 2582
374ca955 2583escape:
b75a7d8f 2584 changeState_2022(args->converter,
46f4442e 2585 &(args->source),
b75a7d8f 2586 args->sourceLimit,
b75a7d8f 2587 ISO_2022_KR,
b75a7d8f 2588 err);
374ca955 2589 }
b75a7d8f
A
2590}
2591
46f4442e 2592static void
b75a7d8f
A
2593UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2594 UErrorCode* err){
374ca955 2595 char tempBuf[2];
b75a7d8f
A
2596 const char *mySource = ( char *) args->source;
2597 UChar *myTarget = args->target;
2598 const char *mySourceLimit = args->sourceLimit;
2599 UChar32 targetUniChar = 0x0000;
2600 UChar mySourceChar = 0x0000;
2601 UConverterDataISO2022* myData;
b75a7d8f
A
2602 UConverterSharedData* sharedData ;
2603 UBool useFallback;
2604
374ca955
A
2605 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2606 if(myData->version==1){
2607 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
b75a7d8f
A
2608 return;
2609 }
374ca955 2610
b75a7d8f 2611 /* initialize state */
374ca955 2612 sharedData = myData->currentConverter->sharedData;
b75a7d8f 2613 useFallback = args->converter->useFallback;
46f4442e 2614
374ca955
A
2615 if(myData->key != 0) {
2616 /* continue with a partial escape sequence */
2617 goto escape;
2618 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2619 /* continue with a partial double-byte character */
2620 mySourceChar = args->converter->toUBytes[0];
2621 args->converter->toULength = 0;
2622 goto getTrailByte;
b75a7d8f 2623 }
b75a7d8f 2624
374ca955 2625 while(mySource< mySourceLimit){
b75a7d8f
A
2626
2627 if(myTarget < args->targetLimit){
2628
2629 mySourceChar= (unsigned char) *mySource++;
2630
2631 if(mySourceChar==UCNV_SI){
374ca955 2632 myData->toU2022State.g = 0;
d5d484b0
A
2633 if (myData->isEmptySegment) {
2634 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
46f4442e
A
2635 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2636 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2637 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
d5d484b0
A
2638 args->converter->toULength = 1;
2639 args->target = myTarget;
2640 args->source = mySource;
2641 return;
2642 }
b75a7d8f
A
2643 /*consume the source */
2644 continue;
2645 }else if(mySourceChar==UCNV_SO){
374ca955 2646 myData->toU2022State.g = 1;
d5d484b0 2647 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
b75a7d8f
A
2648 /*consume the source */
2649 continue;
374ca955
A
2650 }else if(mySourceChar==ESC_2022){
2651 mySource--;
2652escape:
d5d484b0 2653 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
46f4442e 2654 changeState_2022(args->converter,&(mySource),
374ca955 2655 mySourceLimit, ISO_2022_KR, err);
b75a7d8f
A
2656 if(U_FAILURE(*err)){
2657 args->target = myTarget;
2658 args->source = mySource;
2659 return;
2660 }
2661 continue;
46f4442e 2662 }
b75a7d8f 2663
d5d484b0 2664 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
374ca955
A
2665 if(myData->toU2022State.g == 1) {
2666 if(mySource < mySourceLimit) {
fd0068a8
A
2667 int leadIsOk, trailIsOk;
2668 uint8_t trailByte;
374ca955 2669getTrailByte:
fd0068a8
A
2670 targetUniChar = missingCharMarker;
2671 trailByte = (uint8_t)*mySource;
2672 /*
2673 * Ticket 5691: consistent illegal sequences:
2674 * - We include at least the first byte in the illegal sequence.
2675 * - If any of the non-initial bytes could be the start of a character,
2676 * we stop the illegal sequence before the first one of those.
2677 *
2678 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2679 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2680 * Otherwise we convert or report the pair of bytes.
2681 */
2682 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2683 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2684 if (leadIsOk && trailIsOk) {
2685 ++mySource;
2686 tempBuf[0] = (char)(mySourceChar + 0x80);
2687 tempBuf[1] = (char)(trailByte + 0x80);
2688 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2689 mySourceChar = (mySourceChar << 8) | trailByte;
2690 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2691 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2692 ++mySource;
2693 /* add another bit so that the code below writes 2 bytes in case of error */
2694 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
374ca955
A
2695 }
2696 } else {
2697 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2698 args->converter->toULength = 1;
2699 break;
b75a7d8f
A
2700 }
2701 }
fd0068a8 2702 else if(mySourceChar <= 0x7f) {
374ca955 2703 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
fd0068a8
A
2704 } else {
2705 targetUniChar = 0xffff;
b75a7d8f 2706 }
374ca955
A
2707 if(targetUniChar < 0xfffe){
2708 if(args->offsets) {
73c04bcf 2709 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
374ca955 2710 }
b75a7d8f
A
2711 *(myTarget++)=(UChar)targetUniChar;
2712 }
2713 else {
b75a7d8f 2714 /* Call the callback function*/
374ca955
A
2715 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2716 break;
b75a7d8f
A
2717 }
2718 }
2719 else{
2720 *err =U_BUFFER_OVERFLOW_ERROR;
2721 break;
2722 }
2723 }
b75a7d8f
A
2724 args->target = myTarget;
2725 args->source = mySource;
2726}
2727
2728/*************************** END ISO2022-KR *********************************/
2729
2730/*************************** ISO-2022-CN *********************************
2731*
2732* Rules for ISO-2022-CN Encoding:
374ca955 2733* i) The designator sequence must appear once on a line before any instance
b75a7d8f
A
2734* of character set it designates.
2735* ii) If two lines contain characters from the same character set, both lines
2736* must include the designator sequence.
374ca955 2737* iii) Once the designator sequence is known, a shifting sequence has to be found
b75a7d8f
A
2738* to invoke the shifting
2739* iv) All lines start in ASCII and end in ASCII.
2740* v) Four shifting sequences are employed for this purpose:
2741*
2742* Sequcence ASCII Eq Charsets
2743* ---------- ------- ---------
374ca955
A
2744* SI <SI> US-ASCII
2745* SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2746* SS2 <ESC>N CNS-11643-1992 Plane 2
2747* SS3 <ESC>O CNS-11643-1992 Planes 3-7
b75a7d8f
A
2748*
2749* vi)
2750* SOdesignator : ESC "$" ")" finalchar_for_SO
2751* SS2designator : ESC "$" "*" finalchar_for_SS2
2752* SS3designator : ESC "$" "+" finalchar_for_SS3
2753*
2754* ESC $ ) A Indicates the bytes following SO are Chinese
2755* characters as defined in GB 2312-80, until
2756* another SOdesignation appears
2757*
2758*
2759* ESC $ ) E Indicates the bytes following SO are as defined
2760* in ISO-IR-165 (for details, see section 2.1),
2761* until another SOdesignation appears
2762*
2763* ESC $ ) G Indicates the bytes following SO are as defined
2764* in CNS 11643-plane-1, until another
2765* SOdesignation appears
2766*
2767* ESC $ * H Indicates the two bytes immediately following
2768* SS2 is a Chinese character as defined in CNS
2769* 11643-plane-2, until another SS2designation
2770* appears
46f4442e 2771* (Meaning <ESC>N must preceed every 2 byte
b75a7d8f
A
2772* sequence.)
2773*
2774* ESC $ + I Indicates the immediate two bytes following SS3
2775* is a Chinese character as defined in CNS
2776* 11643-plane-3, until another SS3designation
2777* appears
46f4442e 2778* (Meaning <ESC>O must preceed every 2 byte
b75a7d8f
A
2779* sequence.)
2780*
2781* ESC $ + J Indicates the immediate two bytes following SS3
2782* is a Chinese character as defined in CNS
2783* 11643-plane-4, until another SS3designation
2784* appears
46f4442e 2785* (In English: <ESC>O must preceed every 2 byte
b75a7d8f
A
2786* sequence.)
2787*
2788* ESC $ + K Indicates the immediate two bytes following SS3
2789* is a Chinese character as defined in CNS
2790* 11643-plane-5, until another SS3designation
2791* appears
2792*
2793* ESC $ + L Indicates the immediate two bytes following SS3
2794* is a Chinese character as defined in CNS
2795* 11643-plane-6, until another SS3designation
2796* appears
2797*
2798* ESC $ + M Indicates the immediate two bytes following SS3
2799* is a Chinese character as defined in CNS
2800* 11643-plane-7, until another SS3designation
2801* appears
2802*
2803* As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2804* has its own designation information before any Chinese characters
2805* appear
2806*
2807*/
2808
2809/* The following are defined this way to make the strings truely readonly */
b75a7d8f
A
2810static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2811static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2812static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2813static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2814static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2815static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2816static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2817static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2818static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2819
2820/********************** ISO2022-CN Data **************************/
2821static const char* const escSeqCharsCN[10] ={
2822 SHIFT_IN_STR, /* ASCII */
2823 GB_2312_80_STR,
2824 ISO_IR_165_STR,
2825 CNS_11643_1992_Plane_1_STR,
2826 CNS_11643_1992_Plane_2_STR,
2827 CNS_11643_1992_Plane_3_STR,
2828 CNS_11643_1992_Plane_4_STR,
2829 CNS_11643_1992_Plane_5_STR,
2830 CNS_11643_1992_Plane_6_STR,
2831 CNS_11643_1992_Plane_7_STR
2832};
b75a7d8f 2833
46f4442e 2834static void
b75a7d8f 2835UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
46f4442e 2836 UConverter *cnv = args->converter;
b75a7d8f 2837 UConverterDataISO2022 *converterData;
374ca955
A
2838 ISO2022State *pFromU2022State;
2839 uint8_t *target = (uint8_t *) args->target;
2840 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
b75a7d8f
A
2841 const UChar* source = args->source;
2842 const UChar* sourceLimit = args->sourceLimit;
2843 int32_t* offsets = args->offsets;
374ca955
A
2844 UChar32 sourceChar;
2845 char buffer[8];
2846 int32_t len;
2847 int8_t choices[3];
2848 int32_t choiceCount;
73c04bcf 2849 uint32_t targetValue = 0;
b75a7d8f
A
2850 UBool useFallback;
2851
b75a7d8f 2852 /* set up the state */
46f4442e 2853 converterData = (UConverterDataISO2022*)cnv->extraInfo;
374ca955 2854 pFromU2022State = &converterData->fromU2022State;
374ca955
A
2855
2856 choiceCount = 0;
b75a7d8f
A
2857
2858 /* check if the last codepoint of previous buffer was a lead surrogate*/
46f4442e 2859 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
b75a7d8f
A
2860 goto getTrail;
2861 }
2862
b75a7d8f 2863 while( source < sourceLimit){
b75a7d8f
A
2864 if(target < targetLimit){
2865
2866 sourceChar = *(source++);
2867 /*check if the char is a First surrogate*/
2868 if(UTF_IS_SURROGATE(sourceChar)) {
2869 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
b75a7d8f
A
2870getTrail:
2871 /*look ahead to find the trail surrogate*/
2872 if(source < sourceLimit) {
2873 /* test the following code unit */
2874 UChar trail=(UChar) *source;
2875 if(UTF_IS_SECOND_SURROGATE(trail)) {
2876 source++;
374ca955 2877 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
46f4442e 2878 cnv->fromUChar32=0x00;
374ca955 2879 /* convert this supplementary code point */
b75a7d8f
A
2880 /* exit this condition tree */
2881 } else {
2882 /* this is an unmatched lead code unit (1st surrogate) */
2883 /* callback(illegal) */
b75a7d8f 2884 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 2885 cnv->fromUChar32=sourceChar;
374ca955 2886 break;
b75a7d8f
A
2887 }
2888 } else {
2889 /* no more input */
46f4442e 2890 cnv->fromUChar32=sourceChar;
b75a7d8f
A
2891 break;
2892 }
2893 } else {
2894 /* this is an unmatched trail code unit (2nd surrogate) */
2895 /* callback(illegal) */
b75a7d8f 2896 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 2897 cnv->fromUChar32=sourceChar;
374ca955 2898 break;
b75a7d8f
A
2899 }
2900 }
2901
2902 /* do the conversion */
374ca955 2903 if(sourceChar <= 0x007f ){
73c04bcf
A
2904 /* do not convert SO/SI/ESC */
2905 if(IS_2022_CONTROL(sourceChar)) {
2906 /* callback(illegal) */
2907 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 2908 cnv->fromUChar32=sourceChar;
73c04bcf
A
2909 break;
2910 }
2911
374ca955
A
2912 /* US-ASCII */
2913 if(pFromU2022State->g == 0) {
2914 buffer[0] = (char)sourceChar;
2915 len = 1;
2916 } else {
2917 buffer[0] = UCNV_SI;
2918 buffer[1] = (char)sourceChar;
2919 len = 2;
2920 pFromU2022State->g = 0;
2921 choiceCount = 0;
2922 }
2923 if(sourceChar == CR || sourceChar == LF) {
2924 /* reset the state at the end of a line */
2925 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2926 choiceCount = 0;
b75a7d8f 2927 }
b75a7d8f
A
2928 }
2929 else{
374ca955 2930 /* convert U+0080..U+10ffff */
374ca955
A
2931 int32_t i;
2932 int8_t cs, g;
2933
2934 if(choiceCount == 0) {
2935 /* try the current SO/G1 converter first */
2936 choices[0] = pFromU2022State->cs[1];
2937
2938 /* default to GB2312_1 if none is designated yet */
2939 if(choices[0] == 0) {
2940 choices[0] = GB2312_1;
2941 }
b75a7d8f 2942
374ca955
A
2943 if(converterData->version == 0) {
2944 /* ISO-2022-CN */
2945
2946 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2947 if(choices[0] == GB2312_1) {
2948 choices[1] = (int8_t)CNS_11643_1;
2949 } else {
2950 choices[1] = (int8_t)GB2312_1;
b75a7d8f 2951 }
374ca955
A
2952
2953 choiceCount = 2;
2954 } else {
2955 /* ISO-2022-CN-EXT */
2956
2957 /* try one of the other converters */
2958 switch(choices[0]) {
2959 case GB2312_1:
2960 choices[1] = (int8_t)CNS_11643_1;
2961 choices[2] = (int8_t)ISO_IR_165;
2962 break;
2963 case ISO_IR_165:
2964 choices[1] = (int8_t)GB2312_1;
2965 choices[2] = (int8_t)CNS_11643_1;
2966 break;
2967 default: /* CNS_11643_x */
2968 choices[1] = (int8_t)GB2312_1;
2969 choices[2] = (int8_t)ISO_IR_165;
2970 break;
b75a7d8f 2971 }
b75a7d8f 2972
374ca955
A
2973 choiceCount = 3;
2974 }
b75a7d8f
A
2975 }
2976
374ca955 2977 cs = g = 0;
46f4442e
A
2978 /*
2979 * len==0: no mapping found yet
2980 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
2981 * len>0: found a roundtrip result, done
2982 */
374ca955 2983 len = 0;
46f4442e
A
2984 /*
2985 * We will turn off useFallback after finding a fallback,
2986 * but we still get fallbacks from PUA code points as usual.
2987 * Therefore, we will also need to check that we don't overwrite
2988 * an early fallback with a later one.
2989 */
2990 useFallback = cnv->useFallback;
2991
2992 for(i = 0; i < choiceCount && len <= 0; ++i) {
2993 int8_t cs0 = choices[i];
2994 if(cs0 > 0) {
2995 uint32_t value;
2996 int32_t len2;
2997 if(cs0 >= CNS_11643_0) {
2998 len2 = MBCS_FROM_UCHAR32_ISO2022(
2999 converterData->myConverterArray[CNS_11643],
3000 sourceChar,
3001 &value,
3002 useFallback,
3003 MBCS_OUTPUT_3);
3004 if(len2 == 3 || (len2 == -3 && len == 0)) {
3005 targetValue = value;
3006 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3007 if(len2 >= 0) {
3008 len = 2;
3009 } else {
3010 len = -2;
3011 useFallback = FALSE;
3012 }
374ca955
A
3013 if(cs == CNS_11643_1) {
3014 g = 1;
3015 } else if(cs == CNS_11643_2) {
3016 g = 2;
3017 } else /* plane 3..7 */ if(converterData->version == 1) {
3018 g = 3;
3019 } else {
3020 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3021 len = 0;
3022 }
3023 }
3024 } else {
3025 /* GB2312_1 or ISO-IR-165 */
46f4442e
A
3026 len2 = MBCS_FROM_UCHAR32_ISO2022(
3027 converterData->myConverterArray[cs0],
3028 sourceChar,
3029 &value,
3030 useFallback,
3031 MBCS_OUTPUT_2);
3032 if(len2 == 2 || (len2 == -2 && len == 0)) {
3033 targetValue = value;
3034 len = len2;
3035 cs = cs0;
3036 g = 1;
3037 useFallback = FALSE;
3038 }
374ca955 3039 }
b75a7d8f 3040 }
b75a7d8f
A
3041 }
3042
46f4442e
A
3043 if(len != 0) {
3044 len = 0; /* count output bytes; it must have been abs(len) == 2 */
b75a7d8f 3045
374ca955
A
3046 /* write the designation sequence if necessary */
3047 if(cs != pFromU2022State->cs[g]) {
3048 if(cs < CNS_11643) {
3049 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3050 } else {
3051 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
b75a7d8f 3052 }
374ca955
A
3053 len = 4;
3054 pFromU2022State->cs[g] = cs;
3055 if(g == 1) {
3056 /* changing the SO/G1 charset invalidates the choices[] */
3057 choiceCount = 0;
b75a7d8f 3058 }
374ca955
A
3059 }
3060
3061 /* write the shift sequence if necessary */
3062 if(g != pFromU2022State->g) {
3063 switch(g) {
3064 case 1:
3065 buffer[len++] = UCNV_SO;
3066
3067 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3068 pFromU2022State->g = 1;
3069 break;
3070 case 2:
3071 buffer[len++] = 0x1b;
3072 buffer[len++] = 0x4e;
3073 break;
3074 default: /* case 3 */
3075 buffer[len++] = 0x1b;
3076 buffer[len++] = 0x4f;
3077 break;
b75a7d8f 3078 }
b75a7d8f 3079 }
b75a7d8f 3080
374ca955
A
3081 /* write the two output bytes */
3082 buffer[len++] = (char)(targetValue >> 8);
3083 buffer[len++] = (char)targetValue;
3084 } else {
46f4442e 3085 /* if we cannot find the character after checking all codepages
374ca955
A
3086 * then this is an error
3087 */
3088 *err = U_INVALID_CHAR_FOUND;
46f4442e 3089 cnv->fromUChar32=sourceChar;
374ca955
A
3090 break;
3091 }
b75a7d8f 3092 }
b75a7d8f 3093
374ca955
A
3094 /* output len>0 bytes in buffer[] */
3095 if(len == 1) {
3096 *target++ = buffer[0];
3097 if(offsets) {
73c04bcf 3098 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
374ca955
A
3099 }
3100 } else if(len == 2 && (target + 2) <= targetLimit) {
3101 *target++ = buffer[0];
3102 *target++ = buffer[1];
3103 if(offsets) {
3104 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3105 *offsets++ = sourceIndex;
3106 *offsets++ = sourceIndex;
3107 }
3108 } else {
73c04bcf 3109 fromUWriteUInt8(
46f4442e 3110 cnv,
374ca955 3111 buffer, len,
73c04bcf 3112 &target, (const char *)targetLimit,
374ca955
A
3113 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3114 err);
3115 if(U_FAILURE(*err)) {
b75a7d8f
A
3116 break;
3117 }
3118 }
3119 } /* end if(myTargetIndex<myTargetLength) */
3120 else{
3121 *err =U_BUFFER_OVERFLOW_ERROR;
3122 break;
3123 }
3124
3125 }/* end while(mySourceIndex<mySourceLength) */
3126
374ca955
A
3127 /*
3128 * the end of the input stream and detection of truncated input
3129 * are handled by the framework, but for ISO-2022-CN conversion
3130 * we need to be in ASCII mode at the very end
3131 *
3132 * conditions:
3133 * successful
3134 * not in ASCII mode
3135 * end of input and no truncated input
b75a7d8f 3136 */
374ca955
A
3137 if( U_SUCCESS(*err) &&
3138 pFromU2022State->g!=0 &&
46f4442e 3139 args->flush && source>=sourceLimit && cnv->fromUChar32==0
374ca955
A
3140 ) {
3141 int32_t sourceIndex;
3142
3143 /* we are switching to ASCII */
3144 pFromU2022State->g=0;
3145
3146 /* get the source index of the last input character */
3147 /*
3148 * TODO this would be simpler and more reliable if we used a pair
3149 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3150 * so that we could simply use the prevSourceIndex here;
3151 * this code gives an incorrect result for the rare case of an unmatched
3152 * trail surrogate that is alone in the last buffer of the text stream
3153 */
3154 sourceIndex=(int32_t)(source-args->source);
3155 if(sourceIndex>0) {
3156 --sourceIndex;
3157 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3158 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3159 ) {
3160 --sourceIndex;
b75a7d8f 3161 }
374ca955
A
3162 } else {
3163 sourceIndex=-1;
b75a7d8f 3164 }
b75a7d8f 3165
73c04bcf 3166 fromUWriteUInt8(
46f4442e 3167 cnv,
374ca955 3168 SHIFT_IN_STR, 1,
73c04bcf 3169 &target, (const char *)targetLimit,
374ca955
A
3170 &offsets, sourceIndex,
3171 err);
b75a7d8f 3172 }
b75a7d8f 3173
374ca955
A
3174 /*save the state and return */
3175 args->source = source;
3176 args->target = (char*)target;
b75a7d8f
A
3177}
3178
3179
46f4442e 3180static void
b75a7d8f
A
3181UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3182 UErrorCode* err){
3183 char tempBuf[3];
374ca955 3184 const char *mySource = (char *) args->source;
b75a7d8f 3185 UChar *myTarget = args->target;
b75a7d8f
A
3186 const char *mySourceLimit = args->sourceLimit;
3187 uint32_t targetUniChar = 0x0000;
3188 uint32_t mySourceChar = 0x0000;
3189 UConverterDataISO2022* myData;
374ca955 3190 ISO2022State *pToU2022State;
b75a7d8f 3191
374ca955
A
3192 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3193 pToU2022State = &myData->toU2022State;
3194
3195 if(myData->key != 0) {
3196 /* continue with a partial escape sequence */
3197 goto escape;
3198 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3199 /* continue with a partial double-byte character */
3200 mySourceChar = args->converter->toUBytes[0];
3201 args->converter->toULength = 0;
fd0068a8 3202 targetUniChar = missingCharMarker;
374ca955 3203 goto getTrailByte;
b75a7d8f 3204 }
374ca955
A
3205
3206 while(mySource < mySourceLimit){
b75a7d8f
A
3207
3208 targetUniChar =missingCharMarker;
3209
3210 if(myTarget < args->targetLimit){
3211
3212 mySourceChar= (unsigned char) *mySource++;
3213
b75a7d8f
A
3214 switch(mySourceChar){
3215 case UCNV_SI:
374ca955 3216 pToU2022State->g=0;
d5d484b0
A
3217 if (myData->isEmptySegment) {
3218 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
46f4442e
A
3219 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3220 args->converter->toUCallbackReason = UCNV_IRREGULAR;
d5d484b0
A
3221 args->converter->toUBytes[0] = mySourceChar;
3222 args->converter->toULength = 1;
3223 args->target = myTarget;
3224 args->source = mySource;
3225 return;
3226 }
b75a7d8f
A
3227 continue;
3228
3229 case UCNV_SO:
374ca955
A
3230 if(pToU2022State->cs[1] != 0) {
3231 pToU2022State->g=1;
d5d484b0 3232 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
374ca955
A
3233 continue;
3234 } else {
3235 /* illegal to have SO before a matching designator */
d5d484b0 3236 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
b75a7d8f
A
3237 break;
3238 }
3239
b75a7d8f 3240 case ESC_2022:
b75a7d8f 3241 mySource--;
374ca955 3242escape:
d5d484b0
A
3243 {
3244 const char * mySourceBefore = mySource;
3245 int8_t toULengthBefore = args->converter->toULength;
3246
46f4442e 3247 changeState_2022(args->converter,&(mySource),
d5d484b0
A
3248 mySourceLimit, ISO_2022_CN,err);
3249
3250 /* After SO there must be at least one character before a designator (designator error handled separately) */
46f4442e
A
3251 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3252 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3253 args->converter->toUCallbackReason = UCNV_IRREGULAR;
d5d484b0
A
3254 args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
3255 }
3256 }
b75a7d8f
A
3257
3258 /* invalid or illegal escape sequence */
3259 if(U_FAILURE(*err)){
3260 args->target = myTarget;
3261 args->source = mySource;
d5d484b0 3262 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
b75a7d8f
A
3263 return;
3264 }
3265 continue;
3266
374ca955
A
3267 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3268
3269 case CR:
3270 /*falls through*/
3271 case LF:
3272 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3273 /* falls through */
3274 default:
3275 /* convert one or two bytes */
d5d484b0 3276 myData->isEmptySegment = FALSE;
374ca955
A
3277 if(pToU2022State->g != 0) {
3278 if(mySource < mySourceLimit) {
3279 UConverterSharedData *cnv;
3280 StateEnum tempState;
3281 int32_t tempBufLen;
fd0068a8
A
3282 int leadIsOk, trailIsOk;
3283 uint8_t trailByte;
374ca955 3284getTrailByte:
fd0068a8
A
3285 trailByte = (uint8_t)*mySource;
3286 /*
3287 * Ticket 5691: consistent illegal sequences:
3288 * - We include at least the first byte in the illegal sequence.
3289 * - If any of the non-initial bytes could be the start of a character,
3290 * we stop the illegal sequence before the first one of those.
3291 *
3292 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3293 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3294 * Otherwise we convert or report the pair of bytes.
3295 */
3296 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3297 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3298 if (leadIsOk && trailIsOk) {
3299 ++mySource;
3300 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3301 if(tempState >= CNS_11643_0) {
3302 cnv = myData->myConverterArray[CNS_11643];
3303 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3304 tempBuf[1] = (char) (mySourceChar);
3305 tempBuf[2] = (char) trailByte;
3306 tempBufLen = 3;
3307
3308 }else{
3309 cnv = myData->myConverterArray[tempState];
3310 tempBuf[0] = (char) (mySourceChar);
3311 tempBuf[1] = (char) trailByte;
3312 tempBufLen = 2;
3313 }
3314 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3315 mySourceChar = (mySourceChar << 8) | trailByte;
3316 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3317 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3318 ++mySource;
3319 /* add another bit so that the code below writes 2 bytes in case of error */
3320 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
374ca955 3321 }
374ca955
A
3322 if(pToU2022State->g>=2) {
3323 /* return from a single-shift state to the previous one */
3324 pToU2022State->g=pToU2022State->prevG;
3325 }
374ca955
A
3326 } else {
3327 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3328 args->converter->toULength = 1;
3329 goto endloop;
3330 }
3331 }
3332 else{
3333 if(mySourceChar <= 0x7f) {
3334 targetUniChar = (UChar) mySourceChar;
3335 }
3336 }
3337 break;
b75a7d8f
A
3338 }
3339 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3340 if(args->offsets){
73c04bcf 3341 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f
A
3342 }
3343 *(myTarget++)=(UChar)targetUniChar;
3344 }
3345 else if(targetUniChar > missingCharMarker){
3346 /* disassemble the surrogate pair and write to output*/
3347 targetUniChar-=0x0010000;
374ca955 3348 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
b75a7d8f 3349 if(args->offsets){
73c04bcf 3350 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f 3351 }
374ca955 3352 ++myTarget;
46f4442e 3353 if(myTarget< args->targetLimit){
374ca955 3354 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
b75a7d8f 3355 if(args->offsets){
73c04bcf 3356 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f 3357 }
374ca955 3358 ++myTarget;
b75a7d8f
A
3359 }else{
3360 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3361 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3362 }
3363
3364 }
3365 else{
3366 /* Call the callback function*/
374ca955
A
3367 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3368 break;
b75a7d8f
A
3369 }
3370 }
3371 else{
3372 *err =U_BUFFER_OVERFLOW_ERROR;
3373 break;
3374 }
3375 }
374ca955 3376endloop:
b75a7d8f
A
3377 args->target = myTarget;
3378 args->source = mySource;
3379}
3380
3381static void
3382_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3383 UConverter *cnv = args->converter;
3384 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
374ca955
A
3385 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3386 char *p, *subchar;
3387 char buffer[8];
3388 int32_t length;
3389
73c04bcf 3390 subchar=(char *)cnv->subChars;
374ca955 3391 length=cnv->subCharLen; /* assume length==1 for most variants */
b75a7d8f
A
3392
3393 p = buffer;
3394 switch(myConverterData->locale[0]){
3395 case 'j':
374ca955
A
3396 {
3397 int8_t cs;
3398
3399 if(pFromU2022State->g == 1) {
3400 /* JIS7: switch from G1 to G0 */
3401 pFromU2022State->g = 0;
3402 *p++ = UCNV_SI;
3403 }
3404
3405 cs = pFromU2022State->cs[0];
3406 if(cs != ASCII && cs != JISX201) {
3407 /* not in ASCII or JIS X 0201: switch to ASCII */
3408 pFromU2022State->cs[0] = (int8_t)ASCII;
b75a7d8f
A
3409 *p++ = '\x1b';
3410 *p++ = '\x28';
3411 *p++ = '\x42';
b75a7d8f 3412 }
374ca955
A
3413
3414 *p++ = subchar[0];
b75a7d8f 3415 break;
374ca955 3416 }
b75a7d8f 3417 case 'c':
374ca955
A
3418 if(pFromU2022State->g != 0) {
3419 /* not in ASCII mode: switch to ASCII */
3420 pFromU2022State->g = 0;
3421 *p++ = UCNV_SI;
3422 }
3423 *p++ = subchar[0];
b75a7d8f
A
3424 break;
3425 case 'k':
374ca955
A
3426 if(myConverterData->version == 0) {
3427 if(length == 1) {
3428 if((UBool)args->converter->fromUnicodeStatus) {
3429 /* in DBCS mode: switch to SBCS */
3430 args->converter->fromUnicodeStatus = 0;
3431 *p++ = UCNV_SI;
3432 }
3433 *p++ = subchar[0];
3434 } else /* length == 2*/ {
3435 if(!(UBool)args->converter->fromUnicodeStatus) {
3436 /* in SBCS mode: switch to DBCS */
3437 args->converter->fromUnicodeStatus = 1;
3438 *p++ = UCNV_SO;
3439 }
3440 *p++ = subchar[0];
3441 *p++ = subchar[1];
3442 }
3443 break;
3444 } else {
73c04bcf
A
3445 /* save the subconverter's substitution string */
3446 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3447 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3448
3449 /* set our substitution string into the subconverter */
3450 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
374ca955
A
3451 myConverterData->currentConverter->subCharLen = (int8_t)length;
3452
73c04bcf
A
3453 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3454 args->converter = myConverterData->currentConverter;
374ca955
A
3455 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3456 ucnv_cbFromUWriteSub(args, 0, err);
3457 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
73c04bcf
A
3458 args->converter = cnv;
3459
3460 /* restore the subconverter's substitution string */
3461 myConverterData->currentConverter->subChars = currentSubChars;
3462 myConverterData->currentConverter->subCharLen = currentSubCharLen;
374ca955
A
3463
3464 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3465 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3466 uprv_memcpy(
3467 cnv->charErrorBuffer,
3468 myConverterData->currentConverter->charErrorBuffer,
3469 myConverterData->currentConverter->charErrorBufferLength);
3470 }
3471 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3472 myConverterData->currentConverter->charErrorBufferLength = 0;
3473 }
374ca955 3474 return;
b75a7d8f 3475 }
b75a7d8f
A
3476 default:
3477 /* not expected */
3478 break;
3479 }
3480 ucnv_cbFromUWriteBytes(args,
3481 buffer, (int32_t)(p - buffer),
3482 offsetIndex, err);
3483}
3484
73c04bcf
A
3485/*
3486 * Structure for cloning an ISO 2022 converter into a single memory block.
3487 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3488 * and then ucnv_safeClone() of the sub-converter may additionally align
3489 * currentConverter inside the cloneStruct, for which we need the deadSpace
3490 * after currentConverter.
3491 * This is because UAlignedMemory may be larger than the actually
3492 * necessary alignment size for the platform.
3493 * The other cloneStruct fields will not be moved around,
3494 * and are aligned properly with cloneStruct's alignment.
3495 */
b75a7d8f
A
3496struct cloneStruct
3497{
3498 UConverter cnv;
374ca955 3499 UConverter currentConverter;
73c04bcf
A
3500 UAlignedMemory deadSpace;
3501 UConverterDataISO2022 mydata;
b75a7d8f
A
3502};
3503
3504
46f4442e 3505static UConverter *
b75a7d8f 3506_ISO_2022_SafeClone(
46f4442e
A
3507 const UConverter *cnv,
3508 void *stackBuffer,
3509 int32_t *pBufferSize,
b75a7d8f
A
3510 UErrorCode *status)
3511{
3512 struct cloneStruct * localClone;
374ca955
A
3513 UConverterDataISO2022 *cnvData;
3514 int32_t i, size;
b75a7d8f
A
3515
3516 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
374ca955
A
3517 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3518 return NULL;
b75a7d8f
A
3519 }
3520
374ca955 3521 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
b75a7d8f 3522 localClone = (struct cloneStruct *)stackBuffer;
b75a7d8f 3523
374ca955 3524 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
b75a7d8f 3525
374ca955 3526 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
73c04bcf
A
3527 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3528 localClone->cnv.isExtraLocal = TRUE;
b75a7d8f 3529
374ca955 3530 /* share the subconverters */
b75a7d8f 3531
374ca955 3532 if(cnvData->currentConverter != NULL) {
73c04bcf 3533 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
374ca955
A
3534 localClone->mydata.currentConverter =
3535 ucnv_safeClone(cnvData->currentConverter,
3536 &localClone->currentConverter,
3537 &size, status);
3538 if(U_FAILURE(*status)) {
3539 return NULL;
b75a7d8f 3540 }
b75a7d8f
A
3541 }
3542
374ca955
A
3543 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3544 if(cnvData->myConverterArray[i] != NULL) {
3545 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3546 }
b75a7d8f
A
3547 }
3548
b75a7d8f
A
3549 return &localClone->cnv;
3550}
3551
3552static void
3553_ISO_2022_GetUnicodeSet(const UConverter *cnv,
73c04bcf 3554 const USetAdder *sa,
b75a7d8f
A
3555 UConverterUnicodeSet which,
3556 UErrorCode *pErrorCode)
3557{
3558 int32_t i;
b75a7d8f
A
3559 UConverterDataISO2022* cnvData;
3560
3561 if (U_FAILURE(*pErrorCode)) {
3562 return;
3563 }
374ca955 3564#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f
A
3565 if (cnv->sharedData == &_ISO2022Data) {
3566 /* We use UTF-8 in this case */
374ca955
A
3567 sa->addRange(sa->set, 0, 0xd7FF);
3568 sa->addRange(sa->set, 0xE000, 0x10FFFF);
b75a7d8f
A
3569 return;
3570 }
374ca955 3571#endif
b75a7d8f
A
3572
3573 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
b75a7d8f 3574
374ca955
A
3575 /* open a set and initialize it with code points that are algorithmically round-tripped */
3576 switch(cnvData->locale[0]){
3577 case 'j':
46f4442e
A
3578 /* include JIS X 0201 which is hardcoded */
3579 sa->add(sa->set, 0xa5);
3580 sa->add(sa->set, 0x203e);
374ca955
A
3581 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3582 /* include Latin-1 for some variants of JP */
3583 sa->addRange(sa->set, 0, 0xff);
3584 } else {
3585 /* include ASCII for JP */
3586 sa->addRange(sa->set, 0, 0x7f);
3587 }
46f4442e
A
3588 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3589 /*
3590 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3591 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3592 * use half-width Katakana.
3593 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3594 * half-width Katakana via the ESC ( I sequence.
3595 * However, we only emit (fromUnicode) half-width Katakana according to the
3596 * definition of each variant.
3597 *
3598 * When including fallbacks,
3599 * we need to include half-width Katakana Unicode code points for all JP variants because
3600 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3601 */
374ca955 3602 /* include half-width Katakana for JP */
46f4442e 3603 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
374ca955
A
3604 }
3605 break;
3606 case 'c':
3607 case 'z':
3608 /* include ASCII for CN */
3609 sa->addRange(sa->set, 0, 0x7f);
3610 break;
3611 case 'k':
3612 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3613 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3614 cnvData->currentConverter, sa, which, pErrorCode);
73c04bcf
A
3615 /* the loop over myConverterArray[] will simply not find another converter */
3616 break;
374ca955
A
3617 default:
3618 break;
b75a7d8f
A
3619 }
3620
46f4442e 3621#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
374ca955
A
3622 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3623 cnvData->version==0 && i==CNS_11643
3624 ) {
3625 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3626 ucnv_MBCSGetUnicodeSetForBytes(
3627 cnvData->myConverterArray[i],
3628 sa, UCNV_ROUNDTRIP_SET,
3629 0, 0x81, 0x82,
3630 pErrorCode);
46f4442e
A
3631 }
3632#endif
3633
3634 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3635 UConverterSetFilter filter;
3636 if(cnvData->myConverterArray[i]!=NULL) {
3637 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3638 cnvData->version==0 && i==CNS_11643
3639 ) {
3640 /*
3641 * Version-specific for CN:
3642 * CN version 0 does not map CNS planes 3..7 although
3643 * they are all available in the CNS conversion table;
3644 * CN version 1 (-EXT) does map them all.
3645 * The two versions create different Unicode sets.
3646 */
3647 filter=UCNV_SET_FILTER_2022_CN;
3648 } else if(cnvData->locale[0]=='j' && i==JISX208) {
3649 /*
3650 * Only add code points that map to Shift-JIS codes
3651 * corresponding to JIS X 0208.
3652 */
3653 filter=UCNV_SET_FILTER_SJIS;
3654 } else if(i==KSC5601) {
3655 /*
3656 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3657 * are broader than GR94.
3658 */
3659 filter=UCNV_SET_FILTER_GR94DBCS;
374ca955 3660 } else {
46f4442e 3661 filter=UCNV_SET_FILTER_NONE;
374ca955 3662 }
46f4442e 3663 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
374ca955 3664 }
b75a7d8f 3665 }
73c04bcf
A
3666
3667 /*
3668 * ISO 2022 converters must not convert SO/SI/ESC despite what
3669 * sub-converters do by themselves.
3670 * Remove these characters from the set.
3671 */
3672 sa->remove(sa->set, 0x0e);
3673 sa->remove(sa->set, 0x0f);
3674 sa->remove(sa->set, 0x1b);
46f4442e
A
3675
3676 /* ISO 2022 converters do not convert C1 controls either */
3677 sa->removeRange(sa->set, 0x80, 0x9f);
b75a7d8f
A
3678}
3679
374ca955
A
3680static const UConverterImpl _ISO2022Impl={
3681 UCNV_ISO_2022,
3682
3683 NULL,
3684 NULL,
3685
3686 _ISO2022Open,
3687 _ISO2022Close,
3688 _ISO2022Reset,
3689
3690#ifdef U_ENABLE_GENERIC_ISO_2022
3691 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3692 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3693 ucnv_fromUnicode_UTF8,
3694 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3695#else
3696 NULL,
3697 NULL,
3698 NULL,
3699 NULL,
3700#endif
3701 NULL,
3702
3703 NULL,
3704 _ISO2022getName,
3705 _ISO_2022_WriteSub,
3706 _ISO_2022_SafeClone,
3707 _ISO_2022_GetUnicodeSet
3708};
3709static const UConverterStaticData _ISO2022StaticData={
3710 sizeof(UConverterStaticData),
3711 "ISO_2022",
3712 2022,
3713 UCNV_IBM,
3714 UCNV_ISO_2022,
3715 1,
3716 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3717 { 0x1a, 0, 0, 0 },
3718 1,
3719 FALSE,
3720 FALSE,
3721 0,
3722 0,
3723 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3724};
3725const UConverterSharedData _ISO2022Data={
3726 sizeof(UConverterSharedData),
3727 ~((uint32_t) 0),
3728 NULL,
3729 NULL,
3730 &_ISO2022StaticData,
3731 FALSE,
3732 &_ISO2022Impl,
3733 0
3734};
3735
3736/*************JP****************/
3737static const UConverterImpl _ISO2022JPImpl={
3738 UCNV_ISO_2022,
3739
3740 NULL,
3741 NULL,
3742
3743 _ISO2022Open,
3744 _ISO2022Close,
3745 _ISO2022Reset,
3746
3747 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3748 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3749 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3750 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3751 NULL,
3752
3753 NULL,
3754 _ISO2022getName,
3755 _ISO_2022_WriteSub,
3756 _ISO_2022_SafeClone,
3757 _ISO_2022_GetUnicodeSet
3758};
3759static const UConverterStaticData _ISO2022JPStaticData={
3760 sizeof(UConverterStaticData),
3761 "ISO_2022_JP",
3762 0,
3763 UCNV_IBM,
3764 UCNV_ISO_2022,
3765 1,
3766 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3767 { 0x1a, 0, 0, 0 },
3768 1,
3769 FALSE,
3770 FALSE,
3771 0,
3772 0,
3773 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3774};
3775static const UConverterSharedData _ISO2022JPData={
3776 sizeof(UConverterSharedData),
3777 ~((uint32_t) 0),
3778 NULL,
3779 NULL,
3780 &_ISO2022JPStaticData,
3781 FALSE,
3782 &_ISO2022JPImpl,
3783 0
3784};
3785
3786/************* KR ***************/
3787static const UConverterImpl _ISO2022KRImpl={
3788 UCNV_ISO_2022,
3789
3790 NULL,
3791 NULL,
3792
3793 _ISO2022Open,
3794 _ISO2022Close,
3795 _ISO2022Reset,
3796
3797 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3798 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3799 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3800 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3801 NULL,
3802
3803 NULL,
3804 _ISO2022getName,
3805 _ISO_2022_WriteSub,
3806 _ISO_2022_SafeClone,
3807 _ISO_2022_GetUnicodeSet
3808};
3809static const UConverterStaticData _ISO2022KRStaticData={
3810 sizeof(UConverterStaticData),
3811 "ISO_2022_KR",
3812 0,
3813 UCNV_IBM,
3814 UCNV_ISO_2022,
3815 1,
3816 3, /* max 3 bytes per UChar: SO+DBCS */
3817 { 0x1a, 0, 0, 0 },
3818 1,
3819 FALSE,
3820 FALSE,
3821 0,
3822 0,
3823 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3824};
3825static const UConverterSharedData _ISO2022KRData={
3826 sizeof(UConverterSharedData),
3827 ~((uint32_t) 0),
3828 NULL,
3829 NULL,
3830 &_ISO2022KRStaticData,
3831 FALSE,
3832 &_ISO2022KRImpl,
3833 0
3834};
3835
3836/*************** CN ***************/
3837static const UConverterImpl _ISO2022CNImpl={
3838
3839 UCNV_ISO_2022,
3840
3841 NULL,
3842 NULL,
3843
3844 _ISO2022Open,
3845 _ISO2022Close,
3846 _ISO2022Reset,
3847
3848 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3849 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3850 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3851 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3852 NULL,
3853
3854 NULL,
3855 _ISO2022getName,
3856 _ISO_2022_WriteSub,
3857 _ISO_2022_SafeClone,
3858 _ISO_2022_GetUnicodeSet
3859};
3860static const UConverterStaticData _ISO2022CNStaticData={
3861 sizeof(UConverterStaticData),
3862 "ISO_2022_CN",
3863 0,
3864 UCNV_IBM,
3865 UCNV_ISO_2022,
73c04bcf 3866 1,
374ca955
A
3867 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3868 { 0x1a, 0, 0, 0 },
3869 1,
3870 FALSE,
3871 FALSE,
3872 0,
3873 0,
3874 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3875};
3876static const UConverterSharedData _ISO2022CNData={
3877 sizeof(UConverterSharedData),
3878 ~((uint32_t) 0),
3879 NULL,
3880 NULL,
3881 &_ISO2022CNStaticData,
3882 FALSE,
3883 &_ISO2022CNImpl,
3884 0
3885};
3886
3887
3888
b75a7d8f 3889#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */