]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnv2022.cpp
ICU-511.35.tar.gz
[apple/icu.git] / icuSources / common / ucnv2022.cpp
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
51004dcb 3* Copyright (C) 2000-2012, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
4388f060 6* file name: ucnv2022.cpp
b75a7d8f
A
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2000feb03
12* created by: Markus W. Scherer
13*
14* Change history:
15*
16* 06/29/2000 helena Major rewrite of the callback APIs.
17* 08/08/2000 Ram Included support for ISO-2022-JP-2
18* Changed implementation of toUnicode
19* function
20* 08/21/2000 Ram Added support for ISO-2022-KR
21* 08/29/2000 Ram Seperated implementation of EBCDIC to
22* ucnvebdc.c
23* 09/20/2000 Ram Added support for ISO-2022-CN
24* Added implementations for getNextUChar()
25* for specific 2022 country variants.
26* 10/31/2000 Ram Implemented offsets logic functions
27*/
28
29#include "unicode/utypes.h"
30
374ca955 31#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
b75a7d8f
A
32
33#include "unicode/ucnv.h"
34#include "unicode/uset.h"
35#include "unicode/ucnv_err.h"
36#include "unicode/ucnv_cb.h"
4388f060 37#include "unicode/utf16.h"
374ca955 38#include "ucnv_imp.h"
b75a7d8f
A
39#include "ucnv_bld.h"
40#include "ucnv_cnv.h"
41#include "ucnvmbcs.h"
42#include "cstring.h"
43#include "cmemory.h"
4388f060 44#include "uassert.h"
b75a7d8f 45
374ca955
A
46#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
47
48#ifdef U_ENABLE_GENERIC_ISO_2022
49/*
50 * I am disabling the generic ISO-2022 converter after proposing to do so on
51 * the icu mailing list two days ago.
52 *
53 * Reasons:
54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55 * its designation sequences, single shifts with return to the previous state,
56 * switch-with-no-return to UTF-16BE or similar, etc.
57 * This is unlike the language-specific variants like ISO-2022-JP which
58 * require a much smaller repertoire of ISO-2022 features.
59 * These variants continue to be supported.
60 * 2. I believe that no one is really using the generic ISO-2022 converter
61 * but rather always one of the language-specific variants.
62 * Note that ICU's generic ISO-2022 converter has always output one escape
63 * sequence followed by UTF-8 for the whole stream.
64 * 3. Switching between subcharsets is extremely slow, because each time
65 * the previous converter is closed and a new one opened,
66 * without any kind of caching, least-recently-used list, etc.
67 * 4. The code is currently buggy, and given the above it does not seem
68 * reasonable to spend the time on maintenance.
69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70 * This means, for example, that when ISO-8859-7 is designated, the following
71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72 * The ICU ISO-2022 converter does not handle this - and has no information
73 * about which subconverter would have to be shifted vs. which is designed
74 * for 7-bit ISO-2022.
75 *
76 * Markus Scherer 2003-dec-03
77 */
78#endif
79
80static const char SHIFT_IN_STR[] = "\x0F";
51004dcb 81// static const char SHIFT_OUT_STR[] = "\x0E";
b75a7d8f
A
82
83#define CR 0x0D
84#define LF 0x0A
85#define H_TAB 0x09
86#define V_TAB 0x0B
87#define SPACE 0x20
88
46f4442e
A
89enum {
90 HWKANA_START=0xff61,
91 HWKANA_END=0xff9f
92};
93
94/*
95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
96 * as bytes 21..7E. (Subtract 0x80.)
97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
98 * as bytes 20..7F. (Subtract 0x80.)
99 * Do not encode C1 control codes with native bytes 80..9F
100 * as bytes 00..1F (C0 control codes).
101 */
102enum {
103 GR94_START=0xa1,
104 GR94_END=0xfe,
105 GR96_START=0xa0,
106 GR96_END=0xff
107};
108
73c04bcf
A
109/*
110 * ISO 2022 control codes must not be converted from Unicode
111 * because they would mess up the byte stream.
112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
113 * corresponding to SO, SI, and ESC.
114 */
115#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
116
374ca955 117/* for ISO-2022-JP and -CN implementations */
b75a7d8f 118typedef enum {
374ca955
A
119 /* shared values */
120 INVALID_STATE=-1,
b75a7d8f 121 ASCII = 0,
374ca955
A
122
123 SS2_STATE=0x10,
124 SS3_STATE,
125
126 /* JP */
b75a7d8f
A
127 ISO8859_1 = 1 ,
128 ISO8859_7 = 2 ,
129 JISX201 = 3,
130 JISX208 = 4,
131 JISX212 = 5,
132 GB2312 =6,
133 KSC5601 =7,
134 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
b75a7d8f 135
374ca955
A
136 /* CN */
137 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
138 GB2312_1=1,
139 ISO_IR_165=2,
140 CNS_11643=3,
141
142 /*
143 * these are used in StateEnum and ISO2022State variables,
144 * but CNS_11643 must be used to index into myConverterArray[]
145 */
146 CNS_11643_0=0x20,
147 CNS_11643_1,
148 CNS_11643_2,
149 CNS_11643_3,
150 CNS_11643_4,
151 CNS_11643_5,
152 CNS_11643_6,
153 CNS_11643_7
b75a7d8f
A
154} StateEnum;
155
374ca955
A
156/* is the StateEnum charset value for a DBCS charset? */
157#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
158
159#define CSM(cs) ((uint16_t)1<<(cs))
b75a7d8f 160
374ca955
A
161/*
162 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
163 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
164 *
165 * Note: The converter uses some leniency:
166 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
167 * all versions, not just JIS7 and JIS8.
168 * - ICU does not distinguish between different versions of JIS X 0208.
169 */
729e4ab9
A
170enum { MAX_JA_VERSION=4 };
171static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
374ca955
A
172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
175 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
176 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
177};
b75a7d8f
A
178
179typedef enum {
180 ASCII1=0,
181 LATIN1,
182 SBCS,
183 DBCS,
374ca955
A
184 MBCS,
185 HWKANA
b75a7d8f
A
186}Cnv2022Type;
187
374ca955
A
188typedef struct ISO2022State {
189 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
190 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
191 int8_t prevG; /* g before single shift (SS2 or SS3) */
192} ISO2022State;
193
b75a7d8f
A
194#define UCNV_OPTIONS_VERSION_MASK 0xf
195#define UCNV_2022_MAX_CONVERTERS 10
196
197typedef struct{
73c04bcf 198 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
b75a7d8f 199 UConverter *currentConverter;
b75a7d8f 200 Cnv2022Type currentType;
374ca955 201 ISO2022State toU2022State, fromU2022State;
b75a7d8f
A
202 uint32_t key;
203 uint32_t version;
73c04bcf
A
204#ifdef U_ENABLE_GENERIC_ISO_2022
205 UBool isFirstBuffer;
206#endif
d5d484b0 207 UBool isEmptySegment;
b75a7d8f 208 char name[30];
73c04bcf 209 char locale[3];
b75a7d8f
A
210}UConverterDataISO2022;
211
374ca955 212/* Protos */
b75a7d8f
A
213/* ISO-2022 ----------------------------------------------------------------- */
214
215/*Forward declaration */
46f4442e 216U_CFUNC void
374ca955
A
217ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
218 UErrorCode * err);
46f4442e 219U_CFUNC void
374ca955
A
220ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
221 UErrorCode * err);
b75a7d8f
A
222
223#define ESC_2022 0x1B /*ESC*/
224
225typedef enum
226{
227 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
228 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
229 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
374ca955 230 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
b75a7d8f
A
231} UCNV_TableStates_2022;
232
233/*
234* The way these state transition arrays work is:
235* ex : ESC$B is the sequence for JISX208
236* a) First Iteration: char is ESC
237* i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
238* int x = normalize_esq_chars_2022[27] which is equal to 1
239* ii) Search for this value in escSeqStateTable_Key_2022[]
240* value of x is stored at escSeqStateTable_Key_2022[0]
241* iii) Save this index as offset
242* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
243* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
244* b) Switch on this state and continue to next char
245* i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
246* which is normalize_esq_chars_2022[36] == 4
247* ii) x is currently 1(from above)
248* x<<=5 -- x is now 32
249* x+=normalize_esq_chars_2022[36]
250* now x is 36
251* iii) Search for this value in escSeqStateTable_Key_2022[]
252* value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
253* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
254* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
255* c) Switch on this state and continue to next char
256* i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
257* ii) x is currently 36 (from above)
258* x<<=5 -- x is now 1152
259* x+=normalize_esq_chars_2022[66]
260* now x is 1161
261* iii) Search for this value in escSeqStateTable_Key_2022[]
262* value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
263* iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
264* escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
265* v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
266*/
267
268
269/*Below are the 3 arrays depicting a state transition table*/
270static const int8_t normalize_esq_chars_2022[256] = {
271/* 0 1 2 3 4 5 6 7 8 9 */
272
273 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
274 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
275 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
276 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
277 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
278 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
279 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
280 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
281 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
282 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
298 ,0 ,0 ,0 ,0 ,0 ,0
299};
300
374ca955
A
301#ifdef U_ENABLE_GENERIC_ISO_2022
302/*
303 * When the generic ISO-2022 converter is completely removed, not just disabled
304 * per #ifdef, then the following state table and the associated tables that are
305 * dimensioned with MAX_STATES_2022 should be trimmed.
306 *
307 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
308 * the associated escape sequences starting with ESC ( B should be removed.
309 * This includes the ones with key values 1097 and all of the ones above 1000000.
310 *
311 * For the latter, the tables can simply be truncated.
312 * For the former, since the tables must be kept parallel, it is probably best
313 * to simply duplicate an adjacent table cell, parallel in all tables.
314 *
315 * It may make sense to restructure the tables, especially by using small search
316 * tables for the variants instead of indexing them parallel to the table here.
317 */
318#endif
319
b75a7d8f
A
320#define MAX_STATES_2022 74
321static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
322/* 0 1 2 3 4 5 6 7 8 9 */
323
324 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
325 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
326 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
327 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
328 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
329 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
330 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
331 ,35947631 ,35947635 ,35947636 ,35947638
332};
333
374ca955 334#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f
A
335
336static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
337 /* 0 1 2 3 4 5 6 7 8 9 */
338
339 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
374ca955 340 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
b75a7d8f
A
341 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
342 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
343 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
344 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
345 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
346 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
347};
348
374ca955
A
349#endif
350
46f4442e 351static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
b75a7d8f 352/* 0 1 2 3 4 5 6 7 8 9 */
374ca955 353 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
b75a7d8f
A
354 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
355 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
359 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
360 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
361};
362
363
b75a7d8f
A
364/* Type def for refactoring changeState_2022 code*/
365typedef enum{
374ca955 366#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f 367 ISO_2022=0,
374ca955 368#endif
b75a7d8f
A
369 ISO_2022_JP=1,
370 ISO_2022_KR=2,
371 ISO_2022_CN=3
372} Variant2022;
373
b75a7d8f 374/*********** ISO 2022 Converter Protos ***********/
46f4442e 375static void
729e4ab9 376_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
b75a7d8f
A
377
378static void
379 _ISO2022Close(UConverter *converter);
380
46f4442e 381static void
b75a7d8f
A
382_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
383
46f4442e 384static const char*
b75a7d8f
A
385_ISO2022getName(const UConverter* cnv);
386
46f4442e 387static void
b75a7d8f
A
388_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
389
46f4442e 390static UConverter *
b75a7d8f
A
391_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
392
374ca955 393#ifdef U_ENABLE_GENERIC_ISO_2022
46f4442e 394static void
374ca955
A
395T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
396#endif
b75a7d8f 397
4388f060
A
398namespace {
399
374ca955 400/*const UConverterSharedData _ISO2022Data;*/
4388f060
A
401extern const UConverterSharedData _ISO2022JPData;
402extern const UConverterSharedData _ISO2022KRData;
403extern const UConverterSharedData _ISO2022CNData;
404
405} // namespace
b75a7d8f 406
374ca955 407/*************** Converter implementations ******************/
b75a7d8f 408
73c04bcf 409/* The purpose of this function is to get around gcc compiler warnings. */
4388f060 410static inline void
73c04bcf
A
411fromUWriteUInt8(UConverter *cnv,
412 const char *bytes, int32_t length,
413 uint8_t **target, const char *targetLimit,
414 int32_t **offsets,
415 int32_t sourceIndex,
416 UErrorCode *pErrorCode)
417{
418 char *targetChars = (char *)*target;
419 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
420 offsets, sourceIndex, pErrorCode);
421 *target = (uint8_t*)targetChars;
422
423}
424
4388f060
A
425static inline void
426setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
374ca955
A
427 if(myConverterData->version == 1) {
428 UConverter *cnv = myConverterData->currentConverter;
b75a7d8f 429
374ca955
A
430 cnv->toUnicodeStatus=0; /* offset */
431 cnv->mode=0; /* state */
432 cnv->toULength=0; /* byteIndex */
433 }
434}
b75a7d8f 435
4388f060 436static inline void
374ca955
A
437setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
438 /* in ISO-2022-KR the designator sequence appears only once
439 * in a file so we append it only once
440 */
441 if( converter->charErrorBufferLength==0){
b75a7d8f 442
374ca955
A
443 converter->charErrorBufferLength = 4;
444 converter->charErrorBuffer[0] = 0x1b;
445 converter->charErrorBuffer[1] = 0x24;
446 converter->charErrorBuffer[2] = 0x29;
447 converter->charErrorBuffer[3] = 0x43;
448 }
449 if(myConverterData->version == 1) {
450 UConverter *cnv = myConverterData->currentConverter;
b75a7d8f 451
374ca955
A
452 cnv->fromUChar32=0;
453 cnv->fromUnicodeStatus=1; /* prevLength */
454 }
455}
b75a7d8f 456
46f4442e 457static void
729e4ab9 458_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
b75a7d8f 459
374ca955 460 char myLocale[6]={' ',' ',' ',' ',' ',' '};
b75a7d8f 461
374ca955
A
462 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
463 if(cnv->extraInfo != NULL) {
729e4ab9 464 UConverterNamePieces stackPieces;
4388f060 465 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
374ca955
A
466 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
467 uint32_t version;
b75a7d8f 468
729e4ab9
A
469 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
470
374ca955 471 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
374ca955 472 myConverterData->currentType = ASCII1;
374ca955 473 cnv->fromUnicodeStatus =FALSE;
729e4ab9
A
474 if(pArgs->locale){
475 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
374ca955 476 }
729e4ab9 477 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
73c04bcf 478 myConverterData->version = version;
46f4442e 479 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
73c04bcf
A
480 (myLocale[2]=='_' || myLocale[2]=='\0'))
481 {
482 size_t len=0;
374ca955 483 /* open the required converters and cache them */
729e4ab9
A
484 if(version>MAX_JA_VERSION) {
485 /* prevent indexing beyond jpCharsetMasks[] */
486 myConverterData->version = version = 0;
487 }
374ca955 488 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
729e4ab9
A
489 myConverterData->myConverterArray[ISO8859_7] =
490 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
374ca955 491 }
729e4ab9
A
492 myConverterData->myConverterArray[JISX208] =
493 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
374ca955 494 if(jpCharsetMasks[version]&CSM(JISX212)) {
729e4ab9
A
495 myConverterData->myConverterArray[JISX212] =
496 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
374ca955
A
497 }
498 if(jpCharsetMasks[version]&CSM(GB2312)) {
729e4ab9
A
499 myConverterData->myConverterArray[GB2312] =
500 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
374ca955
A
501 }
502 if(jpCharsetMasks[version]&CSM(KSC5601)) {
729e4ab9
A
503 myConverterData->myConverterArray[KSC5601] =
504 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
374ca955 505 }
b75a7d8f 506
374ca955
A
507 /* set the function pointers to appropriate funtions */
508 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
509 uprv_strcpy(myConverterData->locale,"ja");
b75a7d8f 510
46f4442e 511 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
374ca955
A
512 len = uprv_strlen(myConverterData->name);
513 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
514 myConverterData->name[len+1]='\0';
515 }
46f4442e 516 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
73c04bcf
A
517 (myLocale[2]=='_' || myLocale[2]=='\0'))
518 {
729e4ab9
A
519 const char *cnvName;
520 if(version==1) {
521 cnvName="icu-internal-25546";
522 } else {
523 cnvName="ibm-949";
524 myConverterData->version=version=0;
525 }
526 if(pArgs->onlyTestIsLoadable) {
527 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
528 uprv_free(cnv->extraInfo);
529 cnv->extraInfo=NULL;
530 return;
531 } else {
532 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
73c04bcf
A
533 if (U_FAILURE(*errorCode)) {
534 _ISO2022Close(cnv);
535 return;
536 }
b75a7d8f 537
729e4ab9
A
538 if(version==1) {
539 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
540 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
541 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
542 }else{
543 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
73c04bcf 544 }
b75a7d8f 545
729e4ab9
A
546 /* initialize the state variables */
547 setInitialStateToUnicodeKR(cnv, myConverterData);
548 setInitialStateFromUnicodeKR(cnv, myConverterData);
b75a7d8f 549
729e4ab9
A
550 /* set the function pointers to appropriate funtions */
551 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
552 uprv_strcpy(myConverterData->locale,"ko");
553 }
b75a7d8f 554 }
46f4442e 555 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
73c04bcf
A
556 (myLocale[2]=='_' || myLocale[2]=='\0'))
557 {
b75a7d8f
A
558
559 /* open the required converters and cache them */
729e4ab9
A
560 myConverterData->myConverterArray[GB2312_1] =
561 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
374ca955 562 if(version==1) {
729e4ab9
A
563 myConverterData->myConverterArray[ISO_IR_165] =
564 ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
374ca955 565 }
729e4ab9
A
566 myConverterData->myConverterArray[CNS_11643] =
567 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
b75a7d8f 568
b75a7d8f
A
569
570 /* set the function pointers to appropriate funtions */
571 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
572 uprv_strcpy(myConverterData->locale,"cn");
573
729e4ab9 574 if (version==0){
b75a7d8f 575 myConverterData->version = 0;
46f4442e 576 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
729e4ab9
A
577 }else if (version==1){
578 myConverterData->version = 1;
579 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
580 }else {
581 myConverterData->version = 2;
582 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
b75a7d8f
A
583 }
584 }
585 else{
374ca955 586#ifdef U_ENABLE_GENERIC_ISO_2022
73c04bcf
A
587 myConverterData->isFirstBuffer = TRUE;
588
b75a7d8f
A
589 /* append the UTF-8 escape sequence */
590 cnv->charErrorBufferLength = 3;
591 cnv->charErrorBuffer[0] = 0x1b;
592 cnv->charErrorBuffer[1] = 0x25;
593 cnv->charErrorBuffer[2] = 0x42;
594
595 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
596 /* initialize the state variables */
b75a7d8f 597 uprv_strcpy(myConverterData->name,"ISO_2022");
374ca955
A
598#else
599 *errorCode = U_UNSUPPORTED_ERROR;
600 return;
601#endif
b75a7d8f
A
602 }
603
374ca955
A
604 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
605
729e4ab9 606 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
374ca955
A
607 _ISO2022Close(cnv);
608 }
b75a7d8f
A
609 } else {
610 *errorCode = U_MEMORY_ALLOCATION_ERROR;
611 }
b75a7d8f
A
612}
613
614
615static void
616_ISO2022Close(UConverter *converter) {
374ca955
A
617 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
618 UConverterSharedData **array = myData->myConverterArray;
619 int32_t i;
b75a7d8f
A
620
621 if (converter->extraInfo != NULL) {
622 /*close the array of converter pointers and free the memory*/
374ca955
A
623 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
624 if(array[i]!=NULL) {
625 ucnv_unloadSharedDataIfReady(array[i]);
b75a7d8f 626 }
b75a7d8f
A
627 }
628
374ca955 629 ucnv_close(myData->currentConverter);
b75a7d8f
A
630
631 if(!converter->isExtraLocal){
632 uprv_free (converter->extraInfo);
374ca955 633 converter->extraInfo = NULL;
b75a7d8f
A
634 }
635 }
636}
637
638static void
639_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
640 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
374ca955
A
641 if(choice<=UCNV_RESET_TO_UNICODE) {
642 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
643 myConverterData->key = 0;
d5d484b0 644 myConverterData->isEmptySegment = FALSE;
374ca955
A
645 }
646 if(choice!=UCNV_RESET_TO_UNICODE) {
647 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
648 }
649#ifdef U_ENABLE_GENERIC_ISO_2022
650 if(myConverterData->locale[0] == 0){
b75a7d8f
A
651 if(choice<=UCNV_RESET_TO_UNICODE) {
652 myConverterData->isFirstBuffer = TRUE;
374ca955 653 myConverterData->key = 0;
b75a7d8f
A
654 if (converter->mode == UCNV_SO){
655 ucnv_close (myConverterData->currentConverter);
656 myConverterData->currentConverter=NULL;
657 }
46f4442e 658 converter->mode = UCNV_SI;
b75a7d8f
A
659 }
660 if(choice!=UCNV_RESET_TO_UNICODE) {
661 /* re-append UTF-8 escape sequence */
662 converter->charErrorBufferLength = 3;
663 converter->charErrorBuffer[0] = 0x1b;
664 converter->charErrorBuffer[1] = 0x28;
665 converter->charErrorBuffer[2] = 0x42;
666 }
667 }
374ca955
A
668 else
669#endif
670 {
b75a7d8f 671 /* reset the state variables */
374ca955 672 if(myConverterData->locale[0] == 'k'){
b75a7d8f
A
673 if(choice<=UCNV_RESET_TO_UNICODE) {
674 setInitialStateToUnicodeKR(converter, myConverterData);
675 }
676 if(choice!=UCNV_RESET_TO_UNICODE) {
677 setInitialStateFromUnicodeKR(converter, myConverterData);
678 }
679 }
680 }
681}
682
46f4442e 683static const char*
b75a7d8f
A
684_ISO2022getName(const UConverter* cnv){
685 if(cnv->extraInfo){
686 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
687 return myData->name;
688 }
689 return NULL;
690}
691
b75a7d8f 692
374ca955
A
693/*************** to unicode *******************/
694/****************************************************************************
695 * Recognized escape sequences are
696 * <ESC>(B ASCII
697 * <ESC>.A ISO-8859-1
698 * <ESC>.F ISO-8859-7
699 * <ESC>(J JISX-201
700 * <ESC>(I JISX-201
701 * <ESC>$B JISX-208
702 * <ESC>$@ JISX-208
703 * <ESC>$(D JISX-212
704 * <ESC>$A GB2312
705 * <ESC>$(C KSC5601
706 */
46f4442e 707static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
374ca955
A
708/* 0 1 2 3 4 5 6 7 8 9 */
709 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
710 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
711 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
712 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
713 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
714 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
715 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
716 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
717};
b75a7d8f 718
374ca955 719/*************** to unicode *******************/
46f4442e 720static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
374ca955
A
721/* 0 1 2 3 4 5 6 7 8 9 */
722 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
723 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
724 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
725 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
726 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
727 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
728 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
729 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
730};
b75a7d8f 731
b75a7d8f 732
46f4442e 733static UCNV_TableStates_2022
374ca955
A
734getKey_2022(char c,int32_t* key,int32_t* offset){
735 int32_t togo;
736 int32_t low = 0;
737 int32_t hi = MAX_STATES_2022;
738 int32_t oldmid=0;
b75a7d8f 739
374ca955
A
740 togo = normalize_esq_chars_2022[(uint8_t)c];
741 if(togo == 0) {
742 /* not a valid character anywhere in an escape sequence */
743 *key = 0;
744 *offset = 0;
745 return INVALID_2022;
746 }
747 togo = (*key << 5) + togo;
b75a7d8f 748
374ca955 749 while (hi != low) /*binary search*/{
b75a7d8f 750
374ca955
A
751 register int32_t mid = (hi+low) >> 1; /*Finds median*/
752
46f4442e 753 if (mid == oldmid)
374ca955
A
754 break;
755
756 if (escSeqStateTable_Key_2022[mid] > togo){
757 hi = mid;
758 }
759 else if (escSeqStateTable_Key_2022[mid] < togo){
760 low = mid;
761 }
762 else /*we found it*/{
763 *key = togo;
764 *offset = mid;
46f4442e 765 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
374ca955
A
766 }
767 oldmid = mid;
b75a7d8f 768
b75a7d8f 769 }
b75a7d8f 770
374ca955
A
771 *key = 0;
772 *offset = 0;
773 return INVALID_2022;
b75a7d8f
A
774}
775
374ca955
A
776/*runs through a state machine to determine the escape sequence - codepage correspondance
777 */
46f4442e 778static void
374ca955 779changeState_2022(UConverter* _this,
46f4442e 780 const char** source,
374ca955
A
781 const char* sourceLimit,
782 Variant2022 var,
783 UErrorCode* err){
784 UCNV_TableStates_2022 value;
785 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
786 uint32_t key = myData2022->key;
73c04bcf 787 int32_t offset = 0;
fd0068a8 788 int8_t initialToULength = _this->toULength;
374ca955
A
789 char c;
790
791 value = VALID_NON_TERMINAL_2022;
792 while (*source < sourceLimit) {
793 c = *(*source)++;
794 _this->toUBytes[_this->toULength++]=(uint8_t)c;
795 value = getKey_2022(c,(int32_t *) &key, &offset);
46f4442e 796
374ca955 797 switch (value){
b75a7d8f 798
374ca955
A
799 case VALID_NON_TERMINAL_2022 :
800 /* continue with the loop */
801 break;
b75a7d8f 802
374ca955
A
803 case VALID_TERMINAL_2022:
804 key = 0;
805 goto DONE;
b75a7d8f 806
374ca955
A
807 case INVALID_2022:
808 goto DONE;
b75a7d8f 809
374ca955
A
810 case VALID_MAYBE_TERMINAL_2022:
811#ifdef U_ENABLE_GENERIC_ISO_2022
812 /* ESC ( B is ambiguous only for ISO_2022 itself */
813 if(var == ISO_2022) {
814 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
815 _this->toULength = 0;
b75a7d8f 816
374ca955
A
817 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
818
819 /* continue with the loop */
820 value = VALID_NON_TERMINAL_2022;
821 break;
822 } else
823#endif
824 {
825 /* not ISO_2022 itself, finish here */
826 value = VALID_TERMINAL_2022;
827 key = 0;
828 goto DONE;
b75a7d8f
A
829 }
830 }
b75a7d8f 831 }
b75a7d8f 832
374ca955
A
833DONE:
834 myData2022->key = key;
b75a7d8f 835
374ca955
A
836 if (value == VALID_NON_TERMINAL_2022) {
837 /* indicate that the escape sequence is incomplete: key!=0 */
838 return;
839 } else if (value == INVALID_2022 ) {
840 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
374ca955
A
841 } else /* value == VALID_TERMINAL_2022 */ {
842 switch(var){
843#ifdef U_ENABLE_GENERIC_ISO_2022
844 case ISO_2022:
845 {
846 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
847 if(chosenConverterName == NULL) {
848 /* SS2 or SS3 */
849 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
46f4442e 850 _this->toUCallbackReason = UCNV_UNASSIGNED;
374ca955 851 return;
b75a7d8f 852 }
374ca955
A
853
854 _this->mode = UCNV_SI;
855 ucnv_close(myData2022->currentConverter);
856 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
857 if(U_SUCCESS(*err)) {
858 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
859 _this->mode = UCNV_SO;
860 }
861 break;
862 }
863#endif
864 case ISO_2022_JP:
865 {
46f4442e 866 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
374ca955
A
867 switch(tempState) {
868 case INVALID_STATE:
869 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
870 break;
871 case SS2_STATE:
872 if(myData2022->toU2022State.cs[2]!=0) {
873 if(myData2022->toU2022State.g<2) {
874 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
875 }
876 myData2022->toU2022State.g=2;
877 } else {
878 /* illegal to have SS2 before a matching designator */
879 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
880 }
881 break;
882 /* case SS3_STATE: not used in ISO-2022-JP-x */
883 case ISO8859_1:
884 case ISO8859_7:
885 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
886 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
887 } else {
888 /* G2 charset for SS2 */
889 myData2022->toU2022State.cs[2]=(int8_t)tempState;
890 }
891 break;
892 default:
893 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
894 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
895 } else {
896 /* G0 charset */
897 myData2022->toU2022State.cs[0]=(int8_t)tempState;
898 }
899 break;
900 }
901 }
902 break;
903 case ISO_2022_CN:
904 {
46f4442e 905 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
374ca955
A
906 switch(tempState) {
907 case INVALID_STATE:
908 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
909 break;
910 case SS2_STATE:
911 if(myData2022->toU2022State.cs[2]!=0) {
912 if(myData2022->toU2022State.g<2) {
913 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
914 }
915 myData2022->toU2022State.g=2;
916 } else {
917 /* illegal to have SS2 before a matching designator */
918 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
919 }
920 break;
921 case SS3_STATE:
922 if(myData2022->toU2022State.cs[3]!=0) {
923 if(myData2022->toU2022State.g<2) {
924 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
925 }
926 myData2022->toU2022State.g=3;
927 } else {
928 /* illegal to have SS3 before a matching designator */
929 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
930 }
931 break;
932 case ISO_IR_165:
933 if(myData2022->version==0) {
934 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
935 break;
936 }
73c04bcf 937 /*fall through*/
374ca955 938 case GB2312_1:
73c04bcf 939 /*fall through*/
374ca955
A
940 case CNS_11643_1:
941 myData2022->toU2022State.cs[1]=(int8_t)tempState;
942 break;
943 case CNS_11643_2:
944 myData2022->toU2022State.cs[2]=(int8_t)tempState;
945 break;
946 default:
947 /* other CNS 11643 planes */
948 if(myData2022->version==0) {
949 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
950 } else {
951 myData2022->toU2022State.cs[3]=(int8_t)tempState;
952 }
953 break;
954 }
955 }
956 break;
957 case ISO_2022_KR:
958 if(offset==0x30){
959 /* nothing to be done, just accept this one escape sequence */
960 } else {
961 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
962 }
963 break;
964
965 default:
966 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
967 break;
968 }
969 }
970 if(U_SUCCESS(*err)) {
971 _this->toULength = 0;
fd0068a8
A
972 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
973 if(_this->toULength>1) {
974 /*
975 * Ticket 5691: consistent illegal sequences:
976 * - We include at least the first byte (ESC) in the illegal sequence.
977 * - If any of the non-initial bytes could be the start of a character,
978 * we stop the illegal sequence before the first one of those.
979 * In escape sequences, all following bytes are "printable", that is,
980 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
981 * they are valid single/lead bytes.
982 * For simplicity, we always only report the initial ESC byte as the
983 * illegal sequence and back out all other bytes we looked at.
984 */
985 /* Back out some bytes. */
986 int8_t backOutDistance=_this->toULength-1;
987 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
988 if(backOutDistance<=bytesFromThisBuffer) {
989 /* same as initialToULength<=1 */
990 *source-=backOutDistance;
991 } else {
992 /* Back out bytes from the previous buffer: Need to replay them. */
993 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
994 /* same as -(initialToULength-1) */
995 /* preToULength is negative! */
996 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
997 *source-=bytesFromThisBuffer;
998 }
999 _this->toULength=1;
1000 }
46f4442e
A
1001 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1002 _this->toUCallbackReason = UCNV_UNASSIGNED;
374ca955
A
1003 }
1004}
1005
1006/*Checks the characters of the buffer against valid 2022 escape sequences
1007*if the match we return a pointer to the initial start of the sequence otherwise
1008*we return sourceLimit
1009*/
1010/*for 2022 looks ahead in the stream
1011 *to determine the longest possible convertible
1012 *data stream
1013 */
4388f060 1014static inline const char*
374ca955
A
1015getEndOfBuffer_2022(const char** source,
1016 const char* sourceLimit,
4388f060 1017 UBool /*flush*/){
374ca955
A
1018
1019 const char* mySource = *source;
1020
1021#ifdef U_ENABLE_GENERIC_ISO_2022
46f4442e 1022 if (*source >= sourceLimit)
374ca955
A
1023 return sourceLimit;
1024
1025 do{
1026
1027 if (*mySource == ESC_2022){
1028 int8_t i;
1029 int32_t key = 0;
1030 int32_t offset;
1031 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1032
1033 /* Kludge: I could not
1034 * figure out the reason for validating an escape sequence
1035 * twice - once here and once in changeState_2022().
1036 * is it possible to have an ESC character in a ISO2022
1037 * byte stream which is valid in a code page? Is it legal?
1038 */
46f4442e 1039 for (i=0;
374ca955
A
1040 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1041 i++) {
1042 value = getKey_2022(*(mySource+i), &key, &offset);
1043 }
46f4442e 1044 if (value > 0 || *mySource==ESC_2022)
374ca955
A
1045 return mySource;
1046
46f4442e 1047 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
374ca955
A
1048 return sourceLimit;
1049 }
1050 }while (++mySource < sourceLimit);
1051
1052 return sourceLimit;
1053#else
1054 while(mySource < sourceLimit && *mySource != ESC_2022) {
1055 ++mySource;
1056 }
1057 return mySource;
1058#endif
1059}
1060
1061
1062/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
46f4442e
A
1063 * any future change in _MBCSFromUChar32() function should be reflected here.
1064 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
374ca955 1065 */
4388f060 1066static inline int32_t
374ca955 1067MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
46f4442e
A
1068 UChar32 c,
1069 uint32_t* value,
1070 UBool useFallback,
374ca955
A
1071 int outputType)
1072{
1073 const int32_t *cx;
1074 const uint16_t *table;
1075 uint32_t stage2Entry;
1076 uint32_t myValue;
46f4442e 1077 int32_t length;
374ca955 1078 const uint8_t *p;
46f4442e
A
1079 /*
1080 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1081 * Use internal version of ucnv_open() that verifies that the new structures are available,
1082 * else U_INTERNAL_PROGRAM_ERROR.
1083 */
374ca955
A
1084 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1085 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1086 table=sharedData->mbcs.fromUnicodeTable;
1087 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1088 /* get the bytes and the length for the output */
1089 if(outputType==MBCS_OUTPUT_2){
1090 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1091 if(myValue<=0xff) {
46f4442e 1092 length=1;
374ca955 1093 } else {
46f4442e 1094 length=2;
374ca955
A
1095 }
1096 } else /* outputType==MBCS_OUTPUT_3 */ {
1097 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1098 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1099 if(myValue<=0xff) {
46f4442e 1100 length=1;
374ca955 1101 } else if(myValue<=0xffff) {
46f4442e 1102 length=2;
374ca955 1103 } else {
46f4442e 1104 length=3;
b75a7d8f
A
1105 }
1106 }
1107 /* is this code point assigned, or do we use fallbacks? */
46f4442e
A
1108 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1109 /* assigned */
1110 *value=myValue;
1111 return length;
1112 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
b75a7d8f 1113 /*
374ca955 1114 * We allow a 0 byte output if the "assigned" bit is set for this entry.
b75a7d8f 1115 * There is no way with this data structure for fallback output
374ca955 1116 * to be a zero byte.
b75a7d8f 1117 */
b75a7d8f 1118 *value=myValue;
46f4442e 1119 return -length;
b75a7d8f 1120 }
b75a7d8f 1121 }
374ca955
A
1122
1123 cx=sharedData->mbcs.extIndexes;
1124 if(cx!=NULL) {
46f4442e 1125 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
374ca955
A
1126 }
1127
1128 /* unassigned */
46f4442e 1129 return 0;
b75a7d8f
A
1130}
1131
1132/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
46f4442e
A
1133 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1134 * @param retval pointer to output byte
1135 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
b75a7d8f 1136 */
4388f060 1137static inline int32_t
b75a7d8f 1138MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
46f4442e
A
1139 UChar32 c,
1140 uint32_t* retval,
b75a7d8f
A
1141 UBool useFallback)
1142{
46f4442e 1143 const uint16_t *table;
b75a7d8f
A
1144 int32_t value;
1145 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
374ca955 1146 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
46f4442e 1147 return 0;
b75a7d8f
A
1148 }
1149 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
374ca955 1150 table=sharedData->mbcs.fromUnicodeTable;
b75a7d8f 1151 /* get the byte for the output */
374ca955 1152 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
b75a7d8f 1153 /* is this code point assigned, or do we use fallbacks? */
46f4442e
A
1154 *retval=(uint32_t)(value&0xff);
1155 if(value>=0xf00) {
1156 return 1; /* roundtrip */
1157 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1158 return -1; /* fallback taken */
b75a7d8f 1159 } else {
46f4442e 1160 return 0; /* no mapping */
b75a7d8f 1161 }
b75a7d8f
A
1162}
1163
46f4442e
A
1164/*
1165 * Check that the result is a 2-byte value with each byte in the range A1..FE
1166 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1167 * to move it to the ISO 2022 range 21..7E.
1168 * Return 0 if out of range.
1169 */
4388f060 1170static inline uint32_t
46f4442e
A
1171_2022FromGR94DBCS(uint32_t value) {
1172 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1173 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1174 ) {
1175 return value - 0x8080; /* shift down to 21..7e byte range */
1176 } else {
1177 return 0; /* not valid for ISO 2022 */
1178 }
1179}
1180
1181#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1182/*
1183 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1184 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1185 * unchanged.
1186 */
4388f060 1187static inline uint32_t
46f4442e
A
1188_2022ToGR94DBCS(uint32_t value) {
1189 uint32_t returnValue = value + 0x8080;
1190 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1191 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1192 return returnValue;
1193 } else {
1194 return value;
1195 }
1196}
1197#endif
1198
374ca955
A
1199#ifdef U_ENABLE_GENERIC_ISO_2022
1200
b75a7d8f
A
1201/**********************************************************************************
1202* ISO-2022 Converter
1203*
1204*
1205*/
1206
46f4442e 1207static void
b75a7d8f
A
1208T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1209 UErrorCode* err){
374ca955
A
1210 const char* mySourceLimit, *realSourceLimit;
1211 const char* sourceStart;
1212 const UChar* myTargetStart;
b75a7d8f 1213 UConverter* saveThis;
b75a7d8f 1214 UConverterDataISO2022* myData;
374ca955
A
1215 int8_t length;
1216
1217 saveThis = args->converter;
1218 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1219
1220 realSourceLimit = args->sourceLimit;
1221 while (args->source < realSourceLimit) {
1222 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1223 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1224 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1225
1226 if(args->source < mySourceLimit) {
1227 if(myData->currentConverter==NULL) {
1228 myData->currentConverter = ucnv_open("ASCII",err);
1229 if(U_FAILURE(*err)){
1230 return;
1231 }
b75a7d8f 1232
374ca955
A
1233 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1234 saveThis->mode = UCNV_SO;
b75a7d8f 1235 }
b75a7d8f 1236
374ca955
A
1237 /* convert to before the ESC or until the end of the buffer */
1238 myData->isFirstBuffer=FALSE;
1239 sourceStart = args->source;
1240 myTargetStart = args->target;
1241 args->converter = myData->currentConverter;
1242 ucnv_toUnicode(args->converter,
1243 &args->target,
1244 args->targetLimit,
1245 &args->source,
1246 mySourceLimit,
1247 args->offsets,
1248 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1249 err);
1250 args->converter = saveThis;
1251
1252 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1253 /* move the overflow buffer */
1254 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1255 myData->currentConverter->UCharErrorBufferLength = 0;
1256 if(length > 0) {
1257 uprv_memcpy(saveThis->UCharErrorBuffer,
1258 myData->currentConverter->UCharErrorBuffer,
1259 length*U_SIZEOF_UCHAR);
1260 }
1261 return;
1262 }
b75a7d8f 1263
374ca955
A
1264 /*
1265 * At least one of:
1266 * -Error while converting
1267 * -Done with entire buffer
1268 * -Need to write offsets or update the current offset
1269 * (leave that up to the code in ucnv.c)
1270 *
1271 * or else we just stopped at an ESC byte and continue with changeState_2022()
1272 */
1273 if (U_FAILURE(*err) ||
1274 (args->source == realSourceLimit) ||
1275 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1276 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1277 ) {
1278 /* copy partial or error input for truncated detection and error handling */
1279 if(U_FAILURE(*err)) {
1280 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1281 if(length > 0) {
1282 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1283 }
1284 } else {
1285 length = saveThis->toULength = myData->currentConverter->toULength;
1286 if(length > 0) {
1287 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1288 if(args->source < mySourceLimit) {
1289 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1290 }
1291 }
1292 }
1293 return;
b75a7d8f 1294 }
b75a7d8f
A
1295 }
1296 }
b75a7d8f
A
1297
1298 sourceStart = args->source;
1299 changeState_2022(args->converter,
46f4442e 1300 &(args->source),
374ca955 1301 realSourceLimit,
b75a7d8f 1302 ISO_2022,
b75a7d8f 1303 err);
374ca955
A
1304 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1305 /* let the ucnv.c code update its current offset */
1306 return;
b75a7d8f 1307 }
b75a7d8f 1308 }
b75a7d8f
A
1309}
1310
374ca955 1311#endif
b75a7d8f
A
1312
1313/*
1314 * To Unicode Callback helper function
1315 */
46f4442e 1316static void
374ca955
A
1317toUnicodeCallback(UConverter *cnv,
1318 const uint32_t sourceChar, const uint32_t targetUniChar,
1319 UErrorCode* err){
b75a7d8f 1320 if(sourceChar>0xff){
374ca955
A
1321 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1322 cnv->toUBytes[1] = (uint8_t)sourceChar;
1323 cnv->toULength = 2;
b75a7d8f
A
1324 }
1325 else{
374ca955 1326 cnv->toUBytes[0] =(char) sourceChar;
fd0068a8 1327 cnv->toULength = 1;
b75a7d8f
A
1328 }
1329
1330 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
b75a7d8f
A
1331 *err = U_INVALID_CHAR_FOUND;
1332 }
1333 else{
b75a7d8f
A
1334 *err = U_ILLEGAL_CHAR_FOUND;
1335 }
b75a7d8f
A
1336}
1337
1338/**************************************ISO-2022-JP*************************************************/
1339
1340/************************************** IMPORTANT **************************************************
1341* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1342* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
46f4442e
A
1343* The converter iterates over each Unicode codepoint
1344* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1345* processed one char at a time it would make sense to reduce the extra processing a canned converter
b75a7d8f
A
1346* would do as far as possible.
1347*
46f4442e
A
1348* If the implementation of these macros or structure of sharedData struct change in the future, make
1349* sure that ISO-2022 is also changed.
b75a7d8f
A
1350***************************************************************************************************
1351*/
1352
1353/***************************************************************************************************
1354* Rules for ISO-2022-jp encoding
46f4442e 1355* (i) Escape sequences must be fully contained within a line they should not
b75a7d8f
A
1356* span new lines or CRs
1357* (ii) If the last character on a line is represented by two bytes then an ASCII or
1358* JIS-Roman character escape sequence should follow before the line terminates
46f4442e
A
1359* (iii) If the first character on the line is represented by two bytes then a two
1360* byte character escape sequence should precede it
b75a7d8f
A
1361* (iv) If no escape sequence is encountered then the characters are ASCII
1362* (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1363* and invoked with SS2 (ESC N).
1364* (vi) If there is any G0 designation in text, there must be a switch to
1365* ASCII or to JIS X 0201-Roman before a space character (but not
1366* necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1367* characters such as tab or CRLF.
1368* (vi) Supported encodings:
1369* ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1370*
1371* source : RFC-1554
1372*
1373* JISX201, JISX208,JISX212 : new .cnv data files created
1374* KSC5601 : alias to ibm-949 mapping table
1375* GB2312 : alias to ibm-1386 mapping table
1376* ISO-8859-1 : Algorithmic implemented as LATIN1 case
1377* ISO-8859-7 : alisas to ibm-9409 mapping table
1378*/
b75a7d8f 1379
374ca955
A
1380/* preference order of JP charsets */
1381static const StateEnum jpCharsetPref[]={
1382 ASCII,
1383 JISX201,
1384 ISO8859_1,
1385 ISO8859_7,
1386 JISX208,
1387 JISX212,
1388 GB2312,
1389 KSC5601,
1390 HWKANA_7BIT
b75a7d8f
A
1391};
1392
73c04bcf
A
1393/*
1394 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1395 * not in order of jpCharsetPref[]!
1396 */
374ca955 1397static const char escSeqChars[][6] ={
b75a7d8f
A
1398 "\x1B\x28\x42", /* <ESC>(B ASCII */
1399 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1400 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1401 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1402 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1403 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1404 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1405 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1406 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1407
1408};
46f4442e 1409static const int8_t escSeqCharsLen[] ={
374ca955 1410 3, /* length of <ESC>(B ASCII */
b75a7d8f
A
1411 3, /* length of <ESC>.A ISO-8859-1 */
1412 3, /* length of <ESC>.F ISO-8859-7 */
1413 3, /* length of <ESC>(J JISX-201 */
1414 3, /* length of <ESC>$B JISX-208 */
1415 4, /* length of <ESC>$(D JISX-212 */
1416 3, /* length of <ESC>$A GB2312 */
1417 4, /* length of <ESC>$(C KSC5601 */
1418 3 /* length of <ESC>(I HWKANA_7BIT */
1419};
1420
1421/*
1422* The iteration over various code pages works this way:
1423* i) Get the currentState from myConverterData->currentState
1424* ii) Check if the character is mapped to a valid character in the currentState
1425* Yes -> a) set the initIterState to currentState
1426* b) remain in this state until an invalid character is found
1427* No -> a) go to the next code page and find the character
46f4442e 1428* iii) Before changing the state increment the current state check if the current state
b75a7d8f
A
1429* is equal to the intitIteration state
1430* Yes -> A character that cannot be represented in any of the supported encodings
1431* break and return a U_INVALID_CHARACTER error
1432* No -> Continue and find the character in next code page
1433*
1434*
46f4442e 1435* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
b75a7d8f
A
1436*/
1437
46f4442e 1438/* Map 00..7F to Unicode according to JIS X 0201. */
4388f060 1439static inline uint32_t
46f4442e
A
1440jisx201ToU(uint32_t value) {
1441 if(value < 0x5c) {
1442 return value;
1443 } else if(value == 0x5c) {
1444 return 0xa5;
1445 } else if(value == 0x7e) {
1446 return 0x203e;
1447 } else /* value <= 0x7f */ {
1448 return value;
1449 }
1450}
1451
1452/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
4388f060 1453static inline uint32_t
46f4442e
A
1454jisx201FromU(uint32_t value) {
1455 if(value<=0x7f) {
1456 if(value!=0x5c && value!=0x7e) {
1457 return value;
1458 }
1459 } else if(value==0xa5) {
1460 return 0x5c;
1461 } else if(value==0x203e) {
1462 return 0x7e;
1463 }
1464 return 0xfffe;
1465}
1466
1467/*
1468 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1469 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1470 * Return 0 if the byte pair is out of range.
1471 */
4388f060 1472static inline uint32_t
46f4442e
A
1473_2022FromSJIS(uint32_t value) {
1474 uint8_t trail;
1475
1476 if(value > 0xEFFC) {
1477 return 0; /* beyond JIS X 0208 */
1478 }
1479
1480 trail = (uint8_t)value;
1481
1482 value &= 0xff00; /* lead byte */
1483 if(value <= 0x9f00) {
1484 value -= 0x7000;
1485 } else /* 0xe000 <= value <= 0xef00 */ {
1486 value -= 0xb000;
1487 }
1488 value <<= 1;
1489
1490 if(trail <= 0x9e) {
1491 value -= 0x100;
1492 if(trail <= 0x7e) {
1493 value |= trail - 0x1f;
1494 } else {
1495 value |= trail - 0x20;
1496 }
1497 } else /* trail <= 0xfc */ {
1498 value |= trail - 0x7e;
1499 }
1500 return value;
1501}
1502
1503/*
1504 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1505 * If either byte is outside 21..7E make sure that the result is not valid
1506 * for Shift-JIS so that the converter catches it.
1507 * Some invalid byte values already turn into equally invalid Shift-JIS
1508 * byte values and need not be tested explicitly.
1509 */
4388f060 1510static inline void
46f4442e
A
1511_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1512 if(c1&1) {
1513 ++c1;
1514 if(c2 <= 0x5f) {
1515 c2 += 0x1f;
1516 } else if(c2 <= 0x7e) {
1517 c2 += 0x20;
1518 } else {
1519 c2 = 0; /* invalid */
1520 }
1521 } else {
1522 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1523 c2 += 0x7e;
1524 } else {
1525 c2 = 0; /* invalid */
1526 }
1527 }
1528 c1 >>= 1;
1529 if(c1 <= 0x2f) {
1530 c1 += 0x70;
1531 } else if(c1 <= 0x3f) {
1532 c1 += 0xb0;
1533 } else {
1534 c1 = 0; /* invalid */
1535 }
1536 bytes[0] = (char)c1;
1537 bytes[1] = (char)c2;
1538}
1539
1540/*
1541 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1542 * Katakana.
1543 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1544 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1545 * These were the only fallbacks in ICU's jisx-208.ucm file.
1546 */
1547static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1548 0x2123, /* U+FF61 */
1549 0x2156,
1550 0x2157,
1551 0x2122,
1552 0x2126,
1553 0x2572,
1554 0x2521,
1555 0x2523,
1556 0x2525,
1557 0x2527,
1558 0x2529,
1559 0x2563,
1560 0x2565,
1561 0x2567,
1562 0x2543,
1563 0x213C, /* U+FF70 */
1564 0x2522,
1565 0x2524,
1566 0x2526,
1567 0x2528,
1568 0x252A,
1569 0x252B,
1570 0x252D,
1571 0x252F,
1572 0x2531,
1573 0x2533,
1574 0x2535,
1575 0x2537,
1576 0x2539,
1577 0x253B,
1578 0x253D,
1579 0x253F, /* U+FF80 */
1580 0x2541,
1581 0x2544,
1582 0x2546,
1583 0x2548,
1584 0x254A,
1585 0x254B,
1586 0x254C,
1587 0x254D,
1588 0x254E,
1589 0x254F,
1590 0x2552,
1591 0x2555,
1592 0x2558,
1593 0x255B,
1594 0x255E,
1595 0x255F, /* U+FF90 */
1596 0x2560,
1597 0x2561,
1598 0x2562,
1599 0x2564,
1600 0x2566,
1601 0x2568,
1602 0x2569,
1603 0x256A,
1604 0x256B,
1605 0x256C,
1606 0x256D,
1607 0x256F,
1608 0x2573,
1609 0x212B,
1610 0x212C /* U+FF9F */
1611};
1612
1613static void
374ca955 1614UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
46f4442e 1615 UConverter *cnv = args->converter;
b75a7d8f 1616 UConverterDataISO2022 *converterData;
374ca955
A
1617 ISO2022State *pFromU2022State;
1618 uint8_t *target = (uint8_t *) args->target;
1619 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
b75a7d8f
A
1620 const UChar* source = args->source;
1621 const UChar* sourceLimit = args->sourceLimit;
1622 int32_t* offsets = args->offsets;
374ca955
A
1623 UChar32 sourceChar;
1624 char buffer[8];
1625 int32_t len, outLen;
1626 int8_t choices[10];
1627 int32_t choiceCount;
73c04bcf 1628 uint32_t targetValue = 0;
374ca955
A
1629 UBool useFallback;
1630
1631 int32_t i;
1632 int8_t cs, g;
1633
1634 /* set up the state */
46f4442e 1635 converterData = (UConverterDataISO2022*)cnv->extraInfo;
374ca955 1636 pFromU2022State = &converterData->fromU2022State;
374ca955
A
1637
1638 choiceCount = 0;
b75a7d8f 1639
b75a7d8f 1640 /* check if the last codepoint of previous buffer was a lead surrogate*/
46f4442e 1641 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
b75a7d8f
A
1642 goto getTrail;
1643 }
b75a7d8f 1644
374ca955
A
1645 while(source < sourceLimit) {
1646 if(target < targetLimit) {
b75a7d8f 1647
b75a7d8f 1648 sourceChar = *(source++);
374ca955 1649 /*check if the char is a First surrogate*/
4388f060
A
1650 if(U16_IS_SURROGATE(sourceChar)) {
1651 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
374ca955
A
1652getTrail:
1653 /*look ahead to find the trail surrogate*/
1654 if(source < sourceLimit) {
1655 /* test the following code unit */
1656 UChar trail=(UChar) *source;
4388f060 1657 if(U16_IS_TRAIL(trail)) {
374ca955 1658 source++;
4388f060 1659 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
46f4442e 1660 cnv->fromUChar32=0x00;
374ca955
A
1661 /* convert this supplementary code point */
1662 /* exit this condition tree */
1663 } else {
1664 /* this is an unmatched lead code unit (1st surrogate) */
1665 /* callback(illegal) */
1666 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 1667 cnv->fromUChar32=sourceChar;
374ca955 1668 break;
b75a7d8f 1669 }
374ca955
A
1670 } else {
1671 /* no more input */
46f4442e 1672 cnv->fromUChar32=sourceChar;
b75a7d8f
A
1673 break;
1674 }
374ca955
A
1675 } else {
1676 /* this is an unmatched trail code unit (2nd surrogate) */
1677 /* callback(illegal) */
1678 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 1679 cnv->fromUChar32=sourceChar;
374ca955
A
1680 break;
1681 }
b75a7d8f
A
1682 }
1683
73c04bcf
A
1684 /* do not convert SO/SI/ESC */
1685 if(IS_2022_CONTROL(sourceChar)) {
1686 /* callback(illegal) */
1687 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 1688 cnv->fromUChar32=sourceChar;
73c04bcf
A
1689 break;
1690 }
1691
374ca955 1692 /* do the conversion */
b75a7d8f 1693
374ca955
A
1694 if(choiceCount == 0) {
1695 uint16_t csm;
b75a7d8f 1696
374ca955
A
1697 /*
1698 * The csm variable keeps track of which charsets are allowed
1699 * and not used yet while building the choices[].
1700 */
1701 csm = jpCharsetMasks[converterData->version];
1702 choiceCount = 0;
1703
1704 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1705 if(converterData->version == 3 || converterData->version == 4) {
46f4442e 1706 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
374ca955 1707 }
46f4442e
A
1708 /* Do not try single-byte half-width Katakana for other versions. */
1709 csm &= ~CSM(HWKANA_7BIT);
b75a7d8f 1710
374ca955
A
1711 /* try the current G0 charset */
1712 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1713 csm &= ~CSM(cs);
b75a7d8f 1714
374ca955
A
1715 /* try the current G2 charset */
1716 if((cs = pFromU2022State->cs[2]) != 0) {
1717 choices[choiceCount++] = cs;
1718 csm &= ~CSM(cs);
1719 }
1720
1721 /* try all the other possible charsets */
1722 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1723 cs = (int8_t)jpCharsetPref[i];
1724 if(CSM(cs) & csm) {
1725 choices[choiceCount++] = cs;
1726 csm &= ~CSM(cs);
b75a7d8f
A
1727 }
1728 }
374ca955 1729 }
b75a7d8f 1730
374ca955 1731 cs = g = 0;
46f4442e
A
1732 /*
1733 * len==0: no mapping found yet
1734 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1735 * len>0: found a roundtrip result, done
1736 */
374ca955 1737 len = 0;
46f4442e
A
1738 /*
1739 * We will turn off useFallback after finding a fallback,
1740 * but we still get fallbacks from PUA code points as usual.
1741 * Therefore, we will also need to check that we don't overwrite
1742 * an early fallback with a later one.
1743 */
1744 useFallback = cnv->useFallback;
374ca955 1745
46f4442e
A
1746 for(i = 0; i < choiceCount && len <= 0; ++i) {
1747 uint32_t value;
1748 int32_t len2;
1749 int8_t cs0 = choices[i];
1750 switch(cs0) {
374ca955
A
1751 case ASCII:
1752 if(sourceChar <= 0x7f) {
1753 targetValue = (uint32_t)sourceChar;
1754 len = 1;
46f4442e
A
1755 cs = cs0;
1756 g = 0;
b75a7d8f 1757 }
374ca955
A
1758 break;
1759 case ISO8859_1:
46f4442e 1760 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
374ca955
A
1761 targetValue = (uint32_t)sourceChar - 0x80;
1762 len = 1;
46f4442e 1763 cs = cs0;
374ca955
A
1764 g = 2;
1765 }
1766 break;
1767 case HWKANA_7BIT:
46f4442e 1768 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
374ca955
A
1769 if(converterData->version==3) {
1770 /* JIS7: use G1 (SO) */
46f4442e
A
1771 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1772 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1773 len = 1;
1774 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
374ca955
A
1775 g = 1;
1776 } else if(converterData->version==4) {
1777 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
46f4442e
A
1778 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1779 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1780 len = 1;
374ca955 1781
46f4442e
A
1782 cs = pFromU2022State->cs[0];
1783 if(IS_JP_DBCS(cs)) {
374ca955
A
1784 /* switch from a DBCS charset to JISX201 */
1785 cs = (int8_t)JISX201;
b75a7d8f 1786 }
46f4442e
A
1787 /* else stay in the current G0 charset */
1788 g = 0;
b75a7d8f 1789 }
46f4442e 1790 /* else do not use HWKANA_7BIT with other versions */
b75a7d8f 1791 }
374ca955
A
1792 break;
1793 case JISX201:
1794 /* G0 SBCS */
46f4442e
A
1795 value = jisx201FromU(sourceChar);
1796 if(value <= 0x7f) {
1797 targetValue = value;
374ca955 1798 len = 1;
46f4442e
A
1799 cs = cs0;
1800 g = 0;
1801 useFallback = FALSE;
1802 }
1803 break;
1804 case JISX208:
1805 /* G0 DBCS from Shift-JIS table */
1806 len2 = MBCS_FROM_UCHAR32_ISO2022(
1807 converterData->myConverterArray[cs0],
1808 sourceChar, &value,
1809 useFallback, MBCS_OUTPUT_2);
1810 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1811 value = _2022FromSJIS(value);
1812 if(value != 0) {
1813 targetValue = value;
1814 len = len2;
1815 cs = cs0;
1816 g = 0;
1817 useFallback = FALSE;
1818 }
1819 } else if(len == 0 && useFallback &&
1820 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1821 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1822 len = -2;
1823 cs = cs0;
1824 g = 0;
1825 useFallback = FALSE;
374ca955
A
1826 }
1827 break;
1828 case ISO8859_7:
1829 /* G0 SBCS forced to 7-bit output */
46f4442e
A
1830 len2 = MBCS_SINGLE_FROM_UCHAR32(
1831 converterData->myConverterArray[cs0],
1832 sourceChar, &value,
1833 useFallback);
1834 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1835 targetValue = value - 0x80;
1836 len = len2;
1837 cs = cs0;
374ca955 1838 g = 2;
46f4442e 1839 useFallback = FALSE;
374ca955
A
1840 }
1841 break;
1842 default:
1843 /* G0 DBCS */
46f4442e
A
1844 len2 = MBCS_FROM_UCHAR32_ISO2022(
1845 converterData->myConverterArray[cs0],
1846 sourceChar, &value,
1847 useFallback, MBCS_OUTPUT_2);
1848 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1849 if(cs0 == KSC5601) {
1850 /*
1851 * Check for valid bytes for the encoding scheme.
1852 * This is necessary because the sub-converter (windows-949)
1853 * has a broader encoding scheme than is valid for 2022.
1854 */
1855 value = _2022FromGR94DBCS(value);
1856 if(value == 0) {
1857 break;
1858 }
1859 }
1860 targetValue = value;
1861 len = len2;
1862 cs = cs0;
1863 g = 0;
1864 useFallback = FALSE;
374ca955
A
1865 }
1866 break;
b75a7d8f
A
1867 }
1868 }
b75a7d8f 1869
46f4442e
A
1870 if(len != 0) {
1871 if(len < 0) {
1872 len = -len; /* fallback */
1873 }
374ca955
A
1874 outLen = 0; /* count output bytes */
1875
1876 /* write SI if necessary (only for JIS7) */
1877 if(pFromU2022State->g == 1 && g == 0) {
1878 buffer[outLen++] = UCNV_SI;
1879 pFromU2022State->g = 0;
1880 }
1881
1882 /* write the designation sequence if necessary */
1883 if(cs != pFromU2022State->cs[g]) {
1884 int32_t escLen = escSeqCharsLen[cs];
1885 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1886 outLen += escLen;
1887 pFromU2022State->cs[g] = cs;
1888
1889 /* invalidate the choices[] */
1890 choiceCount = 0;
1891 }
1892
1893 /* write the shift sequence if necessary */
1894 if(g != pFromU2022State->g) {
1895 switch(g) {
1896 /* case 0 handled before writing escapes */
1897 case 1:
1898 buffer[outLen++] = UCNV_SO;
1899 pFromU2022State->g = 1;
1900 break;
1901 default: /* case 2 */
1902 buffer[outLen++] = 0x1b;
1903 buffer[outLen++] = 0x4e;
1904 break;
1905 /* no case 3: no SS3 in ISO-2022-JP-x */
1906 }
1907 }
1908
1909 /* write the output bytes */
1910 if(len == 1) {
1911 buffer[outLen++] = (char)targetValue;
1912 } else /* len == 2 */ {
1913 buffer[outLen++] = (char)(targetValue >> 8);
1914 buffer[outLen++] = (char)targetValue;
1915 }
1916 } else {
1917 /*
46f4442e 1918 * if we cannot find the character after checking all codepages
b75a7d8f
A
1919 * then this is an error
1920 */
b75a7d8f 1921 *err = U_INVALID_CHAR_FOUND;
46f4442e 1922 cnv->fromUChar32=sourceChar;
374ca955
A
1923 break;
1924 }
1925
1926 if(sourceChar == CR || sourceChar == LF) {
1927 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1928 pFromU2022State->cs[2] = 0;
1929 choiceCount = 0;
1930 }
1931
1932 /* output outLen>0 bytes in buffer[] */
1933 if(outLen == 1) {
1934 *target++ = buffer[0];
1935 if(offsets) {
73c04bcf 1936 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
b75a7d8f 1937 }
374ca955
A
1938 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1939 *target++ = buffer[0];
1940 *target++ = buffer[1];
1941 if(offsets) {
1942 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1943 *offsets++ = sourceIndex;
1944 *offsets++ = sourceIndex;
1945 }
1946 } else {
73c04bcf 1947 fromUWriteUInt8(
46f4442e 1948 cnv,
374ca955 1949 buffer, outLen,
73c04bcf 1950 &target, (const char *)targetLimit,
374ca955
A
1951 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1952 err);
1953 if(U_FAILURE(*err)) {
b75a7d8f
A
1954 break;
1955 }
1956 }
1957 } /* end if(myTargetIndex<myTargetLength) */
1958 else{
1959 *err =U_BUFFER_OVERFLOW_ERROR;
1960 break;
1961 }
1962
1963 }/* end while(mySourceIndex<mySourceLength) */
1964
374ca955
A
1965 /*
1966 * the end of the input stream and detection of truncated input
1967 * are handled by the framework, but for ISO-2022-JP conversion
1968 * we need to be in ASCII mode at the very end
1969 *
1970 * conditions:
1971 * successful
1972 * in SO mode or not in ASCII mode
1973 * end of input and no truncated input
b75a7d8f 1974 */
374ca955
A
1975 if( U_SUCCESS(*err) &&
1976 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
46f4442e 1977 args->flush && source>=sourceLimit && cnv->fromUChar32==0
374ca955
A
1978 ) {
1979 int32_t sourceIndex;
1980
1981 outLen = 0;
1982
1983 if(pFromU2022State->g != 0) {
1984 buffer[outLen++] = UCNV_SI;
1985 pFromU2022State->g = 0;
1986 }
1987
1988 if(pFromU2022State->cs[0] != ASCII) {
1989 int32_t escLen = escSeqCharsLen[ASCII];
1990 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1991 outLen += escLen;
1992 pFromU2022State->cs[0] = (int8_t)ASCII;
1993 }
1994
1995 /* get the source index of the last input character */
1996 /*
1997 * TODO this would be simpler and more reliable if we used a pair
1998 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1999 * so that we could simply use the prevSourceIndex here;
2000 * this code gives an incorrect result for the rare case of an unmatched
2001 * trail surrogate that is alone in the last buffer of the text stream
2002 */
2003 sourceIndex=(int32_t)(source-args->source);
2004 if(sourceIndex>0) {
2005 --sourceIndex;
2006 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2007 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2008 ) {
2009 --sourceIndex;
2010 }
2011 } else {
2012 sourceIndex=-1;
2013 }
2014
73c04bcf 2015 fromUWriteUInt8(
46f4442e 2016 cnv,
374ca955 2017 buffer, outLen,
73c04bcf 2018 &target, (const char *)targetLimit,
374ca955
A
2019 &offsets, sourceIndex,
2020 err);
b75a7d8f
A
2021 }
2022
2023 /*save the state and return */
2024 args->source = source;
2025 args->target = (char*)target;
2026}
2027
2028/*************** to unicode *******************/
2029
46f4442e 2030static void
b75a7d8f 2031UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
374ca955 2032 UErrorCode* err){
46f4442e 2033 char tempBuf[2];
374ca955 2034 const char *mySource = (char *) args->source;
b75a7d8f
A
2035 UChar *myTarget = args->target;
2036 const char *mySourceLimit = args->sourceLimit;
2037 uint32_t targetUniChar = 0x0000;
2038 uint32_t mySourceChar = 0x0000;
46f4442e 2039 uint32_t tmpSourceChar = 0x0000;
b75a7d8f 2040 UConverterDataISO2022* myData;
374ca955
A
2041 ISO2022State *pToU2022State;
2042 StateEnum cs;
b75a7d8f 2043
b75a7d8f 2044 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
374ca955 2045 pToU2022State = &myData->toU2022State;
b75a7d8f 2046
374ca955
A
2047 if(myData->key != 0) {
2048 /* continue with a partial escape sequence */
2049 goto escape;
2050 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2051 /* continue with a partial double-byte character */
2052 mySourceChar = args->converter->toUBytes[0];
2053 args->converter->toULength = 0;
2054 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
fd0068a8 2055 targetUniChar = missingCharMarker;
374ca955
A
2056 goto getTrailByte;
2057 }
2058
2059 while(mySource < mySourceLimit){
2060
2061 targetUniChar =missingCharMarker;
b75a7d8f
A
2062
2063 if(myTarget < args->targetLimit){
2064
2065 mySourceChar= (unsigned char) *mySource++;
374ca955
A
2066
2067 switch(mySourceChar) {
2068 case UCNV_SI:
2069 if(myData->version==3) {
2070 pToU2022State->g=0;
b75a7d8f 2071 continue;
374ca955
A
2072 } else {
2073 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
d5d484b0 2074 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
374ca955 2075 break;
b75a7d8f 2076 }
b75a7d8f 2077
374ca955
A
2078 case UCNV_SO:
2079 if(myData->version==3) {
2080 /* JIS7: switch to G1 half-width Katakana */
2081 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2082 pToU2022State->g=1;
b75a7d8f 2083 continue;
374ca955
A
2084 } else {
2085 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
d5d484b0 2086 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
374ca955 2087 break;
b75a7d8f 2088 }
b75a7d8f 2089
374ca955
A
2090 case ESC_2022:
2091 mySource--;
2092escape:
d5d484b0
A
2093 {
2094 const char * mySourceBefore = mySource;
2095 int8_t toULengthBefore = args->converter->toULength;
2096
46f4442e 2097 changeState_2022(args->converter,&(mySource),
d5d484b0
A
2098 mySourceLimit, ISO_2022_JP,err);
2099
2100 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
46f4442e
A
2101 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2102 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2103 args->converter->toUCallbackReason = UCNV_IRREGULAR;
729e4ab9 2104 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
d5d484b0 2105 }
d5d484b0 2106 }
46f4442e 2107
374ca955
A
2108 /* invalid or illegal escape sequence */
2109 if(U_FAILURE(*err)){
2110 args->target = myTarget;
2111 args->source = mySource;
d5d484b0 2112 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
374ca955 2113 return;
b75a7d8f 2114 }
d5d484b0 2115 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
46f4442e 2116 if(myData->key==0) {
d5d484b0
A
2117 myData->isEmptySegment = TRUE;
2118 }
374ca955 2119 continue;
b75a7d8f 2120
374ca955 2121 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
b75a7d8f 2122
374ca955
A
2123 case CR:
2124 /*falls through*/
2125 case LF:
2126 /* automatically reset to single-byte mode */
2127 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2128 pToU2022State->cs[0] = (int8_t)ASCII;
b75a7d8f 2129 }
374ca955
A
2130 pToU2022State->cs[2] = 0;
2131 pToU2022State->g = 0;
2132 /* falls through */
b75a7d8f 2133 default:
374ca955 2134 /* convert one or two bytes */
d5d484b0 2135 myData->isEmptySegment = FALSE;
374ca955
A
2136 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2137 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2138 !IS_JP_DBCS(cs)
2139 ) {
2140 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
46f4442e 2141 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
374ca955
A
2142
2143 /* return from a single-shift state to the previous one */
2144 if(pToU2022State->g >= 2) {
2145 pToU2022State->g=pToU2022State->prevG;
2146 }
2147 } else switch(cs) {
2148 case ASCII:
2149 if(mySourceChar <= 0x7f) {
2150 targetUniChar = mySourceChar;
2151 }
2152 break;
2153 case ISO8859_1:
2154 if(mySourceChar <= 0x7f) {
2155 targetUniChar = mySourceChar + 0x80;
2156 }
2157 /* return from a single-shift state to the previous one */
2158 pToU2022State->g=pToU2022State->prevG;
2159 break;
2160 case ISO8859_7:
2161 if(mySourceChar <= 0x7f) {
2162 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2163 targetUniChar =
2164 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2165 myData->myConverterArray[cs],
2166 mySourceChar + 0x80);
2167 }
2168 /* return from a single-shift state to the previous one */
2169 pToU2022State->g=pToU2022State->prevG;
2170 break;
2171 case JISX201:
2172 if(mySourceChar <= 0x7f) {
46f4442e 2173 targetUniChar = jisx201ToU(mySourceChar);
374ca955
A
2174 }
2175 break;
2176 case HWKANA_7BIT:
2177 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2178 /* 7-bit halfwidth Katakana */
46f4442e 2179 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
374ca955
A
2180 }
2181 break;
2182 default:
2183 /* G0 DBCS */
2184 if(mySource < mySourceLimit) {
fd0068a8
A
2185 int leadIsOk, trailIsOk;
2186 uint8_t trailByte;
374ca955 2187getTrailByte:
fd0068a8 2188 trailByte = (uint8_t)*mySource;
fd0068a8
A
2189 /*
2190 * Ticket 5691: consistent illegal sequences:
2191 * - We include at least the first byte in the illegal sequence.
2192 * - If any of the non-initial bytes could be the start of a character,
46f4442e 2193 * we stop the illegal sequence before the first one of those.
fd0068a8
A
2194 *
2195 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2196 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2197 * Otherwise we convert or report the pair of bytes.
2198 */
2199 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2200 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2201 if (leadIsOk && trailIsOk) {
2202 ++mySource;
46f4442e
A
2203 tmpSourceChar = (mySourceChar << 8) | trailByte;
2204 if(cs == JISX208) {
2205 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2206 mySourceChar = tmpSourceChar;
2207 } else {
2208 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2209 mySourceChar = tmpSourceChar;
2210 if (cs == KSC5601) {
2211 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2212 }
2213 tempBuf[0] = (char)(tmpSourceChar >> 8);
2214 tempBuf[1] = (char)(tmpSourceChar);
2215 }
fd0068a8
A
2216 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2217 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2218 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2219 ++mySource;
2220 /* add another bit so that the code below writes 2 bytes in case of error */
2221 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2222 }
374ca955
A
2223 } else {
2224 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2225 args->converter->toULength = 1;
2226 goto endloop;
2227 }
46f4442e 2228 } /* End of inner switch */
b75a7d8f 2229 break;
46f4442e 2230 } /* End of outer switch */
b75a7d8f
A
2231 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2232 if(args->offsets){
73c04bcf 2233 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f
A
2234 }
2235 *(myTarget++)=(UChar)targetUniChar;
b75a7d8f 2236 }
374ca955
A
2237 else if(targetUniChar > missingCharMarker){
2238 /* disassemble the surrogate pair and write to output*/
2239 targetUniChar-=0x0010000;
2240 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2241 if(args->offsets){
73c04bcf 2242 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
374ca955
A
2243 }
2244 ++myTarget;
46f4442e 2245 if(myTarget< args->targetLimit){
374ca955
A
2246 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2247 if(args->offsets){
73c04bcf 2248 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
374ca955
A
2249 }
2250 ++myTarget;
2251 }else{
2252 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2253 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2254 }
b75a7d8f 2255
374ca955
A
2256 }
2257 else{
b75a7d8f 2258 /* Call the callback function*/
374ca955
A
2259 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2260 break;
b75a7d8f
A
2261 }
2262 }
46f4442e 2263 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
b75a7d8f
A
2264 *err =U_BUFFER_OVERFLOW_ERROR;
2265 break;
2266 }
2267 }
374ca955 2268endloop:
b75a7d8f
A
2269 args->target = myTarget;
2270 args->source = mySource;
2271}
2272
2273
b75a7d8f
A
2274/***************************************************************
2275* Rules for ISO-2022-KR encoding
46f4442e 2276* i) The KSC5601 designator sequence should appear only once in a file,
b75a7d8f
A
2277* at the begining of a line before any KSC5601 characters. This usually
2278* means that it appears by itself on the first line of the file
2279* ii) There are only 2 shifting sequences SO to shift into double byte mode
2280* and SI to shift into single byte mode
2281*/
46f4442e 2282static void
b75a7d8f
A
2283UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2284
374ca955
A
2285 UConverter* saveConv = args->converter;
2286 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2287 args->converter=myConverterData->currentConverter;
2288
2289 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2290 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2291 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2292
2293 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2294 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2295 uprv_memcpy(
2296 saveConv->charErrorBuffer,
2297 myConverterData->currentConverter->charErrorBuffer,
2298 myConverterData->currentConverter->charErrorBufferLength);
2299 }
2300 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2301 myConverterData->currentConverter->charErrorBufferLength = 0;
2302 }
2303 args->converter=saveConv;
b75a7d8f
A
2304}
2305
46f4442e 2306static void
b75a7d8f
A
2307UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2308
2309 const UChar *source = args->source;
2310 const UChar *sourceLimit = args->sourceLimit;
2311 unsigned char *target = (unsigned char *) args->target;
2312 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2313 int32_t* offsets = args->offsets;
2314 uint32_t targetByteUnit = 0x0000;
2315 UChar32 sourceChar = 0x0000;
2316 UBool isTargetByteDBCS;
2317 UBool oldIsTargetByteDBCS;
2318 UConverterDataISO2022 *converterData;
b75a7d8f
A
2319 UConverterSharedData* sharedData;
2320 UBool useFallback;
2321 int32_t length =0;
2322
b75a7d8f 2323 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
46f4442e
A
2324 /* if the version is 1 then the user is requesting
2325 * conversion with ibm-25546 pass the arguments to
b75a7d8f
A
2326 * MBCS converter and return
2327 */
2328 if(converterData->version==1){
2329 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2330 return;
2331 }
374ca955
A
2332
2333 /* initialize data */
2334 sharedData = converterData->currentConverter->sharedData;
2335 useFallback = args->converter->useFallback;
2336 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2337 oldIsTargetByteDBCS = isTargetByteDBCS;
46f4442e 2338
b75a7d8f 2339 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
374ca955 2340 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
b75a7d8f
A
2341 goto getTrail;
2342 }
2343 while(source < sourceLimit){
46f4442e 2344
b75a7d8f
A
2345 targetByteUnit = missingCharMarker;
2346
2347 if(target < (unsigned char*) args->targetLimit){
2348 sourceChar = *source++;
73c04bcf
A
2349
2350 /* do not convert SO/SI/ESC */
2351 if(IS_2022_CONTROL(sourceChar)) {
2352 /* callback(illegal) */
2353 *err=U_ILLEGAL_CHAR_FOUND;
2354 args->converter->fromUChar32=sourceChar;
2355 break;
2356 }
2357
46f4442e
A
2358 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2359 if(length < 0) {
2360 length = -length; /* fallback */
2361 }
b75a7d8f 2362 /* only DBCS or SBCS characters are expected*/
374ca955 2363 /* DB characters with high bit set to 1 are expected */
fd0068a8
A
2364 if( length > 2 || length==0 ||
2365 (length == 1 && targetByteUnit > 0x7f) ||
2366 (length == 2 &&
2367 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2368 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2369 ) {
b75a7d8f
A
2370 targetByteUnit=missingCharMarker;
2371 }
2372 if (targetByteUnit != missingCharMarker){
2373
2374 oldIsTargetByteDBCS = isTargetByteDBCS;
2375 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2376 /* append the shift sequence */
2377 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
46f4442e
A
2378
2379 if (isTargetByteDBCS)
b75a7d8f 2380 *target++ = UCNV_SO;
46f4442e 2381 else
b75a7d8f
A
2382 *target++ = UCNV_SI;
2383 if(offsets)
73c04bcf 2384 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2385 }
2386 /* write the targetUniChar to target */
2387 if(targetByteUnit <= 0x00FF){
2388 if( target < targetLimit){
2389 *(target++) = (unsigned char) targetByteUnit;
2390 if(offsets){
73c04bcf 2391 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2392 }
2393
2394 }else{
2395 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2396 *err = U_BUFFER_OVERFLOW_ERROR;
2397 }
2398 }else{
2399 if(target < targetLimit){
2400 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2401 if(offsets){
73c04bcf 2402 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2403 }
2404 if(target < targetLimit){
2405 *(target++) =(unsigned char) (targetByteUnit -0x80);
2406 if(offsets){
73c04bcf 2407 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2408 }
2409 }else{
2410 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2411 *err = U_BUFFER_OVERFLOW_ERROR;
2412 }
2413 }else{
2414 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2415 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2416 *err = U_BUFFER_OVERFLOW_ERROR;
2417 }
2418 }
2419
2420 }
2421 else{
2422 /* oops.. the code point is unassingned
2423 * set the error and reason
2424 */
b75a7d8f
A
2425
2426 /*check if the char is a First surrogate*/
4388f060
A
2427 if(U16_IS_SURROGATE(sourceChar)) {
2428 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
b75a7d8f
A
2429getTrail:
2430 /*look ahead to find the trail surrogate*/
2431 if(source < sourceLimit) {
2432 /* test the following code unit */
2433 UChar trail=(UChar) *source;
4388f060 2434 if(U16_IS_TRAIL(trail)) {
b75a7d8f 2435 source++;
4388f060 2436 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
b75a7d8f 2437 *err = U_INVALID_CHAR_FOUND;
b75a7d8f
A
2438 /* convert this surrogate code point */
2439 /* exit this condition tree */
2440 } else {
2441 /* this is an unmatched lead code unit (1st surrogate) */
2442 /* callback(illegal) */
b75a7d8f
A
2443 *err=U_ILLEGAL_CHAR_FOUND;
2444 }
2445 } else {
2446 /* no more input */
2447 *err = U_ZERO_ERROR;
b75a7d8f
A
2448 }
2449 } else {
2450 /* this is an unmatched trail code unit (2nd surrogate) */
2451 /* callback(illegal) */
b75a7d8f
A
2452 *err=U_ILLEGAL_CHAR_FOUND;
2453 }
374ca955
A
2454 } else {
2455 /* callback(unassigned) for a BMP code point */
2456 *err = U_INVALID_CHAR_FOUND;
b75a7d8f 2457 }
b75a7d8f 2458
374ca955 2459 args->converter->fromUChar32=sourceChar;
374ca955 2460 break;
b75a7d8f
A
2461 }
2462 } /* end if(myTargetIndex<myTargetLength) */
2463 else{
2464 *err =U_BUFFER_OVERFLOW_ERROR;
2465 break;
2466 }
2467
2468 }/* end while(mySourceIndex<mySourceLength) */
2469
374ca955
A
2470 /*
2471 * the end of the input stream and detection of truncated input
2472 * are handled by the framework, but for ISO-2022-KR conversion
2473 * we need to be in ASCII mode at the very end
2474 *
2475 * conditions:
2476 * successful
2477 * not in ASCII mode
2478 * end of input and no truncated input
b75a7d8f 2479 */
374ca955
A
2480 if( U_SUCCESS(*err) &&
2481 isTargetByteDBCS &&
2482 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2483 ) {
2484 int32_t sourceIndex;
2485
2486 /* we are switching to ASCII */
2487 isTargetByteDBCS=FALSE;
2488
2489 /* get the source index of the last input character */
2490 /*
2491 * TODO this would be simpler and more reliable if we used a pair
2492 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2493 * so that we could simply use the prevSourceIndex here;
2494 * this code gives an incorrect result for the rare case of an unmatched
2495 * trail surrogate that is alone in the last buffer of the text stream
2496 */
2497 sourceIndex=(int32_t)(source-args->source);
2498 if(sourceIndex>0) {
2499 --sourceIndex;
2500 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2501 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2502 ) {
2503 --sourceIndex;
2504 }
2505 } else {
2506 sourceIndex=-1;
2507 }
2508
73c04bcf 2509 fromUWriteUInt8(
374ca955
A
2510 args->converter,
2511 SHIFT_IN_STR, 1,
73c04bcf 2512 &target, (const char *)targetLimit,
374ca955
A
2513 &offsets, sourceIndex,
2514 err);
b75a7d8f
A
2515 }
2516
2517 /*save the state and return */
2518 args->source = source;
2519 args->target = (char*)target;
2520 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2521}
2522
2523/************************ To Unicode ***************************************/
2524
46f4442e 2525static void
b75a7d8f
A
2526UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2527 UErrorCode* err){
b75a7d8f 2528 char const* sourceStart;
b75a7d8f 2529 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
b75a7d8f 2530
374ca955
A
2531 UConverterToUnicodeArgs subArgs;
2532 int32_t minArgsSize;
2533
2534 /* set up the subconverter arguments */
2535 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2536 minArgsSize = args->size;
2537 } else {
2538 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2539 }
2540
2541 uprv_memcpy(&subArgs, args, minArgsSize);
2542 subArgs.size = (uint16_t)minArgsSize;
2543 subArgs.converter = myData->currentConverter;
2544
2545 /* remember the original start of the input for offsets */
2546 sourceStart = args->source;
2547
2548 if(myData->key != 0) {
2549 /* continue with a partial escape sequence */
2550 goto escape;
2551 }
2552
2553 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
b75a7d8f 2554 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
374ca955
A
2555 subArgs.source = args->source;
2556 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2557 if(subArgs.source != subArgs.sourceLimit) {
2558 /*
2559 * get the current partial byte sequence
2560 *
2561 * it needs to be moved between the public and the subconverter
2562 * so that the conversion framework, which only sees the public
2563 * converter, can handle truncated and illegal input etc.
2564 */
2565 if(args->converter->toULength > 0) {
2566 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2567 }
2568 subArgs.converter->toULength = args->converter->toULength;
2569
2570 /*
2571 * Convert up to the end of the input, or to before the next escape character.
2572 * Does not handle conversion extensions because the preToU[] state etc.
2573 * is not copied.
2574 */
2575 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2576
2577 if(args->offsets != NULL && sourceStart != args->source) {
2578 /* update offsets to base them on the actual start of the input */
2579 int32_t *offsets = args->offsets;
2580 UChar *target = args->target;
2581 int32_t delta = (int32_t)(args->source - sourceStart);
2582 while(target < subArgs.target) {
2583 if(*offsets >= 0) {
2584 *offsets += delta;
2585 }
2586 ++offsets;
2587 ++target;
2588 }
2589 }
2590 args->source = subArgs.source;
2591 args->target = subArgs.target;
2592 args->offsets = subArgs.offsets;
2593
2594 /* copy input/error/overflow buffers */
2595 if(subArgs.converter->toULength > 0) {
2596 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2597 }
2598 args->converter->toULength = subArgs.converter->toULength;
2599
2600 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2601 if(subArgs.converter->UCharErrorBufferLength > 0) {
2602 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2603 subArgs.converter->UCharErrorBufferLength);
2604 }
2605 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2606 subArgs.converter->UCharErrorBufferLength = 0;
b75a7d8f 2607 }
b75a7d8f
A
2608 }
2609
374ca955 2610 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
b75a7d8f 2611 return;
374ca955 2612 }
b75a7d8f 2613
374ca955 2614escape:
b75a7d8f 2615 changeState_2022(args->converter,
46f4442e 2616 &(args->source),
b75a7d8f 2617 args->sourceLimit,
b75a7d8f 2618 ISO_2022_KR,
b75a7d8f 2619 err);
374ca955 2620 }
b75a7d8f
A
2621}
2622
46f4442e 2623static void
b75a7d8f
A
2624UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2625 UErrorCode* err){
374ca955 2626 char tempBuf[2];
b75a7d8f
A
2627 const char *mySource = ( char *) args->source;
2628 UChar *myTarget = args->target;
2629 const char *mySourceLimit = args->sourceLimit;
2630 UChar32 targetUniChar = 0x0000;
2631 UChar mySourceChar = 0x0000;
2632 UConverterDataISO2022* myData;
b75a7d8f
A
2633 UConverterSharedData* sharedData ;
2634 UBool useFallback;
2635
374ca955
A
2636 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2637 if(myData->version==1){
2638 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
b75a7d8f
A
2639 return;
2640 }
374ca955 2641
b75a7d8f 2642 /* initialize state */
374ca955 2643 sharedData = myData->currentConverter->sharedData;
b75a7d8f 2644 useFallback = args->converter->useFallback;
46f4442e 2645
374ca955
A
2646 if(myData->key != 0) {
2647 /* continue with a partial escape sequence */
2648 goto escape;
2649 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2650 /* continue with a partial double-byte character */
2651 mySourceChar = args->converter->toUBytes[0];
2652 args->converter->toULength = 0;
2653 goto getTrailByte;
b75a7d8f 2654 }
b75a7d8f 2655
374ca955 2656 while(mySource< mySourceLimit){
b75a7d8f
A
2657
2658 if(myTarget < args->targetLimit){
2659
2660 mySourceChar= (unsigned char) *mySource++;
2661
2662 if(mySourceChar==UCNV_SI){
374ca955 2663 myData->toU2022State.g = 0;
d5d484b0
A
2664 if (myData->isEmptySegment) {
2665 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
46f4442e
A
2666 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2667 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2668 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
d5d484b0
A
2669 args->converter->toULength = 1;
2670 args->target = myTarget;
2671 args->source = mySource;
2672 return;
2673 }
b75a7d8f
A
2674 /*consume the source */
2675 continue;
2676 }else if(mySourceChar==UCNV_SO){
374ca955 2677 myData->toU2022State.g = 1;
d5d484b0 2678 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
b75a7d8f
A
2679 /*consume the source */
2680 continue;
374ca955
A
2681 }else if(mySourceChar==ESC_2022){
2682 mySource--;
2683escape:
d5d484b0 2684 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
46f4442e 2685 changeState_2022(args->converter,&(mySource),
374ca955 2686 mySourceLimit, ISO_2022_KR, err);
b75a7d8f
A
2687 if(U_FAILURE(*err)){
2688 args->target = myTarget;
2689 args->source = mySource;
2690 return;
2691 }
2692 continue;
46f4442e 2693 }
b75a7d8f 2694
d5d484b0 2695 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
374ca955
A
2696 if(myData->toU2022State.g == 1) {
2697 if(mySource < mySourceLimit) {
fd0068a8
A
2698 int leadIsOk, trailIsOk;
2699 uint8_t trailByte;
374ca955 2700getTrailByte:
fd0068a8
A
2701 targetUniChar = missingCharMarker;
2702 trailByte = (uint8_t)*mySource;
2703 /*
2704 * Ticket 5691: consistent illegal sequences:
2705 * - We include at least the first byte in the illegal sequence.
2706 * - If any of the non-initial bytes could be the start of a character,
2707 * we stop the illegal sequence before the first one of those.
2708 *
2709 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2710 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2711 * Otherwise we convert or report the pair of bytes.
2712 */
2713 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2714 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2715 if (leadIsOk && trailIsOk) {
2716 ++mySource;
2717 tempBuf[0] = (char)(mySourceChar + 0x80);
2718 tempBuf[1] = (char)(trailByte + 0x80);
2719 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2720 mySourceChar = (mySourceChar << 8) | trailByte;
2721 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2722 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2723 ++mySource;
2724 /* add another bit so that the code below writes 2 bytes in case of error */
2725 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
374ca955
A
2726 }
2727 } else {
2728 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2729 args->converter->toULength = 1;
2730 break;
b75a7d8f
A
2731 }
2732 }
fd0068a8 2733 else if(mySourceChar <= 0x7f) {
374ca955 2734 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
fd0068a8
A
2735 } else {
2736 targetUniChar = 0xffff;
b75a7d8f 2737 }
374ca955
A
2738 if(targetUniChar < 0xfffe){
2739 if(args->offsets) {
73c04bcf 2740 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
374ca955 2741 }
b75a7d8f
A
2742 *(myTarget++)=(UChar)targetUniChar;
2743 }
2744 else {
b75a7d8f 2745 /* Call the callback function*/
374ca955
A
2746 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2747 break;
b75a7d8f
A
2748 }
2749 }
2750 else{
2751 *err =U_BUFFER_OVERFLOW_ERROR;
2752 break;
2753 }
2754 }
b75a7d8f
A
2755 args->target = myTarget;
2756 args->source = mySource;
2757}
2758
2759/*************************** END ISO2022-KR *********************************/
2760
2761/*************************** ISO-2022-CN *********************************
2762*
2763* Rules for ISO-2022-CN Encoding:
374ca955 2764* i) The designator sequence must appear once on a line before any instance
b75a7d8f
A
2765* of character set it designates.
2766* ii) If two lines contain characters from the same character set, both lines
2767* must include the designator sequence.
374ca955 2768* iii) Once the designator sequence is known, a shifting sequence has to be found
b75a7d8f
A
2769* to invoke the shifting
2770* iv) All lines start in ASCII and end in ASCII.
2771* v) Four shifting sequences are employed for this purpose:
2772*
2773* Sequcence ASCII Eq Charsets
2774* ---------- ------- ---------
374ca955
A
2775* SI <SI> US-ASCII
2776* SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2777* SS2 <ESC>N CNS-11643-1992 Plane 2
2778* SS3 <ESC>O CNS-11643-1992 Planes 3-7
b75a7d8f
A
2779*
2780* vi)
2781* SOdesignator : ESC "$" ")" finalchar_for_SO
2782* SS2designator : ESC "$" "*" finalchar_for_SS2
2783* SS3designator : ESC "$" "+" finalchar_for_SS3
2784*
2785* ESC $ ) A Indicates the bytes following SO are Chinese
2786* characters as defined in GB 2312-80, until
2787* another SOdesignation appears
2788*
2789*
2790* ESC $ ) E Indicates the bytes following SO are as defined
2791* in ISO-IR-165 (for details, see section 2.1),
2792* until another SOdesignation appears
2793*
2794* ESC $ ) G Indicates the bytes following SO are as defined
2795* in CNS 11643-plane-1, until another
2796* SOdesignation appears
2797*
2798* ESC $ * H Indicates the two bytes immediately following
2799* SS2 is a Chinese character as defined in CNS
2800* 11643-plane-2, until another SS2designation
2801* appears
46f4442e 2802* (Meaning <ESC>N must preceed every 2 byte
b75a7d8f
A
2803* sequence.)
2804*
2805* ESC $ + I Indicates the immediate two bytes following SS3
2806* is a Chinese character as defined in CNS
2807* 11643-plane-3, until another SS3designation
2808* appears
46f4442e 2809* (Meaning <ESC>O must preceed every 2 byte
b75a7d8f
A
2810* sequence.)
2811*
2812* ESC $ + J Indicates the immediate two bytes following SS3
2813* is a Chinese character as defined in CNS
2814* 11643-plane-4, until another SS3designation
2815* appears
46f4442e 2816* (In English: <ESC>O must preceed every 2 byte
b75a7d8f
A
2817* sequence.)
2818*
2819* ESC $ + K Indicates the immediate two bytes following SS3
2820* is a Chinese character as defined in CNS
2821* 11643-plane-5, until another SS3designation
2822* appears
2823*
2824* ESC $ + L Indicates the immediate two bytes following SS3
2825* is a Chinese character as defined in CNS
2826* 11643-plane-6, until another SS3designation
2827* appears
2828*
2829* ESC $ + M Indicates the immediate two bytes following SS3
2830* is a Chinese character as defined in CNS
2831* 11643-plane-7, until another SS3designation
2832* appears
2833*
2834* As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2835* has its own designation information before any Chinese characters
2836* appear
2837*
2838*/
2839
4388f060 2840/* The following are defined this way to make the strings truly readonly */
b75a7d8f
A
2841static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2842static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2843static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2844static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2845static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2846static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2847static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2848static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2849static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2850
2851/********************** ISO2022-CN Data **************************/
2852static const char* const escSeqCharsCN[10] ={
4388f060
A
2853 SHIFT_IN_STR, /* 0 ASCII */
2854 GB_2312_80_STR, /* 1 GB2312_1 */
2855 ISO_IR_165_STR, /* 2 ISO_IR_165 */
b75a7d8f
A
2856 CNS_11643_1992_Plane_1_STR,
2857 CNS_11643_1992_Plane_2_STR,
2858 CNS_11643_1992_Plane_3_STR,
2859 CNS_11643_1992_Plane_4_STR,
2860 CNS_11643_1992_Plane_5_STR,
2861 CNS_11643_1992_Plane_6_STR,
2862 CNS_11643_1992_Plane_7_STR
2863};
b75a7d8f 2864
46f4442e 2865static void
b75a7d8f 2866UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
46f4442e 2867 UConverter *cnv = args->converter;
b75a7d8f 2868 UConverterDataISO2022 *converterData;
374ca955
A
2869 ISO2022State *pFromU2022State;
2870 uint8_t *target = (uint8_t *) args->target;
2871 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
b75a7d8f
A
2872 const UChar* source = args->source;
2873 const UChar* sourceLimit = args->sourceLimit;
2874 int32_t* offsets = args->offsets;
374ca955
A
2875 UChar32 sourceChar;
2876 char buffer[8];
2877 int32_t len;
2878 int8_t choices[3];
2879 int32_t choiceCount;
73c04bcf 2880 uint32_t targetValue = 0;
b75a7d8f
A
2881 UBool useFallback;
2882
b75a7d8f 2883 /* set up the state */
46f4442e 2884 converterData = (UConverterDataISO2022*)cnv->extraInfo;
374ca955 2885 pFromU2022State = &converterData->fromU2022State;
374ca955
A
2886
2887 choiceCount = 0;
b75a7d8f
A
2888
2889 /* check if the last codepoint of previous buffer was a lead surrogate*/
46f4442e 2890 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
b75a7d8f
A
2891 goto getTrail;
2892 }
2893
b75a7d8f 2894 while( source < sourceLimit){
b75a7d8f
A
2895 if(target < targetLimit){
2896
2897 sourceChar = *(source++);
2898 /*check if the char is a First surrogate*/
4388f060
A
2899 if(U16_IS_SURROGATE(sourceChar)) {
2900 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
b75a7d8f
A
2901getTrail:
2902 /*look ahead to find the trail surrogate*/
2903 if(source < sourceLimit) {
2904 /* test the following code unit */
2905 UChar trail=(UChar) *source;
4388f060 2906 if(U16_IS_TRAIL(trail)) {
b75a7d8f 2907 source++;
4388f060 2908 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
46f4442e 2909 cnv->fromUChar32=0x00;
374ca955 2910 /* convert this supplementary code point */
b75a7d8f
A
2911 /* exit this condition tree */
2912 } else {
2913 /* this is an unmatched lead code unit (1st surrogate) */
2914 /* callback(illegal) */
b75a7d8f 2915 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 2916 cnv->fromUChar32=sourceChar;
374ca955 2917 break;
b75a7d8f
A
2918 }
2919 } else {
2920 /* no more input */
46f4442e 2921 cnv->fromUChar32=sourceChar;
b75a7d8f
A
2922 break;
2923 }
2924 } else {
2925 /* this is an unmatched trail code unit (2nd surrogate) */
2926 /* callback(illegal) */
b75a7d8f 2927 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 2928 cnv->fromUChar32=sourceChar;
374ca955 2929 break;
b75a7d8f
A
2930 }
2931 }
2932
2933 /* do the conversion */
374ca955 2934 if(sourceChar <= 0x007f ){
73c04bcf
A
2935 /* do not convert SO/SI/ESC */
2936 if(IS_2022_CONTROL(sourceChar)) {
2937 /* callback(illegal) */
2938 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 2939 cnv->fromUChar32=sourceChar;
73c04bcf
A
2940 break;
2941 }
2942
374ca955
A
2943 /* US-ASCII */
2944 if(pFromU2022State->g == 0) {
2945 buffer[0] = (char)sourceChar;
2946 len = 1;
2947 } else {
2948 buffer[0] = UCNV_SI;
2949 buffer[1] = (char)sourceChar;
2950 len = 2;
2951 pFromU2022State->g = 0;
2952 choiceCount = 0;
2953 }
2954 if(sourceChar == CR || sourceChar == LF) {
2955 /* reset the state at the end of a line */
2956 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2957 choiceCount = 0;
b75a7d8f 2958 }
b75a7d8f
A
2959 }
2960 else{
374ca955 2961 /* convert U+0080..U+10ffff */
374ca955
A
2962 int32_t i;
2963 int8_t cs, g;
2964
2965 if(choiceCount == 0) {
2966 /* try the current SO/G1 converter first */
2967 choices[0] = pFromU2022State->cs[1];
2968
2969 /* default to GB2312_1 if none is designated yet */
2970 if(choices[0] == 0) {
2971 choices[0] = GB2312_1;
2972 }
b75a7d8f 2973
374ca955
A
2974 if(converterData->version == 0) {
2975 /* ISO-2022-CN */
2976
2977 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2978 if(choices[0] == GB2312_1) {
2979 choices[1] = (int8_t)CNS_11643_1;
2980 } else {
2981 choices[1] = (int8_t)GB2312_1;
b75a7d8f 2982 }
374ca955
A
2983
2984 choiceCount = 2;
729e4ab9 2985 } else if (converterData->version == 1) {
374ca955
A
2986 /* ISO-2022-CN-EXT */
2987
2988 /* try one of the other converters */
2989 switch(choices[0]) {
2990 case GB2312_1:
2991 choices[1] = (int8_t)CNS_11643_1;
2992 choices[2] = (int8_t)ISO_IR_165;
2993 break;
2994 case ISO_IR_165:
2995 choices[1] = (int8_t)GB2312_1;
2996 choices[2] = (int8_t)CNS_11643_1;
2997 break;
2998 default: /* CNS_11643_x */
2999 choices[1] = (int8_t)GB2312_1;
3000 choices[2] = (int8_t)ISO_IR_165;
3001 break;
b75a7d8f 3002 }
b75a7d8f 3003
374ca955 3004 choiceCount = 3;
729e4ab9
A
3005 } else {
3006 choices[0] = (int8_t)CNS_11643_1;
3007 choices[1] = (int8_t)GB2312_1;
374ca955 3008 }
b75a7d8f
A
3009 }
3010
374ca955 3011 cs = g = 0;
46f4442e
A
3012 /*
3013 * len==0: no mapping found yet
3014 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3015 * len>0: found a roundtrip result, done
3016 */
374ca955 3017 len = 0;
46f4442e
A
3018 /*
3019 * We will turn off useFallback after finding a fallback,
3020 * but we still get fallbacks from PUA code points as usual.
3021 * Therefore, we will also need to check that we don't overwrite
3022 * an early fallback with a later one.
3023 */
3024 useFallback = cnv->useFallback;
3025
3026 for(i = 0; i < choiceCount && len <= 0; ++i) {
3027 int8_t cs0 = choices[i];
3028 if(cs0 > 0) {
3029 uint32_t value;
3030 int32_t len2;
3031 if(cs0 >= CNS_11643_0) {
3032 len2 = MBCS_FROM_UCHAR32_ISO2022(
3033 converterData->myConverterArray[CNS_11643],
3034 sourceChar,
3035 &value,
3036 useFallback,
3037 MBCS_OUTPUT_3);
3038 if(len2 == 3 || (len2 == -3 && len == 0)) {
3039 targetValue = value;
3040 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3041 if(len2 >= 0) {
3042 len = 2;
3043 } else {
3044 len = -2;
3045 useFallback = FALSE;
3046 }
374ca955
A
3047 if(cs == CNS_11643_1) {
3048 g = 1;
3049 } else if(cs == CNS_11643_2) {
3050 g = 2;
3051 } else /* plane 3..7 */ if(converterData->version == 1) {
3052 g = 3;
3053 } else {
3054 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3055 len = 0;
3056 }
3057 }
3058 } else {
3059 /* GB2312_1 or ISO-IR-165 */
4388f060 3060 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
46f4442e
A
3061 len2 = MBCS_FROM_UCHAR32_ISO2022(
3062 converterData->myConverterArray[cs0],
3063 sourceChar,
3064 &value,
3065 useFallback,
3066 MBCS_OUTPUT_2);
3067 if(len2 == 2 || (len2 == -2 && len == 0)) {
3068 targetValue = value;
3069 len = len2;
3070 cs = cs0;
3071 g = 1;
3072 useFallback = FALSE;
3073 }
374ca955 3074 }
b75a7d8f 3075 }
b75a7d8f
A
3076 }
3077
46f4442e
A
3078 if(len != 0) {
3079 len = 0; /* count output bytes; it must have been abs(len) == 2 */
b75a7d8f 3080
374ca955
A
3081 /* write the designation sequence if necessary */
3082 if(cs != pFromU2022State->cs[g]) {
3083 if(cs < CNS_11643) {
3084 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3085 } else {
4388f060 3086 U_ASSERT(cs >= CNS_11643_1);
374ca955 3087 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
b75a7d8f 3088 }
374ca955
A
3089 len = 4;
3090 pFromU2022State->cs[g] = cs;
3091 if(g == 1) {
3092 /* changing the SO/G1 charset invalidates the choices[] */
3093 choiceCount = 0;
b75a7d8f 3094 }
374ca955
A
3095 }
3096
3097 /* write the shift sequence if necessary */
3098 if(g != pFromU2022State->g) {
3099 switch(g) {
3100 case 1:
3101 buffer[len++] = UCNV_SO;
3102
3103 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3104 pFromU2022State->g = 1;
3105 break;
3106 case 2:
3107 buffer[len++] = 0x1b;
3108 buffer[len++] = 0x4e;
3109 break;
3110 default: /* case 3 */
3111 buffer[len++] = 0x1b;
3112 buffer[len++] = 0x4f;
3113 break;
b75a7d8f 3114 }
b75a7d8f 3115 }
b75a7d8f 3116
374ca955
A
3117 /* write the two output bytes */
3118 buffer[len++] = (char)(targetValue >> 8);
3119 buffer[len++] = (char)targetValue;
3120 } else {
46f4442e 3121 /* if we cannot find the character after checking all codepages
374ca955
A
3122 * then this is an error
3123 */
3124 *err = U_INVALID_CHAR_FOUND;
46f4442e 3125 cnv->fromUChar32=sourceChar;
374ca955
A
3126 break;
3127 }
b75a7d8f 3128 }
b75a7d8f 3129
374ca955
A
3130 /* output len>0 bytes in buffer[] */
3131 if(len == 1) {
3132 *target++ = buffer[0];
3133 if(offsets) {
73c04bcf 3134 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
374ca955
A
3135 }
3136 } else if(len == 2 && (target + 2) <= targetLimit) {
3137 *target++ = buffer[0];
3138 *target++ = buffer[1];
3139 if(offsets) {
3140 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3141 *offsets++ = sourceIndex;
3142 *offsets++ = sourceIndex;
3143 }
3144 } else {
73c04bcf 3145 fromUWriteUInt8(
46f4442e 3146 cnv,
374ca955 3147 buffer, len,
73c04bcf 3148 &target, (const char *)targetLimit,
374ca955
A
3149 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3150 err);
3151 if(U_FAILURE(*err)) {
b75a7d8f
A
3152 break;
3153 }
3154 }
3155 } /* end if(myTargetIndex<myTargetLength) */
3156 else{
3157 *err =U_BUFFER_OVERFLOW_ERROR;
3158 break;
3159 }
3160
3161 }/* end while(mySourceIndex<mySourceLength) */
3162
374ca955
A
3163 /*
3164 * the end of the input stream and detection of truncated input
3165 * are handled by the framework, but for ISO-2022-CN conversion
3166 * we need to be in ASCII mode at the very end
3167 *
3168 * conditions:
3169 * successful
3170 * not in ASCII mode
3171 * end of input and no truncated input
b75a7d8f 3172 */
374ca955
A
3173 if( U_SUCCESS(*err) &&
3174 pFromU2022State->g!=0 &&
46f4442e 3175 args->flush && source>=sourceLimit && cnv->fromUChar32==0
374ca955
A
3176 ) {
3177 int32_t sourceIndex;
3178
3179 /* we are switching to ASCII */
3180 pFromU2022State->g=0;
3181
3182 /* get the source index of the last input character */
3183 /*
3184 * TODO this would be simpler and more reliable if we used a pair
3185 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3186 * so that we could simply use the prevSourceIndex here;
3187 * this code gives an incorrect result for the rare case of an unmatched
3188 * trail surrogate that is alone in the last buffer of the text stream
3189 */
3190 sourceIndex=(int32_t)(source-args->source);
3191 if(sourceIndex>0) {
3192 --sourceIndex;
3193 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3194 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3195 ) {
3196 --sourceIndex;
b75a7d8f 3197 }
374ca955
A
3198 } else {
3199 sourceIndex=-1;
b75a7d8f 3200 }
b75a7d8f 3201
73c04bcf 3202 fromUWriteUInt8(
46f4442e 3203 cnv,
374ca955 3204 SHIFT_IN_STR, 1,
73c04bcf 3205 &target, (const char *)targetLimit,
374ca955
A
3206 &offsets, sourceIndex,
3207 err);
b75a7d8f 3208 }
b75a7d8f 3209
374ca955
A
3210 /*save the state and return */
3211 args->source = source;
3212 args->target = (char*)target;
b75a7d8f
A
3213}
3214
3215
46f4442e 3216static void
b75a7d8f
A
3217UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3218 UErrorCode* err){
3219 char tempBuf[3];
374ca955 3220 const char *mySource = (char *) args->source;
b75a7d8f 3221 UChar *myTarget = args->target;
b75a7d8f
A
3222 const char *mySourceLimit = args->sourceLimit;
3223 uint32_t targetUniChar = 0x0000;
3224 uint32_t mySourceChar = 0x0000;
3225 UConverterDataISO2022* myData;
374ca955 3226 ISO2022State *pToU2022State;
b75a7d8f 3227
374ca955
A
3228 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3229 pToU2022State = &myData->toU2022State;
3230
3231 if(myData->key != 0) {
3232 /* continue with a partial escape sequence */
3233 goto escape;
3234 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3235 /* continue with a partial double-byte character */
3236 mySourceChar = args->converter->toUBytes[0];
3237 args->converter->toULength = 0;
fd0068a8 3238 targetUniChar = missingCharMarker;
374ca955 3239 goto getTrailByte;
b75a7d8f 3240 }
374ca955
A
3241
3242 while(mySource < mySourceLimit){
b75a7d8f
A
3243
3244 targetUniChar =missingCharMarker;
3245
3246 if(myTarget < args->targetLimit){
3247
3248 mySourceChar= (unsigned char) *mySource++;
3249
b75a7d8f
A
3250 switch(mySourceChar){
3251 case UCNV_SI:
374ca955 3252 pToU2022State->g=0;
d5d484b0
A
3253 if (myData->isEmptySegment) {
3254 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
46f4442e
A
3255 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3256 args->converter->toUCallbackReason = UCNV_IRREGULAR;
d5d484b0
A
3257 args->converter->toUBytes[0] = mySourceChar;
3258 args->converter->toULength = 1;
3259 args->target = myTarget;
3260 args->source = mySource;
3261 return;
3262 }
b75a7d8f
A
3263 continue;
3264
3265 case UCNV_SO:
374ca955
A
3266 if(pToU2022State->cs[1] != 0) {
3267 pToU2022State->g=1;
d5d484b0 3268 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
374ca955
A
3269 continue;
3270 } else {
3271 /* illegal to have SO before a matching designator */
d5d484b0 3272 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
b75a7d8f
A
3273 break;
3274 }
3275
b75a7d8f 3276 case ESC_2022:
b75a7d8f 3277 mySource--;
374ca955 3278escape:
d5d484b0
A
3279 {
3280 const char * mySourceBefore = mySource;
3281 int8_t toULengthBefore = args->converter->toULength;
3282
46f4442e 3283 changeState_2022(args->converter,&(mySource),
d5d484b0
A
3284 mySourceLimit, ISO_2022_CN,err);
3285
3286 /* After SO there must be at least one character before a designator (designator error handled separately) */
46f4442e
A
3287 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3288 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3289 args->converter->toUCallbackReason = UCNV_IRREGULAR;
729e4ab9 3290 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
d5d484b0
A
3291 }
3292 }
b75a7d8f
A
3293
3294 /* invalid or illegal escape sequence */
3295 if(U_FAILURE(*err)){
3296 args->target = myTarget;
3297 args->source = mySource;
d5d484b0 3298 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
b75a7d8f
A
3299 return;
3300 }
3301 continue;
3302
374ca955
A
3303 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3304
3305 case CR:
3306 /*falls through*/
3307 case LF:
3308 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3309 /* falls through */
3310 default:
3311 /* convert one or two bytes */
d5d484b0 3312 myData->isEmptySegment = FALSE;
374ca955
A
3313 if(pToU2022State->g != 0) {
3314 if(mySource < mySourceLimit) {
3315 UConverterSharedData *cnv;
3316 StateEnum tempState;
3317 int32_t tempBufLen;
fd0068a8
A
3318 int leadIsOk, trailIsOk;
3319 uint8_t trailByte;
374ca955 3320getTrailByte:
fd0068a8
A
3321 trailByte = (uint8_t)*mySource;
3322 /*
3323 * Ticket 5691: consistent illegal sequences:
3324 * - We include at least the first byte in the illegal sequence.
3325 * - If any of the non-initial bytes could be the start of a character,
3326 * we stop the illegal sequence before the first one of those.
3327 *
3328 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3329 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3330 * Otherwise we convert or report the pair of bytes.
3331 */
3332 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3333 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3334 if (leadIsOk && trailIsOk) {
3335 ++mySource;
3336 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3337 if(tempState >= CNS_11643_0) {
3338 cnv = myData->myConverterArray[CNS_11643];
3339 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3340 tempBuf[1] = (char) (mySourceChar);
3341 tempBuf[2] = (char) trailByte;
3342 tempBufLen = 3;
3343
3344 }else{
4388f060 3345 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
fd0068a8
A
3346 cnv = myData->myConverterArray[tempState];
3347 tempBuf[0] = (char) (mySourceChar);
3348 tempBuf[1] = (char) trailByte;
3349 tempBufLen = 2;
3350 }
3351 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3352 mySourceChar = (mySourceChar << 8) | trailByte;
3353 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3354 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3355 ++mySource;
3356 /* add another bit so that the code below writes 2 bytes in case of error */
3357 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
374ca955 3358 }
374ca955
A
3359 if(pToU2022State->g>=2) {
3360 /* return from a single-shift state to the previous one */
3361 pToU2022State->g=pToU2022State->prevG;
3362 }
374ca955
A
3363 } else {
3364 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3365 args->converter->toULength = 1;
3366 goto endloop;
3367 }
3368 }
3369 else{
3370 if(mySourceChar <= 0x7f) {
3371 targetUniChar = (UChar) mySourceChar;
3372 }
3373 }
3374 break;
b75a7d8f
A
3375 }
3376 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3377 if(args->offsets){
73c04bcf 3378 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f
A
3379 }
3380 *(myTarget++)=(UChar)targetUniChar;
3381 }
3382 else if(targetUniChar > missingCharMarker){
3383 /* disassemble the surrogate pair and write to output*/
3384 targetUniChar-=0x0010000;
374ca955 3385 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
b75a7d8f 3386 if(args->offsets){
73c04bcf 3387 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f 3388 }
374ca955 3389 ++myTarget;
46f4442e 3390 if(myTarget< args->targetLimit){
374ca955 3391 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
b75a7d8f 3392 if(args->offsets){
73c04bcf 3393 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f 3394 }
374ca955 3395 ++myTarget;
b75a7d8f
A
3396 }else{
3397 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3398 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3399 }
3400
3401 }
3402 else{
3403 /* Call the callback function*/
374ca955
A
3404 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3405 break;
b75a7d8f
A
3406 }
3407 }
3408 else{
3409 *err =U_BUFFER_OVERFLOW_ERROR;
3410 break;
3411 }
3412 }
374ca955 3413endloop:
b75a7d8f
A
3414 args->target = myTarget;
3415 args->source = mySource;
3416}
3417
3418static void
3419_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3420 UConverter *cnv = args->converter;
3421 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
374ca955
A
3422 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3423 char *p, *subchar;
3424 char buffer[8];
3425 int32_t length;
3426
73c04bcf 3427 subchar=(char *)cnv->subChars;
374ca955 3428 length=cnv->subCharLen; /* assume length==1 for most variants */
b75a7d8f
A
3429
3430 p = buffer;
3431 switch(myConverterData->locale[0]){
3432 case 'j':
374ca955
A
3433 {
3434 int8_t cs;
3435
3436 if(pFromU2022State->g == 1) {
3437 /* JIS7: switch from G1 to G0 */
3438 pFromU2022State->g = 0;
3439 *p++ = UCNV_SI;
3440 }
3441
3442 cs = pFromU2022State->cs[0];
3443 if(cs != ASCII && cs != JISX201) {
3444 /* not in ASCII or JIS X 0201: switch to ASCII */
3445 pFromU2022State->cs[0] = (int8_t)ASCII;
b75a7d8f
A
3446 *p++ = '\x1b';
3447 *p++ = '\x28';
3448 *p++ = '\x42';
b75a7d8f 3449 }
374ca955
A
3450
3451 *p++ = subchar[0];
b75a7d8f 3452 break;
374ca955 3453 }
b75a7d8f 3454 case 'c':
374ca955
A
3455 if(pFromU2022State->g != 0) {
3456 /* not in ASCII mode: switch to ASCII */
3457 pFromU2022State->g = 0;
3458 *p++ = UCNV_SI;
3459 }
3460 *p++ = subchar[0];
b75a7d8f
A
3461 break;
3462 case 'k':
374ca955
A
3463 if(myConverterData->version == 0) {
3464 if(length == 1) {
3465 if((UBool)args->converter->fromUnicodeStatus) {
3466 /* in DBCS mode: switch to SBCS */
3467 args->converter->fromUnicodeStatus = 0;
3468 *p++ = UCNV_SI;
3469 }
3470 *p++ = subchar[0];
3471 } else /* length == 2*/ {
3472 if(!(UBool)args->converter->fromUnicodeStatus) {
3473 /* in SBCS mode: switch to DBCS */
3474 args->converter->fromUnicodeStatus = 1;
3475 *p++ = UCNV_SO;
3476 }
3477 *p++ = subchar[0];
3478 *p++ = subchar[1];
3479 }
3480 break;
3481 } else {
73c04bcf
A
3482 /* save the subconverter's substitution string */
3483 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3484 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3485
3486 /* set our substitution string into the subconverter */
3487 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
374ca955
A
3488 myConverterData->currentConverter->subCharLen = (int8_t)length;
3489
73c04bcf
A
3490 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3491 args->converter = myConverterData->currentConverter;
374ca955
A
3492 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3493 ucnv_cbFromUWriteSub(args, 0, err);
3494 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
73c04bcf
A
3495 args->converter = cnv;
3496
3497 /* restore the subconverter's substitution string */
3498 myConverterData->currentConverter->subChars = currentSubChars;
3499 myConverterData->currentConverter->subCharLen = currentSubCharLen;
374ca955
A
3500
3501 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3502 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3503 uprv_memcpy(
3504 cnv->charErrorBuffer,
3505 myConverterData->currentConverter->charErrorBuffer,
3506 myConverterData->currentConverter->charErrorBufferLength);
3507 }
3508 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3509 myConverterData->currentConverter->charErrorBufferLength = 0;
3510 }
374ca955 3511 return;
b75a7d8f 3512 }
b75a7d8f
A
3513 default:
3514 /* not expected */
3515 break;
3516 }
3517 ucnv_cbFromUWriteBytes(args,
3518 buffer, (int32_t)(p - buffer),
3519 offsetIndex, err);
3520}
3521
73c04bcf
A
3522/*
3523 * Structure for cloning an ISO 2022 converter into a single memory block.
3524 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3525 * and then ucnv_safeClone() of the sub-converter may additionally align
3526 * currentConverter inside the cloneStruct, for which we need the deadSpace
3527 * after currentConverter.
3528 * This is because UAlignedMemory may be larger than the actually
3529 * necessary alignment size for the platform.
3530 * The other cloneStruct fields will not be moved around,
3531 * and are aligned properly with cloneStruct's alignment.
3532 */
b75a7d8f
A
3533struct cloneStruct
3534{
3535 UConverter cnv;
374ca955 3536 UConverter currentConverter;
73c04bcf
A
3537 UAlignedMemory deadSpace;
3538 UConverterDataISO2022 mydata;
b75a7d8f
A
3539};
3540
3541
46f4442e 3542static UConverter *
b75a7d8f 3543_ISO_2022_SafeClone(
46f4442e
A
3544 const UConverter *cnv,
3545 void *stackBuffer,
3546 int32_t *pBufferSize,
b75a7d8f
A
3547 UErrorCode *status)
3548{
3549 struct cloneStruct * localClone;
374ca955
A
3550 UConverterDataISO2022 *cnvData;
3551 int32_t i, size;
b75a7d8f
A
3552
3553 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
374ca955
A
3554 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3555 return NULL;
b75a7d8f
A
3556 }
3557
374ca955 3558 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
b75a7d8f 3559 localClone = (struct cloneStruct *)stackBuffer;
b75a7d8f 3560
374ca955 3561 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
b75a7d8f 3562
374ca955 3563 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
73c04bcf
A
3564 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3565 localClone->cnv.isExtraLocal = TRUE;
b75a7d8f 3566
374ca955 3567 /* share the subconverters */
b75a7d8f 3568
374ca955 3569 if(cnvData->currentConverter != NULL) {
73c04bcf 3570 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
374ca955
A
3571 localClone->mydata.currentConverter =
3572 ucnv_safeClone(cnvData->currentConverter,
3573 &localClone->currentConverter,
3574 &size, status);
3575 if(U_FAILURE(*status)) {
3576 return NULL;
b75a7d8f 3577 }
b75a7d8f
A
3578 }
3579
374ca955
A
3580 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3581 if(cnvData->myConverterArray[i] != NULL) {
3582 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3583 }
b75a7d8f
A
3584 }
3585
b75a7d8f
A
3586 return &localClone->cnv;
3587}
3588
3589static void
3590_ISO_2022_GetUnicodeSet(const UConverter *cnv,
73c04bcf 3591 const USetAdder *sa,
b75a7d8f
A
3592 UConverterUnicodeSet which,
3593 UErrorCode *pErrorCode)
3594{
3595 int32_t i;
b75a7d8f
A
3596 UConverterDataISO2022* cnvData;
3597
3598 if (U_FAILURE(*pErrorCode)) {
3599 return;
3600 }
374ca955 3601#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f
A
3602 if (cnv->sharedData == &_ISO2022Data) {
3603 /* We use UTF-8 in this case */
374ca955
A
3604 sa->addRange(sa->set, 0, 0xd7FF);
3605 sa->addRange(sa->set, 0xE000, 0x10FFFF);
b75a7d8f
A
3606 return;
3607 }
374ca955 3608#endif
b75a7d8f
A
3609
3610 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
b75a7d8f 3611
374ca955
A
3612 /* open a set and initialize it with code points that are algorithmically round-tripped */
3613 switch(cnvData->locale[0]){
3614 case 'j':
46f4442e
A
3615 /* include JIS X 0201 which is hardcoded */
3616 sa->add(sa->set, 0xa5);
3617 sa->add(sa->set, 0x203e);
374ca955
A
3618 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3619 /* include Latin-1 for some variants of JP */
3620 sa->addRange(sa->set, 0, 0xff);
3621 } else {
3622 /* include ASCII for JP */
3623 sa->addRange(sa->set, 0, 0x7f);
3624 }
46f4442e
A
3625 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3626 /*
3627 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3628 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3629 * use half-width Katakana.
3630 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3631 * half-width Katakana via the ESC ( I sequence.
3632 * However, we only emit (fromUnicode) half-width Katakana according to the
3633 * definition of each variant.
3634 *
3635 * When including fallbacks,
3636 * we need to include half-width Katakana Unicode code points for all JP variants because
3637 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3638 */
374ca955 3639 /* include half-width Katakana for JP */
46f4442e 3640 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
374ca955
A
3641 }
3642 break;
3643 case 'c':
3644 case 'z':
3645 /* include ASCII for CN */
3646 sa->addRange(sa->set, 0, 0x7f);
3647 break;
3648 case 'k':
3649 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3650 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3651 cnvData->currentConverter, sa, which, pErrorCode);
73c04bcf
A
3652 /* the loop over myConverterArray[] will simply not find another converter */
3653 break;
374ca955
A
3654 default:
3655 break;
b75a7d8f
A
3656 }
3657
46f4442e 3658#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
374ca955
A
3659 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3660 cnvData->version==0 && i==CNS_11643
3661 ) {
3662 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3663 ucnv_MBCSGetUnicodeSetForBytes(
3664 cnvData->myConverterArray[i],
3665 sa, UCNV_ROUNDTRIP_SET,
3666 0, 0x81, 0x82,
3667 pErrorCode);
46f4442e
A
3668 }
3669#endif
3670
3671 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3672 UConverterSetFilter filter;
3673 if(cnvData->myConverterArray[i]!=NULL) {
3674 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3675 cnvData->version==0 && i==CNS_11643
3676 ) {
3677 /*
3678 * Version-specific for CN:
3679 * CN version 0 does not map CNS planes 3..7 although
3680 * they are all available in the CNS conversion table;
3681 * CN version 1 (-EXT) does map them all.
3682 * The two versions create different Unicode sets.
3683 */
3684 filter=UCNV_SET_FILTER_2022_CN;
3685 } else if(cnvData->locale[0]=='j' && i==JISX208) {
3686 /*
3687 * Only add code points that map to Shift-JIS codes
3688 * corresponding to JIS X 0208.
3689 */
3690 filter=UCNV_SET_FILTER_SJIS;
3691 } else if(i==KSC5601) {
3692 /*
3693 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3694 * are broader than GR94.
3695 */
3696 filter=UCNV_SET_FILTER_GR94DBCS;
374ca955 3697 } else {
46f4442e 3698 filter=UCNV_SET_FILTER_NONE;
374ca955 3699 }
46f4442e 3700 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
374ca955 3701 }
b75a7d8f 3702 }
73c04bcf
A
3703
3704 /*
3705 * ISO 2022 converters must not convert SO/SI/ESC despite what
3706 * sub-converters do by themselves.
3707 * Remove these characters from the set.
3708 */
3709 sa->remove(sa->set, 0x0e);
3710 sa->remove(sa->set, 0x0f);
3711 sa->remove(sa->set, 0x1b);
46f4442e
A
3712
3713 /* ISO 2022 converters do not convert C1 controls either */
3714 sa->removeRange(sa->set, 0x80, 0x9f);
b75a7d8f
A
3715}
3716
374ca955
A
3717static const UConverterImpl _ISO2022Impl={
3718 UCNV_ISO_2022,
3719
3720 NULL,
3721 NULL,
3722
3723 _ISO2022Open,
3724 _ISO2022Close,
3725 _ISO2022Reset,
3726
3727#ifdef U_ENABLE_GENERIC_ISO_2022
3728 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3729 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3730 ucnv_fromUnicode_UTF8,
3731 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3732#else
3733 NULL,
3734 NULL,
3735 NULL,
3736 NULL,
3737#endif
3738 NULL,
3739
3740 NULL,
3741 _ISO2022getName,
3742 _ISO_2022_WriteSub,
3743 _ISO_2022_SafeClone,
4388f060
A
3744 _ISO_2022_GetUnicodeSet,
3745
3746 NULL,
3747 NULL
374ca955
A
3748};
3749static const UConverterStaticData _ISO2022StaticData={
3750 sizeof(UConverterStaticData),
3751 "ISO_2022",
3752 2022,
3753 UCNV_IBM,
3754 UCNV_ISO_2022,
3755 1,
3756 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3757 { 0x1a, 0, 0, 0 },
3758 1,
3759 FALSE,
3760 FALSE,
3761 0,
3762 0,
3763 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3764};
3765const UConverterSharedData _ISO2022Data={
3766 sizeof(UConverterSharedData),
3767 ~((uint32_t) 0),
3768 NULL,
3769 NULL,
3770 &_ISO2022StaticData,
3771 FALSE,
3772 &_ISO2022Impl,
4388f060 3773 0, UCNV_MBCS_TABLE_INITIALIZER
374ca955
A
3774};
3775
3776/*************JP****************/
3777static const UConverterImpl _ISO2022JPImpl={
3778 UCNV_ISO_2022,
3779
3780 NULL,
3781 NULL,
3782
3783 _ISO2022Open,
3784 _ISO2022Close,
3785 _ISO2022Reset,
3786
3787 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3788 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3789 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3790 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3791 NULL,
3792
3793 NULL,
3794 _ISO2022getName,
3795 _ISO_2022_WriteSub,
3796 _ISO_2022_SafeClone,
4388f060
A
3797 _ISO_2022_GetUnicodeSet,
3798
3799 NULL,
3800 NULL
374ca955
A
3801};
3802static const UConverterStaticData _ISO2022JPStaticData={
3803 sizeof(UConverterStaticData),
3804 "ISO_2022_JP",
3805 0,
3806 UCNV_IBM,
3807 UCNV_ISO_2022,
3808 1,
3809 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3810 { 0x1a, 0, 0, 0 },
3811 1,
3812 FALSE,
3813 FALSE,
3814 0,
3815 0,
3816 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3817};
4388f060
A
3818
3819namespace {
3820
3821const UConverterSharedData _ISO2022JPData={
374ca955
A
3822 sizeof(UConverterSharedData),
3823 ~((uint32_t) 0),
3824 NULL,
3825 NULL,
3826 &_ISO2022JPStaticData,
3827 FALSE,
3828 &_ISO2022JPImpl,
4388f060 3829 0, UCNV_MBCS_TABLE_INITIALIZER
374ca955
A
3830};
3831
4388f060
A
3832} // namespace
3833
374ca955
A
3834/************* KR ***************/
3835static const UConverterImpl _ISO2022KRImpl={
3836 UCNV_ISO_2022,
3837
3838 NULL,
3839 NULL,
3840
3841 _ISO2022Open,
3842 _ISO2022Close,
3843 _ISO2022Reset,
3844
3845 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3846 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3847 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3848 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3849 NULL,
3850
3851 NULL,
3852 _ISO2022getName,
3853 _ISO_2022_WriteSub,
3854 _ISO_2022_SafeClone,
4388f060
A
3855 _ISO_2022_GetUnicodeSet,
3856
3857 NULL,
3858 NULL
374ca955
A
3859};
3860static const UConverterStaticData _ISO2022KRStaticData={
3861 sizeof(UConverterStaticData),
3862 "ISO_2022_KR",
3863 0,
3864 UCNV_IBM,
3865 UCNV_ISO_2022,
3866 1,
3867 3, /* max 3 bytes per UChar: SO+DBCS */
3868 { 0x1a, 0, 0, 0 },
3869 1,
3870 FALSE,
3871 FALSE,
3872 0,
3873 0,
3874 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3875};
4388f060
A
3876
3877namespace {
3878
3879const UConverterSharedData _ISO2022KRData={
374ca955
A
3880 sizeof(UConverterSharedData),
3881 ~((uint32_t) 0),
3882 NULL,
3883 NULL,
3884 &_ISO2022KRStaticData,
3885 FALSE,
3886 &_ISO2022KRImpl,
4388f060 3887 0, UCNV_MBCS_TABLE_INITIALIZER
374ca955
A
3888};
3889
4388f060
A
3890} // namespace
3891
374ca955
A
3892/*************** CN ***************/
3893static const UConverterImpl _ISO2022CNImpl={
3894
3895 UCNV_ISO_2022,
3896
3897 NULL,
3898 NULL,
3899
3900 _ISO2022Open,
3901 _ISO2022Close,
3902 _ISO2022Reset,
3903
3904 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3905 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3906 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3907 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3908 NULL,
3909
3910 NULL,
3911 _ISO2022getName,
3912 _ISO_2022_WriteSub,
3913 _ISO_2022_SafeClone,
4388f060
A
3914 _ISO_2022_GetUnicodeSet,
3915
3916 NULL,
3917 NULL
374ca955
A
3918};
3919static const UConverterStaticData _ISO2022CNStaticData={
3920 sizeof(UConverterStaticData),
3921 "ISO_2022_CN",
3922 0,
3923 UCNV_IBM,
3924 UCNV_ISO_2022,
73c04bcf 3925 1,
374ca955
A
3926 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3927 { 0x1a, 0, 0, 0 },
3928 1,
3929 FALSE,
3930 FALSE,
3931 0,
3932 0,
3933 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3934};
4388f060
A
3935
3936namespace {
3937
3938const UConverterSharedData _ISO2022CNData={
374ca955
A
3939 sizeof(UConverterSharedData),
3940 ~((uint32_t) 0),
3941 NULL,
3942 NULL,
3943 &_ISO2022CNStaticData,
3944 FALSE,
3945 &_ISO2022CNImpl,
4388f060 3946 0, UCNV_MBCS_TABLE_INITIALIZER
374ca955
A
3947};
3948
4388f060 3949} // namespace
374ca955 3950
b75a7d8f 3951#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */