]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnv2022.cpp
ICU-59173.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucnv2022.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4**********************************************************************
2ca993e8 5* Copyright (C) 2000-2016, International Business Machines
b75a7d8f
A
6* Corporation and others. All Rights Reserved.
7**********************************************************************
4388f060 8* file name: ucnv2022.cpp
f3c0d7a5 9* encoding: UTF-8
b75a7d8f
A
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2000feb03
14* created by: Markus W. Scherer
15*
16* Change history:
17*
18* 06/29/2000 helena Major rewrite of the callback APIs.
19* 08/08/2000 Ram Included support for ISO-2022-JP-2
20* Changed implementation of toUnicode
21* function
22* 08/21/2000 Ram Added support for ISO-2022-KR
23* 08/29/2000 Ram Seperated implementation of EBCDIC to
24* ucnvebdc.c
25* 09/20/2000 Ram Added support for ISO-2022-CN
26* Added implementations for getNextUChar()
27* for specific 2022 country variants.
28* 10/31/2000 Ram Implemented offsets logic functions
29*/
30
31#include "unicode/utypes.h"
32
374ca955 33#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
b75a7d8f
A
34
35#include "unicode/ucnv.h"
36#include "unicode/uset.h"
37#include "unicode/ucnv_err.h"
38#include "unicode/ucnv_cb.h"
4388f060 39#include "unicode/utf16.h"
374ca955 40#include "ucnv_imp.h"
b75a7d8f
A
41#include "ucnv_bld.h"
42#include "ucnv_cnv.h"
43#include "ucnvmbcs.h"
44#include "cstring.h"
45#include "cmemory.h"
4388f060 46#include "uassert.h"
b75a7d8f 47
374ca955
A
48#ifdef U_ENABLE_GENERIC_ISO_2022
49/*
50 * I am disabling the generic ISO-2022 converter after proposing to do so on
51 * the icu mailing list two days ago.
52 *
53 * Reasons:
54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55 * its designation sequences, single shifts with return to the previous state,
56 * switch-with-no-return to UTF-16BE or similar, etc.
57 * This is unlike the language-specific variants like ISO-2022-JP which
58 * require a much smaller repertoire of ISO-2022 features.
59 * These variants continue to be supported.
60 * 2. I believe that no one is really using the generic ISO-2022 converter
61 * but rather always one of the language-specific variants.
62 * Note that ICU's generic ISO-2022 converter has always output one escape
63 * sequence followed by UTF-8 for the whole stream.
64 * 3. Switching between subcharsets is extremely slow, because each time
65 * the previous converter is closed and a new one opened,
66 * without any kind of caching, least-recently-used list, etc.
67 * 4. The code is currently buggy, and given the above it does not seem
68 * reasonable to spend the time on maintenance.
69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70 * This means, for example, that when ISO-8859-7 is designated, the following
71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72 * The ICU ISO-2022 converter does not handle this - and has no information
73 * about which subconverter would have to be shifted vs. which is designed
74 * for 7-bit ISO-2022.
75 *
76 * Markus Scherer 2003-dec-03
77 */
78#endif
79
b331163b 80#if !UCONFIG_ONLY_HTML_CONVERSION
374ca955 81static const char SHIFT_IN_STR[] = "\x0F";
51004dcb 82// static const char SHIFT_OUT_STR[] = "\x0E";
b331163b 83#endif
b75a7d8f
A
84
85#define CR 0x0D
86#define LF 0x0A
87#define H_TAB 0x09
88#define V_TAB 0x0B
89#define SPACE 0x20
90
46f4442e
A
91enum {
92 HWKANA_START=0xff61,
93 HWKANA_END=0xff9f
94};
95
96/*
97 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
98 * as bytes 21..7E. (Subtract 0x80.)
99 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
100 * as bytes 20..7F. (Subtract 0x80.)
101 * Do not encode C1 control codes with native bytes 80..9F
102 * as bytes 00..1F (C0 control codes).
103 */
104enum {
105 GR94_START=0xa1,
106 GR94_END=0xfe,
107 GR96_START=0xa0,
108 GR96_END=0xff
109};
110
73c04bcf
A
111/*
112 * ISO 2022 control codes must not be converted from Unicode
113 * because they would mess up the byte stream.
114 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
115 * corresponding to SO, SI, and ESC.
116 */
117#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
118
374ca955 119/* for ISO-2022-JP and -CN implementations */
b75a7d8f 120typedef enum {
374ca955
A
121 /* shared values */
122 INVALID_STATE=-1,
b75a7d8f 123 ASCII = 0,
374ca955
A
124
125 SS2_STATE=0x10,
126 SS3_STATE,
127
128 /* JP */
b75a7d8f
A
129 ISO8859_1 = 1 ,
130 ISO8859_7 = 2 ,
131 JISX201 = 3,
132 JISX208 = 4,
133 JISX212 = 5,
134 GB2312 =6,
135 KSC5601 =7,
136 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
b75a7d8f 137
374ca955
A
138 /* CN */
139 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
140 GB2312_1=1,
141 ISO_IR_165=2,
142 CNS_11643=3,
143
144 /*
145 * these are used in StateEnum and ISO2022State variables,
146 * but CNS_11643 must be used to index into myConverterArray[]
147 */
148 CNS_11643_0=0x20,
149 CNS_11643_1,
150 CNS_11643_2,
151 CNS_11643_3,
152 CNS_11643_4,
153 CNS_11643_5,
154 CNS_11643_6,
155 CNS_11643_7
b75a7d8f
A
156} StateEnum;
157
374ca955 158/* is the StateEnum charset value for a DBCS charset? */
b331163b
A
159#if UCONFIG_ONLY_HTML_CONVERSION
160#define IS_JP_DBCS(cs) (JISX208==(cs))
161#else
374ca955 162#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
b331163b 163#endif
374ca955
A
164
165#define CSM(cs) ((uint16_t)1<<(cs))
b75a7d8f 166
374ca955
A
167/*
168 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
169 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
170 *
171 * Note: The converter uses some leniency:
172 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
173 * all versions, not just JIS7 and JIS8.
174 * - ICU does not distinguish between different versions of JIS X 0208.
175 */
b331163b
A
176#if UCONFIG_ONLY_HTML_CONVERSION
177enum { MAX_JA_VERSION=0 };
178#else
729e4ab9 179enum { MAX_JA_VERSION=4 };
b331163b 180#endif
729e4ab9 181static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
374ca955 182 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
b331163b 183#if !UCONFIG_ONLY_HTML_CONVERSION
374ca955
A
184 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
185 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
186 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
187 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
b331163b 188#endif
374ca955 189};
b75a7d8f
A
190
191typedef enum {
192 ASCII1=0,
193 LATIN1,
194 SBCS,
195 DBCS,
374ca955
A
196 MBCS,
197 HWKANA
b75a7d8f
A
198}Cnv2022Type;
199
374ca955
A
200typedef struct ISO2022State {
201 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
202 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
203 int8_t prevG; /* g before single shift (SS2 or SS3) */
204} ISO2022State;
205
b75a7d8f
A
206#define UCNV_OPTIONS_VERSION_MASK 0xf
207#define UCNV_2022_MAX_CONVERTERS 10
208
209typedef struct{
73c04bcf 210 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
b75a7d8f 211 UConverter *currentConverter;
b75a7d8f 212 Cnv2022Type currentType;
374ca955 213 ISO2022State toU2022State, fromU2022State;
b75a7d8f
A
214 uint32_t key;
215 uint32_t version;
73c04bcf
A
216#ifdef U_ENABLE_GENERIC_ISO_2022
217 UBool isFirstBuffer;
218#endif
d5d484b0 219 UBool isEmptySegment;
b75a7d8f 220 char name[30];
73c04bcf 221 char locale[3];
b75a7d8f
A
222}UConverterDataISO2022;
223
374ca955 224/* Protos */
b75a7d8f
A
225/* ISO-2022 ----------------------------------------------------------------- */
226
227/*Forward declaration */
f3c0d7a5 228U_CFUNC void U_CALLCONV
374ca955
A
229ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
230 UErrorCode * err);
f3c0d7a5 231U_CFUNC void U_CALLCONV
374ca955
A
232ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
233 UErrorCode * err);
b75a7d8f
A
234
235#define ESC_2022 0x1B /*ESC*/
236
237typedef enum
238{
239 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
240 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
241 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
374ca955 242 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
b75a7d8f
A
243} UCNV_TableStates_2022;
244
245/*
246* The way these state transition arrays work is:
247* ex : ESC$B is the sequence for JISX208
248* a) First Iteration: char is ESC
249* i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
250* int x = normalize_esq_chars_2022[27] which is equal to 1
251* ii) Search for this value in escSeqStateTable_Key_2022[]
252* value of x is stored at escSeqStateTable_Key_2022[0]
253* iii) Save this index as offset
254* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
255* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
256* b) Switch on this state and continue to next char
257* i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
258* which is normalize_esq_chars_2022[36] == 4
259* ii) x is currently 1(from above)
260* x<<=5 -- x is now 32
261* x+=normalize_esq_chars_2022[36]
262* now x is 36
263* iii) Search for this value in escSeqStateTable_Key_2022[]
264* value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
265* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
266* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
267* c) Switch on this state and continue to next char
268* i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
269* ii) x is currently 36 (from above)
270* x<<=5 -- x is now 1152
271* x+=normalize_esq_chars_2022[66]
272* now x is 1161
273* iii) Search for this value in escSeqStateTable_Key_2022[]
274* value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
275* iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
276* escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
277* v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
278*/
279
280
281/*Below are the 3 arrays depicting a state transition table*/
282static const int8_t normalize_esq_chars_2022[256] = {
283/* 0 1 2 3 4 5 6 7 8 9 */
284
285 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
289 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
292 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
293 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
298 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
299 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
300 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
301 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
302 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
303 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
304 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
305 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
306 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
307 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
308 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
309 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
310 ,0 ,0 ,0 ,0 ,0 ,0
311};
312
374ca955
A
313#ifdef U_ENABLE_GENERIC_ISO_2022
314/*
315 * When the generic ISO-2022 converter is completely removed, not just disabled
316 * per #ifdef, then the following state table and the associated tables that are
317 * dimensioned with MAX_STATES_2022 should be trimmed.
318 *
319 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
320 * the associated escape sequences starting with ESC ( B should be removed.
321 * This includes the ones with key values 1097 and all of the ones above 1000000.
322 *
323 * For the latter, the tables can simply be truncated.
324 * For the former, since the tables must be kept parallel, it is probably best
325 * to simply duplicate an adjacent table cell, parallel in all tables.
326 *
327 * It may make sense to restructure the tables, especially by using small search
328 * tables for the variants instead of indexing them parallel to the table here.
329 */
330#endif
331
b75a7d8f
A
332#define MAX_STATES_2022 74
333static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
334/* 0 1 2 3 4 5 6 7 8 9 */
335
336 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
337 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
338 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
339 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
340 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
341 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
342 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
343 ,35947631 ,35947635 ,35947636 ,35947638
344};
345
374ca955 346#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f
A
347
348static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
349 /* 0 1 2 3 4 5 6 7 8 9 */
350
351 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
374ca955 352 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
b75a7d8f
A
353 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
354 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
355 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
356 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
357 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
358 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
359};
360
374ca955
A
361#endif
362
46f4442e 363static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
b75a7d8f 364/* 0 1 2 3 4 5 6 7 8 9 */
374ca955 365 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
b75a7d8f
A
366 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
367 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
368 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
369 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
370 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
371 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
372 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
373};
374
b75a7d8f
A
375/* Type def for refactoring changeState_2022 code*/
376typedef enum{
374ca955 377#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f 378 ISO_2022=0,
374ca955 379#endif
b75a7d8f 380 ISO_2022_JP=1,
b331163b 381#if !UCONFIG_ONLY_HTML_CONVERSION
b75a7d8f
A
382 ISO_2022_KR=2,
383 ISO_2022_CN=3
b331163b 384#endif
b75a7d8f
A
385} Variant2022;
386
b75a7d8f 387/*********** ISO 2022 Converter Protos ***********/
f3c0d7a5 388static void U_CALLCONV
729e4ab9 389_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
b75a7d8f 390
f3c0d7a5 391static void U_CALLCONV
b75a7d8f
A
392 _ISO2022Close(UConverter *converter);
393
f3c0d7a5 394static void U_CALLCONV
b75a7d8f
A
395_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
396
f3c0d7a5
A
397U_CDECL_BEGIN
398static const char * U_CALLCONV
b75a7d8f 399_ISO2022getName(const UConverter* cnv);
f3c0d7a5 400U_CDECL_END
b75a7d8f 401
f3c0d7a5 402static void U_CALLCONV
b75a7d8f
A
403_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
404
f3c0d7a5
A
405U_CDECL_BEGIN
406static UConverter * U_CALLCONV
b75a7d8f
A
407_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
408
f3c0d7a5
A
409U_CDECL_END
410
374ca955 411#ifdef U_ENABLE_GENERIC_ISO_2022
f3c0d7a5 412static void U_CALLCONV
374ca955
A
413T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
414#endif
b75a7d8f 415
4388f060
A
416namespace {
417
374ca955 418/*const UConverterSharedData _ISO2022Data;*/
4388f060 419extern const UConverterSharedData _ISO2022JPData;
b331163b
A
420
421#if !UCONFIG_ONLY_HTML_CONVERSION
4388f060
A
422extern const UConverterSharedData _ISO2022KRData;
423extern const UConverterSharedData _ISO2022CNData;
b331163b 424#endif
4388f060
A
425
426} // namespace
b75a7d8f 427
374ca955 428/*************** Converter implementations ******************/
b75a7d8f 429
73c04bcf 430/* The purpose of this function is to get around gcc compiler warnings. */
4388f060 431static inline void
73c04bcf
A
432fromUWriteUInt8(UConverter *cnv,
433 const char *bytes, int32_t length,
434 uint8_t **target, const char *targetLimit,
435 int32_t **offsets,
436 int32_t sourceIndex,
437 UErrorCode *pErrorCode)
438{
439 char *targetChars = (char *)*target;
440 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
441 offsets, sourceIndex, pErrorCode);
442 *target = (uint8_t*)targetChars;
443
444}
445
4388f060
A
446static inline void
447setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
374ca955
A
448 if(myConverterData->version == 1) {
449 UConverter *cnv = myConverterData->currentConverter;
b75a7d8f 450
374ca955
A
451 cnv->toUnicodeStatus=0; /* offset */
452 cnv->mode=0; /* state */
453 cnv->toULength=0; /* byteIndex */
454 }
455}
b75a7d8f 456
4388f060 457static inline void
374ca955
A
458setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
459 /* in ISO-2022-KR the designator sequence appears only once
460 * in a file so we append it only once
461 */
462 if( converter->charErrorBufferLength==0){
b75a7d8f 463
374ca955
A
464 converter->charErrorBufferLength = 4;
465 converter->charErrorBuffer[0] = 0x1b;
466 converter->charErrorBuffer[1] = 0x24;
467 converter->charErrorBuffer[2] = 0x29;
468 converter->charErrorBuffer[3] = 0x43;
469 }
470 if(myConverterData->version == 1) {
471 UConverter *cnv = myConverterData->currentConverter;
b75a7d8f 472
374ca955
A
473 cnv->fromUChar32=0;
474 cnv->fromUnicodeStatus=1; /* prevLength */
475 }
476}
b75a7d8f 477
f3c0d7a5 478static void U_CALLCONV
729e4ab9 479_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
b75a7d8f 480
374ca955 481 char myLocale[6]={' ',' ',' ',' ',' ',' '};
b75a7d8f 482
374ca955
A
483 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
484 if(cnv->extraInfo != NULL) {
729e4ab9 485 UConverterNamePieces stackPieces;
4388f060 486 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
374ca955
A
487 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
488 uint32_t version;
b75a7d8f 489
729e4ab9
A
490 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
491
374ca955 492 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
374ca955 493 myConverterData->currentType = ASCII1;
374ca955 494 cnv->fromUnicodeStatus =FALSE;
729e4ab9
A
495 if(pArgs->locale){
496 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
374ca955 497 }
729e4ab9 498 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
73c04bcf 499 myConverterData->version = version;
46f4442e 500 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
73c04bcf
A
501 (myLocale[2]=='_' || myLocale[2]=='\0'))
502 {
374ca955 503 /* open the required converters and cache them */
729e4ab9 504 if(version>MAX_JA_VERSION) {
b331163b
A
505 // ICU 55 fails to open a converter for an unsupported version.
506 // Previously, it fell back to version 0, but that would yield
507 // unexpected behavior.
508 *errorCode = U_MISSING_RESOURCE_ERROR;
509 return;
729e4ab9 510 }
374ca955 511 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
729e4ab9
A
512 myConverterData->myConverterArray[ISO8859_7] =
513 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
374ca955 514 }
729e4ab9
A
515 myConverterData->myConverterArray[JISX208] =
516 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
374ca955 517 if(jpCharsetMasks[version]&CSM(JISX212)) {
729e4ab9
A
518 myConverterData->myConverterArray[JISX212] =
519 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
374ca955
A
520 }
521 if(jpCharsetMasks[version]&CSM(GB2312)) {
729e4ab9
A
522 myConverterData->myConverterArray[GB2312] =
523 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
374ca955
A
524 }
525 if(jpCharsetMasks[version]&CSM(KSC5601)) {
729e4ab9
A
526 myConverterData->myConverterArray[KSC5601] =
527 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
374ca955 528 }
b75a7d8f 529
374ca955
A
530 /* set the function pointers to appropriate funtions */
531 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
532 uprv_strcpy(myConverterData->locale,"ja");
b75a7d8f 533
46f4442e 534 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
b331163b 535 size_t len = uprv_strlen(myConverterData->name);
374ca955
A
536 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
537 myConverterData->name[len+1]='\0';
538 }
b331163b 539#if !UCONFIG_ONLY_HTML_CONVERSION
46f4442e 540 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
73c04bcf
A
541 (myLocale[2]=='_' || myLocale[2]=='\0'))
542 {
b331163b
A
543 if(version>1) {
544 // ICU 55 fails to open a converter for an unsupported version.
545 // Previously, it fell back to version 0, but that would yield
546 // unexpected behavior.
547 *errorCode = U_MISSING_RESOURCE_ERROR;
548 return;
549 }
729e4ab9
A
550 const char *cnvName;
551 if(version==1) {
552 cnvName="icu-internal-25546";
553 } else {
554 cnvName="ibm-949";
555 myConverterData->version=version=0;
556 }
557 if(pArgs->onlyTestIsLoadable) {
558 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
559 uprv_free(cnv->extraInfo);
560 cnv->extraInfo=NULL;
561 return;
562 } else {
563 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
73c04bcf
A
564 if (U_FAILURE(*errorCode)) {
565 _ISO2022Close(cnv);
566 return;
567 }
b75a7d8f 568
729e4ab9
A
569 if(version==1) {
570 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
571 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
572 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
573 }else{
574 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
73c04bcf 575 }
b75a7d8f 576
729e4ab9
A
577 /* initialize the state variables */
578 setInitialStateToUnicodeKR(cnv, myConverterData);
579 setInitialStateFromUnicodeKR(cnv, myConverterData);
b75a7d8f 580
729e4ab9
A
581 /* set the function pointers to appropriate funtions */
582 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
583 uprv_strcpy(myConverterData->locale,"ko");
584 }
b75a7d8f 585 }
46f4442e 586 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
73c04bcf
A
587 (myLocale[2]=='_' || myLocale[2]=='\0'))
588 {
b331163b
A
589 if(version>2) {
590 // ICU 55 fails to open a converter for an unsupported version.
591 // Previously, it fell back to version 0, but that would yield
592 // unexpected behavior.
593 *errorCode = U_MISSING_RESOURCE_ERROR;
594 return;
595 }
b75a7d8f
A
596
597 /* open the required converters and cache them */
729e4ab9
A
598 myConverterData->myConverterArray[GB2312_1] =
599 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
374ca955 600 if(version==1) {
729e4ab9
A
601 myConverterData->myConverterArray[ISO_IR_165] =
602 ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
374ca955 603 }
729e4ab9
A
604 myConverterData->myConverterArray[CNS_11643] =
605 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
b75a7d8f 606
b75a7d8f
A
607
608 /* set the function pointers to appropriate funtions */
609 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
610 uprv_strcpy(myConverterData->locale,"cn");
611
729e4ab9 612 if (version==0){
b75a7d8f 613 myConverterData->version = 0;
46f4442e 614 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
729e4ab9
A
615 }else if (version==1){
616 myConverterData->version = 1;
617 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
618 }else {
619 myConverterData->version = 2;
620 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
b75a7d8f
A
621 }
622 }
b331163b 623#endif // !UCONFIG_ONLY_HTML_CONVERSION
b75a7d8f 624 else{
374ca955 625#ifdef U_ENABLE_GENERIC_ISO_2022
73c04bcf
A
626 myConverterData->isFirstBuffer = TRUE;
627
b75a7d8f
A
628 /* append the UTF-8 escape sequence */
629 cnv->charErrorBufferLength = 3;
630 cnv->charErrorBuffer[0] = 0x1b;
631 cnv->charErrorBuffer[1] = 0x25;
632 cnv->charErrorBuffer[2] = 0x42;
633
634 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
635 /* initialize the state variables */
b75a7d8f 636 uprv_strcpy(myConverterData->name,"ISO_2022");
374ca955 637#else
b331163b
A
638 *errorCode = U_MISSING_RESOURCE_ERROR;
639 // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
640 // data loading error code.
374ca955
A
641 return;
642#endif
b75a7d8f
A
643 }
644
374ca955
A
645 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
646
729e4ab9 647 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
374ca955
A
648 _ISO2022Close(cnv);
649 }
b75a7d8f
A
650 } else {
651 *errorCode = U_MEMORY_ALLOCATION_ERROR;
652 }
b75a7d8f
A
653}
654
655
f3c0d7a5 656static void U_CALLCONV
b75a7d8f 657_ISO2022Close(UConverter *converter) {
374ca955
A
658 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
659 UConverterSharedData **array = myData->myConverterArray;
660 int32_t i;
b75a7d8f
A
661
662 if (converter->extraInfo != NULL) {
663 /*close the array of converter pointers and free the memory*/
374ca955
A
664 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
665 if(array[i]!=NULL) {
666 ucnv_unloadSharedDataIfReady(array[i]);
b75a7d8f 667 }
b75a7d8f
A
668 }
669
374ca955 670 ucnv_close(myData->currentConverter);
b75a7d8f
A
671
672 if(!converter->isExtraLocal){
673 uprv_free (converter->extraInfo);
374ca955 674 converter->extraInfo = NULL;
b75a7d8f
A
675 }
676 }
677}
678
f3c0d7a5 679static void U_CALLCONV
b75a7d8f
A
680_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
681 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
374ca955
A
682 if(choice<=UCNV_RESET_TO_UNICODE) {
683 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
684 myConverterData->key = 0;
d5d484b0 685 myConverterData->isEmptySegment = FALSE;
374ca955
A
686 }
687 if(choice!=UCNV_RESET_TO_UNICODE) {
688 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
689 }
690#ifdef U_ENABLE_GENERIC_ISO_2022
691 if(myConverterData->locale[0] == 0){
b75a7d8f
A
692 if(choice<=UCNV_RESET_TO_UNICODE) {
693 myConverterData->isFirstBuffer = TRUE;
374ca955 694 myConverterData->key = 0;
b75a7d8f
A
695 if (converter->mode == UCNV_SO){
696 ucnv_close (myConverterData->currentConverter);
697 myConverterData->currentConverter=NULL;
698 }
46f4442e 699 converter->mode = UCNV_SI;
b75a7d8f
A
700 }
701 if(choice!=UCNV_RESET_TO_UNICODE) {
702 /* re-append UTF-8 escape sequence */
703 converter->charErrorBufferLength = 3;
704 converter->charErrorBuffer[0] = 0x1b;
705 converter->charErrorBuffer[1] = 0x28;
706 converter->charErrorBuffer[2] = 0x42;
707 }
708 }
374ca955
A
709 else
710#endif
711 {
b75a7d8f 712 /* reset the state variables */
374ca955 713 if(myConverterData->locale[0] == 'k'){
b75a7d8f
A
714 if(choice<=UCNV_RESET_TO_UNICODE) {
715 setInitialStateToUnicodeKR(converter, myConverterData);
716 }
717 if(choice!=UCNV_RESET_TO_UNICODE) {
718 setInitialStateFromUnicodeKR(converter, myConverterData);
719 }
720 }
721 }
722}
723
f3c0d7a5
A
724U_CDECL_BEGIN
725
726static const char * U_CALLCONV
b75a7d8f
A
727_ISO2022getName(const UConverter* cnv){
728 if(cnv->extraInfo){
729 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
730 return myData->name;
731 }
732 return NULL;
733}
734
f3c0d7a5
A
735U_CDECL_END
736
b75a7d8f 737
374ca955
A
738/*************** to unicode *******************/
739/****************************************************************************
740 * Recognized escape sequences are
741 * <ESC>(B ASCII
742 * <ESC>.A ISO-8859-1
743 * <ESC>.F ISO-8859-7
744 * <ESC>(J JISX-201
745 * <ESC>(I JISX-201
746 * <ESC>$B JISX-208
747 * <ESC>$@ JISX-208
748 * <ESC>$(D JISX-212
749 * <ESC>$A GB2312
750 * <ESC>$(C KSC5601
751 */
46f4442e 752static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
374ca955
A
753/* 0 1 2 3 4 5 6 7 8 9 */
754 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
755 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
756 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
757 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
758 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
759 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
760 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
761 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
762};
b75a7d8f 763
b331163b 764#if !UCONFIG_ONLY_HTML_CONVERSION
374ca955 765/*************** to unicode *******************/
46f4442e 766static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
374ca955
A
767/* 0 1 2 3 4 5 6 7 8 9 */
768 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
769 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
770 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
771 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
772 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
773 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
774 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
775 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
776};
b331163b 777#endif
b75a7d8f 778
b75a7d8f 779
46f4442e 780static UCNV_TableStates_2022
374ca955
A
781getKey_2022(char c,int32_t* key,int32_t* offset){
782 int32_t togo;
783 int32_t low = 0;
784 int32_t hi = MAX_STATES_2022;
785 int32_t oldmid=0;
b75a7d8f 786
374ca955
A
787 togo = normalize_esq_chars_2022[(uint8_t)c];
788 if(togo == 0) {
789 /* not a valid character anywhere in an escape sequence */
790 *key = 0;
791 *offset = 0;
792 return INVALID_2022;
793 }
794 togo = (*key << 5) + togo;
b75a7d8f 795
374ca955 796 while (hi != low) /*binary search*/{
b75a7d8f 797
57a6839d 798 int32_t mid = (hi+low) >> 1; /*Finds median*/
374ca955 799
46f4442e 800 if (mid == oldmid)
374ca955
A
801 break;
802
803 if (escSeqStateTable_Key_2022[mid] > togo){
804 hi = mid;
805 }
806 else if (escSeqStateTable_Key_2022[mid] < togo){
807 low = mid;
808 }
809 else /*we found it*/{
810 *key = togo;
811 *offset = mid;
46f4442e 812 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
374ca955
A
813 }
814 oldmid = mid;
b75a7d8f 815
b75a7d8f 816 }
b75a7d8f 817
374ca955
A
818 *key = 0;
819 *offset = 0;
820 return INVALID_2022;
b75a7d8f
A
821}
822
374ca955
A
823/*runs through a state machine to determine the escape sequence - codepage correspondance
824 */
46f4442e 825static void
374ca955 826changeState_2022(UConverter* _this,
46f4442e 827 const char** source,
374ca955
A
828 const char* sourceLimit,
829 Variant2022 var,
830 UErrorCode* err){
831 UCNV_TableStates_2022 value;
832 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
833 uint32_t key = myData2022->key;
73c04bcf 834 int32_t offset = 0;
fd0068a8 835 int8_t initialToULength = _this->toULength;
374ca955
A
836 char c;
837
838 value = VALID_NON_TERMINAL_2022;
839 while (*source < sourceLimit) {
840 c = *(*source)++;
841 _this->toUBytes[_this->toULength++]=(uint8_t)c;
842 value = getKey_2022(c,(int32_t *) &key, &offset);
46f4442e 843
374ca955 844 switch (value){
b75a7d8f 845
374ca955
A
846 case VALID_NON_TERMINAL_2022 :
847 /* continue with the loop */
848 break;
b75a7d8f 849
374ca955
A
850 case VALID_TERMINAL_2022:
851 key = 0;
852 goto DONE;
b75a7d8f 853
374ca955
A
854 case INVALID_2022:
855 goto DONE;
b75a7d8f 856
374ca955
A
857 case VALID_MAYBE_TERMINAL_2022:
858#ifdef U_ENABLE_GENERIC_ISO_2022
859 /* ESC ( B is ambiguous only for ISO_2022 itself */
860 if(var == ISO_2022) {
861 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
862 _this->toULength = 0;
b75a7d8f 863
374ca955
A
864 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
865
866 /* continue with the loop */
867 value = VALID_NON_TERMINAL_2022;
868 break;
869 } else
870#endif
871 {
872 /* not ISO_2022 itself, finish here */
873 value = VALID_TERMINAL_2022;
874 key = 0;
875 goto DONE;
b75a7d8f
A
876 }
877 }
b75a7d8f 878 }
b75a7d8f 879
374ca955
A
880DONE:
881 myData2022->key = key;
b75a7d8f 882
374ca955
A
883 if (value == VALID_NON_TERMINAL_2022) {
884 /* indicate that the escape sequence is incomplete: key!=0 */
885 return;
886 } else if (value == INVALID_2022 ) {
887 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
374ca955
A
888 } else /* value == VALID_TERMINAL_2022 */ {
889 switch(var){
890#ifdef U_ENABLE_GENERIC_ISO_2022
891 case ISO_2022:
892 {
893 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
894 if(chosenConverterName == NULL) {
895 /* SS2 or SS3 */
896 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
46f4442e 897 _this->toUCallbackReason = UCNV_UNASSIGNED;
374ca955 898 return;
b75a7d8f 899 }
374ca955
A
900
901 _this->mode = UCNV_SI;
902 ucnv_close(myData2022->currentConverter);
903 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
904 if(U_SUCCESS(*err)) {
905 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
906 _this->mode = UCNV_SO;
907 }
908 break;
909 }
910#endif
911 case ISO_2022_JP:
912 {
46f4442e 913 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
374ca955
A
914 switch(tempState) {
915 case INVALID_STATE:
916 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
917 break;
918 case SS2_STATE:
919 if(myData2022->toU2022State.cs[2]!=0) {
920 if(myData2022->toU2022State.g<2) {
921 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
922 }
923 myData2022->toU2022State.g=2;
924 } else {
925 /* illegal to have SS2 before a matching designator */
926 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
927 }
928 break;
929 /* case SS3_STATE: not used in ISO-2022-JP-x */
930 case ISO8859_1:
931 case ISO8859_7:
932 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
933 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
934 } else {
935 /* G2 charset for SS2 */
936 myData2022->toU2022State.cs[2]=(int8_t)tempState;
937 }
938 break;
939 default:
940 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
941 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
942 } else {
943 /* G0 charset */
944 myData2022->toU2022State.cs[0]=(int8_t)tempState;
945 }
946 break;
947 }
948 }
949 break;
b331163b 950#if !UCONFIG_ONLY_HTML_CONVERSION
374ca955
A
951 case ISO_2022_CN:
952 {
46f4442e 953 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
374ca955
A
954 switch(tempState) {
955 case INVALID_STATE:
956 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
957 break;
958 case SS2_STATE:
959 if(myData2022->toU2022State.cs[2]!=0) {
960 if(myData2022->toU2022State.g<2) {
961 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
962 }
963 myData2022->toU2022State.g=2;
964 } else {
965 /* illegal to have SS2 before a matching designator */
966 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
967 }
968 break;
969 case SS3_STATE:
970 if(myData2022->toU2022State.cs[3]!=0) {
971 if(myData2022->toU2022State.g<2) {
972 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
973 }
974 myData2022->toU2022State.g=3;
975 } else {
976 /* illegal to have SS3 before a matching designator */
977 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
978 }
979 break;
980 case ISO_IR_165:
981 if(myData2022->version==0) {
982 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
983 break;
984 }
2ca993e8 985 U_FALLTHROUGH;
374ca955 986 case GB2312_1:
2ca993e8 987 U_FALLTHROUGH;
374ca955
A
988 case CNS_11643_1:
989 myData2022->toU2022State.cs[1]=(int8_t)tempState;
990 break;
991 case CNS_11643_2:
992 myData2022->toU2022State.cs[2]=(int8_t)tempState;
993 break;
994 default:
995 /* other CNS 11643 planes */
996 if(myData2022->version==0) {
997 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
998 } else {
999 myData2022->toU2022State.cs[3]=(int8_t)tempState;
1000 }
1001 break;
1002 }
1003 }
1004 break;
1005 case ISO_2022_KR:
1006 if(offset==0x30){
1007 /* nothing to be done, just accept this one escape sequence */
1008 } else {
1009 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1010 }
1011 break;
b331163b 1012#endif // !UCONFIG_ONLY_HTML_CONVERSION
374ca955
A
1013
1014 default:
1015 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1016 break;
1017 }
1018 }
1019 if(U_SUCCESS(*err)) {
1020 _this->toULength = 0;
fd0068a8
A
1021 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1022 if(_this->toULength>1) {
1023 /*
1024 * Ticket 5691: consistent illegal sequences:
1025 * - We include at least the first byte (ESC) in the illegal sequence.
1026 * - If any of the non-initial bytes could be the start of a character,
1027 * we stop the illegal sequence before the first one of those.
1028 * In escape sequences, all following bytes are "printable", that is,
1029 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1030 * they are valid single/lead bytes.
1031 * For simplicity, we always only report the initial ESC byte as the
1032 * illegal sequence and back out all other bytes we looked at.
1033 */
1034 /* Back out some bytes. */
1035 int8_t backOutDistance=_this->toULength-1;
1036 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1037 if(backOutDistance<=bytesFromThisBuffer) {
1038 /* same as initialToULength<=1 */
1039 *source-=backOutDistance;
1040 } else {
1041 /* Back out bytes from the previous buffer: Need to replay them. */
1042 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1043 /* same as -(initialToULength-1) */
1044 /* preToULength is negative! */
1045 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1046 *source-=bytesFromThisBuffer;
1047 }
1048 _this->toULength=1;
1049 }
46f4442e
A
1050 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1051 _this->toUCallbackReason = UCNV_UNASSIGNED;
374ca955
A
1052 }
1053}
1054
b331163b 1055#if !UCONFIG_ONLY_HTML_CONVERSION
374ca955
A
1056/*Checks the characters of the buffer against valid 2022 escape sequences
1057*if the match we return a pointer to the initial start of the sequence otherwise
1058*we return sourceLimit
1059*/
1060/*for 2022 looks ahead in the stream
1061 *to determine the longest possible convertible
1062 *data stream
1063 */
4388f060 1064static inline const char*
374ca955
A
1065getEndOfBuffer_2022(const char** source,
1066 const char* sourceLimit,
4388f060 1067 UBool /*flush*/){
374ca955
A
1068
1069 const char* mySource = *source;
1070
1071#ifdef U_ENABLE_GENERIC_ISO_2022
46f4442e 1072 if (*source >= sourceLimit)
374ca955
A
1073 return sourceLimit;
1074
1075 do{
1076
1077 if (*mySource == ESC_2022){
1078 int8_t i;
1079 int32_t key = 0;
1080 int32_t offset;
1081 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1082
1083 /* Kludge: I could not
1084 * figure out the reason for validating an escape sequence
1085 * twice - once here and once in changeState_2022().
1086 * is it possible to have an ESC character in a ISO2022
1087 * byte stream which is valid in a code page? Is it legal?
1088 */
46f4442e 1089 for (i=0;
374ca955
A
1090 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1091 i++) {
1092 value = getKey_2022(*(mySource+i), &key, &offset);
1093 }
46f4442e 1094 if (value > 0 || *mySource==ESC_2022)
374ca955
A
1095 return mySource;
1096
46f4442e 1097 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
374ca955
A
1098 return sourceLimit;
1099 }
1100 }while (++mySource < sourceLimit);
1101
1102 return sourceLimit;
1103#else
1104 while(mySource < sourceLimit && *mySource != ESC_2022) {
1105 ++mySource;
1106 }
1107 return mySource;
1108#endif
1109}
b331163b 1110#endif
374ca955
A
1111
1112/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
46f4442e
A
1113 * any future change in _MBCSFromUChar32() function should be reflected here.
1114 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
374ca955 1115 */
4388f060 1116static inline int32_t
374ca955 1117MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
46f4442e
A
1118 UChar32 c,
1119 uint32_t* value,
1120 UBool useFallback,
374ca955
A
1121 int outputType)
1122{
1123 const int32_t *cx;
1124 const uint16_t *table;
1125 uint32_t stage2Entry;
1126 uint32_t myValue;
46f4442e 1127 int32_t length;
374ca955 1128 const uint8_t *p;
46f4442e
A
1129 /*
1130 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1131 * Use internal version of ucnv_open() that verifies that the new structures are available,
1132 * else U_INTERNAL_PROGRAM_ERROR.
1133 */
374ca955
A
1134 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1135 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1136 table=sharedData->mbcs.fromUnicodeTable;
1137 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1138 /* get the bytes and the length for the output */
1139 if(outputType==MBCS_OUTPUT_2){
1140 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1141 if(myValue<=0xff) {
46f4442e 1142 length=1;
374ca955 1143 } else {
46f4442e 1144 length=2;
374ca955
A
1145 }
1146 } else /* outputType==MBCS_OUTPUT_3 */ {
1147 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1148 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1149 if(myValue<=0xff) {
46f4442e 1150 length=1;
374ca955 1151 } else if(myValue<=0xffff) {
46f4442e 1152 length=2;
374ca955 1153 } else {
46f4442e 1154 length=3;
b75a7d8f
A
1155 }
1156 }
1157 /* is this code point assigned, or do we use fallbacks? */
46f4442e
A
1158 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1159 /* assigned */
1160 *value=myValue;
1161 return length;
1162 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
b75a7d8f 1163 /*
374ca955 1164 * We allow a 0 byte output if the "assigned" bit is set for this entry.
b75a7d8f 1165 * There is no way with this data structure for fallback output
374ca955 1166 * to be a zero byte.
b75a7d8f 1167 */
b75a7d8f 1168 *value=myValue;
46f4442e 1169 return -length;
b75a7d8f 1170 }
b75a7d8f 1171 }
374ca955
A
1172
1173 cx=sharedData->mbcs.extIndexes;
1174 if(cx!=NULL) {
46f4442e 1175 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
374ca955
A
1176 }
1177
1178 /* unassigned */
46f4442e 1179 return 0;
b75a7d8f
A
1180}
1181
1182/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
46f4442e
A
1183 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1184 * @param retval pointer to output byte
1185 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
b75a7d8f 1186 */
4388f060 1187static inline int32_t
b75a7d8f 1188MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
46f4442e
A
1189 UChar32 c,
1190 uint32_t* retval,
b75a7d8f
A
1191 UBool useFallback)
1192{
46f4442e 1193 const uint16_t *table;
b75a7d8f
A
1194 int32_t value;
1195 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
374ca955 1196 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
46f4442e 1197 return 0;
b75a7d8f
A
1198 }
1199 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
374ca955 1200 table=sharedData->mbcs.fromUnicodeTable;
b75a7d8f 1201 /* get the byte for the output */
374ca955 1202 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
b75a7d8f 1203 /* is this code point assigned, or do we use fallbacks? */
46f4442e
A
1204 *retval=(uint32_t)(value&0xff);
1205 if(value>=0xf00) {
1206 return 1; /* roundtrip */
1207 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1208 return -1; /* fallback taken */
b75a7d8f 1209 } else {
46f4442e 1210 return 0; /* no mapping */
b75a7d8f 1211 }
b75a7d8f
A
1212}
1213
46f4442e
A
1214/*
1215 * Check that the result is a 2-byte value with each byte in the range A1..FE
1216 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1217 * to move it to the ISO 2022 range 21..7E.
1218 * Return 0 if out of range.
1219 */
4388f060 1220static inline uint32_t
46f4442e
A
1221_2022FromGR94DBCS(uint32_t value) {
1222 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1223 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1224 ) {
1225 return value - 0x8080; /* shift down to 21..7e byte range */
1226 } else {
1227 return 0; /* not valid for ISO 2022 */
1228 }
1229}
1230
1231#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1232/*
1233 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1234 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1235 * unchanged.
1236 */
4388f060 1237static inline uint32_t
46f4442e
A
1238_2022ToGR94DBCS(uint32_t value) {
1239 uint32_t returnValue = value + 0x8080;
1240 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1241 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1242 return returnValue;
1243 } else {
1244 return value;
1245 }
1246}
1247#endif
1248
374ca955
A
1249#ifdef U_ENABLE_GENERIC_ISO_2022
1250
b75a7d8f
A
1251/**********************************************************************************
1252* ISO-2022 Converter
1253*
1254*
1255*/
1256
f3c0d7a5 1257static void U_CALLCONV
b75a7d8f
A
1258T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1259 UErrorCode* err){
374ca955
A
1260 const char* mySourceLimit, *realSourceLimit;
1261 const char* sourceStart;
1262 const UChar* myTargetStart;
b75a7d8f 1263 UConverter* saveThis;
b75a7d8f 1264 UConverterDataISO2022* myData;
374ca955
A
1265 int8_t length;
1266
1267 saveThis = args->converter;
1268 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1269
1270 realSourceLimit = args->sourceLimit;
1271 while (args->source < realSourceLimit) {
1272 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1273 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1274 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1275
1276 if(args->source < mySourceLimit) {
1277 if(myData->currentConverter==NULL) {
1278 myData->currentConverter = ucnv_open("ASCII",err);
1279 if(U_FAILURE(*err)){
1280 return;
1281 }
b75a7d8f 1282
374ca955
A
1283 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1284 saveThis->mode = UCNV_SO;
b75a7d8f 1285 }
b75a7d8f 1286
374ca955
A
1287 /* convert to before the ESC or until the end of the buffer */
1288 myData->isFirstBuffer=FALSE;
1289 sourceStart = args->source;
1290 myTargetStart = args->target;
1291 args->converter = myData->currentConverter;
1292 ucnv_toUnicode(args->converter,
1293 &args->target,
1294 args->targetLimit,
1295 &args->source,
1296 mySourceLimit,
1297 args->offsets,
1298 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1299 err);
1300 args->converter = saveThis;
1301
1302 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1303 /* move the overflow buffer */
1304 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1305 myData->currentConverter->UCharErrorBufferLength = 0;
1306 if(length > 0) {
1307 uprv_memcpy(saveThis->UCharErrorBuffer,
1308 myData->currentConverter->UCharErrorBuffer,
1309 length*U_SIZEOF_UCHAR);
1310 }
1311 return;
1312 }
b75a7d8f 1313
374ca955
A
1314 /*
1315 * At least one of:
1316 * -Error while converting
1317 * -Done with entire buffer
1318 * -Need to write offsets or update the current offset
1319 * (leave that up to the code in ucnv.c)
1320 *
1321 * or else we just stopped at an ESC byte and continue with changeState_2022()
1322 */
1323 if (U_FAILURE(*err) ||
1324 (args->source == realSourceLimit) ||
1325 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1326 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1327 ) {
1328 /* copy partial or error input for truncated detection and error handling */
1329 if(U_FAILURE(*err)) {
1330 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1331 if(length > 0) {
1332 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1333 }
1334 } else {
1335 length = saveThis->toULength = myData->currentConverter->toULength;
1336 if(length > 0) {
1337 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1338 if(args->source < mySourceLimit) {
1339 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1340 }
1341 }
1342 }
1343 return;
b75a7d8f 1344 }
b75a7d8f
A
1345 }
1346 }
b75a7d8f
A
1347
1348 sourceStart = args->source;
1349 changeState_2022(args->converter,
46f4442e 1350 &(args->source),
374ca955 1351 realSourceLimit,
b75a7d8f 1352 ISO_2022,
b75a7d8f 1353 err);
374ca955
A
1354 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1355 /* let the ucnv.c code update its current offset */
1356 return;
b75a7d8f 1357 }
b75a7d8f 1358 }
b75a7d8f
A
1359}
1360
374ca955 1361#endif
b75a7d8f
A
1362
1363/*
1364 * To Unicode Callback helper function
1365 */
46f4442e 1366static void
374ca955
A
1367toUnicodeCallback(UConverter *cnv,
1368 const uint32_t sourceChar, const uint32_t targetUniChar,
1369 UErrorCode* err){
b75a7d8f 1370 if(sourceChar>0xff){
374ca955
A
1371 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1372 cnv->toUBytes[1] = (uint8_t)sourceChar;
1373 cnv->toULength = 2;
b75a7d8f
A
1374 }
1375 else{
374ca955 1376 cnv->toUBytes[0] =(char) sourceChar;
fd0068a8 1377 cnv->toULength = 1;
b75a7d8f
A
1378 }
1379
1380 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
b75a7d8f
A
1381 *err = U_INVALID_CHAR_FOUND;
1382 }
1383 else{
b75a7d8f
A
1384 *err = U_ILLEGAL_CHAR_FOUND;
1385 }
b75a7d8f
A
1386}
1387
1388/**************************************ISO-2022-JP*************************************************/
1389
1390/************************************** IMPORTANT **************************************************
1391* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1392* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
46f4442e
A
1393* The converter iterates over each Unicode codepoint
1394* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1395* processed one char at a time it would make sense to reduce the extra processing a canned converter
b75a7d8f
A
1396* would do as far as possible.
1397*
46f4442e
A
1398* If the implementation of these macros or structure of sharedData struct change in the future, make
1399* sure that ISO-2022 is also changed.
b75a7d8f
A
1400***************************************************************************************************
1401*/
1402
1403/***************************************************************************************************
1404* Rules for ISO-2022-jp encoding
46f4442e 1405* (i) Escape sequences must be fully contained within a line they should not
b75a7d8f
A
1406* span new lines or CRs
1407* (ii) If the last character on a line is represented by two bytes then an ASCII or
1408* JIS-Roman character escape sequence should follow before the line terminates
46f4442e
A
1409* (iii) If the first character on the line is represented by two bytes then a two
1410* byte character escape sequence should precede it
b75a7d8f
A
1411* (iv) If no escape sequence is encountered then the characters are ASCII
1412* (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1413* and invoked with SS2 (ESC N).
1414* (vi) If there is any G0 designation in text, there must be a switch to
1415* ASCII or to JIS X 0201-Roman before a space character (but not
1416* necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1417* characters such as tab or CRLF.
1418* (vi) Supported encodings:
1419* ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1420*
1421* source : RFC-1554
1422*
1423* JISX201, JISX208,JISX212 : new .cnv data files created
1424* KSC5601 : alias to ibm-949 mapping table
1425* GB2312 : alias to ibm-1386 mapping table
1426* ISO-8859-1 : Algorithmic implemented as LATIN1 case
1427* ISO-8859-7 : alisas to ibm-9409 mapping table
1428*/
b75a7d8f 1429
374ca955
A
1430/* preference order of JP charsets */
1431static const StateEnum jpCharsetPref[]={
1432 ASCII,
1433 JISX201,
1434 ISO8859_1,
374ca955 1435 JISX208,
2ca993e8 1436 ISO8859_7,
374ca955
A
1437 JISX212,
1438 GB2312,
1439 KSC5601,
1440 HWKANA_7BIT
b75a7d8f
A
1441};
1442
73c04bcf
A
1443/*
1444 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1445 * not in order of jpCharsetPref[]!
1446 */
374ca955 1447static const char escSeqChars[][6] ={
b75a7d8f
A
1448 "\x1B\x28\x42", /* <ESC>(B ASCII */
1449 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1450 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1451 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1452 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1453 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1454 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1455 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1456 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1457
1458};
46f4442e 1459static const int8_t escSeqCharsLen[] ={
374ca955 1460 3, /* length of <ESC>(B ASCII */
b75a7d8f
A
1461 3, /* length of <ESC>.A ISO-8859-1 */
1462 3, /* length of <ESC>.F ISO-8859-7 */
1463 3, /* length of <ESC>(J JISX-201 */
1464 3, /* length of <ESC>$B JISX-208 */
1465 4, /* length of <ESC>$(D JISX-212 */
1466 3, /* length of <ESC>$A GB2312 */
1467 4, /* length of <ESC>$(C KSC5601 */
1468 3 /* length of <ESC>(I HWKANA_7BIT */
1469};
1470
1471/*
1472* The iteration over various code pages works this way:
1473* i) Get the currentState from myConverterData->currentState
1474* ii) Check if the character is mapped to a valid character in the currentState
1475* Yes -> a) set the initIterState to currentState
1476* b) remain in this state until an invalid character is found
1477* No -> a) go to the next code page and find the character
46f4442e 1478* iii) Before changing the state increment the current state check if the current state
b75a7d8f
A
1479* is equal to the intitIteration state
1480* Yes -> A character that cannot be represented in any of the supported encodings
1481* break and return a U_INVALID_CHARACTER error
1482* No -> Continue and find the character in next code page
1483*
1484*
46f4442e 1485* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
b75a7d8f
A
1486*/
1487
46f4442e 1488/* Map 00..7F to Unicode according to JIS X 0201. */
4388f060 1489static inline uint32_t
46f4442e
A
1490jisx201ToU(uint32_t value) {
1491 if(value < 0x5c) {
1492 return value;
1493 } else if(value == 0x5c) {
1494 return 0xa5;
1495 } else if(value == 0x7e) {
1496 return 0x203e;
1497 } else /* value <= 0x7f */ {
1498 return value;
1499 }
1500}
1501
1502/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
4388f060 1503static inline uint32_t
46f4442e
A
1504jisx201FromU(uint32_t value) {
1505 if(value<=0x7f) {
1506 if(value!=0x5c && value!=0x7e) {
1507 return value;
1508 }
1509 } else if(value==0xa5) {
1510 return 0x5c;
1511 } else if(value==0x203e) {
1512 return 0x7e;
1513 }
1514 return 0xfffe;
1515}
1516
1517/*
1518 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1519 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1520 * Return 0 if the byte pair is out of range.
1521 */
4388f060 1522static inline uint32_t
46f4442e
A
1523_2022FromSJIS(uint32_t value) {
1524 uint8_t trail;
1525
1526 if(value > 0xEFFC) {
1527 return 0; /* beyond JIS X 0208 */
1528 }
1529
1530 trail = (uint8_t)value;
1531
1532 value &= 0xff00; /* lead byte */
1533 if(value <= 0x9f00) {
1534 value -= 0x7000;
1535 } else /* 0xe000 <= value <= 0xef00 */ {
1536 value -= 0xb000;
1537 }
1538 value <<= 1;
1539
1540 if(trail <= 0x9e) {
1541 value -= 0x100;
1542 if(trail <= 0x7e) {
1543 value |= trail - 0x1f;
1544 } else {
1545 value |= trail - 0x20;
1546 }
1547 } else /* trail <= 0xfc */ {
1548 value |= trail - 0x7e;
1549 }
1550 return value;
1551}
1552
1553/*
1554 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1555 * If either byte is outside 21..7E make sure that the result is not valid
1556 * for Shift-JIS so that the converter catches it.
1557 * Some invalid byte values already turn into equally invalid Shift-JIS
1558 * byte values and need not be tested explicitly.
1559 */
4388f060 1560static inline void
46f4442e
A
1561_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1562 if(c1&1) {
1563 ++c1;
1564 if(c2 <= 0x5f) {
1565 c2 += 0x1f;
1566 } else if(c2 <= 0x7e) {
1567 c2 += 0x20;
1568 } else {
1569 c2 = 0; /* invalid */
1570 }
1571 } else {
1572 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1573 c2 += 0x7e;
1574 } else {
1575 c2 = 0; /* invalid */
1576 }
1577 }
1578 c1 >>= 1;
1579 if(c1 <= 0x2f) {
1580 c1 += 0x70;
1581 } else if(c1 <= 0x3f) {
1582 c1 += 0xb0;
1583 } else {
1584 c1 = 0; /* invalid */
1585 }
1586 bytes[0] = (char)c1;
1587 bytes[1] = (char)c2;
1588}
1589
1590/*
1591 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1592 * Katakana.
1593 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1594 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1595 * These were the only fallbacks in ICU's jisx-208.ucm file.
1596 */
1597static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1598 0x2123, /* U+FF61 */
1599 0x2156,
1600 0x2157,
1601 0x2122,
1602 0x2126,
1603 0x2572,
1604 0x2521,
1605 0x2523,
1606 0x2525,
1607 0x2527,
1608 0x2529,
1609 0x2563,
1610 0x2565,
1611 0x2567,
1612 0x2543,
1613 0x213C, /* U+FF70 */
1614 0x2522,
1615 0x2524,
1616 0x2526,
1617 0x2528,
1618 0x252A,
1619 0x252B,
1620 0x252D,
1621 0x252F,
1622 0x2531,
1623 0x2533,
1624 0x2535,
1625 0x2537,
1626 0x2539,
1627 0x253B,
1628 0x253D,
1629 0x253F, /* U+FF80 */
1630 0x2541,
1631 0x2544,
1632 0x2546,
1633 0x2548,
1634 0x254A,
1635 0x254B,
1636 0x254C,
1637 0x254D,
1638 0x254E,
1639 0x254F,
1640 0x2552,
1641 0x2555,
1642 0x2558,
1643 0x255B,
1644 0x255E,
1645 0x255F, /* U+FF90 */
1646 0x2560,
1647 0x2561,
1648 0x2562,
1649 0x2564,
1650 0x2566,
1651 0x2568,
1652 0x2569,
1653 0x256A,
1654 0x256B,
1655 0x256C,
1656 0x256D,
1657 0x256F,
1658 0x2573,
1659 0x212B,
1660 0x212C /* U+FF9F */
1661};
1662
f3c0d7a5 1663static void U_CALLCONV
374ca955 1664UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
46f4442e 1665 UConverter *cnv = args->converter;
b75a7d8f 1666 UConverterDataISO2022 *converterData;
374ca955
A
1667 ISO2022State *pFromU2022State;
1668 uint8_t *target = (uint8_t *) args->target;
1669 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
b75a7d8f
A
1670 const UChar* source = args->source;
1671 const UChar* sourceLimit = args->sourceLimit;
1672 int32_t* offsets = args->offsets;
374ca955
A
1673 UChar32 sourceChar;
1674 char buffer[8];
1675 int32_t len, outLen;
1676 int8_t choices[10];
1677 int32_t choiceCount;
73c04bcf 1678 uint32_t targetValue = 0;
374ca955
A
1679 UBool useFallback;
1680
1681 int32_t i;
1682 int8_t cs, g;
1683
1684 /* set up the state */
46f4442e 1685 converterData = (UConverterDataISO2022*)cnv->extraInfo;
374ca955 1686 pFromU2022State = &converterData->fromU2022State;
374ca955
A
1687
1688 choiceCount = 0;
b75a7d8f 1689
b75a7d8f 1690 /* check if the last codepoint of previous buffer was a lead surrogate*/
46f4442e 1691 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
b75a7d8f
A
1692 goto getTrail;
1693 }
b75a7d8f 1694
374ca955
A
1695 while(source < sourceLimit) {
1696 if(target < targetLimit) {
b75a7d8f 1697
b75a7d8f 1698 sourceChar = *(source++);
374ca955 1699 /*check if the char is a First surrogate*/
4388f060
A
1700 if(U16_IS_SURROGATE(sourceChar)) {
1701 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
374ca955
A
1702getTrail:
1703 /*look ahead to find the trail surrogate*/
1704 if(source < sourceLimit) {
1705 /* test the following code unit */
1706 UChar trail=(UChar) *source;
4388f060 1707 if(U16_IS_TRAIL(trail)) {
374ca955 1708 source++;
4388f060 1709 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
46f4442e 1710 cnv->fromUChar32=0x00;
374ca955
A
1711 /* convert this supplementary code point */
1712 /* exit this condition tree */
1713 } else {
1714 /* this is an unmatched lead code unit (1st surrogate) */
1715 /* callback(illegal) */
1716 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 1717 cnv->fromUChar32=sourceChar;
374ca955 1718 break;
b75a7d8f 1719 }
374ca955
A
1720 } else {
1721 /* no more input */
46f4442e 1722 cnv->fromUChar32=sourceChar;
b75a7d8f
A
1723 break;
1724 }
374ca955
A
1725 } else {
1726 /* this is an unmatched trail code unit (2nd surrogate) */
1727 /* callback(illegal) */
1728 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 1729 cnv->fromUChar32=sourceChar;
374ca955
A
1730 break;
1731 }
b75a7d8f
A
1732 }
1733
73c04bcf
A
1734 /* do not convert SO/SI/ESC */
1735 if(IS_2022_CONTROL(sourceChar)) {
1736 /* callback(illegal) */
1737 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 1738 cnv->fromUChar32=sourceChar;
73c04bcf
A
1739 break;
1740 }
1741
374ca955 1742 /* do the conversion */
b75a7d8f 1743
374ca955
A
1744 if(choiceCount == 0) {
1745 uint16_t csm;
b75a7d8f 1746
374ca955
A
1747 /*
1748 * The csm variable keeps track of which charsets are allowed
1749 * and not used yet while building the choices[].
1750 */
1751 csm = jpCharsetMasks[converterData->version];
1752 choiceCount = 0;
1753
1754 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1755 if(converterData->version == 3 || converterData->version == 4) {
46f4442e 1756 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
374ca955 1757 }
46f4442e
A
1758 /* Do not try single-byte half-width Katakana for other versions. */
1759 csm &= ~CSM(HWKANA_7BIT);
b75a7d8f 1760
374ca955
A
1761 /* try the current G0 charset */
1762 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1763 csm &= ~CSM(cs);
b75a7d8f 1764
374ca955
A
1765 /* try the current G2 charset */
1766 if((cs = pFromU2022State->cs[2]) != 0) {
1767 choices[choiceCount++] = cs;
1768 csm &= ~CSM(cs);
1769 }
1770
1771 /* try all the other possible charsets */
b331163b 1772 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
374ca955
A
1773 cs = (int8_t)jpCharsetPref[i];
1774 if(CSM(cs) & csm) {
1775 choices[choiceCount++] = cs;
1776 csm &= ~CSM(cs);
b75a7d8f
A
1777 }
1778 }
374ca955 1779 }
b75a7d8f 1780
374ca955 1781 cs = g = 0;
46f4442e
A
1782 /*
1783 * len==0: no mapping found yet
1784 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1785 * len>0: found a roundtrip result, done
1786 */
374ca955 1787 len = 0;
46f4442e
A
1788 /*
1789 * We will turn off useFallback after finding a fallback,
1790 * but we still get fallbacks from PUA code points as usual.
1791 * Therefore, we will also need to check that we don't overwrite
1792 * an early fallback with a later one.
1793 */
1794 useFallback = cnv->useFallback;
374ca955 1795
46f4442e
A
1796 for(i = 0; i < choiceCount && len <= 0; ++i) {
1797 uint32_t value;
1798 int32_t len2;
1799 int8_t cs0 = choices[i];
1800 switch(cs0) {
374ca955
A
1801 case ASCII:
1802 if(sourceChar <= 0x7f) {
1803 targetValue = (uint32_t)sourceChar;
1804 len = 1;
46f4442e
A
1805 cs = cs0;
1806 g = 0;
b75a7d8f 1807 }
374ca955
A
1808 break;
1809 case ISO8859_1:
46f4442e 1810 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
374ca955
A
1811 targetValue = (uint32_t)sourceChar - 0x80;
1812 len = 1;
46f4442e 1813 cs = cs0;
374ca955
A
1814 g = 2;
1815 }
1816 break;
1817 case HWKANA_7BIT:
46f4442e 1818 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
374ca955
A
1819 if(converterData->version==3) {
1820 /* JIS7: use G1 (SO) */
46f4442e
A
1821 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1822 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1823 len = 1;
1824 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
374ca955
A
1825 g = 1;
1826 } else if(converterData->version==4) {
1827 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
46f4442e
A
1828 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1829 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1830 len = 1;
374ca955 1831
46f4442e
A
1832 cs = pFromU2022State->cs[0];
1833 if(IS_JP_DBCS(cs)) {
374ca955
A
1834 /* switch from a DBCS charset to JISX201 */
1835 cs = (int8_t)JISX201;
b75a7d8f 1836 }
46f4442e
A
1837 /* else stay in the current G0 charset */
1838 g = 0;
b75a7d8f 1839 }
46f4442e 1840 /* else do not use HWKANA_7BIT with other versions */
b75a7d8f 1841 }
374ca955
A
1842 break;
1843 case JISX201:
1844 /* G0 SBCS */
46f4442e
A
1845 value = jisx201FromU(sourceChar);
1846 if(value <= 0x7f) {
1847 targetValue = value;
374ca955 1848 len = 1;
46f4442e
A
1849 cs = cs0;
1850 g = 0;
1851 useFallback = FALSE;
1852 }
1853 break;
1854 case JISX208:
1855 /* G0 DBCS from Shift-JIS table */
1856 len2 = MBCS_FROM_UCHAR32_ISO2022(
1857 converterData->myConverterArray[cs0],
1858 sourceChar, &value,
1859 useFallback, MBCS_OUTPUT_2);
1860 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1861 value = _2022FromSJIS(value);
1862 if(value != 0) {
1863 targetValue = value;
1864 len = len2;
1865 cs = cs0;
1866 g = 0;
1867 useFallback = FALSE;
1868 }
1869 } else if(len == 0 && useFallback &&
1870 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1871 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1872 len = -2;
1873 cs = cs0;
1874 g = 0;
1875 useFallback = FALSE;
374ca955
A
1876 }
1877 break;
1878 case ISO8859_7:
1879 /* G0 SBCS forced to 7-bit output */
46f4442e
A
1880 len2 = MBCS_SINGLE_FROM_UCHAR32(
1881 converterData->myConverterArray[cs0],
1882 sourceChar, &value,
1883 useFallback);
1884 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1885 targetValue = value - 0x80;
1886 len = len2;
1887 cs = cs0;
374ca955 1888 g = 2;
46f4442e 1889 useFallback = FALSE;
374ca955
A
1890 }
1891 break;
1892 default:
1893 /* G0 DBCS */
46f4442e
A
1894 len2 = MBCS_FROM_UCHAR32_ISO2022(
1895 converterData->myConverterArray[cs0],
1896 sourceChar, &value,
1897 useFallback, MBCS_OUTPUT_2);
1898 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1899 if(cs0 == KSC5601) {
1900 /*
1901 * Check for valid bytes for the encoding scheme.
1902 * This is necessary because the sub-converter (windows-949)
1903 * has a broader encoding scheme than is valid for 2022.
1904 */
1905 value = _2022FromGR94DBCS(value);
1906 if(value == 0) {
1907 break;
1908 }
1909 }
1910 targetValue = value;
1911 len = len2;
1912 cs = cs0;
1913 g = 0;
1914 useFallback = FALSE;
374ca955
A
1915 }
1916 break;
b75a7d8f
A
1917 }
1918 }
b75a7d8f 1919
46f4442e
A
1920 if(len != 0) {
1921 if(len < 0) {
1922 len = -len; /* fallback */
1923 }
374ca955
A
1924 outLen = 0; /* count output bytes */
1925
1926 /* write SI if necessary (only for JIS7) */
1927 if(pFromU2022State->g == 1 && g == 0) {
1928 buffer[outLen++] = UCNV_SI;
1929 pFromU2022State->g = 0;
1930 }
1931
1932 /* write the designation sequence if necessary */
1933 if(cs != pFromU2022State->cs[g]) {
1934 int32_t escLen = escSeqCharsLen[cs];
1935 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1936 outLen += escLen;
1937 pFromU2022State->cs[g] = cs;
1938
1939 /* invalidate the choices[] */
1940 choiceCount = 0;
1941 }
1942
1943 /* write the shift sequence if necessary */
1944 if(g != pFromU2022State->g) {
1945 switch(g) {
1946 /* case 0 handled before writing escapes */
1947 case 1:
1948 buffer[outLen++] = UCNV_SO;
1949 pFromU2022State->g = 1;
1950 break;
1951 default: /* case 2 */
1952 buffer[outLen++] = 0x1b;
1953 buffer[outLen++] = 0x4e;
1954 break;
1955 /* no case 3: no SS3 in ISO-2022-JP-x */
1956 }
1957 }
1958
1959 /* write the output bytes */
1960 if(len == 1) {
1961 buffer[outLen++] = (char)targetValue;
1962 } else /* len == 2 */ {
1963 buffer[outLen++] = (char)(targetValue >> 8);
1964 buffer[outLen++] = (char)targetValue;
1965 }
1966 } else {
1967 /*
46f4442e 1968 * if we cannot find the character after checking all codepages
b75a7d8f
A
1969 * then this is an error
1970 */
b75a7d8f 1971 *err = U_INVALID_CHAR_FOUND;
46f4442e 1972 cnv->fromUChar32=sourceChar;
374ca955
A
1973 break;
1974 }
1975
1976 if(sourceChar == CR || sourceChar == LF) {
1977 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1978 pFromU2022State->cs[2] = 0;
1979 choiceCount = 0;
1980 }
1981
1982 /* output outLen>0 bytes in buffer[] */
1983 if(outLen == 1) {
1984 *target++ = buffer[0];
1985 if(offsets) {
73c04bcf 1986 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
b75a7d8f 1987 }
374ca955
A
1988 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1989 *target++ = buffer[0];
1990 *target++ = buffer[1];
1991 if(offsets) {
1992 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1993 *offsets++ = sourceIndex;
1994 *offsets++ = sourceIndex;
1995 }
1996 } else {
73c04bcf 1997 fromUWriteUInt8(
46f4442e 1998 cnv,
374ca955 1999 buffer, outLen,
73c04bcf 2000 &target, (const char *)targetLimit,
374ca955
A
2001 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2002 err);
2003 if(U_FAILURE(*err)) {
b75a7d8f
A
2004 break;
2005 }
2006 }
2007 } /* end if(myTargetIndex<myTargetLength) */
2008 else{
2009 *err =U_BUFFER_OVERFLOW_ERROR;
2010 break;
2011 }
2012
2013 }/* end while(mySourceIndex<mySourceLength) */
2014
374ca955
A
2015 /*
2016 * the end of the input stream and detection of truncated input
2017 * are handled by the framework, but for ISO-2022-JP conversion
2018 * we need to be in ASCII mode at the very end
2019 *
2020 * conditions:
2021 * successful
2022 * in SO mode or not in ASCII mode
2023 * end of input and no truncated input
b75a7d8f 2024 */
374ca955
A
2025 if( U_SUCCESS(*err) &&
2026 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
46f4442e 2027 args->flush && source>=sourceLimit && cnv->fromUChar32==0
374ca955
A
2028 ) {
2029 int32_t sourceIndex;
2030
2031 outLen = 0;
2032
2033 if(pFromU2022State->g != 0) {
2034 buffer[outLen++] = UCNV_SI;
2035 pFromU2022State->g = 0;
2036 }
2037
2038 if(pFromU2022State->cs[0] != ASCII) {
2039 int32_t escLen = escSeqCharsLen[ASCII];
2040 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2041 outLen += escLen;
2042 pFromU2022State->cs[0] = (int8_t)ASCII;
2043 }
2044
2045 /* get the source index of the last input character */
2046 /*
2047 * TODO this would be simpler and more reliable if we used a pair
2048 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2049 * so that we could simply use the prevSourceIndex here;
2050 * this code gives an incorrect result for the rare case of an unmatched
2051 * trail surrogate that is alone in the last buffer of the text stream
2052 */
2053 sourceIndex=(int32_t)(source-args->source);
2054 if(sourceIndex>0) {
2055 --sourceIndex;
2056 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2057 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2058 ) {
2059 --sourceIndex;
2060 }
2061 } else {
2062 sourceIndex=-1;
2063 }
2064
73c04bcf 2065 fromUWriteUInt8(
46f4442e 2066 cnv,
374ca955 2067 buffer, outLen,
73c04bcf 2068 &target, (const char *)targetLimit,
374ca955
A
2069 &offsets, sourceIndex,
2070 err);
b75a7d8f
A
2071 }
2072
2073 /*save the state and return */
2074 args->source = source;
2075 args->target = (char*)target;
2076}
2077
2078/*************** to unicode *******************/
2079
f3c0d7a5 2080static void U_CALLCONV
b75a7d8f 2081UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
374ca955 2082 UErrorCode* err){
46f4442e 2083 char tempBuf[2];
374ca955 2084 const char *mySource = (char *) args->source;
b75a7d8f
A
2085 UChar *myTarget = args->target;
2086 const char *mySourceLimit = args->sourceLimit;
2087 uint32_t targetUniChar = 0x0000;
2088 uint32_t mySourceChar = 0x0000;
46f4442e 2089 uint32_t tmpSourceChar = 0x0000;
b75a7d8f 2090 UConverterDataISO2022* myData;
374ca955
A
2091 ISO2022State *pToU2022State;
2092 StateEnum cs;
b75a7d8f 2093
b75a7d8f 2094 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
374ca955 2095 pToU2022State = &myData->toU2022State;
b75a7d8f 2096
374ca955
A
2097 if(myData->key != 0) {
2098 /* continue with a partial escape sequence */
2099 goto escape;
2100 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2101 /* continue with a partial double-byte character */
2102 mySourceChar = args->converter->toUBytes[0];
2103 args->converter->toULength = 0;
2104 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
fd0068a8 2105 targetUniChar = missingCharMarker;
374ca955
A
2106 goto getTrailByte;
2107 }
2108
2109 while(mySource < mySourceLimit){
2110
2111 targetUniChar =missingCharMarker;
b75a7d8f
A
2112
2113 if(myTarget < args->targetLimit){
2114
2115 mySourceChar= (unsigned char) *mySource++;
374ca955
A
2116
2117 switch(mySourceChar) {
2118 case UCNV_SI:
2119 if(myData->version==3) {
2120 pToU2022State->g=0;
b75a7d8f 2121 continue;
374ca955
A
2122 } else {
2123 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
d5d484b0 2124 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
374ca955 2125 break;
b75a7d8f 2126 }
b75a7d8f 2127
374ca955
A
2128 case UCNV_SO:
2129 if(myData->version==3) {
2130 /* JIS7: switch to G1 half-width Katakana */
2131 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2132 pToU2022State->g=1;
b75a7d8f 2133 continue;
374ca955
A
2134 } else {
2135 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
d5d484b0 2136 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
374ca955 2137 break;
b75a7d8f 2138 }
b75a7d8f 2139
374ca955
A
2140 case ESC_2022:
2141 mySource--;
2142escape:
d5d484b0
A
2143 {
2144 const char * mySourceBefore = mySource;
2145 int8_t toULengthBefore = args->converter->toULength;
2146
46f4442e 2147 changeState_2022(args->converter,&(mySource),
d5d484b0
A
2148 mySourceLimit, ISO_2022_JP,err);
2149
2150 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
46f4442e
A
2151 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2152 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2153 args->converter->toUCallbackReason = UCNV_IRREGULAR;
729e4ab9 2154 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
d5d484b0 2155 }
d5d484b0 2156 }
46f4442e 2157
374ca955
A
2158 /* invalid or illegal escape sequence */
2159 if(U_FAILURE(*err)){
2160 args->target = myTarget;
2161 args->source = mySource;
d5d484b0 2162 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
374ca955 2163 return;
b75a7d8f 2164 }
d5d484b0 2165 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
46f4442e 2166 if(myData->key==0) {
d5d484b0
A
2167 myData->isEmptySegment = TRUE;
2168 }
374ca955 2169 continue;
b75a7d8f 2170
374ca955 2171 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
b75a7d8f 2172
374ca955 2173 case CR:
374ca955
A
2174 case LF:
2175 /* automatically reset to single-byte mode */
2176 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2177 pToU2022State->cs[0] = (int8_t)ASCII;
b75a7d8f 2178 }
374ca955
A
2179 pToU2022State->cs[2] = 0;
2180 pToU2022State->g = 0;
2ca993e8 2181 U_FALLTHROUGH;
b75a7d8f 2182 default:
374ca955 2183 /* convert one or two bytes */
d5d484b0 2184 myData->isEmptySegment = FALSE;
374ca955
A
2185 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2186 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2187 !IS_JP_DBCS(cs)
2188 ) {
2189 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
46f4442e 2190 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
374ca955
A
2191
2192 /* return from a single-shift state to the previous one */
2193 if(pToU2022State->g >= 2) {
2194 pToU2022State->g=pToU2022State->prevG;
2195 }
2196 } else switch(cs) {
2197 case ASCII:
2198 if(mySourceChar <= 0x7f) {
2199 targetUniChar = mySourceChar;
2200 }
2201 break;
2202 case ISO8859_1:
2203 if(mySourceChar <= 0x7f) {
2204 targetUniChar = mySourceChar + 0x80;
2205 }
2206 /* return from a single-shift state to the previous one */
2207 pToU2022State->g=pToU2022State->prevG;
2208 break;
2209 case ISO8859_7:
2210 if(mySourceChar <= 0x7f) {
2211 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2212 targetUniChar =
2213 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2214 myData->myConverterArray[cs],
2215 mySourceChar + 0x80);
2216 }
2217 /* return from a single-shift state to the previous one */
2218 pToU2022State->g=pToU2022State->prevG;
2219 break;
2220 case JISX201:
2221 if(mySourceChar <= 0x7f) {
46f4442e 2222 targetUniChar = jisx201ToU(mySourceChar);
374ca955
A
2223 }
2224 break;
2225 case HWKANA_7BIT:
2226 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2227 /* 7-bit halfwidth Katakana */
46f4442e 2228 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
374ca955
A
2229 }
2230 break;
2231 default:
2232 /* G0 DBCS */
2233 if(mySource < mySourceLimit) {
fd0068a8
A
2234 int leadIsOk, trailIsOk;
2235 uint8_t trailByte;
374ca955 2236getTrailByte:
fd0068a8 2237 trailByte = (uint8_t)*mySource;
fd0068a8
A
2238 /*
2239 * Ticket 5691: consistent illegal sequences:
2240 * - We include at least the first byte in the illegal sequence.
2241 * - If any of the non-initial bytes could be the start of a character,
46f4442e 2242 * we stop the illegal sequence before the first one of those.
fd0068a8
A
2243 *
2244 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2245 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2246 * Otherwise we convert or report the pair of bytes.
2247 */
2248 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2249 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2250 if (leadIsOk && trailIsOk) {
2251 ++mySource;
46f4442e
A
2252 tmpSourceChar = (mySourceChar << 8) | trailByte;
2253 if(cs == JISX208) {
2254 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2255 mySourceChar = tmpSourceChar;
2256 } else {
2257 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2258 mySourceChar = tmpSourceChar;
2259 if (cs == KSC5601) {
2260 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2261 }
2262 tempBuf[0] = (char)(tmpSourceChar >> 8);
2263 tempBuf[1] = (char)(tmpSourceChar);
2264 }
fd0068a8
A
2265 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2266 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2267 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2268 ++mySource;
2269 /* add another bit so that the code below writes 2 bytes in case of error */
2270 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2271 }
374ca955
A
2272 } else {
2273 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2274 args->converter->toULength = 1;
2275 goto endloop;
2276 }
46f4442e 2277 } /* End of inner switch */
b75a7d8f 2278 break;
46f4442e 2279 } /* End of outer switch */
b75a7d8f
A
2280 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2281 if(args->offsets){
73c04bcf 2282 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f
A
2283 }
2284 *(myTarget++)=(UChar)targetUniChar;
b75a7d8f 2285 }
374ca955
A
2286 else if(targetUniChar > missingCharMarker){
2287 /* disassemble the surrogate pair and write to output*/
2288 targetUniChar-=0x0010000;
2289 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2290 if(args->offsets){
73c04bcf 2291 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
374ca955
A
2292 }
2293 ++myTarget;
46f4442e 2294 if(myTarget< args->targetLimit){
374ca955
A
2295 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2296 if(args->offsets){
73c04bcf 2297 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
374ca955
A
2298 }
2299 ++myTarget;
2300 }else{
2301 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2302 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2303 }
b75a7d8f 2304
374ca955
A
2305 }
2306 else{
b75a7d8f 2307 /* Call the callback function*/
374ca955
A
2308 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2309 break;
b75a7d8f
A
2310 }
2311 }
46f4442e 2312 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
b75a7d8f
A
2313 *err =U_BUFFER_OVERFLOW_ERROR;
2314 break;
2315 }
2316 }
374ca955 2317endloop:
b75a7d8f
A
2318 args->target = myTarget;
2319 args->source = mySource;
2320}
2321
2322
b331163b 2323#if !UCONFIG_ONLY_HTML_CONVERSION
b75a7d8f
A
2324/***************************************************************
2325* Rules for ISO-2022-KR encoding
46f4442e 2326* i) The KSC5601 designator sequence should appear only once in a file,
b75a7d8f
A
2327* at the begining of a line before any KSC5601 characters. This usually
2328* means that it appears by itself on the first line of the file
2329* ii) There are only 2 shifting sequences SO to shift into double byte mode
2330* and SI to shift into single byte mode
2331*/
f3c0d7a5 2332static void U_CALLCONV
b75a7d8f
A
2333UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2334
374ca955
A
2335 UConverter* saveConv = args->converter;
2336 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2337 args->converter=myConverterData->currentConverter;
2338
2339 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2340 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2341 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2342
2343 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2344 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2345 uprv_memcpy(
2346 saveConv->charErrorBuffer,
2347 myConverterData->currentConverter->charErrorBuffer,
2348 myConverterData->currentConverter->charErrorBufferLength);
2349 }
2350 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2351 myConverterData->currentConverter->charErrorBufferLength = 0;
2352 }
2353 args->converter=saveConv;
b75a7d8f
A
2354}
2355
f3c0d7a5 2356static void U_CALLCONV
b75a7d8f
A
2357UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2358
2359 const UChar *source = args->source;
2360 const UChar *sourceLimit = args->sourceLimit;
2361 unsigned char *target = (unsigned char *) args->target;
2362 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2363 int32_t* offsets = args->offsets;
2364 uint32_t targetByteUnit = 0x0000;
2365 UChar32 sourceChar = 0x0000;
2366 UBool isTargetByteDBCS;
2367 UBool oldIsTargetByteDBCS;
2368 UConverterDataISO2022 *converterData;
b75a7d8f
A
2369 UConverterSharedData* sharedData;
2370 UBool useFallback;
2371 int32_t length =0;
2372
b75a7d8f 2373 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
46f4442e
A
2374 /* if the version is 1 then the user is requesting
2375 * conversion with ibm-25546 pass the arguments to
b75a7d8f
A
2376 * MBCS converter and return
2377 */
2378 if(converterData->version==1){
2379 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2380 return;
2381 }
374ca955
A
2382
2383 /* initialize data */
2384 sharedData = converterData->currentConverter->sharedData;
2385 useFallback = args->converter->useFallback;
2386 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2387 oldIsTargetByteDBCS = isTargetByteDBCS;
46f4442e 2388
b75a7d8f 2389 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
374ca955 2390 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
b75a7d8f
A
2391 goto getTrail;
2392 }
2393 while(source < sourceLimit){
46f4442e 2394
b75a7d8f
A
2395 targetByteUnit = missingCharMarker;
2396
2397 if(target < (unsigned char*) args->targetLimit){
2398 sourceChar = *source++;
73c04bcf
A
2399
2400 /* do not convert SO/SI/ESC */
2401 if(IS_2022_CONTROL(sourceChar)) {
2402 /* callback(illegal) */
2403 *err=U_ILLEGAL_CHAR_FOUND;
2404 args->converter->fromUChar32=sourceChar;
2405 break;
2406 }
2407
46f4442e
A
2408 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2409 if(length < 0) {
2410 length = -length; /* fallback */
2411 }
b75a7d8f 2412 /* only DBCS or SBCS characters are expected*/
374ca955 2413 /* DB characters with high bit set to 1 are expected */
fd0068a8
A
2414 if( length > 2 || length==0 ||
2415 (length == 1 && targetByteUnit > 0x7f) ||
2416 (length == 2 &&
2417 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2418 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2419 ) {
b75a7d8f
A
2420 targetByteUnit=missingCharMarker;
2421 }
2422 if (targetByteUnit != missingCharMarker){
2423
2424 oldIsTargetByteDBCS = isTargetByteDBCS;
2425 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2426 /* append the shift sequence */
2427 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
46f4442e
A
2428
2429 if (isTargetByteDBCS)
b75a7d8f 2430 *target++ = UCNV_SO;
46f4442e 2431 else
b75a7d8f
A
2432 *target++ = UCNV_SI;
2433 if(offsets)
73c04bcf 2434 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2435 }
2436 /* write the targetUniChar to target */
2437 if(targetByteUnit <= 0x00FF){
2438 if( target < targetLimit){
2439 *(target++) = (unsigned char) targetByteUnit;
2440 if(offsets){
73c04bcf 2441 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2442 }
2443
2444 }else{
2445 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2446 *err = U_BUFFER_OVERFLOW_ERROR;
2447 }
2448 }else{
2449 if(target < targetLimit){
2450 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2451 if(offsets){
73c04bcf 2452 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2453 }
2454 if(target < targetLimit){
2455 *(target++) =(unsigned char) (targetByteUnit -0x80);
2456 if(offsets){
73c04bcf 2457 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2458 }
2459 }else{
2460 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2461 *err = U_BUFFER_OVERFLOW_ERROR;
2462 }
2463 }else{
2464 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2465 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2466 *err = U_BUFFER_OVERFLOW_ERROR;
2467 }
2468 }
2469
2470 }
2471 else{
2472 /* oops.. the code point is unassingned
2473 * set the error and reason
2474 */
b75a7d8f
A
2475
2476 /*check if the char is a First surrogate*/
4388f060
A
2477 if(U16_IS_SURROGATE(sourceChar)) {
2478 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
b75a7d8f
A
2479getTrail:
2480 /*look ahead to find the trail surrogate*/
2481 if(source < sourceLimit) {
2482 /* test the following code unit */
2483 UChar trail=(UChar) *source;
4388f060 2484 if(U16_IS_TRAIL(trail)) {
b75a7d8f 2485 source++;
4388f060 2486 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
b75a7d8f 2487 *err = U_INVALID_CHAR_FOUND;
b75a7d8f
A
2488 /* convert this surrogate code point */
2489 /* exit this condition tree */
2490 } else {
2491 /* this is an unmatched lead code unit (1st surrogate) */
2492 /* callback(illegal) */
b75a7d8f
A
2493 *err=U_ILLEGAL_CHAR_FOUND;
2494 }
2495 } else {
2496 /* no more input */
2497 *err = U_ZERO_ERROR;
b75a7d8f
A
2498 }
2499 } else {
2500 /* this is an unmatched trail code unit (2nd surrogate) */
2501 /* callback(illegal) */
b75a7d8f
A
2502 *err=U_ILLEGAL_CHAR_FOUND;
2503 }
374ca955
A
2504 } else {
2505 /* callback(unassigned) for a BMP code point */
2506 *err = U_INVALID_CHAR_FOUND;
b75a7d8f 2507 }
b75a7d8f 2508
374ca955 2509 args->converter->fromUChar32=sourceChar;
374ca955 2510 break;
b75a7d8f
A
2511 }
2512 } /* end if(myTargetIndex<myTargetLength) */
2513 else{
2514 *err =U_BUFFER_OVERFLOW_ERROR;
2515 break;
2516 }
2517
2518 }/* end while(mySourceIndex<mySourceLength) */
2519
374ca955
A
2520 /*
2521 * the end of the input stream and detection of truncated input
2522 * are handled by the framework, but for ISO-2022-KR conversion
2523 * we need to be in ASCII mode at the very end
2524 *
2525 * conditions:
2526 * successful
2527 * not in ASCII mode
2528 * end of input and no truncated input
b75a7d8f 2529 */
374ca955
A
2530 if( U_SUCCESS(*err) &&
2531 isTargetByteDBCS &&
2532 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2533 ) {
2534 int32_t sourceIndex;
2535
2536 /* we are switching to ASCII */
2537 isTargetByteDBCS=FALSE;
2538
2539 /* get the source index of the last input character */
2540 /*
2541 * TODO this would be simpler and more reliable if we used a pair
2542 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2543 * so that we could simply use the prevSourceIndex here;
2544 * this code gives an incorrect result for the rare case of an unmatched
2545 * trail surrogate that is alone in the last buffer of the text stream
2546 */
2547 sourceIndex=(int32_t)(source-args->source);
2548 if(sourceIndex>0) {
2549 --sourceIndex;
2550 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2551 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2552 ) {
2553 --sourceIndex;
2554 }
2555 } else {
2556 sourceIndex=-1;
2557 }
2558
73c04bcf 2559 fromUWriteUInt8(
374ca955
A
2560 args->converter,
2561 SHIFT_IN_STR, 1,
73c04bcf 2562 &target, (const char *)targetLimit,
374ca955
A
2563 &offsets, sourceIndex,
2564 err);
b75a7d8f
A
2565 }
2566
2567 /*save the state and return */
2568 args->source = source;
2569 args->target = (char*)target;
2570 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2571}
2572
2573/************************ To Unicode ***************************************/
2574
f3c0d7a5 2575static void U_CALLCONV
b75a7d8f
A
2576UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2577 UErrorCode* err){
b75a7d8f 2578 char const* sourceStart;
b75a7d8f 2579 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
b75a7d8f 2580
374ca955
A
2581 UConverterToUnicodeArgs subArgs;
2582 int32_t minArgsSize;
2583
2584 /* set up the subconverter arguments */
2585 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2586 minArgsSize = args->size;
2587 } else {
2588 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2589 }
2590
2591 uprv_memcpy(&subArgs, args, minArgsSize);
2592 subArgs.size = (uint16_t)minArgsSize;
2593 subArgs.converter = myData->currentConverter;
2594
2595 /* remember the original start of the input for offsets */
2596 sourceStart = args->source;
2597
2598 if(myData->key != 0) {
2599 /* continue with a partial escape sequence */
2600 goto escape;
2601 }
2602
2603 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
b75a7d8f 2604 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
374ca955
A
2605 subArgs.source = args->source;
2606 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2607 if(subArgs.source != subArgs.sourceLimit) {
2608 /*
2609 * get the current partial byte sequence
2610 *
2611 * it needs to be moved between the public and the subconverter
2612 * so that the conversion framework, which only sees the public
2613 * converter, can handle truncated and illegal input etc.
2614 */
2615 if(args->converter->toULength > 0) {
2616 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2617 }
2618 subArgs.converter->toULength = args->converter->toULength;
2619
2620 /*
2621 * Convert up to the end of the input, or to before the next escape character.
2622 * Does not handle conversion extensions because the preToU[] state etc.
2623 * is not copied.
2624 */
2625 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2626
2627 if(args->offsets != NULL && sourceStart != args->source) {
2628 /* update offsets to base them on the actual start of the input */
2629 int32_t *offsets = args->offsets;
2630 UChar *target = args->target;
2631 int32_t delta = (int32_t)(args->source - sourceStart);
2632 while(target < subArgs.target) {
2633 if(*offsets >= 0) {
2634 *offsets += delta;
2635 }
2636 ++offsets;
2637 ++target;
2638 }
2639 }
2640 args->source = subArgs.source;
2641 args->target = subArgs.target;
2642 args->offsets = subArgs.offsets;
2643
2644 /* copy input/error/overflow buffers */
2645 if(subArgs.converter->toULength > 0) {
2646 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2647 }
2648 args->converter->toULength = subArgs.converter->toULength;
2649
2650 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2651 if(subArgs.converter->UCharErrorBufferLength > 0) {
2652 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2653 subArgs.converter->UCharErrorBufferLength);
2654 }
2655 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2656 subArgs.converter->UCharErrorBufferLength = 0;
b75a7d8f 2657 }
b75a7d8f
A
2658 }
2659
374ca955 2660 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
b75a7d8f 2661 return;
374ca955 2662 }
b75a7d8f 2663
374ca955 2664escape:
b75a7d8f 2665 changeState_2022(args->converter,
46f4442e 2666 &(args->source),
b75a7d8f 2667 args->sourceLimit,
b75a7d8f 2668 ISO_2022_KR,
b75a7d8f 2669 err);
374ca955 2670 }
b75a7d8f
A
2671}
2672
f3c0d7a5 2673static void U_CALLCONV
b75a7d8f
A
2674UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2675 UErrorCode* err){
374ca955 2676 char tempBuf[2];
b75a7d8f
A
2677 const char *mySource = ( char *) args->source;
2678 UChar *myTarget = args->target;
2679 const char *mySourceLimit = args->sourceLimit;
2680 UChar32 targetUniChar = 0x0000;
2681 UChar mySourceChar = 0x0000;
2682 UConverterDataISO2022* myData;
b75a7d8f
A
2683 UConverterSharedData* sharedData ;
2684 UBool useFallback;
2685
374ca955
A
2686 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2687 if(myData->version==1){
2688 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
b75a7d8f
A
2689 return;
2690 }
374ca955 2691
b75a7d8f 2692 /* initialize state */
374ca955 2693 sharedData = myData->currentConverter->sharedData;
b75a7d8f 2694 useFallback = args->converter->useFallback;
46f4442e 2695
374ca955
A
2696 if(myData->key != 0) {
2697 /* continue with a partial escape sequence */
2698 goto escape;
2699 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2700 /* continue with a partial double-byte character */
2701 mySourceChar = args->converter->toUBytes[0];
2702 args->converter->toULength = 0;
2703 goto getTrailByte;
b75a7d8f 2704 }
b75a7d8f 2705
374ca955 2706 while(mySource< mySourceLimit){
b75a7d8f
A
2707
2708 if(myTarget < args->targetLimit){
2709
2710 mySourceChar= (unsigned char) *mySource++;
2711
2712 if(mySourceChar==UCNV_SI){
374ca955 2713 myData->toU2022State.g = 0;
d5d484b0
A
2714 if (myData->isEmptySegment) {
2715 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
46f4442e
A
2716 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2717 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2718 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
d5d484b0
A
2719 args->converter->toULength = 1;
2720 args->target = myTarget;
2721 args->source = mySource;
2722 return;
2723 }
b75a7d8f
A
2724 /*consume the source */
2725 continue;
2726 }else if(mySourceChar==UCNV_SO){
374ca955 2727 myData->toU2022State.g = 1;
d5d484b0 2728 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
b75a7d8f
A
2729 /*consume the source */
2730 continue;
374ca955
A
2731 }else if(mySourceChar==ESC_2022){
2732 mySource--;
2733escape:
d5d484b0 2734 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
46f4442e 2735 changeState_2022(args->converter,&(mySource),
374ca955 2736 mySourceLimit, ISO_2022_KR, err);
b75a7d8f
A
2737 if(U_FAILURE(*err)){
2738 args->target = myTarget;
2739 args->source = mySource;
2740 return;
2741 }
2742 continue;
46f4442e 2743 }
b75a7d8f 2744
d5d484b0 2745 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
374ca955
A
2746 if(myData->toU2022State.g == 1) {
2747 if(mySource < mySourceLimit) {
fd0068a8
A
2748 int leadIsOk, trailIsOk;
2749 uint8_t trailByte;
374ca955 2750getTrailByte:
fd0068a8
A
2751 targetUniChar = missingCharMarker;
2752 trailByte = (uint8_t)*mySource;
2753 /*
2754 * Ticket 5691: consistent illegal sequences:
2755 * - We include at least the first byte in the illegal sequence.
2756 * - If any of the non-initial bytes could be the start of a character,
2757 * we stop the illegal sequence before the first one of those.
2758 *
2759 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2760 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2761 * Otherwise we convert or report the pair of bytes.
2762 */
2763 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2764 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2765 if (leadIsOk && trailIsOk) {
2766 ++mySource;
2767 tempBuf[0] = (char)(mySourceChar + 0x80);
2768 tempBuf[1] = (char)(trailByte + 0x80);
2769 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2770 mySourceChar = (mySourceChar << 8) | trailByte;
2771 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2772 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2773 ++mySource;
2774 /* add another bit so that the code below writes 2 bytes in case of error */
2775 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
374ca955
A
2776 }
2777 } else {
2778 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2779 args->converter->toULength = 1;
2780 break;
b75a7d8f
A
2781 }
2782 }
fd0068a8 2783 else if(mySourceChar <= 0x7f) {
374ca955 2784 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
fd0068a8
A
2785 } else {
2786 targetUniChar = 0xffff;
b75a7d8f 2787 }
374ca955
A
2788 if(targetUniChar < 0xfffe){
2789 if(args->offsets) {
73c04bcf 2790 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
374ca955 2791 }
b75a7d8f
A
2792 *(myTarget++)=(UChar)targetUniChar;
2793 }
2794 else {
b75a7d8f 2795 /* Call the callback function*/
374ca955
A
2796 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2797 break;
b75a7d8f
A
2798 }
2799 }
2800 else{
2801 *err =U_BUFFER_OVERFLOW_ERROR;
2802 break;
2803 }
2804 }
b75a7d8f
A
2805 args->target = myTarget;
2806 args->source = mySource;
2807}
2808
2809/*************************** END ISO2022-KR *********************************/
2810
2811/*************************** ISO-2022-CN *********************************
2812*
2813* Rules for ISO-2022-CN Encoding:
374ca955 2814* i) The designator sequence must appear once on a line before any instance
b75a7d8f
A
2815* of character set it designates.
2816* ii) If two lines contain characters from the same character set, both lines
2817* must include the designator sequence.
374ca955 2818* iii) Once the designator sequence is known, a shifting sequence has to be found
b75a7d8f
A
2819* to invoke the shifting
2820* iv) All lines start in ASCII and end in ASCII.
2821* v) Four shifting sequences are employed for this purpose:
2822*
2823* Sequcence ASCII Eq Charsets
2824* ---------- ------- ---------
374ca955
A
2825* SI <SI> US-ASCII
2826* SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2827* SS2 <ESC>N CNS-11643-1992 Plane 2
2828* SS3 <ESC>O CNS-11643-1992 Planes 3-7
b75a7d8f
A
2829*
2830* vi)
2831* SOdesignator : ESC "$" ")" finalchar_for_SO
2832* SS2designator : ESC "$" "*" finalchar_for_SS2
2833* SS3designator : ESC "$" "+" finalchar_for_SS3
2834*
2835* ESC $ ) A Indicates the bytes following SO are Chinese
2836* characters as defined in GB 2312-80, until
2837* another SOdesignation appears
2838*
2839*
2840* ESC $ ) E Indicates the bytes following SO are as defined
2841* in ISO-IR-165 (for details, see section 2.1),
2842* until another SOdesignation appears
2843*
2844* ESC $ ) G Indicates the bytes following SO are as defined
2845* in CNS 11643-plane-1, until another
2846* SOdesignation appears
2847*
2848* ESC $ * H Indicates the two bytes immediately following
2849* SS2 is a Chinese character as defined in CNS
2850* 11643-plane-2, until another SS2designation
2851* appears
46f4442e 2852* (Meaning <ESC>N must preceed every 2 byte
b75a7d8f
A
2853* sequence.)
2854*
2855* ESC $ + I Indicates the immediate two bytes following SS3
2856* is a Chinese character as defined in CNS
2857* 11643-plane-3, until another SS3designation
2858* appears
46f4442e 2859* (Meaning <ESC>O must preceed every 2 byte
b75a7d8f
A
2860* sequence.)
2861*
2862* ESC $ + J Indicates the immediate two bytes following SS3
2863* is a Chinese character as defined in CNS
2864* 11643-plane-4, until another SS3designation
2865* appears
46f4442e 2866* (In English: <ESC>O must preceed every 2 byte
b75a7d8f
A
2867* sequence.)
2868*
2869* ESC $ + K Indicates the immediate two bytes following SS3
2870* is a Chinese character as defined in CNS
2871* 11643-plane-5, until another SS3designation
2872* appears
2873*
2874* ESC $ + L Indicates the immediate two bytes following SS3
2875* is a Chinese character as defined in CNS
2876* 11643-plane-6, until another SS3designation
2877* appears
2878*
2879* ESC $ + M Indicates the immediate two bytes following SS3
2880* is a Chinese character as defined in CNS
2881* 11643-plane-7, until another SS3designation
2882* appears
2883*
2884* As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2885* has its own designation information before any Chinese characters
2886* appear
2887*
2888*/
2889
4388f060 2890/* The following are defined this way to make the strings truly readonly */
b75a7d8f
A
2891static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2892static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2893static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2894static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2895static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2896static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2897static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2898static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2899static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2900
2901/********************** ISO2022-CN Data **************************/
2902static const char* const escSeqCharsCN[10] ={
4388f060
A
2903 SHIFT_IN_STR, /* 0 ASCII */
2904 GB_2312_80_STR, /* 1 GB2312_1 */
2905 ISO_IR_165_STR, /* 2 ISO_IR_165 */
b75a7d8f
A
2906 CNS_11643_1992_Plane_1_STR,
2907 CNS_11643_1992_Plane_2_STR,
2908 CNS_11643_1992_Plane_3_STR,
2909 CNS_11643_1992_Plane_4_STR,
2910 CNS_11643_1992_Plane_5_STR,
2911 CNS_11643_1992_Plane_6_STR,
2912 CNS_11643_1992_Plane_7_STR
2913};
b75a7d8f 2914
f3c0d7a5 2915static void U_CALLCONV
b75a7d8f 2916UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
46f4442e 2917 UConverter *cnv = args->converter;
b75a7d8f 2918 UConverterDataISO2022 *converterData;
374ca955
A
2919 ISO2022State *pFromU2022State;
2920 uint8_t *target = (uint8_t *) args->target;
2921 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
b75a7d8f
A
2922 const UChar* source = args->source;
2923 const UChar* sourceLimit = args->sourceLimit;
2924 int32_t* offsets = args->offsets;
374ca955
A
2925 UChar32 sourceChar;
2926 char buffer[8];
2927 int32_t len;
2928 int8_t choices[3];
2929 int32_t choiceCount;
73c04bcf 2930 uint32_t targetValue = 0;
b75a7d8f
A
2931 UBool useFallback;
2932
b75a7d8f 2933 /* set up the state */
46f4442e 2934 converterData = (UConverterDataISO2022*)cnv->extraInfo;
374ca955 2935 pFromU2022State = &converterData->fromU2022State;
374ca955
A
2936
2937 choiceCount = 0;
b75a7d8f
A
2938
2939 /* check if the last codepoint of previous buffer was a lead surrogate*/
46f4442e 2940 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
b75a7d8f
A
2941 goto getTrail;
2942 }
2943
b75a7d8f 2944 while( source < sourceLimit){
b75a7d8f
A
2945 if(target < targetLimit){
2946
2947 sourceChar = *(source++);
2948 /*check if the char is a First surrogate*/
4388f060
A
2949 if(U16_IS_SURROGATE(sourceChar)) {
2950 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
b75a7d8f
A
2951getTrail:
2952 /*look ahead to find the trail surrogate*/
2953 if(source < sourceLimit) {
2954 /* test the following code unit */
2955 UChar trail=(UChar) *source;
4388f060 2956 if(U16_IS_TRAIL(trail)) {
b75a7d8f 2957 source++;
4388f060 2958 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
46f4442e 2959 cnv->fromUChar32=0x00;
374ca955 2960 /* convert this supplementary code point */
b75a7d8f
A
2961 /* exit this condition tree */
2962 } else {
2963 /* this is an unmatched lead code unit (1st surrogate) */
2964 /* callback(illegal) */
b75a7d8f 2965 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 2966 cnv->fromUChar32=sourceChar;
374ca955 2967 break;
b75a7d8f
A
2968 }
2969 } else {
2970 /* no more input */
46f4442e 2971 cnv->fromUChar32=sourceChar;
b75a7d8f
A
2972 break;
2973 }
2974 } else {
2975 /* this is an unmatched trail code unit (2nd surrogate) */
2976 /* callback(illegal) */
b75a7d8f 2977 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 2978 cnv->fromUChar32=sourceChar;
374ca955 2979 break;
b75a7d8f
A
2980 }
2981 }
2982
2983 /* do the conversion */
374ca955 2984 if(sourceChar <= 0x007f ){
73c04bcf
A
2985 /* do not convert SO/SI/ESC */
2986 if(IS_2022_CONTROL(sourceChar)) {
2987 /* callback(illegal) */
2988 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 2989 cnv->fromUChar32=sourceChar;
73c04bcf
A
2990 break;
2991 }
2992
374ca955
A
2993 /* US-ASCII */
2994 if(pFromU2022State->g == 0) {
2995 buffer[0] = (char)sourceChar;
2996 len = 1;
2997 } else {
2998 buffer[0] = UCNV_SI;
2999 buffer[1] = (char)sourceChar;
3000 len = 2;
3001 pFromU2022State->g = 0;
3002 choiceCount = 0;
3003 }
3004 if(sourceChar == CR || sourceChar == LF) {
3005 /* reset the state at the end of a line */
3006 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
3007 choiceCount = 0;
b75a7d8f 3008 }
b75a7d8f
A
3009 }
3010 else{
374ca955 3011 /* convert U+0080..U+10ffff */
374ca955
A
3012 int32_t i;
3013 int8_t cs, g;
3014
3015 if(choiceCount == 0) {
3016 /* try the current SO/G1 converter first */
3017 choices[0] = pFromU2022State->cs[1];
3018
3019 /* default to GB2312_1 if none is designated yet */
3020 if(choices[0] == 0) {
3021 choices[0] = GB2312_1;
3022 }
b75a7d8f 3023
374ca955
A
3024 if(converterData->version == 0) {
3025 /* ISO-2022-CN */
3026
3027 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3028 if(choices[0] == GB2312_1) {
3029 choices[1] = (int8_t)CNS_11643_1;
3030 } else {
3031 choices[1] = (int8_t)GB2312_1;
b75a7d8f 3032 }
374ca955
A
3033
3034 choiceCount = 2;
729e4ab9 3035 } else if (converterData->version == 1) {
374ca955
A
3036 /* ISO-2022-CN-EXT */
3037
3038 /* try one of the other converters */
3039 switch(choices[0]) {
3040 case GB2312_1:
3041 choices[1] = (int8_t)CNS_11643_1;
3042 choices[2] = (int8_t)ISO_IR_165;
3043 break;
3044 case ISO_IR_165:
3045 choices[1] = (int8_t)GB2312_1;
3046 choices[2] = (int8_t)CNS_11643_1;
3047 break;
3048 default: /* CNS_11643_x */
3049 choices[1] = (int8_t)GB2312_1;
3050 choices[2] = (int8_t)ISO_IR_165;
3051 break;
b75a7d8f 3052 }
b75a7d8f 3053
374ca955 3054 choiceCount = 3;
729e4ab9
A
3055 } else {
3056 choices[0] = (int8_t)CNS_11643_1;
3057 choices[1] = (int8_t)GB2312_1;
374ca955 3058 }
b75a7d8f
A
3059 }
3060
374ca955 3061 cs = g = 0;
46f4442e
A
3062 /*
3063 * len==0: no mapping found yet
3064 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3065 * len>0: found a roundtrip result, done
3066 */
374ca955 3067 len = 0;
46f4442e
A
3068 /*
3069 * We will turn off useFallback after finding a fallback,
3070 * but we still get fallbacks from PUA code points as usual.
3071 * Therefore, we will also need to check that we don't overwrite
3072 * an early fallback with a later one.
3073 */
3074 useFallback = cnv->useFallback;
3075
3076 for(i = 0; i < choiceCount && len <= 0; ++i) {
3077 int8_t cs0 = choices[i];
3078 if(cs0 > 0) {
3079 uint32_t value;
3080 int32_t len2;
3081 if(cs0 >= CNS_11643_0) {
3082 len2 = MBCS_FROM_UCHAR32_ISO2022(
3083 converterData->myConverterArray[CNS_11643],
3084 sourceChar,
3085 &value,
3086 useFallback,
3087 MBCS_OUTPUT_3);
3088 if(len2 == 3 || (len2 == -3 && len == 0)) {
3089 targetValue = value;
3090 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3091 if(len2 >= 0) {
3092 len = 2;
3093 } else {
3094 len = -2;
3095 useFallback = FALSE;
3096 }
374ca955
A
3097 if(cs == CNS_11643_1) {
3098 g = 1;
3099 } else if(cs == CNS_11643_2) {
3100 g = 2;
3101 } else /* plane 3..7 */ if(converterData->version == 1) {
3102 g = 3;
3103 } else {
3104 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3105 len = 0;
3106 }
3107 }
3108 } else {
3109 /* GB2312_1 or ISO-IR-165 */
4388f060 3110 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
46f4442e
A
3111 len2 = MBCS_FROM_UCHAR32_ISO2022(
3112 converterData->myConverterArray[cs0],
3113 sourceChar,
3114 &value,
3115 useFallback,
3116 MBCS_OUTPUT_2);
3117 if(len2 == 2 || (len2 == -2 && len == 0)) {
3118 targetValue = value;
3119 len = len2;
3120 cs = cs0;
3121 g = 1;
3122 useFallback = FALSE;
3123 }
374ca955 3124 }
b75a7d8f 3125 }
b75a7d8f
A
3126 }
3127
46f4442e
A
3128 if(len != 0) {
3129 len = 0; /* count output bytes; it must have been abs(len) == 2 */
b75a7d8f 3130
374ca955
A
3131 /* write the designation sequence if necessary */
3132 if(cs != pFromU2022State->cs[g]) {
3133 if(cs < CNS_11643) {
3134 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3135 } else {
4388f060 3136 U_ASSERT(cs >= CNS_11643_1);
374ca955 3137 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
b75a7d8f 3138 }
374ca955
A
3139 len = 4;
3140 pFromU2022State->cs[g] = cs;
3141 if(g == 1) {
3142 /* changing the SO/G1 charset invalidates the choices[] */
3143 choiceCount = 0;
b75a7d8f 3144 }
374ca955
A
3145 }
3146
3147 /* write the shift sequence if necessary */
3148 if(g != pFromU2022State->g) {
3149 switch(g) {
3150 case 1:
3151 buffer[len++] = UCNV_SO;
3152
3153 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3154 pFromU2022State->g = 1;
3155 break;
3156 case 2:
3157 buffer[len++] = 0x1b;
3158 buffer[len++] = 0x4e;
3159 break;
3160 default: /* case 3 */
3161 buffer[len++] = 0x1b;
3162 buffer[len++] = 0x4f;
3163 break;
b75a7d8f 3164 }
b75a7d8f 3165 }
b75a7d8f 3166
374ca955
A
3167 /* write the two output bytes */
3168 buffer[len++] = (char)(targetValue >> 8);
3169 buffer[len++] = (char)targetValue;
3170 } else {
46f4442e 3171 /* if we cannot find the character after checking all codepages
374ca955
A
3172 * then this is an error
3173 */
3174 *err = U_INVALID_CHAR_FOUND;
46f4442e 3175 cnv->fromUChar32=sourceChar;
374ca955
A
3176 break;
3177 }
b75a7d8f 3178 }
b75a7d8f 3179
374ca955
A
3180 /* output len>0 bytes in buffer[] */
3181 if(len == 1) {
3182 *target++ = buffer[0];
3183 if(offsets) {
73c04bcf 3184 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
374ca955
A
3185 }
3186 } else if(len == 2 && (target + 2) <= targetLimit) {
3187 *target++ = buffer[0];
3188 *target++ = buffer[1];
3189 if(offsets) {
3190 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3191 *offsets++ = sourceIndex;
3192 *offsets++ = sourceIndex;
3193 }
3194 } else {
73c04bcf 3195 fromUWriteUInt8(
46f4442e 3196 cnv,
374ca955 3197 buffer, len,
73c04bcf 3198 &target, (const char *)targetLimit,
374ca955
A
3199 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3200 err);
3201 if(U_FAILURE(*err)) {
b75a7d8f
A
3202 break;
3203 }
3204 }
3205 } /* end if(myTargetIndex<myTargetLength) */
3206 else{
3207 *err =U_BUFFER_OVERFLOW_ERROR;
3208 break;
3209 }
3210
3211 }/* end while(mySourceIndex<mySourceLength) */
3212
374ca955
A
3213 /*
3214 * the end of the input stream and detection of truncated input
3215 * are handled by the framework, but for ISO-2022-CN conversion
3216 * we need to be in ASCII mode at the very end
3217 *
3218 * conditions:
3219 * successful
3220 * not in ASCII mode
3221 * end of input and no truncated input
b75a7d8f 3222 */
374ca955
A
3223 if( U_SUCCESS(*err) &&
3224 pFromU2022State->g!=0 &&
46f4442e 3225 args->flush && source>=sourceLimit && cnv->fromUChar32==0
374ca955
A
3226 ) {
3227 int32_t sourceIndex;
3228
3229 /* we are switching to ASCII */
3230 pFromU2022State->g=0;
3231
3232 /* get the source index of the last input character */
3233 /*
3234 * TODO this would be simpler and more reliable if we used a pair
3235 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3236 * so that we could simply use the prevSourceIndex here;
3237 * this code gives an incorrect result for the rare case of an unmatched
3238 * trail surrogate that is alone in the last buffer of the text stream
3239 */
3240 sourceIndex=(int32_t)(source-args->source);
3241 if(sourceIndex>0) {
3242 --sourceIndex;
3243 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3244 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3245 ) {
3246 --sourceIndex;
b75a7d8f 3247 }
374ca955
A
3248 } else {
3249 sourceIndex=-1;
b75a7d8f 3250 }
b75a7d8f 3251
73c04bcf 3252 fromUWriteUInt8(
46f4442e 3253 cnv,
374ca955 3254 SHIFT_IN_STR, 1,
73c04bcf 3255 &target, (const char *)targetLimit,
374ca955
A
3256 &offsets, sourceIndex,
3257 err);
b75a7d8f 3258 }
b75a7d8f 3259
374ca955
A
3260 /*save the state and return */
3261 args->source = source;
3262 args->target = (char*)target;
b75a7d8f
A
3263}
3264
3265
f3c0d7a5 3266static void U_CALLCONV
b75a7d8f
A
3267UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3268 UErrorCode* err){
3269 char tempBuf[3];
374ca955 3270 const char *mySource = (char *) args->source;
b75a7d8f 3271 UChar *myTarget = args->target;
b75a7d8f
A
3272 const char *mySourceLimit = args->sourceLimit;
3273 uint32_t targetUniChar = 0x0000;
3274 uint32_t mySourceChar = 0x0000;
3275 UConverterDataISO2022* myData;
374ca955 3276 ISO2022State *pToU2022State;
b75a7d8f 3277
374ca955
A
3278 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3279 pToU2022State = &myData->toU2022State;
3280
3281 if(myData->key != 0) {
3282 /* continue with a partial escape sequence */
3283 goto escape;
3284 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3285 /* continue with a partial double-byte character */
3286 mySourceChar = args->converter->toUBytes[0];
3287 args->converter->toULength = 0;
fd0068a8 3288 targetUniChar = missingCharMarker;
374ca955 3289 goto getTrailByte;
b75a7d8f 3290 }
374ca955
A
3291
3292 while(mySource < mySourceLimit){
b75a7d8f
A
3293
3294 targetUniChar =missingCharMarker;
3295
3296 if(myTarget < args->targetLimit){
3297
3298 mySourceChar= (unsigned char) *mySource++;
3299
b75a7d8f
A
3300 switch(mySourceChar){
3301 case UCNV_SI:
374ca955 3302 pToU2022State->g=0;
d5d484b0
A
3303 if (myData->isEmptySegment) {
3304 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
46f4442e
A
3305 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3306 args->converter->toUCallbackReason = UCNV_IRREGULAR;
d5d484b0
A
3307 args->converter->toUBytes[0] = mySourceChar;
3308 args->converter->toULength = 1;
3309 args->target = myTarget;
3310 args->source = mySource;
3311 return;
3312 }
b75a7d8f
A
3313 continue;
3314
3315 case UCNV_SO:
374ca955
A
3316 if(pToU2022State->cs[1] != 0) {
3317 pToU2022State->g=1;
d5d484b0 3318 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
374ca955
A
3319 continue;
3320 } else {
3321 /* illegal to have SO before a matching designator */
d5d484b0 3322 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
b75a7d8f
A
3323 break;
3324 }
3325
b75a7d8f 3326 case ESC_2022:
b75a7d8f 3327 mySource--;
374ca955 3328escape:
d5d484b0
A
3329 {
3330 const char * mySourceBefore = mySource;
3331 int8_t toULengthBefore = args->converter->toULength;
3332
46f4442e 3333 changeState_2022(args->converter,&(mySource),
d5d484b0
A
3334 mySourceLimit, ISO_2022_CN,err);
3335
3336 /* After SO there must be at least one character before a designator (designator error handled separately) */
46f4442e
A
3337 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3338 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3339 args->converter->toUCallbackReason = UCNV_IRREGULAR;
729e4ab9 3340 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
d5d484b0
A
3341 }
3342 }
b75a7d8f
A
3343
3344 /* invalid or illegal escape sequence */
3345 if(U_FAILURE(*err)){
3346 args->target = myTarget;
3347 args->source = mySource;
d5d484b0 3348 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
b75a7d8f
A
3349 return;
3350 }
3351 continue;
3352
374ca955
A
3353 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3354
3355 case CR:
374ca955
A
3356 case LF:
3357 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
2ca993e8 3358 U_FALLTHROUGH;
374ca955
A
3359 default:
3360 /* convert one or two bytes */
d5d484b0 3361 myData->isEmptySegment = FALSE;
374ca955
A
3362 if(pToU2022State->g != 0) {
3363 if(mySource < mySourceLimit) {
3364 UConverterSharedData *cnv;
3365 StateEnum tempState;
3366 int32_t tempBufLen;
fd0068a8
A
3367 int leadIsOk, trailIsOk;
3368 uint8_t trailByte;
374ca955 3369getTrailByte:
fd0068a8
A
3370 trailByte = (uint8_t)*mySource;
3371 /*
3372 * Ticket 5691: consistent illegal sequences:
3373 * - We include at least the first byte in the illegal sequence.
3374 * - If any of the non-initial bytes could be the start of a character,
3375 * we stop the illegal sequence before the first one of those.
3376 *
3377 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3378 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3379 * Otherwise we convert or report the pair of bytes.
3380 */
3381 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3382 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3383 if (leadIsOk && trailIsOk) {
3384 ++mySource;
3385 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3386 if(tempState >= CNS_11643_0) {
3387 cnv = myData->myConverterArray[CNS_11643];
3388 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3389 tempBuf[1] = (char) (mySourceChar);
3390 tempBuf[2] = (char) trailByte;
3391 tempBufLen = 3;
3392
3393 }else{
4388f060 3394 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
fd0068a8
A
3395 cnv = myData->myConverterArray[tempState];
3396 tempBuf[0] = (char) (mySourceChar);
3397 tempBuf[1] = (char) trailByte;
3398 tempBufLen = 2;
3399 }
3400 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3401 mySourceChar = (mySourceChar << 8) | trailByte;
3402 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3403 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3404 ++mySource;
3405 /* add another bit so that the code below writes 2 bytes in case of error */
3406 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
374ca955 3407 }
374ca955
A
3408 if(pToU2022State->g>=2) {
3409 /* return from a single-shift state to the previous one */
3410 pToU2022State->g=pToU2022State->prevG;
3411 }
374ca955
A
3412 } else {
3413 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3414 args->converter->toULength = 1;
3415 goto endloop;
3416 }
3417 }
3418 else{
3419 if(mySourceChar <= 0x7f) {
3420 targetUniChar = (UChar) mySourceChar;
3421 }
3422 }
3423 break;
b75a7d8f
A
3424 }
3425 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3426 if(args->offsets){
73c04bcf 3427 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f
A
3428 }
3429 *(myTarget++)=(UChar)targetUniChar;
3430 }
3431 else if(targetUniChar > missingCharMarker){
3432 /* disassemble the surrogate pair and write to output*/
3433 targetUniChar-=0x0010000;
374ca955 3434 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
b75a7d8f 3435 if(args->offsets){
73c04bcf 3436 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f 3437 }
374ca955 3438 ++myTarget;
46f4442e 3439 if(myTarget< args->targetLimit){
374ca955 3440 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
b75a7d8f 3441 if(args->offsets){
73c04bcf 3442 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f 3443 }
374ca955 3444 ++myTarget;
b75a7d8f
A
3445 }else{
3446 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3447 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3448 }
3449
3450 }
3451 else{
3452 /* Call the callback function*/
374ca955
A
3453 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3454 break;
b75a7d8f
A
3455 }
3456 }
3457 else{
3458 *err =U_BUFFER_OVERFLOW_ERROR;
3459 break;
3460 }
3461 }
374ca955 3462endloop:
b75a7d8f
A
3463 args->target = myTarget;
3464 args->source = mySource;
3465}
b331163b 3466#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
b75a7d8f 3467
f3c0d7a5 3468static void U_CALLCONV
b75a7d8f
A
3469_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3470 UConverter *cnv = args->converter;
3471 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
374ca955
A
3472 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3473 char *p, *subchar;
3474 char buffer[8];
3475 int32_t length;
3476
73c04bcf 3477 subchar=(char *)cnv->subChars;
374ca955 3478 length=cnv->subCharLen; /* assume length==1 for most variants */
b75a7d8f
A
3479
3480 p = buffer;
3481 switch(myConverterData->locale[0]){
3482 case 'j':
374ca955
A
3483 {
3484 int8_t cs;
3485
3486 if(pFromU2022State->g == 1) {
3487 /* JIS7: switch from G1 to G0 */
3488 pFromU2022State->g = 0;
3489 *p++ = UCNV_SI;
3490 }
3491
3492 cs = pFromU2022State->cs[0];
3493 if(cs != ASCII && cs != JISX201) {
3494 /* not in ASCII or JIS X 0201: switch to ASCII */
3495 pFromU2022State->cs[0] = (int8_t)ASCII;
b75a7d8f
A
3496 *p++ = '\x1b';
3497 *p++ = '\x28';
3498 *p++ = '\x42';
b75a7d8f 3499 }
374ca955
A
3500
3501 *p++ = subchar[0];
b75a7d8f 3502 break;
374ca955 3503 }
b75a7d8f 3504 case 'c':
374ca955
A
3505 if(pFromU2022State->g != 0) {
3506 /* not in ASCII mode: switch to ASCII */
3507 pFromU2022State->g = 0;
3508 *p++ = UCNV_SI;
3509 }
3510 *p++ = subchar[0];
b75a7d8f
A
3511 break;
3512 case 'k':
374ca955
A
3513 if(myConverterData->version == 0) {
3514 if(length == 1) {
3515 if((UBool)args->converter->fromUnicodeStatus) {
3516 /* in DBCS mode: switch to SBCS */
3517 args->converter->fromUnicodeStatus = 0;
3518 *p++ = UCNV_SI;
3519 }
3520 *p++ = subchar[0];
3521 } else /* length == 2*/ {
3522 if(!(UBool)args->converter->fromUnicodeStatus) {
3523 /* in SBCS mode: switch to DBCS */
3524 args->converter->fromUnicodeStatus = 1;
3525 *p++ = UCNV_SO;
3526 }
3527 *p++ = subchar[0];
3528 *p++ = subchar[1];
3529 }
3530 break;
3531 } else {
73c04bcf
A
3532 /* save the subconverter's substitution string */
3533 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3534 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3535
3536 /* set our substitution string into the subconverter */
3537 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
374ca955
A
3538 myConverterData->currentConverter->subCharLen = (int8_t)length;
3539
73c04bcf
A
3540 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3541 args->converter = myConverterData->currentConverter;
374ca955
A
3542 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3543 ucnv_cbFromUWriteSub(args, 0, err);
3544 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
73c04bcf
A
3545 args->converter = cnv;
3546
3547 /* restore the subconverter's substitution string */
3548 myConverterData->currentConverter->subChars = currentSubChars;
3549 myConverterData->currentConverter->subCharLen = currentSubCharLen;
374ca955
A
3550
3551 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3552 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3553 uprv_memcpy(
3554 cnv->charErrorBuffer,
3555 myConverterData->currentConverter->charErrorBuffer,
3556 myConverterData->currentConverter->charErrorBufferLength);
3557 }
3558 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3559 myConverterData->currentConverter->charErrorBufferLength = 0;
3560 }
374ca955 3561 return;
b75a7d8f 3562 }
b75a7d8f
A
3563 default:
3564 /* not expected */
3565 break;
3566 }
3567 ucnv_cbFromUWriteBytes(args,
3568 buffer, (int32_t)(p - buffer),
3569 offsetIndex, err);
3570}
3571
73c04bcf
A
3572/*
3573 * Structure for cloning an ISO 2022 converter into a single memory block.
3574 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3575 * and then ucnv_safeClone() of the sub-converter may additionally align
3576 * currentConverter inside the cloneStruct, for which we need the deadSpace
3577 * after currentConverter.
3578 * This is because UAlignedMemory may be larger than the actually
3579 * necessary alignment size for the platform.
3580 * The other cloneStruct fields will not be moved around,
3581 * and are aligned properly with cloneStruct's alignment.
3582 */
b75a7d8f
A
3583struct cloneStruct
3584{
3585 UConverter cnv;
374ca955 3586 UConverter currentConverter;
73c04bcf
A
3587 UAlignedMemory deadSpace;
3588 UConverterDataISO2022 mydata;
b75a7d8f
A
3589};
3590
3591
f3c0d7a5
A
3592U_CDECL_BEGIN
3593
3594static UConverter * U_CALLCONV
b75a7d8f 3595_ISO_2022_SafeClone(
46f4442e
A
3596 const UConverter *cnv,
3597 void *stackBuffer,
3598 int32_t *pBufferSize,
b75a7d8f
A
3599 UErrorCode *status)
3600{
3601 struct cloneStruct * localClone;
374ca955
A
3602 UConverterDataISO2022 *cnvData;
3603 int32_t i, size;
b75a7d8f
A
3604
3605 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
374ca955
A
3606 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3607 return NULL;
b75a7d8f
A
3608 }
3609
374ca955 3610 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
b75a7d8f 3611 localClone = (struct cloneStruct *)stackBuffer;
b75a7d8f 3612
374ca955 3613 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
b75a7d8f 3614
374ca955 3615 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
73c04bcf
A
3616 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3617 localClone->cnv.isExtraLocal = TRUE;
b75a7d8f 3618
374ca955 3619 /* share the subconverters */
b75a7d8f 3620
374ca955 3621 if(cnvData->currentConverter != NULL) {
73c04bcf 3622 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
374ca955
A
3623 localClone->mydata.currentConverter =
3624 ucnv_safeClone(cnvData->currentConverter,
3625 &localClone->currentConverter,
3626 &size, status);
3627 if(U_FAILURE(*status)) {
3628 return NULL;
b75a7d8f 3629 }
b75a7d8f
A
3630 }
3631
374ca955
A
3632 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3633 if(cnvData->myConverterArray[i] != NULL) {
3634 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3635 }
b75a7d8f
A
3636 }
3637
b75a7d8f
A
3638 return &localClone->cnv;
3639}
3640
f3c0d7a5
A
3641U_CDECL_END
3642
3643static void U_CALLCONV
b75a7d8f 3644_ISO_2022_GetUnicodeSet(const UConverter *cnv,
73c04bcf 3645 const USetAdder *sa,
b75a7d8f
A
3646 UConverterUnicodeSet which,
3647 UErrorCode *pErrorCode)
3648{
3649 int32_t i;
b75a7d8f
A
3650 UConverterDataISO2022* cnvData;
3651
3652 if (U_FAILURE(*pErrorCode)) {
3653 return;
3654 }
374ca955 3655#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f
A
3656 if (cnv->sharedData == &_ISO2022Data) {
3657 /* We use UTF-8 in this case */
374ca955
A
3658 sa->addRange(sa->set, 0, 0xd7FF);
3659 sa->addRange(sa->set, 0xE000, 0x10FFFF);
b75a7d8f
A
3660 return;
3661 }
374ca955 3662#endif
b75a7d8f
A
3663
3664 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
b75a7d8f 3665
374ca955
A
3666 /* open a set and initialize it with code points that are algorithmically round-tripped */
3667 switch(cnvData->locale[0]){
3668 case 'j':
46f4442e
A
3669 /* include JIS X 0201 which is hardcoded */
3670 sa->add(sa->set, 0xa5);
3671 sa->add(sa->set, 0x203e);
374ca955
A
3672 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3673 /* include Latin-1 for some variants of JP */
3674 sa->addRange(sa->set, 0, 0xff);
3675 } else {
3676 /* include ASCII for JP */
3677 sa->addRange(sa->set, 0, 0x7f);
3678 }
46f4442e
A
3679 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3680 /*
3681 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3682 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3683 * use half-width Katakana.
3684 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3685 * half-width Katakana via the ESC ( I sequence.
3686 * However, we only emit (fromUnicode) half-width Katakana according to the
3687 * definition of each variant.
3688 *
3689 * When including fallbacks,
3690 * we need to include half-width Katakana Unicode code points for all JP variants because
3691 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3692 */
374ca955 3693 /* include half-width Katakana for JP */
46f4442e 3694 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
374ca955
A
3695 }
3696 break;
b331163b 3697#if !UCONFIG_ONLY_HTML_CONVERSION
374ca955
A
3698 case 'c':
3699 case 'z':
3700 /* include ASCII for CN */
3701 sa->addRange(sa->set, 0, 0x7f);
3702 break;
3703 case 'k':
3704 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3705 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3706 cnvData->currentConverter, sa, which, pErrorCode);
73c04bcf
A
3707 /* the loop over myConverterArray[] will simply not find another converter */
3708 break;
b331163b 3709#endif
374ca955
A
3710 default:
3711 break;
b75a7d8f
A
3712 }
3713
46f4442e 3714#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
374ca955
A
3715 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3716 cnvData->version==0 && i==CNS_11643
3717 ) {
3718 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3719 ucnv_MBCSGetUnicodeSetForBytes(
3720 cnvData->myConverterArray[i],
3721 sa, UCNV_ROUNDTRIP_SET,
3722 0, 0x81, 0x82,
3723 pErrorCode);
46f4442e
A
3724 }
3725#endif
3726
3727 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3728 UConverterSetFilter filter;
3729 if(cnvData->myConverterArray[i]!=NULL) {
b331163b
A
3730 if(cnvData->locale[0]=='j' && i==JISX208) {
3731 /*
3732 * Only add code points that map to Shift-JIS codes
3733 * corresponding to JIS X 0208.
3734 */
3735 filter=UCNV_SET_FILTER_SJIS;
3736#if !UCONFIG_ONLY_HTML_CONVERSION
3737 } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3738 cnvData->version==0 && i==CNS_11643) {
46f4442e
A
3739 /*
3740 * Version-specific for CN:
3741 * CN version 0 does not map CNS planes 3..7 although
3742 * they are all available in the CNS conversion table;
3743 * CN version 1 (-EXT) does map them all.
3744 * The two versions create different Unicode sets.
3745 */
3746 filter=UCNV_SET_FILTER_2022_CN;
46f4442e
A
3747 } else if(i==KSC5601) {
3748 /*
3749 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3750 * are broader than GR94.
3751 */
3752 filter=UCNV_SET_FILTER_GR94DBCS;
b331163b 3753#endif
374ca955 3754 } else {
46f4442e 3755 filter=UCNV_SET_FILTER_NONE;
374ca955 3756 }
46f4442e 3757 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
374ca955 3758 }
b75a7d8f 3759 }
73c04bcf
A
3760
3761 /*
3762 * ISO 2022 converters must not convert SO/SI/ESC despite what
3763 * sub-converters do by themselves.
3764 * Remove these characters from the set.
3765 */
3766 sa->remove(sa->set, 0x0e);
3767 sa->remove(sa->set, 0x0f);
3768 sa->remove(sa->set, 0x1b);
46f4442e
A
3769
3770 /* ISO 2022 converters do not convert C1 controls either */
3771 sa->removeRange(sa->set, 0x80, 0x9f);
b75a7d8f
A
3772}
3773
374ca955
A
3774static const UConverterImpl _ISO2022Impl={
3775 UCNV_ISO_2022,
3776
3777 NULL,
3778 NULL,
3779
3780 _ISO2022Open,
3781 _ISO2022Close,
3782 _ISO2022Reset,
3783
3784#ifdef U_ENABLE_GENERIC_ISO_2022
3785 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3786 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3787 ucnv_fromUnicode_UTF8,
3788 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3789#else
3790 NULL,
3791 NULL,
3792 NULL,
3793 NULL,
3794#endif
3795 NULL,
3796
3797 NULL,
3798 _ISO2022getName,
3799 _ISO_2022_WriteSub,
3800 _ISO_2022_SafeClone,
4388f060
A
3801 _ISO_2022_GetUnicodeSet,
3802
3803 NULL,
3804 NULL
374ca955
A
3805};
3806static const UConverterStaticData _ISO2022StaticData={
3807 sizeof(UConverterStaticData),
3808 "ISO_2022",
3809 2022,
3810 UCNV_IBM,
3811 UCNV_ISO_2022,
3812 1,
3813 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3814 { 0x1a, 0, 0, 0 },
3815 1,
3816 FALSE,
3817 FALSE,
3818 0,
3819 0,
3820 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3821};
2ca993e8
A
3822const UConverterSharedData _ISO2022Data=
3823 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
374ca955
A
3824
3825/*************JP****************/
3826static const UConverterImpl _ISO2022JPImpl={
3827 UCNV_ISO_2022,
3828
3829 NULL,
3830 NULL,
3831
3832 _ISO2022Open,
3833 _ISO2022Close,
3834 _ISO2022Reset,
3835
3836 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3837 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3838 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3839 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3840 NULL,
3841
3842 NULL,
3843 _ISO2022getName,
3844 _ISO_2022_WriteSub,
3845 _ISO_2022_SafeClone,
4388f060
A
3846 _ISO_2022_GetUnicodeSet,
3847
3848 NULL,
3849 NULL
374ca955
A
3850};
3851static const UConverterStaticData _ISO2022JPStaticData={
3852 sizeof(UConverterStaticData),
3853 "ISO_2022_JP",
3854 0,
3855 UCNV_IBM,
3856 UCNV_ISO_2022,
3857 1,
3858 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3859 { 0x1a, 0, 0, 0 },
3860 1,
3861 FALSE,
3862 FALSE,
3863 0,
3864 0,
3865 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3866};
4388f060
A
3867
3868namespace {
3869
2ca993e8
A
3870const UConverterSharedData _ISO2022JPData=
3871 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
374ca955 3872
4388f060
A
3873} // namespace
3874
b331163b 3875#if !UCONFIG_ONLY_HTML_CONVERSION
374ca955
A
3876/************* KR ***************/
3877static const UConverterImpl _ISO2022KRImpl={
3878 UCNV_ISO_2022,
3879
3880 NULL,
3881 NULL,
3882
3883 _ISO2022Open,
3884 _ISO2022Close,
3885 _ISO2022Reset,
3886
3887 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3888 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3889 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3890 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3891 NULL,
3892
3893 NULL,
3894 _ISO2022getName,
3895 _ISO_2022_WriteSub,
3896 _ISO_2022_SafeClone,
4388f060
A
3897 _ISO_2022_GetUnicodeSet,
3898
3899 NULL,
3900 NULL
374ca955
A
3901};
3902static const UConverterStaticData _ISO2022KRStaticData={
3903 sizeof(UConverterStaticData),
3904 "ISO_2022_KR",
3905 0,
3906 UCNV_IBM,
3907 UCNV_ISO_2022,
3908 1,
2ca993e8 3909 8, /* max 8 bytes per UChar */
374ca955
A
3910 { 0x1a, 0, 0, 0 },
3911 1,
3912 FALSE,
3913 FALSE,
3914 0,
3915 0,
3916 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3917};
4388f060
A
3918
3919namespace {
3920
2ca993e8
A
3921const UConverterSharedData _ISO2022KRData=
3922 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
374ca955 3923
4388f060
A
3924} // namespace
3925
374ca955
A
3926/*************** CN ***************/
3927static const UConverterImpl _ISO2022CNImpl={
3928
3929 UCNV_ISO_2022,
3930
3931 NULL,
3932 NULL,
3933
3934 _ISO2022Open,
3935 _ISO2022Close,
3936 _ISO2022Reset,
3937
3938 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3939 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3940 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3941 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3942 NULL,
3943
3944 NULL,
3945 _ISO2022getName,
3946 _ISO_2022_WriteSub,
3947 _ISO_2022_SafeClone,
4388f060
A
3948 _ISO_2022_GetUnicodeSet,
3949
3950 NULL,
3951 NULL
374ca955
A
3952};
3953static const UConverterStaticData _ISO2022CNStaticData={
3954 sizeof(UConverterStaticData),
3955 "ISO_2022_CN",
3956 0,
3957 UCNV_IBM,
3958 UCNV_ISO_2022,
73c04bcf 3959 1,
374ca955
A
3960 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3961 { 0x1a, 0, 0, 0 },
3962 1,
3963 FALSE,
3964 FALSE,
3965 0,
3966 0,
3967 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3968};
4388f060
A
3969
3970namespace {
3971
2ca993e8
A
3972const UConverterSharedData _ISO2022CNData=
3973 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
374ca955 3974
4388f060 3975} // namespace
b331163b 3976#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
374ca955 3977
b75a7d8f 3978#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */