]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnv2022.cpp
ICU-551.51.4.tar.gz
[apple/icu.git] / icuSources / common / ucnv2022.cpp
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
b331163b 3* Copyright (C) 2000-2015, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
4388f060 6* file name: ucnv2022.cpp
b75a7d8f
A
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2000feb03
12* created by: Markus W. Scherer
13*
14* Change history:
15*
16* 06/29/2000 helena Major rewrite of the callback APIs.
17* 08/08/2000 Ram Included support for ISO-2022-JP-2
18* Changed implementation of toUnicode
19* function
20* 08/21/2000 Ram Added support for ISO-2022-KR
21* 08/29/2000 Ram Seperated implementation of EBCDIC to
22* ucnvebdc.c
23* 09/20/2000 Ram Added support for ISO-2022-CN
24* Added implementations for getNextUChar()
25* for specific 2022 country variants.
26* 10/31/2000 Ram Implemented offsets logic functions
27*/
28
29#include "unicode/utypes.h"
30
374ca955 31#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
b75a7d8f
A
32
33#include "unicode/ucnv.h"
34#include "unicode/uset.h"
35#include "unicode/ucnv_err.h"
36#include "unicode/ucnv_cb.h"
4388f060 37#include "unicode/utf16.h"
374ca955 38#include "ucnv_imp.h"
b75a7d8f
A
39#include "ucnv_bld.h"
40#include "ucnv_cnv.h"
41#include "ucnvmbcs.h"
42#include "cstring.h"
43#include "cmemory.h"
4388f060 44#include "uassert.h"
b75a7d8f 45
374ca955
A
46#ifdef U_ENABLE_GENERIC_ISO_2022
47/*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
72 * for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76#endif
77
b331163b 78#if !UCONFIG_ONLY_HTML_CONVERSION
374ca955 79static const char SHIFT_IN_STR[] = "\x0F";
51004dcb 80// static const char SHIFT_OUT_STR[] = "\x0E";
b331163b 81#endif
b75a7d8f
A
82
83#define CR 0x0D
84#define LF 0x0A
85#define H_TAB 0x09
86#define V_TAB 0x0B
87#define SPACE 0x20
88
46f4442e
A
89enum {
90 HWKANA_START=0xff61,
91 HWKANA_END=0xff9f
92};
93
94/*
95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
96 * as bytes 21..7E. (Subtract 0x80.)
97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
98 * as bytes 20..7F. (Subtract 0x80.)
99 * Do not encode C1 control codes with native bytes 80..9F
100 * as bytes 00..1F (C0 control codes).
101 */
102enum {
103 GR94_START=0xa1,
104 GR94_END=0xfe,
105 GR96_START=0xa0,
106 GR96_END=0xff
107};
108
73c04bcf
A
109/*
110 * ISO 2022 control codes must not be converted from Unicode
111 * because they would mess up the byte stream.
112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
113 * corresponding to SO, SI, and ESC.
114 */
115#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
116
374ca955 117/* for ISO-2022-JP and -CN implementations */
b75a7d8f 118typedef enum {
374ca955
A
119 /* shared values */
120 INVALID_STATE=-1,
b75a7d8f 121 ASCII = 0,
374ca955
A
122
123 SS2_STATE=0x10,
124 SS3_STATE,
125
126 /* JP */
b75a7d8f
A
127 ISO8859_1 = 1 ,
128 ISO8859_7 = 2 ,
129 JISX201 = 3,
130 JISX208 = 4,
131 JISX212 = 5,
132 GB2312 =6,
133 KSC5601 =7,
134 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
b75a7d8f 135
374ca955
A
136 /* CN */
137 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
138 GB2312_1=1,
139 ISO_IR_165=2,
140 CNS_11643=3,
141
142 /*
143 * these are used in StateEnum and ISO2022State variables,
144 * but CNS_11643 must be used to index into myConverterArray[]
145 */
146 CNS_11643_0=0x20,
147 CNS_11643_1,
148 CNS_11643_2,
149 CNS_11643_3,
150 CNS_11643_4,
151 CNS_11643_5,
152 CNS_11643_6,
153 CNS_11643_7
b75a7d8f
A
154} StateEnum;
155
374ca955 156/* is the StateEnum charset value for a DBCS charset? */
b331163b
A
157#if UCONFIG_ONLY_HTML_CONVERSION
158#define IS_JP_DBCS(cs) (JISX208==(cs))
159#else
374ca955 160#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
b331163b 161#endif
374ca955
A
162
163#define CSM(cs) ((uint16_t)1<<(cs))
b75a7d8f 164
374ca955
A
165/*
166 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
167 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
168 *
169 * Note: The converter uses some leniency:
170 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
171 * all versions, not just JIS7 and JIS8.
172 * - ICU does not distinguish between different versions of JIS X 0208.
173 */
b331163b
A
174#if UCONFIG_ONLY_HTML_CONVERSION
175enum { MAX_JA_VERSION=0 };
176#else
729e4ab9 177enum { MAX_JA_VERSION=4 };
b331163b 178#endif
729e4ab9 179static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
374ca955 180 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
b331163b 181#if !UCONFIG_ONLY_HTML_CONVERSION
374ca955
A
182 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
183 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
184 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
185 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
b331163b 186#endif
374ca955 187};
b75a7d8f
A
188
189typedef enum {
190 ASCII1=0,
191 LATIN1,
192 SBCS,
193 DBCS,
374ca955
A
194 MBCS,
195 HWKANA
b75a7d8f
A
196}Cnv2022Type;
197
374ca955
A
198typedef struct ISO2022State {
199 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
200 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
201 int8_t prevG; /* g before single shift (SS2 or SS3) */
202} ISO2022State;
203
b75a7d8f
A
204#define UCNV_OPTIONS_VERSION_MASK 0xf
205#define UCNV_2022_MAX_CONVERTERS 10
206
207typedef struct{
73c04bcf 208 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
b75a7d8f 209 UConverter *currentConverter;
b75a7d8f 210 Cnv2022Type currentType;
374ca955 211 ISO2022State toU2022State, fromU2022State;
b75a7d8f
A
212 uint32_t key;
213 uint32_t version;
73c04bcf
A
214#ifdef U_ENABLE_GENERIC_ISO_2022
215 UBool isFirstBuffer;
216#endif
d5d484b0 217 UBool isEmptySegment;
b75a7d8f 218 char name[30];
73c04bcf 219 char locale[3];
b75a7d8f
A
220}UConverterDataISO2022;
221
374ca955 222/* Protos */
b75a7d8f
A
223/* ISO-2022 ----------------------------------------------------------------- */
224
225/*Forward declaration */
46f4442e 226U_CFUNC void
374ca955
A
227ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
228 UErrorCode * err);
46f4442e 229U_CFUNC void
374ca955
A
230ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
231 UErrorCode * err);
b75a7d8f
A
232
233#define ESC_2022 0x1B /*ESC*/
234
235typedef enum
236{
237 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
238 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
239 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
374ca955 240 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
b75a7d8f
A
241} UCNV_TableStates_2022;
242
243/*
244* The way these state transition arrays work is:
245* ex : ESC$B is the sequence for JISX208
246* a) First Iteration: char is ESC
247* i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
248* int x = normalize_esq_chars_2022[27] which is equal to 1
249* ii) Search for this value in escSeqStateTable_Key_2022[]
250* value of x is stored at escSeqStateTable_Key_2022[0]
251* iii) Save this index as offset
252* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
253* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
254* b) Switch on this state and continue to next char
255* i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
256* which is normalize_esq_chars_2022[36] == 4
257* ii) x is currently 1(from above)
258* x<<=5 -- x is now 32
259* x+=normalize_esq_chars_2022[36]
260* now x is 36
261* iii) Search for this value in escSeqStateTable_Key_2022[]
262* value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
263* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
264* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
265* c) Switch on this state and continue to next char
266* i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
267* ii) x is currently 36 (from above)
268* x<<=5 -- x is now 1152
269* x+=normalize_esq_chars_2022[66]
270* now x is 1161
271* iii) Search for this value in escSeqStateTable_Key_2022[]
272* value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
273* iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
274* escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
275* v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
276*/
277
278
279/*Below are the 3 arrays depicting a state transition table*/
280static const int8_t normalize_esq_chars_2022[256] = {
281/* 0 1 2 3 4 5 6 7 8 9 */
282
283 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
287 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
290 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
291 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
298 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
299 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
300 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
301 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
302 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
303 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
304 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
305 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
306 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
307 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
308 ,0 ,0 ,0 ,0 ,0 ,0
309};
310
374ca955
A
311#ifdef U_ENABLE_GENERIC_ISO_2022
312/*
313 * When the generic ISO-2022 converter is completely removed, not just disabled
314 * per #ifdef, then the following state table and the associated tables that are
315 * dimensioned with MAX_STATES_2022 should be trimmed.
316 *
317 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
318 * the associated escape sequences starting with ESC ( B should be removed.
319 * This includes the ones with key values 1097 and all of the ones above 1000000.
320 *
321 * For the latter, the tables can simply be truncated.
322 * For the former, since the tables must be kept parallel, it is probably best
323 * to simply duplicate an adjacent table cell, parallel in all tables.
324 *
325 * It may make sense to restructure the tables, especially by using small search
326 * tables for the variants instead of indexing them parallel to the table here.
327 */
328#endif
329
b75a7d8f
A
330#define MAX_STATES_2022 74
331static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
332/* 0 1 2 3 4 5 6 7 8 9 */
333
334 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
335 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
336 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
337 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
338 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
339 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
340 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
341 ,35947631 ,35947635 ,35947636 ,35947638
342};
343
374ca955 344#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f
A
345
346static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
347 /* 0 1 2 3 4 5 6 7 8 9 */
348
349 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
374ca955 350 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
b75a7d8f
A
351 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
352 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
353 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
354 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
355 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
356 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
357};
358
374ca955
A
359#endif
360
46f4442e 361static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
b75a7d8f 362/* 0 1 2 3 4 5 6 7 8 9 */
374ca955 363 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
b75a7d8f
A
364 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
365 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
366 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
367 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
368 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
369 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
370 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
371};
372
b75a7d8f
A
373/* Type def for refactoring changeState_2022 code*/
374typedef enum{
374ca955 375#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f 376 ISO_2022=0,
374ca955 377#endif
b75a7d8f 378 ISO_2022_JP=1,
b331163b 379#if !UCONFIG_ONLY_HTML_CONVERSION
b75a7d8f
A
380 ISO_2022_KR=2,
381 ISO_2022_CN=3
b331163b 382#endif
b75a7d8f
A
383} Variant2022;
384
b75a7d8f 385/*********** ISO 2022 Converter Protos ***********/
46f4442e 386static void
729e4ab9 387_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
b75a7d8f
A
388
389static void
390 _ISO2022Close(UConverter *converter);
391
46f4442e 392static void
b75a7d8f
A
393_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
394
46f4442e 395static const char*
b75a7d8f
A
396_ISO2022getName(const UConverter* cnv);
397
46f4442e 398static void
b75a7d8f
A
399_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
400
46f4442e 401static UConverter *
b75a7d8f
A
402_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
403
374ca955 404#ifdef U_ENABLE_GENERIC_ISO_2022
46f4442e 405static void
374ca955
A
406T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
407#endif
b75a7d8f 408
4388f060
A
409namespace {
410
374ca955 411/*const UConverterSharedData _ISO2022Data;*/
4388f060 412extern const UConverterSharedData _ISO2022JPData;
b331163b
A
413
414#if !UCONFIG_ONLY_HTML_CONVERSION
4388f060
A
415extern const UConverterSharedData _ISO2022KRData;
416extern const UConverterSharedData _ISO2022CNData;
b331163b 417#endif
4388f060
A
418
419} // namespace
b75a7d8f 420
374ca955 421/*************** Converter implementations ******************/
b75a7d8f 422
73c04bcf 423/* The purpose of this function is to get around gcc compiler warnings. */
4388f060 424static inline void
73c04bcf
A
425fromUWriteUInt8(UConverter *cnv,
426 const char *bytes, int32_t length,
427 uint8_t **target, const char *targetLimit,
428 int32_t **offsets,
429 int32_t sourceIndex,
430 UErrorCode *pErrorCode)
431{
432 char *targetChars = (char *)*target;
433 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
434 offsets, sourceIndex, pErrorCode);
435 *target = (uint8_t*)targetChars;
436
437}
438
4388f060
A
439static inline void
440setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
374ca955
A
441 if(myConverterData->version == 1) {
442 UConverter *cnv = myConverterData->currentConverter;
b75a7d8f 443
374ca955
A
444 cnv->toUnicodeStatus=0; /* offset */
445 cnv->mode=0; /* state */
446 cnv->toULength=0; /* byteIndex */
447 }
448}
b75a7d8f 449
4388f060 450static inline void
374ca955
A
451setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
452 /* in ISO-2022-KR the designator sequence appears only once
453 * in a file so we append it only once
454 */
455 if( converter->charErrorBufferLength==0){
b75a7d8f 456
374ca955
A
457 converter->charErrorBufferLength = 4;
458 converter->charErrorBuffer[0] = 0x1b;
459 converter->charErrorBuffer[1] = 0x24;
460 converter->charErrorBuffer[2] = 0x29;
461 converter->charErrorBuffer[3] = 0x43;
462 }
463 if(myConverterData->version == 1) {
464 UConverter *cnv = myConverterData->currentConverter;
b75a7d8f 465
374ca955
A
466 cnv->fromUChar32=0;
467 cnv->fromUnicodeStatus=1; /* prevLength */
468 }
469}
b75a7d8f 470
46f4442e 471static void
729e4ab9 472_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
b75a7d8f 473
374ca955 474 char myLocale[6]={' ',' ',' ',' ',' ',' '};
b75a7d8f 475
374ca955
A
476 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
477 if(cnv->extraInfo != NULL) {
729e4ab9 478 UConverterNamePieces stackPieces;
4388f060 479 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
374ca955
A
480 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
481 uint32_t version;
b75a7d8f 482
729e4ab9
A
483 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
484
374ca955 485 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
374ca955 486 myConverterData->currentType = ASCII1;
374ca955 487 cnv->fromUnicodeStatus =FALSE;
729e4ab9
A
488 if(pArgs->locale){
489 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
374ca955 490 }
729e4ab9 491 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
73c04bcf 492 myConverterData->version = version;
46f4442e 493 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
73c04bcf
A
494 (myLocale[2]=='_' || myLocale[2]=='\0'))
495 {
374ca955 496 /* open the required converters and cache them */
729e4ab9 497 if(version>MAX_JA_VERSION) {
b331163b
A
498 // ICU 55 fails to open a converter for an unsupported version.
499 // Previously, it fell back to version 0, but that would yield
500 // unexpected behavior.
501 *errorCode = U_MISSING_RESOURCE_ERROR;
502 return;
729e4ab9 503 }
374ca955 504 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
729e4ab9
A
505 myConverterData->myConverterArray[ISO8859_7] =
506 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
374ca955 507 }
729e4ab9
A
508 myConverterData->myConverterArray[JISX208] =
509 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
374ca955 510 if(jpCharsetMasks[version]&CSM(JISX212)) {
729e4ab9
A
511 myConverterData->myConverterArray[JISX212] =
512 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
374ca955
A
513 }
514 if(jpCharsetMasks[version]&CSM(GB2312)) {
729e4ab9
A
515 myConverterData->myConverterArray[GB2312] =
516 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
374ca955
A
517 }
518 if(jpCharsetMasks[version]&CSM(KSC5601)) {
729e4ab9
A
519 myConverterData->myConverterArray[KSC5601] =
520 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
374ca955 521 }
b75a7d8f 522
374ca955
A
523 /* set the function pointers to appropriate funtions */
524 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
525 uprv_strcpy(myConverterData->locale,"ja");
b75a7d8f 526
46f4442e 527 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
b331163b 528 size_t len = uprv_strlen(myConverterData->name);
374ca955
A
529 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
530 myConverterData->name[len+1]='\0';
531 }
b331163b 532#if !UCONFIG_ONLY_HTML_CONVERSION
46f4442e 533 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
73c04bcf
A
534 (myLocale[2]=='_' || myLocale[2]=='\0'))
535 {
b331163b
A
536 if(version>1) {
537 // ICU 55 fails to open a converter for an unsupported version.
538 // Previously, it fell back to version 0, but that would yield
539 // unexpected behavior.
540 *errorCode = U_MISSING_RESOURCE_ERROR;
541 return;
542 }
729e4ab9
A
543 const char *cnvName;
544 if(version==1) {
545 cnvName="icu-internal-25546";
546 } else {
547 cnvName="ibm-949";
548 myConverterData->version=version=0;
549 }
550 if(pArgs->onlyTestIsLoadable) {
551 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
552 uprv_free(cnv->extraInfo);
553 cnv->extraInfo=NULL;
554 return;
555 } else {
556 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
73c04bcf
A
557 if (U_FAILURE(*errorCode)) {
558 _ISO2022Close(cnv);
559 return;
560 }
b75a7d8f 561
729e4ab9
A
562 if(version==1) {
563 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
564 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
565 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
566 }else{
567 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
73c04bcf 568 }
b75a7d8f 569
729e4ab9
A
570 /* initialize the state variables */
571 setInitialStateToUnicodeKR(cnv, myConverterData);
572 setInitialStateFromUnicodeKR(cnv, myConverterData);
b75a7d8f 573
729e4ab9
A
574 /* set the function pointers to appropriate funtions */
575 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
576 uprv_strcpy(myConverterData->locale,"ko");
577 }
b75a7d8f 578 }
46f4442e 579 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
73c04bcf
A
580 (myLocale[2]=='_' || myLocale[2]=='\0'))
581 {
b331163b
A
582 if(version>2) {
583 // ICU 55 fails to open a converter for an unsupported version.
584 // Previously, it fell back to version 0, but that would yield
585 // unexpected behavior.
586 *errorCode = U_MISSING_RESOURCE_ERROR;
587 return;
588 }
b75a7d8f
A
589
590 /* open the required converters and cache them */
729e4ab9
A
591 myConverterData->myConverterArray[GB2312_1] =
592 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
374ca955 593 if(version==1) {
729e4ab9
A
594 myConverterData->myConverterArray[ISO_IR_165] =
595 ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
374ca955 596 }
729e4ab9
A
597 myConverterData->myConverterArray[CNS_11643] =
598 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
b75a7d8f 599
b75a7d8f
A
600
601 /* set the function pointers to appropriate funtions */
602 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
603 uprv_strcpy(myConverterData->locale,"cn");
604
729e4ab9 605 if (version==0){
b75a7d8f 606 myConverterData->version = 0;
46f4442e 607 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
729e4ab9
A
608 }else if (version==1){
609 myConverterData->version = 1;
610 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
611 }else {
612 myConverterData->version = 2;
613 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
b75a7d8f
A
614 }
615 }
b331163b 616#endif // !UCONFIG_ONLY_HTML_CONVERSION
b75a7d8f 617 else{
374ca955 618#ifdef U_ENABLE_GENERIC_ISO_2022
73c04bcf
A
619 myConverterData->isFirstBuffer = TRUE;
620
b75a7d8f
A
621 /* append the UTF-8 escape sequence */
622 cnv->charErrorBufferLength = 3;
623 cnv->charErrorBuffer[0] = 0x1b;
624 cnv->charErrorBuffer[1] = 0x25;
625 cnv->charErrorBuffer[2] = 0x42;
626
627 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
628 /* initialize the state variables */
b75a7d8f 629 uprv_strcpy(myConverterData->name,"ISO_2022");
374ca955 630#else
b331163b
A
631 *errorCode = U_MISSING_RESOURCE_ERROR;
632 // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
633 // data loading error code.
374ca955
A
634 return;
635#endif
b75a7d8f
A
636 }
637
374ca955
A
638 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
639
729e4ab9 640 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
374ca955
A
641 _ISO2022Close(cnv);
642 }
b75a7d8f
A
643 } else {
644 *errorCode = U_MEMORY_ALLOCATION_ERROR;
645 }
b75a7d8f
A
646}
647
648
649static void
650_ISO2022Close(UConverter *converter) {
374ca955
A
651 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
652 UConverterSharedData **array = myData->myConverterArray;
653 int32_t i;
b75a7d8f
A
654
655 if (converter->extraInfo != NULL) {
656 /*close the array of converter pointers and free the memory*/
374ca955
A
657 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
658 if(array[i]!=NULL) {
659 ucnv_unloadSharedDataIfReady(array[i]);
b75a7d8f 660 }
b75a7d8f
A
661 }
662
374ca955 663 ucnv_close(myData->currentConverter);
b75a7d8f
A
664
665 if(!converter->isExtraLocal){
666 uprv_free (converter->extraInfo);
374ca955 667 converter->extraInfo = NULL;
b75a7d8f
A
668 }
669 }
670}
671
672static void
673_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
674 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
374ca955
A
675 if(choice<=UCNV_RESET_TO_UNICODE) {
676 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
677 myConverterData->key = 0;
d5d484b0 678 myConverterData->isEmptySegment = FALSE;
374ca955
A
679 }
680 if(choice!=UCNV_RESET_TO_UNICODE) {
681 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
682 }
683#ifdef U_ENABLE_GENERIC_ISO_2022
684 if(myConverterData->locale[0] == 0){
b75a7d8f
A
685 if(choice<=UCNV_RESET_TO_UNICODE) {
686 myConverterData->isFirstBuffer = TRUE;
374ca955 687 myConverterData->key = 0;
b75a7d8f
A
688 if (converter->mode == UCNV_SO){
689 ucnv_close (myConverterData->currentConverter);
690 myConverterData->currentConverter=NULL;
691 }
46f4442e 692 converter->mode = UCNV_SI;
b75a7d8f
A
693 }
694 if(choice!=UCNV_RESET_TO_UNICODE) {
695 /* re-append UTF-8 escape sequence */
696 converter->charErrorBufferLength = 3;
697 converter->charErrorBuffer[0] = 0x1b;
698 converter->charErrorBuffer[1] = 0x28;
699 converter->charErrorBuffer[2] = 0x42;
700 }
701 }
374ca955
A
702 else
703#endif
704 {
b75a7d8f 705 /* reset the state variables */
374ca955 706 if(myConverterData->locale[0] == 'k'){
b75a7d8f
A
707 if(choice<=UCNV_RESET_TO_UNICODE) {
708 setInitialStateToUnicodeKR(converter, myConverterData);
709 }
710 if(choice!=UCNV_RESET_TO_UNICODE) {
711 setInitialStateFromUnicodeKR(converter, myConverterData);
712 }
713 }
714 }
715}
716
46f4442e 717static const char*
b75a7d8f
A
718_ISO2022getName(const UConverter* cnv){
719 if(cnv->extraInfo){
720 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
721 return myData->name;
722 }
723 return NULL;
724}
725
b75a7d8f 726
374ca955
A
727/*************** to unicode *******************/
728/****************************************************************************
729 * Recognized escape sequences are
730 * <ESC>(B ASCII
731 * <ESC>.A ISO-8859-1
732 * <ESC>.F ISO-8859-7
733 * <ESC>(J JISX-201
734 * <ESC>(I JISX-201
735 * <ESC>$B JISX-208
736 * <ESC>$@ JISX-208
737 * <ESC>$(D JISX-212
738 * <ESC>$A GB2312
739 * <ESC>$(C KSC5601
740 */
46f4442e 741static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
374ca955
A
742/* 0 1 2 3 4 5 6 7 8 9 */
743 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
744 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
745 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
746 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
747 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
748 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
749 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
750 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
751};
b75a7d8f 752
b331163b 753#if !UCONFIG_ONLY_HTML_CONVERSION
374ca955 754/*************** to unicode *******************/
46f4442e 755static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
374ca955
A
756/* 0 1 2 3 4 5 6 7 8 9 */
757 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
758 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
759 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
760 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
761 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
762 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
763 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
764 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
765};
b331163b 766#endif
b75a7d8f 767
b75a7d8f 768
46f4442e 769static UCNV_TableStates_2022
374ca955
A
770getKey_2022(char c,int32_t* key,int32_t* offset){
771 int32_t togo;
772 int32_t low = 0;
773 int32_t hi = MAX_STATES_2022;
774 int32_t oldmid=0;
b75a7d8f 775
374ca955
A
776 togo = normalize_esq_chars_2022[(uint8_t)c];
777 if(togo == 0) {
778 /* not a valid character anywhere in an escape sequence */
779 *key = 0;
780 *offset = 0;
781 return INVALID_2022;
782 }
783 togo = (*key << 5) + togo;
b75a7d8f 784
374ca955 785 while (hi != low) /*binary search*/{
b75a7d8f 786
57a6839d 787 int32_t mid = (hi+low) >> 1; /*Finds median*/
374ca955 788
46f4442e 789 if (mid == oldmid)
374ca955
A
790 break;
791
792 if (escSeqStateTable_Key_2022[mid] > togo){
793 hi = mid;
794 }
795 else if (escSeqStateTable_Key_2022[mid] < togo){
796 low = mid;
797 }
798 else /*we found it*/{
799 *key = togo;
800 *offset = mid;
46f4442e 801 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
374ca955
A
802 }
803 oldmid = mid;
b75a7d8f 804
b75a7d8f 805 }
b75a7d8f 806
374ca955
A
807 *key = 0;
808 *offset = 0;
809 return INVALID_2022;
b75a7d8f
A
810}
811
374ca955
A
812/*runs through a state machine to determine the escape sequence - codepage correspondance
813 */
46f4442e 814static void
374ca955 815changeState_2022(UConverter* _this,
46f4442e 816 const char** source,
374ca955
A
817 const char* sourceLimit,
818 Variant2022 var,
819 UErrorCode* err){
820 UCNV_TableStates_2022 value;
821 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
822 uint32_t key = myData2022->key;
73c04bcf 823 int32_t offset = 0;
fd0068a8 824 int8_t initialToULength = _this->toULength;
374ca955
A
825 char c;
826
827 value = VALID_NON_TERMINAL_2022;
828 while (*source < sourceLimit) {
829 c = *(*source)++;
830 _this->toUBytes[_this->toULength++]=(uint8_t)c;
831 value = getKey_2022(c,(int32_t *) &key, &offset);
46f4442e 832
374ca955 833 switch (value){
b75a7d8f 834
374ca955
A
835 case VALID_NON_TERMINAL_2022 :
836 /* continue with the loop */
837 break;
b75a7d8f 838
374ca955
A
839 case VALID_TERMINAL_2022:
840 key = 0;
841 goto DONE;
b75a7d8f 842
374ca955
A
843 case INVALID_2022:
844 goto DONE;
b75a7d8f 845
374ca955
A
846 case VALID_MAYBE_TERMINAL_2022:
847#ifdef U_ENABLE_GENERIC_ISO_2022
848 /* ESC ( B is ambiguous only for ISO_2022 itself */
849 if(var == ISO_2022) {
850 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
851 _this->toULength = 0;
b75a7d8f 852
374ca955
A
853 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
854
855 /* continue with the loop */
856 value = VALID_NON_TERMINAL_2022;
857 break;
858 } else
859#endif
860 {
861 /* not ISO_2022 itself, finish here */
862 value = VALID_TERMINAL_2022;
863 key = 0;
864 goto DONE;
b75a7d8f
A
865 }
866 }
b75a7d8f 867 }
b75a7d8f 868
374ca955
A
869DONE:
870 myData2022->key = key;
b75a7d8f 871
374ca955
A
872 if (value == VALID_NON_TERMINAL_2022) {
873 /* indicate that the escape sequence is incomplete: key!=0 */
874 return;
875 } else if (value == INVALID_2022 ) {
876 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
374ca955
A
877 } else /* value == VALID_TERMINAL_2022 */ {
878 switch(var){
879#ifdef U_ENABLE_GENERIC_ISO_2022
880 case ISO_2022:
881 {
882 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
883 if(chosenConverterName == NULL) {
884 /* SS2 or SS3 */
885 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
46f4442e 886 _this->toUCallbackReason = UCNV_UNASSIGNED;
374ca955 887 return;
b75a7d8f 888 }
374ca955
A
889
890 _this->mode = UCNV_SI;
891 ucnv_close(myData2022->currentConverter);
892 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
893 if(U_SUCCESS(*err)) {
894 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
895 _this->mode = UCNV_SO;
896 }
897 break;
898 }
899#endif
900 case ISO_2022_JP:
901 {
46f4442e 902 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
374ca955
A
903 switch(tempState) {
904 case INVALID_STATE:
905 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
906 break;
907 case SS2_STATE:
908 if(myData2022->toU2022State.cs[2]!=0) {
909 if(myData2022->toU2022State.g<2) {
910 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
911 }
912 myData2022->toU2022State.g=2;
913 } else {
914 /* illegal to have SS2 before a matching designator */
915 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
916 }
917 break;
918 /* case SS3_STATE: not used in ISO-2022-JP-x */
919 case ISO8859_1:
920 case ISO8859_7:
921 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
922 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
923 } else {
924 /* G2 charset for SS2 */
925 myData2022->toU2022State.cs[2]=(int8_t)tempState;
926 }
927 break;
928 default:
929 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
930 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
931 } else {
932 /* G0 charset */
933 myData2022->toU2022State.cs[0]=(int8_t)tempState;
934 }
935 break;
936 }
937 }
938 break;
b331163b 939#if !UCONFIG_ONLY_HTML_CONVERSION
374ca955
A
940 case ISO_2022_CN:
941 {
46f4442e 942 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
374ca955
A
943 switch(tempState) {
944 case INVALID_STATE:
945 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
946 break;
947 case SS2_STATE:
948 if(myData2022->toU2022State.cs[2]!=0) {
949 if(myData2022->toU2022State.g<2) {
950 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
951 }
952 myData2022->toU2022State.g=2;
953 } else {
954 /* illegal to have SS2 before a matching designator */
955 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
956 }
957 break;
958 case SS3_STATE:
959 if(myData2022->toU2022State.cs[3]!=0) {
960 if(myData2022->toU2022State.g<2) {
961 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
962 }
963 myData2022->toU2022State.g=3;
964 } else {
965 /* illegal to have SS3 before a matching designator */
966 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
967 }
968 break;
969 case ISO_IR_165:
970 if(myData2022->version==0) {
971 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
972 break;
973 }
73c04bcf 974 /*fall through*/
374ca955 975 case GB2312_1:
73c04bcf 976 /*fall through*/
374ca955
A
977 case CNS_11643_1:
978 myData2022->toU2022State.cs[1]=(int8_t)tempState;
979 break;
980 case CNS_11643_2:
981 myData2022->toU2022State.cs[2]=(int8_t)tempState;
982 break;
983 default:
984 /* other CNS 11643 planes */
985 if(myData2022->version==0) {
986 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
987 } else {
988 myData2022->toU2022State.cs[3]=(int8_t)tempState;
989 }
990 break;
991 }
992 }
993 break;
994 case ISO_2022_KR:
995 if(offset==0x30){
996 /* nothing to be done, just accept this one escape sequence */
997 } else {
998 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
999 }
1000 break;
b331163b 1001#endif // !UCONFIG_ONLY_HTML_CONVERSION
374ca955
A
1002
1003 default:
1004 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1005 break;
1006 }
1007 }
1008 if(U_SUCCESS(*err)) {
1009 _this->toULength = 0;
fd0068a8
A
1010 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1011 if(_this->toULength>1) {
1012 /*
1013 * Ticket 5691: consistent illegal sequences:
1014 * - We include at least the first byte (ESC) in the illegal sequence.
1015 * - If any of the non-initial bytes could be the start of a character,
1016 * we stop the illegal sequence before the first one of those.
1017 * In escape sequences, all following bytes are "printable", that is,
1018 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1019 * they are valid single/lead bytes.
1020 * For simplicity, we always only report the initial ESC byte as the
1021 * illegal sequence and back out all other bytes we looked at.
1022 */
1023 /* Back out some bytes. */
1024 int8_t backOutDistance=_this->toULength-1;
1025 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1026 if(backOutDistance<=bytesFromThisBuffer) {
1027 /* same as initialToULength<=1 */
1028 *source-=backOutDistance;
1029 } else {
1030 /* Back out bytes from the previous buffer: Need to replay them. */
1031 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1032 /* same as -(initialToULength-1) */
1033 /* preToULength is negative! */
1034 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1035 *source-=bytesFromThisBuffer;
1036 }
1037 _this->toULength=1;
1038 }
46f4442e
A
1039 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1040 _this->toUCallbackReason = UCNV_UNASSIGNED;
374ca955
A
1041 }
1042}
1043
b331163b 1044#if !UCONFIG_ONLY_HTML_CONVERSION
374ca955
A
1045/*Checks the characters of the buffer against valid 2022 escape sequences
1046*if the match we return a pointer to the initial start of the sequence otherwise
1047*we return sourceLimit
1048*/
1049/*for 2022 looks ahead in the stream
1050 *to determine the longest possible convertible
1051 *data stream
1052 */
4388f060 1053static inline const char*
374ca955
A
1054getEndOfBuffer_2022(const char** source,
1055 const char* sourceLimit,
4388f060 1056 UBool /*flush*/){
374ca955
A
1057
1058 const char* mySource = *source;
1059
1060#ifdef U_ENABLE_GENERIC_ISO_2022
46f4442e 1061 if (*source >= sourceLimit)
374ca955
A
1062 return sourceLimit;
1063
1064 do{
1065
1066 if (*mySource == ESC_2022){
1067 int8_t i;
1068 int32_t key = 0;
1069 int32_t offset;
1070 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1071
1072 /* Kludge: I could not
1073 * figure out the reason for validating an escape sequence
1074 * twice - once here and once in changeState_2022().
1075 * is it possible to have an ESC character in a ISO2022
1076 * byte stream which is valid in a code page? Is it legal?
1077 */
46f4442e 1078 for (i=0;
374ca955
A
1079 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1080 i++) {
1081 value = getKey_2022(*(mySource+i), &key, &offset);
1082 }
46f4442e 1083 if (value > 0 || *mySource==ESC_2022)
374ca955
A
1084 return mySource;
1085
46f4442e 1086 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
374ca955
A
1087 return sourceLimit;
1088 }
1089 }while (++mySource < sourceLimit);
1090
1091 return sourceLimit;
1092#else
1093 while(mySource < sourceLimit && *mySource != ESC_2022) {
1094 ++mySource;
1095 }
1096 return mySource;
1097#endif
1098}
b331163b 1099#endif
374ca955
A
1100
1101/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
46f4442e
A
1102 * any future change in _MBCSFromUChar32() function should be reflected here.
1103 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
374ca955 1104 */
4388f060 1105static inline int32_t
374ca955 1106MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
46f4442e
A
1107 UChar32 c,
1108 uint32_t* value,
1109 UBool useFallback,
374ca955
A
1110 int outputType)
1111{
1112 const int32_t *cx;
1113 const uint16_t *table;
1114 uint32_t stage2Entry;
1115 uint32_t myValue;
46f4442e 1116 int32_t length;
374ca955 1117 const uint8_t *p;
46f4442e
A
1118 /*
1119 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1120 * Use internal version of ucnv_open() that verifies that the new structures are available,
1121 * else U_INTERNAL_PROGRAM_ERROR.
1122 */
374ca955
A
1123 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1124 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1125 table=sharedData->mbcs.fromUnicodeTable;
1126 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1127 /* get the bytes and the length for the output */
1128 if(outputType==MBCS_OUTPUT_2){
1129 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1130 if(myValue<=0xff) {
46f4442e 1131 length=1;
374ca955 1132 } else {
46f4442e 1133 length=2;
374ca955
A
1134 }
1135 } else /* outputType==MBCS_OUTPUT_3 */ {
1136 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1137 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1138 if(myValue<=0xff) {
46f4442e 1139 length=1;
374ca955 1140 } else if(myValue<=0xffff) {
46f4442e 1141 length=2;
374ca955 1142 } else {
46f4442e 1143 length=3;
b75a7d8f
A
1144 }
1145 }
1146 /* is this code point assigned, or do we use fallbacks? */
46f4442e
A
1147 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1148 /* assigned */
1149 *value=myValue;
1150 return length;
1151 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
b75a7d8f 1152 /*
374ca955 1153 * We allow a 0 byte output if the "assigned" bit is set for this entry.
b75a7d8f 1154 * There is no way with this data structure for fallback output
374ca955 1155 * to be a zero byte.
b75a7d8f 1156 */
b75a7d8f 1157 *value=myValue;
46f4442e 1158 return -length;
b75a7d8f 1159 }
b75a7d8f 1160 }
374ca955
A
1161
1162 cx=sharedData->mbcs.extIndexes;
1163 if(cx!=NULL) {
46f4442e 1164 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
374ca955
A
1165 }
1166
1167 /* unassigned */
46f4442e 1168 return 0;
b75a7d8f
A
1169}
1170
1171/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
46f4442e
A
1172 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1173 * @param retval pointer to output byte
1174 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
b75a7d8f 1175 */
4388f060 1176static inline int32_t
b75a7d8f 1177MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
46f4442e
A
1178 UChar32 c,
1179 uint32_t* retval,
b75a7d8f
A
1180 UBool useFallback)
1181{
46f4442e 1182 const uint16_t *table;
b75a7d8f
A
1183 int32_t value;
1184 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
374ca955 1185 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
46f4442e 1186 return 0;
b75a7d8f
A
1187 }
1188 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
374ca955 1189 table=sharedData->mbcs.fromUnicodeTable;
b75a7d8f 1190 /* get the byte for the output */
374ca955 1191 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
b75a7d8f 1192 /* is this code point assigned, or do we use fallbacks? */
46f4442e
A
1193 *retval=(uint32_t)(value&0xff);
1194 if(value>=0xf00) {
1195 return 1; /* roundtrip */
1196 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1197 return -1; /* fallback taken */
b75a7d8f 1198 } else {
46f4442e 1199 return 0; /* no mapping */
b75a7d8f 1200 }
b75a7d8f
A
1201}
1202
46f4442e
A
1203/*
1204 * Check that the result is a 2-byte value with each byte in the range A1..FE
1205 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1206 * to move it to the ISO 2022 range 21..7E.
1207 * Return 0 if out of range.
1208 */
4388f060 1209static inline uint32_t
46f4442e
A
1210_2022FromGR94DBCS(uint32_t value) {
1211 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1212 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1213 ) {
1214 return value - 0x8080; /* shift down to 21..7e byte range */
1215 } else {
1216 return 0; /* not valid for ISO 2022 */
1217 }
1218}
1219
1220#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1221/*
1222 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1223 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1224 * unchanged.
1225 */
4388f060 1226static inline uint32_t
46f4442e
A
1227_2022ToGR94DBCS(uint32_t value) {
1228 uint32_t returnValue = value + 0x8080;
1229 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1230 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1231 return returnValue;
1232 } else {
1233 return value;
1234 }
1235}
1236#endif
1237
374ca955
A
1238#ifdef U_ENABLE_GENERIC_ISO_2022
1239
b75a7d8f
A
1240/**********************************************************************************
1241* ISO-2022 Converter
1242*
1243*
1244*/
1245
46f4442e 1246static void
b75a7d8f
A
1247T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1248 UErrorCode* err){
374ca955
A
1249 const char* mySourceLimit, *realSourceLimit;
1250 const char* sourceStart;
1251 const UChar* myTargetStart;
b75a7d8f 1252 UConverter* saveThis;
b75a7d8f 1253 UConverterDataISO2022* myData;
374ca955
A
1254 int8_t length;
1255
1256 saveThis = args->converter;
1257 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1258
1259 realSourceLimit = args->sourceLimit;
1260 while (args->source < realSourceLimit) {
1261 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1262 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1263 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1264
1265 if(args->source < mySourceLimit) {
1266 if(myData->currentConverter==NULL) {
1267 myData->currentConverter = ucnv_open("ASCII",err);
1268 if(U_FAILURE(*err)){
1269 return;
1270 }
b75a7d8f 1271
374ca955
A
1272 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1273 saveThis->mode = UCNV_SO;
b75a7d8f 1274 }
b75a7d8f 1275
374ca955
A
1276 /* convert to before the ESC or until the end of the buffer */
1277 myData->isFirstBuffer=FALSE;
1278 sourceStart = args->source;
1279 myTargetStart = args->target;
1280 args->converter = myData->currentConverter;
1281 ucnv_toUnicode(args->converter,
1282 &args->target,
1283 args->targetLimit,
1284 &args->source,
1285 mySourceLimit,
1286 args->offsets,
1287 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1288 err);
1289 args->converter = saveThis;
1290
1291 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1292 /* move the overflow buffer */
1293 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1294 myData->currentConverter->UCharErrorBufferLength = 0;
1295 if(length > 0) {
1296 uprv_memcpy(saveThis->UCharErrorBuffer,
1297 myData->currentConverter->UCharErrorBuffer,
1298 length*U_SIZEOF_UCHAR);
1299 }
1300 return;
1301 }
b75a7d8f 1302
374ca955
A
1303 /*
1304 * At least one of:
1305 * -Error while converting
1306 * -Done with entire buffer
1307 * -Need to write offsets or update the current offset
1308 * (leave that up to the code in ucnv.c)
1309 *
1310 * or else we just stopped at an ESC byte and continue with changeState_2022()
1311 */
1312 if (U_FAILURE(*err) ||
1313 (args->source == realSourceLimit) ||
1314 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1315 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1316 ) {
1317 /* copy partial or error input for truncated detection and error handling */
1318 if(U_FAILURE(*err)) {
1319 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1320 if(length > 0) {
1321 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1322 }
1323 } else {
1324 length = saveThis->toULength = myData->currentConverter->toULength;
1325 if(length > 0) {
1326 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1327 if(args->source < mySourceLimit) {
1328 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1329 }
1330 }
1331 }
1332 return;
b75a7d8f 1333 }
b75a7d8f
A
1334 }
1335 }
b75a7d8f
A
1336
1337 sourceStart = args->source;
1338 changeState_2022(args->converter,
46f4442e 1339 &(args->source),
374ca955 1340 realSourceLimit,
b75a7d8f 1341 ISO_2022,
b75a7d8f 1342 err);
374ca955
A
1343 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1344 /* let the ucnv.c code update its current offset */
1345 return;
b75a7d8f 1346 }
b75a7d8f 1347 }
b75a7d8f
A
1348}
1349
374ca955 1350#endif
b75a7d8f
A
1351
1352/*
1353 * To Unicode Callback helper function
1354 */
46f4442e 1355static void
374ca955
A
1356toUnicodeCallback(UConverter *cnv,
1357 const uint32_t sourceChar, const uint32_t targetUniChar,
1358 UErrorCode* err){
b75a7d8f 1359 if(sourceChar>0xff){
374ca955
A
1360 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1361 cnv->toUBytes[1] = (uint8_t)sourceChar;
1362 cnv->toULength = 2;
b75a7d8f
A
1363 }
1364 else{
374ca955 1365 cnv->toUBytes[0] =(char) sourceChar;
fd0068a8 1366 cnv->toULength = 1;
b75a7d8f
A
1367 }
1368
1369 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
b75a7d8f
A
1370 *err = U_INVALID_CHAR_FOUND;
1371 }
1372 else{
b75a7d8f
A
1373 *err = U_ILLEGAL_CHAR_FOUND;
1374 }
b75a7d8f
A
1375}
1376
1377/**************************************ISO-2022-JP*************************************************/
1378
1379/************************************** IMPORTANT **************************************************
1380* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1381* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
46f4442e
A
1382* The converter iterates over each Unicode codepoint
1383* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1384* processed one char at a time it would make sense to reduce the extra processing a canned converter
b75a7d8f
A
1385* would do as far as possible.
1386*
46f4442e
A
1387* If the implementation of these macros or structure of sharedData struct change in the future, make
1388* sure that ISO-2022 is also changed.
b75a7d8f
A
1389***************************************************************************************************
1390*/
1391
1392/***************************************************************************************************
1393* Rules for ISO-2022-jp encoding
46f4442e 1394* (i) Escape sequences must be fully contained within a line they should not
b75a7d8f
A
1395* span new lines or CRs
1396* (ii) If the last character on a line is represented by two bytes then an ASCII or
1397* JIS-Roman character escape sequence should follow before the line terminates
46f4442e
A
1398* (iii) If the first character on the line is represented by two bytes then a two
1399* byte character escape sequence should precede it
b75a7d8f
A
1400* (iv) If no escape sequence is encountered then the characters are ASCII
1401* (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1402* and invoked with SS2 (ESC N).
1403* (vi) If there is any G0 designation in text, there must be a switch to
1404* ASCII or to JIS X 0201-Roman before a space character (but not
1405* necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1406* characters such as tab or CRLF.
1407* (vi) Supported encodings:
1408* ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1409*
1410* source : RFC-1554
1411*
1412* JISX201, JISX208,JISX212 : new .cnv data files created
1413* KSC5601 : alias to ibm-949 mapping table
1414* GB2312 : alias to ibm-1386 mapping table
1415* ISO-8859-1 : Algorithmic implemented as LATIN1 case
1416* ISO-8859-7 : alisas to ibm-9409 mapping table
1417*/
b75a7d8f 1418
374ca955
A
1419/* preference order of JP charsets */
1420static const StateEnum jpCharsetPref[]={
1421 ASCII,
1422 JISX201,
1423 ISO8859_1,
1424 ISO8859_7,
1425 JISX208,
1426 JISX212,
1427 GB2312,
1428 KSC5601,
1429 HWKANA_7BIT
b75a7d8f
A
1430};
1431
73c04bcf
A
1432/*
1433 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1434 * not in order of jpCharsetPref[]!
1435 */
374ca955 1436static const char escSeqChars[][6] ={
b75a7d8f
A
1437 "\x1B\x28\x42", /* <ESC>(B ASCII */
1438 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1439 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1440 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1441 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1442 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1443 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1444 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1445 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1446
1447};
46f4442e 1448static const int8_t escSeqCharsLen[] ={
374ca955 1449 3, /* length of <ESC>(B ASCII */
b75a7d8f
A
1450 3, /* length of <ESC>.A ISO-8859-1 */
1451 3, /* length of <ESC>.F ISO-8859-7 */
1452 3, /* length of <ESC>(J JISX-201 */
1453 3, /* length of <ESC>$B JISX-208 */
1454 4, /* length of <ESC>$(D JISX-212 */
1455 3, /* length of <ESC>$A GB2312 */
1456 4, /* length of <ESC>$(C KSC5601 */
1457 3 /* length of <ESC>(I HWKANA_7BIT */
1458};
1459
1460/*
1461* The iteration over various code pages works this way:
1462* i) Get the currentState from myConverterData->currentState
1463* ii) Check if the character is mapped to a valid character in the currentState
1464* Yes -> a) set the initIterState to currentState
1465* b) remain in this state until an invalid character is found
1466* No -> a) go to the next code page and find the character
46f4442e 1467* iii) Before changing the state increment the current state check if the current state
b75a7d8f
A
1468* is equal to the intitIteration state
1469* Yes -> A character that cannot be represented in any of the supported encodings
1470* break and return a U_INVALID_CHARACTER error
1471* No -> Continue and find the character in next code page
1472*
1473*
46f4442e 1474* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
b75a7d8f
A
1475*/
1476
46f4442e 1477/* Map 00..7F to Unicode according to JIS X 0201. */
4388f060 1478static inline uint32_t
46f4442e
A
1479jisx201ToU(uint32_t value) {
1480 if(value < 0x5c) {
1481 return value;
1482 } else if(value == 0x5c) {
1483 return 0xa5;
1484 } else if(value == 0x7e) {
1485 return 0x203e;
1486 } else /* value <= 0x7f */ {
1487 return value;
1488 }
1489}
1490
1491/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
4388f060 1492static inline uint32_t
46f4442e
A
1493jisx201FromU(uint32_t value) {
1494 if(value<=0x7f) {
1495 if(value!=0x5c && value!=0x7e) {
1496 return value;
1497 }
1498 } else if(value==0xa5) {
1499 return 0x5c;
1500 } else if(value==0x203e) {
1501 return 0x7e;
1502 }
1503 return 0xfffe;
1504}
1505
1506/*
1507 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1508 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1509 * Return 0 if the byte pair is out of range.
1510 */
4388f060 1511static inline uint32_t
46f4442e
A
1512_2022FromSJIS(uint32_t value) {
1513 uint8_t trail;
1514
1515 if(value > 0xEFFC) {
1516 return 0; /* beyond JIS X 0208 */
1517 }
1518
1519 trail = (uint8_t)value;
1520
1521 value &= 0xff00; /* lead byte */
1522 if(value <= 0x9f00) {
1523 value -= 0x7000;
1524 } else /* 0xe000 <= value <= 0xef00 */ {
1525 value -= 0xb000;
1526 }
1527 value <<= 1;
1528
1529 if(trail <= 0x9e) {
1530 value -= 0x100;
1531 if(trail <= 0x7e) {
1532 value |= trail - 0x1f;
1533 } else {
1534 value |= trail - 0x20;
1535 }
1536 } else /* trail <= 0xfc */ {
1537 value |= trail - 0x7e;
1538 }
1539 return value;
1540}
1541
1542/*
1543 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1544 * If either byte is outside 21..7E make sure that the result is not valid
1545 * for Shift-JIS so that the converter catches it.
1546 * Some invalid byte values already turn into equally invalid Shift-JIS
1547 * byte values and need not be tested explicitly.
1548 */
4388f060 1549static inline void
46f4442e
A
1550_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1551 if(c1&1) {
1552 ++c1;
1553 if(c2 <= 0x5f) {
1554 c2 += 0x1f;
1555 } else if(c2 <= 0x7e) {
1556 c2 += 0x20;
1557 } else {
1558 c2 = 0; /* invalid */
1559 }
1560 } else {
1561 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1562 c2 += 0x7e;
1563 } else {
1564 c2 = 0; /* invalid */
1565 }
1566 }
1567 c1 >>= 1;
1568 if(c1 <= 0x2f) {
1569 c1 += 0x70;
1570 } else if(c1 <= 0x3f) {
1571 c1 += 0xb0;
1572 } else {
1573 c1 = 0; /* invalid */
1574 }
1575 bytes[0] = (char)c1;
1576 bytes[1] = (char)c2;
1577}
1578
1579/*
1580 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1581 * Katakana.
1582 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1583 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1584 * These were the only fallbacks in ICU's jisx-208.ucm file.
1585 */
1586static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1587 0x2123, /* U+FF61 */
1588 0x2156,
1589 0x2157,
1590 0x2122,
1591 0x2126,
1592 0x2572,
1593 0x2521,
1594 0x2523,
1595 0x2525,
1596 0x2527,
1597 0x2529,
1598 0x2563,
1599 0x2565,
1600 0x2567,
1601 0x2543,
1602 0x213C, /* U+FF70 */
1603 0x2522,
1604 0x2524,
1605 0x2526,
1606 0x2528,
1607 0x252A,
1608 0x252B,
1609 0x252D,
1610 0x252F,
1611 0x2531,
1612 0x2533,
1613 0x2535,
1614 0x2537,
1615 0x2539,
1616 0x253B,
1617 0x253D,
1618 0x253F, /* U+FF80 */
1619 0x2541,
1620 0x2544,
1621 0x2546,
1622 0x2548,
1623 0x254A,
1624 0x254B,
1625 0x254C,
1626 0x254D,
1627 0x254E,
1628 0x254F,
1629 0x2552,
1630 0x2555,
1631 0x2558,
1632 0x255B,
1633 0x255E,
1634 0x255F, /* U+FF90 */
1635 0x2560,
1636 0x2561,
1637 0x2562,
1638 0x2564,
1639 0x2566,
1640 0x2568,
1641 0x2569,
1642 0x256A,
1643 0x256B,
1644 0x256C,
1645 0x256D,
1646 0x256F,
1647 0x2573,
1648 0x212B,
1649 0x212C /* U+FF9F */
1650};
1651
1652static void
374ca955 1653UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
46f4442e 1654 UConverter *cnv = args->converter;
b75a7d8f 1655 UConverterDataISO2022 *converterData;
374ca955
A
1656 ISO2022State *pFromU2022State;
1657 uint8_t *target = (uint8_t *) args->target;
1658 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
b75a7d8f
A
1659 const UChar* source = args->source;
1660 const UChar* sourceLimit = args->sourceLimit;
1661 int32_t* offsets = args->offsets;
374ca955
A
1662 UChar32 sourceChar;
1663 char buffer[8];
1664 int32_t len, outLen;
1665 int8_t choices[10];
1666 int32_t choiceCount;
73c04bcf 1667 uint32_t targetValue = 0;
374ca955
A
1668 UBool useFallback;
1669
1670 int32_t i;
1671 int8_t cs, g;
1672
1673 /* set up the state */
46f4442e 1674 converterData = (UConverterDataISO2022*)cnv->extraInfo;
374ca955 1675 pFromU2022State = &converterData->fromU2022State;
374ca955
A
1676
1677 choiceCount = 0;
b75a7d8f 1678
b75a7d8f 1679 /* check if the last codepoint of previous buffer was a lead surrogate*/
46f4442e 1680 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
b75a7d8f
A
1681 goto getTrail;
1682 }
b75a7d8f 1683
374ca955
A
1684 while(source < sourceLimit) {
1685 if(target < targetLimit) {
b75a7d8f 1686
b75a7d8f 1687 sourceChar = *(source++);
374ca955 1688 /*check if the char is a First surrogate*/
4388f060
A
1689 if(U16_IS_SURROGATE(sourceChar)) {
1690 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
374ca955
A
1691getTrail:
1692 /*look ahead to find the trail surrogate*/
1693 if(source < sourceLimit) {
1694 /* test the following code unit */
1695 UChar trail=(UChar) *source;
4388f060 1696 if(U16_IS_TRAIL(trail)) {
374ca955 1697 source++;
4388f060 1698 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
46f4442e 1699 cnv->fromUChar32=0x00;
374ca955
A
1700 /* convert this supplementary code point */
1701 /* exit this condition tree */
1702 } else {
1703 /* this is an unmatched lead code unit (1st surrogate) */
1704 /* callback(illegal) */
1705 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 1706 cnv->fromUChar32=sourceChar;
374ca955 1707 break;
b75a7d8f 1708 }
374ca955
A
1709 } else {
1710 /* no more input */
46f4442e 1711 cnv->fromUChar32=sourceChar;
b75a7d8f
A
1712 break;
1713 }
374ca955
A
1714 } else {
1715 /* this is an unmatched trail code unit (2nd surrogate) */
1716 /* callback(illegal) */
1717 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 1718 cnv->fromUChar32=sourceChar;
374ca955
A
1719 break;
1720 }
b75a7d8f
A
1721 }
1722
73c04bcf
A
1723 /* do not convert SO/SI/ESC */
1724 if(IS_2022_CONTROL(sourceChar)) {
1725 /* callback(illegal) */
1726 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 1727 cnv->fromUChar32=sourceChar;
73c04bcf
A
1728 break;
1729 }
1730
374ca955 1731 /* do the conversion */
b75a7d8f 1732
374ca955
A
1733 if(choiceCount == 0) {
1734 uint16_t csm;
b75a7d8f 1735
374ca955
A
1736 /*
1737 * The csm variable keeps track of which charsets are allowed
1738 * and not used yet while building the choices[].
1739 */
1740 csm = jpCharsetMasks[converterData->version];
1741 choiceCount = 0;
1742
1743 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1744 if(converterData->version == 3 || converterData->version == 4) {
46f4442e 1745 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
374ca955 1746 }
46f4442e
A
1747 /* Do not try single-byte half-width Katakana for other versions. */
1748 csm &= ~CSM(HWKANA_7BIT);
b75a7d8f 1749
374ca955
A
1750 /* try the current G0 charset */
1751 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1752 csm &= ~CSM(cs);
b75a7d8f 1753
374ca955
A
1754 /* try the current G2 charset */
1755 if((cs = pFromU2022State->cs[2]) != 0) {
1756 choices[choiceCount++] = cs;
1757 csm &= ~CSM(cs);
1758 }
1759
1760 /* try all the other possible charsets */
b331163b 1761 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
374ca955
A
1762 cs = (int8_t)jpCharsetPref[i];
1763 if(CSM(cs) & csm) {
1764 choices[choiceCount++] = cs;
1765 csm &= ~CSM(cs);
b75a7d8f
A
1766 }
1767 }
374ca955 1768 }
b75a7d8f 1769
374ca955 1770 cs = g = 0;
46f4442e
A
1771 /*
1772 * len==0: no mapping found yet
1773 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1774 * len>0: found a roundtrip result, done
1775 */
374ca955 1776 len = 0;
46f4442e
A
1777 /*
1778 * We will turn off useFallback after finding a fallback,
1779 * but we still get fallbacks from PUA code points as usual.
1780 * Therefore, we will also need to check that we don't overwrite
1781 * an early fallback with a later one.
1782 */
1783 useFallback = cnv->useFallback;
374ca955 1784
46f4442e
A
1785 for(i = 0; i < choiceCount && len <= 0; ++i) {
1786 uint32_t value;
1787 int32_t len2;
1788 int8_t cs0 = choices[i];
1789 switch(cs0) {
374ca955
A
1790 case ASCII:
1791 if(sourceChar <= 0x7f) {
1792 targetValue = (uint32_t)sourceChar;
1793 len = 1;
46f4442e
A
1794 cs = cs0;
1795 g = 0;
b75a7d8f 1796 }
374ca955
A
1797 break;
1798 case ISO8859_1:
46f4442e 1799 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
374ca955
A
1800 targetValue = (uint32_t)sourceChar - 0x80;
1801 len = 1;
46f4442e 1802 cs = cs0;
374ca955
A
1803 g = 2;
1804 }
1805 break;
1806 case HWKANA_7BIT:
46f4442e 1807 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
374ca955
A
1808 if(converterData->version==3) {
1809 /* JIS7: use G1 (SO) */
46f4442e
A
1810 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1811 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1812 len = 1;
1813 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
374ca955
A
1814 g = 1;
1815 } else if(converterData->version==4) {
1816 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
46f4442e
A
1817 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1818 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1819 len = 1;
374ca955 1820
46f4442e
A
1821 cs = pFromU2022State->cs[0];
1822 if(IS_JP_DBCS(cs)) {
374ca955
A
1823 /* switch from a DBCS charset to JISX201 */
1824 cs = (int8_t)JISX201;
b75a7d8f 1825 }
46f4442e
A
1826 /* else stay in the current G0 charset */
1827 g = 0;
b75a7d8f 1828 }
46f4442e 1829 /* else do not use HWKANA_7BIT with other versions */
b75a7d8f 1830 }
374ca955
A
1831 break;
1832 case JISX201:
1833 /* G0 SBCS */
46f4442e
A
1834 value = jisx201FromU(sourceChar);
1835 if(value <= 0x7f) {
1836 targetValue = value;
374ca955 1837 len = 1;
46f4442e
A
1838 cs = cs0;
1839 g = 0;
1840 useFallback = FALSE;
1841 }
1842 break;
1843 case JISX208:
1844 /* G0 DBCS from Shift-JIS table */
1845 len2 = MBCS_FROM_UCHAR32_ISO2022(
1846 converterData->myConverterArray[cs0],
1847 sourceChar, &value,
1848 useFallback, MBCS_OUTPUT_2);
1849 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1850 value = _2022FromSJIS(value);
1851 if(value != 0) {
1852 targetValue = value;
1853 len = len2;
1854 cs = cs0;
1855 g = 0;
1856 useFallback = FALSE;
1857 }
1858 } else if(len == 0 && useFallback &&
1859 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1860 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1861 len = -2;
1862 cs = cs0;
1863 g = 0;
1864 useFallback = FALSE;
374ca955
A
1865 }
1866 break;
1867 case ISO8859_7:
1868 /* G0 SBCS forced to 7-bit output */
46f4442e
A
1869 len2 = MBCS_SINGLE_FROM_UCHAR32(
1870 converterData->myConverterArray[cs0],
1871 sourceChar, &value,
1872 useFallback);
1873 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1874 targetValue = value - 0x80;
1875 len = len2;
1876 cs = cs0;
374ca955 1877 g = 2;
46f4442e 1878 useFallback = FALSE;
374ca955
A
1879 }
1880 break;
1881 default:
1882 /* G0 DBCS */
46f4442e
A
1883 len2 = MBCS_FROM_UCHAR32_ISO2022(
1884 converterData->myConverterArray[cs0],
1885 sourceChar, &value,
1886 useFallback, MBCS_OUTPUT_2);
1887 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1888 if(cs0 == KSC5601) {
1889 /*
1890 * Check for valid bytes for the encoding scheme.
1891 * This is necessary because the sub-converter (windows-949)
1892 * has a broader encoding scheme than is valid for 2022.
1893 */
1894 value = _2022FromGR94DBCS(value);
1895 if(value == 0) {
1896 break;
1897 }
1898 }
1899 targetValue = value;
1900 len = len2;
1901 cs = cs0;
1902 g = 0;
1903 useFallback = FALSE;
374ca955
A
1904 }
1905 break;
b75a7d8f
A
1906 }
1907 }
b75a7d8f 1908
46f4442e
A
1909 if(len != 0) {
1910 if(len < 0) {
1911 len = -len; /* fallback */
1912 }
374ca955
A
1913 outLen = 0; /* count output bytes */
1914
1915 /* write SI if necessary (only for JIS7) */
1916 if(pFromU2022State->g == 1 && g == 0) {
1917 buffer[outLen++] = UCNV_SI;
1918 pFromU2022State->g = 0;
1919 }
1920
1921 /* write the designation sequence if necessary */
1922 if(cs != pFromU2022State->cs[g]) {
1923 int32_t escLen = escSeqCharsLen[cs];
1924 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1925 outLen += escLen;
1926 pFromU2022State->cs[g] = cs;
1927
1928 /* invalidate the choices[] */
1929 choiceCount = 0;
1930 }
1931
1932 /* write the shift sequence if necessary */
1933 if(g != pFromU2022State->g) {
1934 switch(g) {
1935 /* case 0 handled before writing escapes */
1936 case 1:
1937 buffer[outLen++] = UCNV_SO;
1938 pFromU2022State->g = 1;
1939 break;
1940 default: /* case 2 */
1941 buffer[outLen++] = 0x1b;
1942 buffer[outLen++] = 0x4e;
1943 break;
1944 /* no case 3: no SS3 in ISO-2022-JP-x */
1945 }
1946 }
1947
1948 /* write the output bytes */
1949 if(len == 1) {
1950 buffer[outLen++] = (char)targetValue;
1951 } else /* len == 2 */ {
1952 buffer[outLen++] = (char)(targetValue >> 8);
1953 buffer[outLen++] = (char)targetValue;
1954 }
1955 } else {
1956 /*
46f4442e 1957 * if we cannot find the character after checking all codepages
b75a7d8f
A
1958 * then this is an error
1959 */
b75a7d8f 1960 *err = U_INVALID_CHAR_FOUND;
46f4442e 1961 cnv->fromUChar32=sourceChar;
374ca955
A
1962 break;
1963 }
1964
1965 if(sourceChar == CR || sourceChar == LF) {
1966 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1967 pFromU2022State->cs[2] = 0;
1968 choiceCount = 0;
1969 }
1970
1971 /* output outLen>0 bytes in buffer[] */
1972 if(outLen == 1) {
1973 *target++ = buffer[0];
1974 if(offsets) {
73c04bcf 1975 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
b75a7d8f 1976 }
374ca955
A
1977 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1978 *target++ = buffer[0];
1979 *target++ = buffer[1];
1980 if(offsets) {
1981 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1982 *offsets++ = sourceIndex;
1983 *offsets++ = sourceIndex;
1984 }
1985 } else {
73c04bcf 1986 fromUWriteUInt8(
46f4442e 1987 cnv,
374ca955 1988 buffer, outLen,
73c04bcf 1989 &target, (const char *)targetLimit,
374ca955
A
1990 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1991 err);
1992 if(U_FAILURE(*err)) {
b75a7d8f
A
1993 break;
1994 }
1995 }
1996 } /* end if(myTargetIndex<myTargetLength) */
1997 else{
1998 *err =U_BUFFER_OVERFLOW_ERROR;
1999 break;
2000 }
2001
2002 }/* end while(mySourceIndex<mySourceLength) */
2003
374ca955
A
2004 /*
2005 * the end of the input stream and detection of truncated input
2006 * are handled by the framework, but for ISO-2022-JP conversion
2007 * we need to be in ASCII mode at the very end
2008 *
2009 * conditions:
2010 * successful
2011 * in SO mode or not in ASCII mode
2012 * end of input and no truncated input
b75a7d8f 2013 */
374ca955
A
2014 if( U_SUCCESS(*err) &&
2015 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
46f4442e 2016 args->flush && source>=sourceLimit && cnv->fromUChar32==0
374ca955
A
2017 ) {
2018 int32_t sourceIndex;
2019
2020 outLen = 0;
2021
2022 if(pFromU2022State->g != 0) {
2023 buffer[outLen++] = UCNV_SI;
2024 pFromU2022State->g = 0;
2025 }
2026
2027 if(pFromU2022State->cs[0] != ASCII) {
2028 int32_t escLen = escSeqCharsLen[ASCII];
2029 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2030 outLen += escLen;
2031 pFromU2022State->cs[0] = (int8_t)ASCII;
2032 }
2033
2034 /* get the source index of the last input character */
2035 /*
2036 * TODO this would be simpler and more reliable if we used a pair
2037 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2038 * so that we could simply use the prevSourceIndex here;
2039 * this code gives an incorrect result for the rare case of an unmatched
2040 * trail surrogate that is alone in the last buffer of the text stream
2041 */
2042 sourceIndex=(int32_t)(source-args->source);
2043 if(sourceIndex>0) {
2044 --sourceIndex;
2045 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2046 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2047 ) {
2048 --sourceIndex;
2049 }
2050 } else {
2051 sourceIndex=-1;
2052 }
2053
73c04bcf 2054 fromUWriteUInt8(
46f4442e 2055 cnv,
374ca955 2056 buffer, outLen,
73c04bcf 2057 &target, (const char *)targetLimit,
374ca955
A
2058 &offsets, sourceIndex,
2059 err);
b75a7d8f
A
2060 }
2061
2062 /*save the state and return */
2063 args->source = source;
2064 args->target = (char*)target;
2065}
2066
2067/*************** to unicode *******************/
2068
46f4442e 2069static void
b75a7d8f 2070UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
374ca955 2071 UErrorCode* err){
46f4442e 2072 char tempBuf[2];
374ca955 2073 const char *mySource = (char *) args->source;
b75a7d8f
A
2074 UChar *myTarget = args->target;
2075 const char *mySourceLimit = args->sourceLimit;
2076 uint32_t targetUniChar = 0x0000;
2077 uint32_t mySourceChar = 0x0000;
46f4442e 2078 uint32_t tmpSourceChar = 0x0000;
b75a7d8f 2079 UConverterDataISO2022* myData;
374ca955
A
2080 ISO2022State *pToU2022State;
2081 StateEnum cs;
b75a7d8f 2082
b75a7d8f 2083 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
374ca955 2084 pToU2022State = &myData->toU2022State;
b75a7d8f 2085
374ca955
A
2086 if(myData->key != 0) {
2087 /* continue with a partial escape sequence */
2088 goto escape;
2089 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2090 /* continue with a partial double-byte character */
2091 mySourceChar = args->converter->toUBytes[0];
2092 args->converter->toULength = 0;
2093 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
fd0068a8 2094 targetUniChar = missingCharMarker;
374ca955
A
2095 goto getTrailByte;
2096 }
2097
2098 while(mySource < mySourceLimit){
2099
2100 targetUniChar =missingCharMarker;
b75a7d8f
A
2101
2102 if(myTarget < args->targetLimit){
2103
2104 mySourceChar= (unsigned char) *mySource++;
374ca955
A
2105
2106 switch(mySourceChar) {
2107 case UCNV_SI:
2108 if(myData->version==3) {
2109 pToU2022State->g=0;
b75a7d8f 2110 continue;
374ca955
A
2111 } else {
2112 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
d5d484b0 2113 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
374ca955 2114 break;
b75a7d8f 2115 }
b75a7d8f 2116
374ca955
A
2117 case UCNV_SO:
2118 if(myData->version==3) {
2119 /* JIS7: switch to G1 half-width Katakana */
2120 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2121 pToU2022State->g=1;
b75a7d8f 2122 continue;
374ca955
A
2123 } else {
2124 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
d5d484b0 2125 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
374ca955 2126 break;
b75a7d8f 2127 }
b75a7d8f 2128
374ca955
A
2129 case ESC_2022:
2130 mySource--;
2131escape:
d5d484b0
A
2132 {
2133 const char * mySourceBefore = mySource;
2134 int8_t toULengthBefore = args->converter->toULength;
2135
46f4442e 2136 changeState_2022(args->converter,&(mySource),
d5d484b0
A
2137 mySourceLimit, ISO_2022_JP,err);
2138
2139 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
46f4442e
A
2140 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2141 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2142 args->converter->toUCallbackReason = UCNV_IRREGULAR;
729e4ab9 2143 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
d5d484b0 2144 }
d5d484b0 2145 }
46f4442e 2146
374ca955
A
2147 /* invalid or illegal escape sequence */
2148 if(U_FAILURE(*err)){
2149 args->target = myTarget;
2150 args->source = mySource;
d5d484b0 2151 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
374ca955 2152 return;
b75a7d8f 2153 }
d5d484b0 2154 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
46f4442e 2155 if(myData->key==0) {
d5d484b0
A
2156 myData->isEmptySegment = TRUE;
2157 }
374ca955 2158 continue;
b75a7d8f 2159
374ca955 2160 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
b75a7d8f 2161
374ca955
A
2162 case CR:
2163 /*falls through*/
2164 case LF:
2165 /* automatically reset to single-byte mode */
2166 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2167 pToU2022State->cs[0] = (int8_t)ASCII;
b75a7d8f 2168 }
374ca955
A
2169 pToU2022State->cs[2] = 0;
2170 pToU2022State->g = 0;
2171 /* falls through */
b75a7d8f 2172 default:
374ca955 2173 /* convert one or two bytes */
d5d484b0 2174 myData->isEmptySegment = FALSE;
374ca955
A
2175 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2176 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2177 !IS_JP_DBCS(cs)
2178 ) {
2179 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
46f4442e 2180 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
374ca955
A
2181
2182 /* return from a single-shift state to the previous one */
2183 if(pToU2022State->g >= 2) {
2184 pToU2022State->g=pToU2022State->prevG;
2185 }
2186 } else switch(cs) {
2187 case ASCII:
2188 if(mySourceChar <= 0x7f) {
2189 targetUniChar = mySourceChar;
2190 }
2191 break;
2192 case ISO8859_1:
2193 if(mySourceChar <= 0x7f) {
2194 targetUniChar = mySourceChar + 0x80;
2195 }
2196 /* return from a single-shift state to the previous one */
2197 pToU2022State->g=pToU2022State->prevG;
2198 break;
2199 case ISO8859_7:
2200 if(mySourceChar <= 0x7f) {
2201 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2202 targetUniChar =
2203 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2204 myData->myConverterArray[cs],
2205 mySourceChar + 0x80);
2206 }
2207 /* return from a single-shift state to the previous one */
2208 pToU2022State->g=pToU2022State->prevG;
2209 break;
2210 case JISX201:
2211 if(mySourceChar <= 0x7f) {
46f4442e 2212 targetUniChar = jisx201ToU(mySourceChar);
374ca955
A
2213 }
2214 break;
2215 case HWKANA_7BIT:
2216 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2217 /* 7-bit halfwidth Katakana */
46f4442e 2218 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
374ca955
A
2219 }
2220 break;
2221 default:
2222 /* G0 DBCS */
2223 if(mySource < mySourceLimit) {
fd0068a8
A
2224 int leadIsOk, trailIsOk;
2225 uint8_t trailByte;
374ca955 2226getTrailByte:
fd0068a8 2227 trailByte = (uint8_t)*mySource;
fd0068a8
A
2228 /*
2229 * Ticket 5691: consistent illegal sequences:
2230 * - We include at least the first byte in the illegal sequence.
2231 * - If any of the non-initial bytes could be the start of a character,
46f4442e 2232 * we stop the illegal sequence before the first one of those.
fd0068a8
A
2233 *
2234 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2235 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2236 * Otherwise we convert or report the pair of bytes.
2237 */
2238 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2239 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2240 if (leadIsOk && trailIsOk) {
2241 ++mySource;
46f4442e
A
2242 tmpSourceChar = (mySourceChar << 8) | trailByte;
2243 if(cs == JISX208) {
2244 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2245 mySourceChar = tmpSourceChar;
2246 } else {
2247 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2248 mySourceChar = tmpSourceChar;
2249 if (cs == KSC5601) {
2250 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2251 }
2252 tempBuf[0] = (char)(tmpSourceChar >> 8);
2253 tempBuf[1] = (char)(tmpSourceChar);
2254 }
fd0068a8
A
2255 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2256 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2257 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2258 ++mySource;
2259 /* add another bit so that the code below writes 2 bytes in case of error */
2260 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2261 }
374ca955
A
2262 } else {
2263 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2264 args->converter->toULength = 1;
2265 goto endloop;
2266 }
46f4442e 2267 } /* End of inner switch */
b75a7d8f 2268 break;
46f4442e 2269 } /* End of outer switch */
b75a7d8f
A
2270 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2271 if(args->offsets){
73c04bcf 2272 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f
A
2273 }
2274 *(myTarget++)=(UChar)targetUniChar;
b75a7d8f 2275 }
374ca955
A
2276 else if(targetUniChar > missingCharMarker){
2277 /* disassemble the surrogate pair and write to output*/
2278 targetUniChar-=0x0010000;
2279 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2280 if(args->offsets){
73c04bcf 2281 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
374ca955
A
2282 }
2283 ++myTarget;
46f4442e 2284 if(myTarget< args->targetLimit){
374ca955
A
2285 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2286 if(args->offsets){
73c04bcf 2287 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
374ca955
A
2288 }
2289 ++myTarget;
2290 }else{
2291 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2292 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2293 }
b75a7d8f 2294
374ca955
A
2295 }
2296 else{
b75a7d8f 2297 /* Call the callback function*/
374ca955
A
2298 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2299 break;
b75a7d8f
A
2300 }
2301 }
46f4442e 2302 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
b75a7d8f
A
2303 *err =U_BUFFER_OVERFLOW_ERROR;
2304 break;
2305 }
2306 }
374ca955 2307endloop:
b75a7d8f
A
2308 args->target = myTarget;
2309 args->source = mySource;
2310}
2311
2312
b331163b 2313#if !UCONFIG_ONLY_HTML_CONVERSION
b75a7d8f
A
2314/***************************************************************
2315* Rules for ISO-2022-KR encoding
46f4442e 2316* i) The KSC5601 designator sequence should appear only once in a file,
b75a7d8f
A
2317* at the begining of a line before any KSC5601 characters. This usually
2318* means that it appears by itself on the first line of the file
2319* ii) There are only 2 shifting sequences SO to shift into double byte mode
2320* and SI to shift into single byte mode
2321*/
46f4442e 2322static void
b75a7d8f
A
2323UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2324
374ca955
A
2325 UConverter* saveConv = args->converter;
2326 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2327 args->converter=myConverterData->currentConverter;
2328
2329 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2330 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2331 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2332
2333 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2334 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2335 uprv_memcpy(
2336 saveConv->charErrorBuffer,
2337 myConverterData->currentConverter->charErrorBuffer,
2338 myConverterData->currentConverter->charErrorBufferLength);
2339 }
2340 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2341 myConverterData->currentConverter->charErrorBufferLength = 0;
2342 }
2343 args->converter=saveConv;
b75a7d8f
A
2344}
2345
46f4442e 2346static void
b75a7d8f
A
2347UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2348
2349 const UChar *source = args->source;
2350 const UChar *sourceLimit = args->sourceLimit;
2351 unsigned char *target = (unsigned char *) args->target;
2352 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2353 int32_t* offsets = args->offsets;
2354 uint32_t targetByteUnit = 0x0000;
2355 UChar32 sourceChar = 0x0000;
2356 UBool isTargetByteDBCS;
2357 UBool oldIsTargetByteDBCS;
2358 UConverterDataISO2022 *converterData;
b75a7d8f
A
2359 UConverterSharedData* sharedData;
2360 UBool useFallback;
2361 int32_t length =0;
2362
b75a7d8f 2363 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
46f4442e
A
2364 /* if the version is 1 then the user is requesting
2365 * conversion with ibm-25546 pass the arguments to
b75a7d8f
A
2366 * MBCS converter and return
2367 */
2368 if(converterData->version==1){
2369 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2370 return;
2371 }
374ca955
A
2372
2373 /* initialize data */
2374 sharedData = converterData->currentConverter->sharedData;
2375 useFallback = args->converter->useFallback;
2376 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2377 oldIsTargetByteDBCS = isTargetByteDBCS;
46f4442e 2378
b75a7d8f 2379 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
374ca955 2380 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
b75a7d8f
A
2381 goto getTrail;
2382 }
2383 while(source < sourceLimit){
46f4442e 2384
b75a7d8f
A
2385 targetByteUnit = missingCharMarker;
2386
2387 if(target < (unsigned char*) args->targetLimit){
2388 sourceChar = *source++;
73c04bcf
A
2389
2390 /* do not convert SO/SI/ESC */
2391 if(IS_2022_CONTROL(sourceChar)) {
2392 /* callback(illegal) */
2393 *err=U_ILLEGAL_CHAR_FOUND;
2394 args->converter->fromUChar32=sourceChar;
2395 break;
2396 }
2397
46f4442e
A
2398 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2399 if(length < 0) {
2400 length = -length; /* fallback */
2401 }
b75a7d8f 2402 /* only DBCS or SBCS characters are expected*/
374ca955 2403 /* DB characters with high bit set to 1 are expected */
fd0068a8
A
2404 if( length > 2 || length==0 ||
2405 (length == 1 && targetByteUnit > 0x7f) ||
2406 (length == 2 &&
2407 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2408 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2409 ) {
b75a7d8f
A
2410 targetByteUnit=missingCharMarker;
2411 }
2412 if (targetByteUnit != missingCharMarker){
2413
2414 oldIsTargetByteDBCS = isTargetByteDBCS;
2415 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2416 /* append the shift sequence */
2417 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
46f4442e
A
2418
2419 if (isTargetByteDBCS)
b75a7d8f 2420 *target++ = UCNV_SO;
46f4442e 2421 else
b75a7d8f
A
2422 *target++ = UCNV_SI;
2423 if(offsets)
73c04bcf 2424 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2425 }
2426 /* write the targetUniChar to target */
2427 if(targetByteUnit <= 0x00FF){
2428 if( target < targetLimit){
2429 *(target++) = (unsigned char) targetByteUnit;
2430 if(offsets){
73c04bcf 2431 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2432 }
2433
2434 }else{
2435 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2436 *err = U_BUFFER_OVERFLOW_ERROR;
2437 }
2438 }else{
2439 if(target < targetLimit){
2440 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2441 if(offsets){
73c04bcf 2442 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2443 }
2444 if(target < targetLimit){
2445 *(target++) =(unsigned char) (targetByteUnit -0x80);
2446 if(offsets){
73c04bcf 2447 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2448 }
2449 }else{
2450 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2451 *err = U_BUFFER_OVERFLOW_ERROR;
2452 }
2453 }else{
2454 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2455 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2456 *err = U_BUFFER_OVERFLOW_ERROR;
2457 }
2458 }
2459
2460 }
2461 else{
2462 /* oops.. the code point is unassingned
2463 * set the error and reason
2464 */
b75a7d8f
A
2465
2466 /*check if the char is a First surrogate*/
4388f060
A
2467 if(U16_IS_SURROGATE(sourceChar)) {
2468 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
b75a7d8f
A
2469getTrail:
2470 /*look ahead to find the trail surrogate*/
2471 if(source < sourceLimit) {
2472 /* test the following code unit */
2473 UChar trail=(UChar) *source;
4388f060 2474 if(U16_IS_TRAIL(trail)) {
b75a7d8f 2475 source++;
4388f060 2476 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
b75a7d8f 2477 *err = U_INVALID_CHAR_FOUND;
b75a7d8f
A
2478 /* convert this surrogate code point */
2479 /* exit this condition tree */
2480 } else {
2481 /* this is an unmatched lead code unit (1st surrogate) */
2482 /* callback(illegal) */
b75a7d8f
A
2483 *err=U_ILLEGAL_CHAR_FOUND;
2484 }
2485 } else {
2486 /* no more input */
2487 *err = U_ZERO_ERROR;
b75a7d8f
A
2488 }
2489 } else {
2490 /* this is an unmatched trail code unit (2nd surrogate) */
2491 /* callback(illegal) */
b75a7d8f
A
2492 *err=U_ILLEGAL_CHAR_FOUND;
2493 }
374ca955
A
2494 } else {
2495 /* callback(unassigned) for a BMP code point */
2496 *err = U_INVALID_CHAR_FOUND;
b75a7d8f 2497 }
b75a7d8f 2498
374ca955 2499 args->converter->fromUChar32=sourceChar;
374ca955 2500 break;
b75a7d8f
A
2501 }
2502 } /* end if(myTargetIndex<myTargetLength) */
2503 else{
2504 *err =U_BUFFER_OVERFLOW_ERROR;
2505 break;
2506 }
2507
2508 }/* end while(mySourceIndex<mySourceLength) */
2509
374ca955
A
2510 /*
2511 * the end of the input stream and detection of truncated input
2512 * are handled by the framework, but for ISO-2022-KR conversion
2513 * we need to be in ASCII mode at the very end
2514 *
2515 * conditions:
2516 * successful
2517 * not in ASCII mode
2518 * end of input and no truncated input
b75a7d8f 2519 */
374ca955
A
2520 if( U_SUCCESS(*err) &&
2521 isTargetByteDBCS &&
2522 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2523 ) {
2524 int32_t sourceIndex;
2525
2526 /* we are switching to ASCII */
2527 isTargetByteDBCS=FALSE;
2528
2529 /* get the source index of the last input character */
2530 /*
2531 * TODO this would be simpler and more reliable if we used a pair
2532 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2533 * so that we could simply use the prevSourceIndex here;
2534 * this code gives an incorrect result for the rare case of an unmatched
2535 * trail surrogate that is alone in the last buffer of the text stream
2536 */
2537 sourceIndex=(int32_t)(source-args->source);
2538 if(sourceIndex>0) {
2539 --sourceIndex;
2540 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2541 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2542 ) {
2543 --sourceIndex;
2544 }
2545 } else {
2546 sourceIndex=-1;
2547 }
2548
73c04bcf 2549 fromUWriteUInt8(
374ca955
A
2550 args->converter,
2551 SHIFT_IN_STR, 1,
73c04bcf 2552 &target, (const char *)targetLimit,
374ca955
A
2553 &offsets, sourceIndex,
2554 err);
b75a7d8f
A
2555 }
2556
2557 /*save the state and return */
2558 args->source = source;
2559 args->target = (char*)target;
2560 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2561}
2562
2563/************************ To Unicode ***************************************/
2564
46f4442e 2565static void
b75a7d8f
A
2566UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2567 UErrorCode* err){
b75a7d8f 2568 char const* sourceStart;
b75a7d8f 2569 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
b75a7d8f 2570
374ca955
A
2571 UConverterToUnicodeArgs subArgs;
2572 int32_t minArgsSize;
2573
2574 /* set up the subconverter arguments */
2575 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2576 minArgsSize = args->size;
2577 } else {
2578 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2579 }
2580
2581 uprv_memcpy(&subArgs, args, minArgsSize);
2582 subArgs.size = (uint16_t)minArgsSize;
2583 subArgs.converter = myData->currentConverter;
2584
2585 /* remember the original start of the input for offsets */
2586 sourceStart = args->source;
2587
2588 if(myData->key != 0) {
2589 /* continue with a partial escape sequence */
2590 goto escape;
2591 }
2592
2593 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
b75a7d8f 2594 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
374ca955
A
2595 subArgs.source = args->source;
2596 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2597 if(subArgs.source != subArgs.sourceLimit) {
2598 /*
2599 * get the current partial byte sequence
2600 *
2601 * it needs to be moved between the public and the subconverter
2602 * so that the conversion framework, which only sees the public
2603 * converter, can handle truncated and illegal input etc.
2604 */
2605 if(args->converter->toULength > 0) {
2606 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2607 }
2608 subArgs.converter->toULength = args->converter->toULength;
2609
2610 /*
2611 * Convert up to the end of the input, or to before the next escape character.
2612 * Does not handle conversion extensions because the preToU[] state etc.
2613 * is not copied.
2614 */
2615 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2616
2617 if(args->offsets != NULL && sourceStart != args->source) {
2618 /* update offsets to base them on the actual start of the input */
2619 int32_t *offsets = args->offsets;
2620 UChar *target = args->target;
2621 int32_t delta = (int32_t)(args->source - sourceStart);
2622 while(target < subArgs.target) {
2623 if(*offsets >= 0) {
2624 *offsets += delta;
2625 }
2626 ++offsets;
2627 ++target;
2628 }
2629 }
2630 args->source = subArgs.source;
2631 args->target = subArgs.target;
2632 args->offsets = subArgs.offsets;
2633
2634 /* copy input/error/overflow buffers */
2635 if(subArgs.converter->toULength > 0) {
2636 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2637 }
2638 args->converter->toULength = subArgs.converter->toULength;
2639
2640 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2641 if(subArgs.converter->UCharErrorBufferLength > 0) {
2642 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2643 subArgs.converter->UCharErrorBufferLength);
2644 }
2645 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2646 subArgs.converter->UCharErrorBufferLength = 0;
b75a7d8f 2647 }
b75a7d8f
A
2648 }
2649
374ca955 2650 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
b75a7d8f 2651 return;
374ca955 2652 }
b75a7d8f 2653
374ca955 2654escape:
b75a7d8f 2655 changeState_2022(args->converter,
46f4442e 2656 &(args->source),
b75a7d8f 2657 args->sourceLimit,
b75a7d8f 2658 ISO_2022_KR,
b75a7d8f 2659 err);
374ca955 2660 }
b75a7d8f
A
2661}
2662
46f4442e 2663static void
b75a7d8f
A
2664UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2665 UErrorCode* err){
374ca955 2666 char tempBuf[2];
b75a7d8f
A
2667 const char *mySource = ( char *) args->source;
2668 UChar *myTarget = args->target;
2669 const char *mySourceLimit = args->sourceLimit;
2670 UChar32 targetUniChar = 0x0000;
2671 UChar mySourceChar = 0x0000;
2672 UConverterDataISO2022* myData;
b75a7d8f
A
2673 UConverterSharedData* sharedData ;
2674 UBool useFallback;
2675
374ca955
A
2676 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2677 if(myData->version==1){
2678 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
b75a7d8f
A
2679 return;
2680 }
374ca955 2681
b75a7d8f 2682 /* initialize state */
374ca955 2683 sharedData = myData->currentConverter->sharedData;
b75a7d8f 2684 useFallback = args->converter->useFallback;
46f4442e 2685
374ca955
A
2686 if(myData->key != 0) {
2687 /* continue with a partial escape sequence */
2688 goto escape;
2689 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2690 /* continue with a partial double-byte character */
2691 mySourceChar = args->converter->toUBytes[0];
2692 args->converter->toULength = 0;
2693 goto getTrailByte;
b75a7d8f 2694 }
b75a7d8f 2695
374ca955 2696 while(mySource< mySourceLimit){
b75a7d8f
A
2697
2698 if(myTarget < args->targetLimit){
2699
2700 mySourceChar= (unsigned char) *mySource++;
2701
2702 if(mySourceChar==UCNV_SI){
374ca955 2703 myData->toU2022State.g = 0;
d5d484b0
A
2704 if (myData->isEmptySegment) {
2705 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
46f4442e
A
2706 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2707 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2708 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
d5d484b0
A
2709 args->converter->toULength = 1;
2710 args->target = myTarget;
2711 args->source = mySource;
2712 return;
2713 }
b75a7d8f
A
2714 /*consume the source */
2715 continue;
2716 }else if(mySourceChar==UCNV_SO){
374ca955 2717 myData->toU2022State.g = 1;
d5d484b0 2718 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
b75a7d8f
A
2719 /*consume the source */
2720 continue;
374ca955
A
2721 }else if(mySourceChar==ESC_2022){
2722 mySource--;
2723escape:
d5d484b0 2724 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
46f4442e 2725 changeState_2022(args->converter,&(mySource),
374ca955 2726 mySourceLimit, ISO_2022_KR, err);
b75a7d8f
A
2727 if(U_FAILURE(*err)){
2728 args->target = myTarget;
2729 args->source = mySource;
2730 return;
2731 }
2732 continue;
46f4442e 2733 }
b75a7d8f 2734
d5d484b0 2735 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
374ca955
A
2736 if(myData->toU2022State.g == 1) {
2737 if(mySource < mySourceLimit) {
fd0068a8
A
2738 int leadIsOk, trailIsOk;
2739 uint8_t trailByte;
374ca955 2740getTrailByte:
fd0068a8
A
2741 targetUniChar = missingCharMarker;
2742 trailByte = (uint8_t)*mySource;
2743 /*
2744 * Ticket 5691: consistent illegal sequences:
2745 * - We include at least the first byte in the illegal sequence.
2746 * - If any of the non-initial bytes could be the start of a character,
2747 * we stop the illegal sequence before the first one of those.
2748 *
2749 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2750 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2751 * Otherwise we convert or report the pair of bytes.
2752 */
2753 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2754 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2755 if (leadIsOk && trailIsOk) {
2756 ++mySource;
2757 tempBuf[0] = (char)(mySourceChar + 0x80);
2758 tempBuf[1] = (char)(trailByte + 0x80);
2759 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2760 mySourceChar = (mySourceChar << 8) | trailByte;
2761 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2762 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2763 ++mySource;
2764 /* add another bit so that the code below writes 2 bytes in case of error */
2765 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
374ca955
A
2766 }
2767 } else {
2768 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2769 args->converter->toULength = 1;
2770 break;
b75a7d8f
A
2771 }
2772 }
fd0068a8 2773 else if(mySourceChar <= 0x7f) {
374ca955 2774 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
fd0068a8
A
2775 } else {
2776 targetUniChar = 0xffff;
b75a7d8f 2777 }
374ca955
A
2778 if(targetUniChar < 0xfffe){
2779 if(args->offsets) {
73c04bcf 2780 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
374ca955 2781 }
b75a7d8f
A
2782 *(myTarget++)=(UChar)targetUniChar;
2783 }
2784 else {
b75a7d8f 2785 /* Call the callback function*/
374ca955
A
2786 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2787 break;
b75a7d8f
A
2788 }
2789 }
2790 else{
2791 *err =U_BUFFER_OVERFLOW_ERROR;
2792 break;
2793 }
2794 }
b75a7d8f
A
2795 args->target = myTarget;
2796 args->source = mySource;
2797}
2798
2799/*************************** END ISO2022-KR *********************************/
2800
2801/*************************** ISO-2022-CN *********************************
2802*
2803* Rules for ISO-2022-CN Encoding:
374ca955 2804* i) The designator sequence must appear once on a line before any instance
b75a7d8f
A
2805* of character set it designates.
2806* ii) If two lines contain characters from the same character set, both lines
2807* must include the designator sequence.
374ca955 2808* iii) Once the designator sequence is known, a shifting sequence has to be found
b75a7d8f
A
2809* to invoke the shifting
2810* iv) All lines start in ASCII and end in ASCII.
2811* v) Four shifting sequences are employed for this purpose:
2812*
2813* Sequcence ASCII Eq Charsets
2814* ---------- ------- ---------
374ca955
A
2815* SI <SI> US-ASCII
2816* SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2817* SS2 <ESC>N CNS-11643-1992 Plane 2
2818* SS3 <ESC>O CNS-11643-1992 Planes 3-7
b75a7d8f
A
2819*
2820* vi)
2821* SOdesignator : ESC "$" ")" finalchar_for_SO
2822* SS2designator : ESC "$" "*" finalchar_for_SS2
2823* SS3designator : ESC "$" "+" finalchar_for_SS3
2824*
2825* ESC $ ) A Indicates the bytes following SO are Chinese
2826* characters as defined in GB 2312-80, until
2827* another SOdesignation appears
2828*
2829*
2830* ESC $ ) E Indicates the bytes following SO are as defined
2831* in ISO-IR-165 (for details, see section 2.1),
2832* until another SOdesignation appears
2833*
2834* ESC $ ) G Indicates the bytes following SO are as defined
2835* in CNS 11643-plane-1, until another
2836* SOdesignation appears
2837*
2838* ESC $ * H Indicates the two bytes immediately following
2839* SS2 is a Chinese character as defined in CNS
2840* 11643-plane-2, until another SS2designation
2841* appears
46f4442e 2842* (Meaning <ESC>N must preceed every 2 byte
b75a7d8f
A
2843* sequence.)
2844*
2845* ESC $ + I Indicates the immediate two bytes following SS3
2846* is a Chinese character as defined in CNS
2847* 11643-plane-3, until another SS3designation
2848* appears
46f4442e 2849* (Meaning <ESC>O must preceed every 2 byte
b75a7d8f
A
2850* sequence.)
2851*
2852* ESC $ + J Indicates the immediate two bytes following SS3
2853* is a Chinese character as defined in CNS
2854* 11643-plane-4, until another SS3designation
2855* appears
46f4442e 2856* (In English: <ESC>O must preceed every 2 byte
b75a7d8f
A
2857* sequence.)
2858*
2859* ESC $ + K Indicates the immediate two bytes following SS3
2860* is a Chinese character as defined in CNS
2861* 11643-plane-5, until another SS3designation
2862* appears
2863*
2864* ESC $ + L Indicates the immediate two bytes following SS3
2865* is a Chinese character as defined in CNS
2866* 11643-plane-6, until another SS3designation
2867* appears
2868*
2869* ESC $ + M Indicates the immediate two bytes following SS3
2870* is a Chinese character as defined in CNS
2871* 11643-plane-7, until another SS3designation
2872* appears
2873*
2874* As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2875* has its own designation information before any Chinese characters
2876* appear
2877*
2878*/
2879
4388f060 2880/* The following are defined this way to make the strings truly readonly */
b75a7d8f
A
2881static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2882static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2883static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2884static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2885static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2886static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2887static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2888static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2889static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2890
2891/********************** ISO2022-CN Data **************************/
2892static const char* const escSeqCharsCN[10] ={
4388f060
A
2893 SHIFT_IN_STR, /* 0 ASCII */
2894 GB_2312_80_STR, /* 1 GB2312_1 */
2895 ISO_IR_165_STR, /* 2 ISO_IR_165 */
b75a7d8f
A
2896 CNS_11643_1992_Plane_1_STR,
2897 CNS_11643_1992_Plane_2_STR,
2898 CNS_11643_1992_Plane_3_STR,
2899 CNS_11643_1992_Plane_4_STR,
2900 CNS_11643_1992_Plane_5_STR,
2901 CNS_11643_1992_Plane_6_STR,
2902 CNS_11643_1992_Plane_7_STR
2903};
b75a7d8f 2904
46f4442e 2905static void
b75a7d8f 2906UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
46f4442e 2907 UConverter *cnv = args->converter;
b75a7d8f 2908 UConverterDataISO2022 *converterData;
374ca955
A
2909 ISO2022State *pFromU2022State;
2910 uint8_t *target = (uint8_t *) args->target;
2911 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
b75a7d8f
A
2912 const UChar* source = args->source;
2913 const UChar* sourceLimit = args->sourceLimit;
2914 int32_t* offsets = args->offsets;
374ca955
A
2915 UChar32 sourceChar;
2916 char buffer[8];
2917 int32_t len;
2918 int8_t choices[3];
2919 int32_t choiceCount;
73c04bcf 2920 uint32_t targetValue = 0;
b75a7d8f
A
2921 UBool useFallback;
2922
b75a7d8f 2923 /* set up the state */
46f4442e 2924 converterData = (UConverterDataISO2022*)cnv->extraInfo;
374ca955 2925 pFromU2022State = &converterData->fromU2022State;
374ca955
A
2926
2927 choiceCount = 0;
b75a7d8f
A
2928
2929 /* check if the last codepoint of previous buffer was a lead surrogate*/
46f4442e 2930 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
b75a7d8f
A
2931 goto getTrail;
2932 }
2933
b75a7d8f 2934 while( source < sourceLimit){
b75a7d8f
A
2935 if(target < targetLimit){
2936
2937 sourceChar = *(source++);
2938 /*check if the char is a First surrogate*/
4388f060
A
2939 if(U16_IS_SURROGATE(sourceChar)) {
2940 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
b75a7d8f
A
2941getTrail:
2942 /*look ahead to find the trail surrogate*/
2943 if(source < sourceLimit) {
2944 /* test the following code unit */
2945 UChar trail=(UChar) *source;
4388f060 2946 if(U16_IS_TRAIL(trail)) {
b75a7d8f 2947 source++;
4388f060 2948 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
46f4442e 2949 cnv->fromUChar32=0x00;
374ca955 2950 /* convert this supplementary code point */
b75a7d8f
A
2951 /* exit this condition tree */
2952 } else {
2953 /* this is an unmatched lead code unit (1st surrogate) */
2954 /* callback(illegal) */
b75a7d8f 2955 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 2956 cnv->fromUChar32=sourceChar;
374ca955 2957 break;
b75a7d8f
A
2958 }
2959 } else {
2960 /* no more input */
46f4442e 2961 cnv->fromUChar32=sourceChar;
b75a7d8f
A
2962 break;
2963 }
2964 } else {
2965 /* this is an unmatched trail code unit (2nd surrogate) */
2966 /* callback(illegal) */
b75a7d8f 2967 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 2968 cnv->fromUChar32=sourceChar;
374ca955 2969 break;
b75a7d8f
A
2970 }
2971 }
2972
2973 /* do the conversion */
374ca955 2974 if(sourceChar <= 0x007f ){
73c04bcf
A
2975 /* do not convert SO/SI/ESC */
2976 if(IS_2022_CONTROL(sourceChar)) {
2977 /* callback(illegal) */
2978 *err=U_ILLEGAL_CHAR_FOUND;
46f4442e 2979 cnv->fromUChar32=sourceChar;
73c04bcf
A
2980 break;
2981 }
2982
374ca955
A
2983 /* US-ASCII */
2984 if(pFromU2022State->g == 0) {
2985 buffer[0] = (char)sourceChar;
2986 len = 1;
2987 } else {
2988 buffer[0] = UCNV_SI;
2989 buffer[1] = (char)sourceChar;
2990 len = 2;
2991 pFromU2022State->g = 0;
2992 choiceCount = 0;
2993 }
2994 if(sourceChar == CR || sourceChar == LF) {
2995 /* reset the state at the end of a line */
2996 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2997 choiceCount = 0;
b75a7d8f 2998 }
b75a7d8f
A
2999 }
3000 else{
374ca955 3001 /* convert U+0080..U+10ffff */
374ca955
A
3002 int32_t i;
3003 int8_t cs, g;
3004
3005 if(choiceCount == 0) {
3006 /* try the current SO/G1 converter first */
3007 choices[0] = pFromU2022State->cs[1];
3008
3009 /* default to GB2312_1 if none is designated yet */
3010 if(choices[0] == 0) {
3011 choices[0] = GB2312_1;
3012 }
b75a7d8f 3013
374ca955
A
3014 if(converterData->version == 0) {
3015 /* ISO-2022-CN */
3016
3017 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3018 if(choices[0] == GB2312_1) {
3019 choices[1] = (int8_t)CNS_11643_1;
3020 } else {
3021 choices[1] = (int8_t)GB2312_1;
b75a7d8f 3022 }
374ca955
A
3023
3024 choiceCount = 2;
729e4ab9 3025 } else if (converterData->version == 1) {
374ca955
A
3026 /* ISO-2022-CN-EXT */
3027
3028 /* try one of the other converters */
3029 switch(choices[0]) {
3030 case GB2312_1:
3031 choices[1] = (int8_t)CNS_11643_1;
3032 choices[2] = (int8_t)ISO_IR_165;
3033 break;
3034 case ISO_IR_165:
3035 choices[1] = (int8_t)GB2312_1;
3036 choices[2] = (int8_t)CNS_11643_1;
3037 break;
3038 default: /* CNS_11643_x */
3039 choices[1] = (int8_t)GB2312_1;
3040 choices[2] = (int8_t)ISO_IR_165;
3041 break;
b75a7d8f 3042 }
b75a7d8f 3043
374ca955 3044 choiceCount = 3;
729e4ab9
A
3045 } else {
3046 choices[0] = (int8_t)CNS_11643_1;
3047 choices[1] = (int8_t)GB2312_1;
374ca955 3048 }
b75a7d8f
A
3049 }
3050
374ca955 3051 cs = g = 0;
46f4442e
A
3052 /*
3053 * len==0: no mapping found yet
3054 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3055 * len>0: found a roundtrip result, done
3056 */
374ca955 3057 len = 0;
46f4442e
A
3058 /*
3059 * We will turn off useFallback after finding a fallback,
3060 * but we still get fallbacks from PUA code points as usual.
3061 * Therefore, we will also need to check that we don't overwrite
3062 * an early fallback with a later one.
3063 */
3064 useFallback = cnv->useFallback;
3065
3066 for(i = 0; i < choiceCount && len <= 0; ++i) {
3067 int8_t cs0 = choices[i];
3068 if(cs0 > 0) {
3069 uint32_t value;
3070 int32_t len2;
3071 if(cs0 >= CNS_11643_0) {
3072 len2 = MBCS_FROM_UCHAR32_ISO2022(
3073 converterData->myConverterArray[CNS_11643],
3074 sourceChar,
3075 &value,
3076 useFallback,
3077 MBCS_OUTPUT_3);
3078 if(len2 == 3 || (len2 == -3 && len == 0)) {
3079 targetValue = value;
3080 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3081 if(len2 >= 0) {
3082 len = 2;
3083 } else {
3084 len = -2;
3085 useFallback = FALSE;
3086 }
374ca955
A
3087 if(cs == CNS_11643_1) {
3088 g = 1;
3089 } else if(cs == CNS_11643_2) {
3090 g = 2;
3091 } else /* plane 3..7 */ if(converterData->version == 1) {
3092 g = 3;
3093 } else {
3094 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3095 len = 0;
3096 }
3097 }
3098 } else {
3099 /* GB2312_1 or ISO-IR-165 */
4388f060 3100 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
46f4442e
A
3101 len2 = MBCS_FROM_UCHAR32_ISO2022(
3102 converterData->myConverterArray[cs0],
3103 sourceChar,
3104 &value,
3105 useFallback,
3106 MBCS_OUTPUT_2);
3107 if(len2 == 2 || (len2 == -2 && len == 0)) {
3108 targetValue = value;
3109 len = len2;
3110 cs = cs0;
3111 g = 1;
3112 useFallback = FALSE;
3113 }
374ca955 3114 }
b75a7d8f 3115 }
b75a7d8f
A
3116 }
3117
46f4442e
A
3118 if(len != 0) {
3119 len = 0; /* count output bytes; it must have been abs(len) == 2 */
b75a7d8f 3120
374ca955
A
3121 /* write the designation sequence if necessary */
3122 if(cs != pFromU2022State->cs[g]) {
3123 if(cs < CNS_11643) {
3124 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3125 } else {
4388f060 3126 U_ASSERT(cs >= CNS_11643_1);
374ca955 3127 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
b75a7d8f 3128 }
374ca955
A
3129 len = 4;
3130 pFromU2022State->cs[g] = cs;
3131 if(g == 1) {
3132 /* changing the SO/G1 charset invalidates the choices[] */
3133 choiceCount = 0;
b75a7d8f 3134 }
374ca955
A
3135 }
3136
3137 /* write the shift sequence if necessary */
3138 if(g != pFromU2022State->g) {
3139 switch(g) {
3140 case 1:
3141 buffer[len++] = UCNV_SO;
3142
3143 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3144 pFromU2022State->g = 1;
3145 break;
3146 case 2:
3147 buffer[len++] = 0x1b;
3148 buffer[len++] = 0x4e;
3149 break;
3150 default: /* case 3 */
3151 buffer[len++] = 0x1b;
3152 buffer[len++] = 0x4f;
3153 break;
b75a7d8f 3154 }
b75a7d8f 3155 }
b75a7d8f 3156
374ca955
A
3157 /* write the two output bytes */
3158 buffer[len++] = (char)(targetValue >> 8);
3159 buffer[len++] = (char)targetValue;
3160 } else {
46f4442e 3161 /* if we cannot find the character after checking all codepages
374ca955
A
3162 * then this is an error
3163 */
3164 *err = U_INVALID_CHAR_FOUND;
46f4442e 3165 cnv->fromUChar32=sourceChar;
374ca955
A
3166 break;
3167 }
b75a7d8f 3168 }
b75a7d8f 3169
374ca955
A
3170 /* output len>0 bytes in buffer[] */
3171 if(len == 1) {
3172 *target++ = buffer[0];
3173 if(offsets) {
73c04bcf 3174 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
374ca955
A
3175 }
3176 } else if(len == 2 && (target + 2) <= targetLimit) {
3177 *target++ = buffer[0];
3178 *target++ = buffer[1];
3179 if(offsets) {
3180 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3181 *offsets++ = sourceIndex;
3182 *offsets++ = sourceIndex;
3183 }
3184 } else {
73c04bcf 3185 fromUWriteUInt8(
46f4442e 3186 cnv,
374ca955 3187 buffer, len,
73c04bcf 3188 &target, (const char *)targetLimit,
374ca955
A
3189 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3190 err);
3191 if(U_FAILURE(*err)) {
b75a7d8f
A
3192 break;
3193 }
3194 }
3195 } /* end if(myTargetIndex<myTargetLength) */
3196 else{
3197 *err =U_BUFFER_OVERFLOW_ERROR;
3198 break;
3199 }
3200
3201 }/* end while(mySourceIndex<mySourceLength) */
3202
374ca955
A
3203 /*
3204 * the end of the input stream and detection of truncated input
3205 * are handled by the framework, but for ISO-2022-CN conversion
3206 * we need to be in ASCII mode at the very end
3207 *
3208 * conditions:
3209 * successful
3210 * not in ASCII mode
3211 * end of input and no truncated input
b75a7d8f 3212 */
374ca955
A
3213 if( U_SUCCESS(*err) &&
3214 pFromU2022State->g!=0 &&
46f4442e 3215 args->flush && source>=sourceLimit && cnv->fromUChar32==0
374ca955
A
3216 ) {
3217 int32_t sourceIndex;
3218
3219 /* we are switching to ASCII */
3220 pFromU2022State->g=0;
3221
3222 /* get the source index of the last input character */
3223 /*
3224 * TODO this would be simpler and more reliable if we used a pair
3225 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3226 * so that we could simply use the prevSourceIndex here;
3227 * this code gives an incorrect result for the rare case of an unmatched
3228 * trail surrogate that is alone in the last buffer of the text stream
3229 */
3230 sourceIndex=(int32_t)(source-args->source);
3231 if(sourceIndex>0) {
3232 --sourceIndex;
3233 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3234 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3235 ) {
3236 --sourceIndex;
b75a7d8f 3237 }
374ca955
A
3238 } else {
3239 sourceIndex=-1;
b75a7d8f 3240 }
b75a7d8f 3241
73c04bcf 3242 fromUWriteUInt8(
46f4442e 3243 cnv,
374ca955 3244 SHIFT_IN_STR, 1,
73c04bcf 3245 &target, (const char *)targetLimit,
374ca955
A
3246 &offsets, sourceIndex,
3247 err);
b75a7d8f 3248 }
b75a7d8f 3249
374ca955
A
3250 /*save the state and return */
3251 args->source = source;
3252 args->target = (char*)target;
b75a7d8f
A
3253}
3254
3255
46f4442e 3256static void
b75a7d8f
A
3257UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3258 UErrorCode* err){
3259 char tempBuf[3];
374ca955 3260 const char *mySource = (char *) args->source;
b75a7d8f 3261 UChar *myTarget = args->target;
b75a7d8f
A
3262 const char *mySourceLimit = args->sourceLimit;
3263 uint32_t targetUniChar = 0x0000;
3264 uint32_t mySourceChar = 0x0000;
3265 UConverterDataISO2022* myData;
374ca955 3266 ISO2022State *pToU2022State;
b75a7d8f 3267
374ca955
A
3268 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3269 pToU2022State = &myData->toU2022State;
3270
3271 if(myData->key != 0) {
3272 /* continue with a partial escape sequence */
3273 goto escape;
3274 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3275 /* continue with a partial double-byte character */
3276 mySourceChar = args->converter->toUBytes[0];
3277 args->converter->toULength = 0;
fd0068a8 3278 targetUniChar = missingCharMarker;
374ca955 3279 goto getTrailByte;
b75a7d8f 3280 }
374ca955
A
3281
3282 while(mySource < mySourceLimit){
b75a7d8f
A
3283
3284 targetUniChar =missingCharMarker;
3285
3286 if(myTarget < args->targetLimit){
3287
3288 mySourceChar= (unsigned char) *mySource++;
3289
b75a7d8f
A
3290 switch(mySourceChar){
3291 case UCNV_SI:
374ca955 3292 pToU2022State->g=0;
d5d484b0
A
3293 if (myData->isEmptySegment) {
3294 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
46f4442e
A
3295 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3296 args->converter->toUCallbackReason = UCNV_IRREGULAR;
d5d484b0
A
3297 args->converter->toUBytes[0] = mySourceChar;
3298 args->converter->toULength = 1;
3299 args->target = myTarget;
3300 args->source = mySource;
3301 return;
3302 }
b75a7d8f
A
3303 continue;
3304
3305 case UCNV_SO:
374ca955
A
3306 if(pToU2022State->cs[1] != 0) {
3307 pToU2022State->g=1;
d5d484b0 3308 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
374ca955
A
3309 continue;
3310 } else {
3311 /* illegal to have SO before a matching designator */
d5d484b0 3312 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
b75a7d8f
A
3313 break;
3314 }
3315
b75a7d8f 3316 case ESC_2022:
b75a7d8f 3317 mySource--;
374ca955 3318escape:
d5d484b0
A
3319 {
3320 const char * mySourceBefore = mySource;
3321 int8_t toULengthBefore = args->converter->toULength;
3322
46f4442e 3323 changeState_2022(args->converter,&(mySource),
d5d484b0
A
3324 mySourceLimit, ISO_2022_CN,err);
3325
3326 /* After SO there must be at least one character before a designator (designator error handled separately) */
46f4442e
A
3327 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3328 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3329 args->converter->toUCallbackReason = UCNV_IRREGULAR;
729e4ab9 3330 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
d5d484b0
A
3331 }
3332 }
b75a7d8f
A
3333
3334 /* invalid or illegal escape sequence */
3335 if(U_FAILURE(*err)){
3336 args->target = myTarget;
3337 args->source = mySource;
d5d484b0 3338 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
b75a7d8f
A
3339 return;
3340 }
3341 continue;
3342
374ca955
A
3343 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3344
3345 case CR:
3346 /*falls through*/
3347 case LF:
3348 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3349 /* falls through */
3350 default:
3351 /* convert one or two bytes */
d5d484b0 3352 myData->isEmptySegment = FALSE;
374ca955
A
3353 if(pToU2022State->g != 0) {
3354 if(mySource < mySourceLimit) {
3355 UConverterSharedData *cnv;
3356 StateEnum tempState;
3357 int32_t tempBufLen;
fd0068a8
A
3358 int leadIsOk, trailIsOk;
3359 uint8_t trailByte;
374ca955 3360getTrailByte:
fd0068a8
A
3361 trailByte = (uint8_t)*mySource;
3362 /*
3363 * Ticket 5691: consistent illegal sequences:
3364 * - We include at least the first byte in the illegal sequence.
3365 * - If any of the non-initial bytes could be the start of a character,
3366 * we stop the illegal sequence before the first one of those.
3367 *
3368 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3369 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3370 * Otherwise we convert or report the pair of bytes.
3371 */
3372 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3373 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3374 if (leadIsOk && trailIsOk) {
3375 ++mySource;
3376 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3377 if(tempState >= CNS_11643_0) {
3378 cnv = myData->myConverterArray[CNS_11643];
3379 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3380 tempBuf[1] = (char) (mySourceChar);
3381 tempBuf[2] = (char) trailByte;
3382 tempBufLen = 3;
3383
3384 }else{
4388f060 3385 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
fd0068a8
A
3386 cnv = myData->myConverterArray[tempState];
3387 tempBuf[0] = (char) (mySourceChar);
3388 tempBuf[1] = (char) trailByte;
3389 tempBufLen = 2;
3390 }
3391 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3392 mySourceChar = (mySourceChar << 8) | trailByte;
3393 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3394 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3395 ++mySource;
3396 /* add another bit so that the code below writes 2 bytes in case of error */
3397 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
374ca955 3398 }
374ca955
A
3399 if(pToU2022State->g>=2) {
3400 /* return from a single-shift state to the previous one */
3401 pToU2022State->g=pToU2022State->prevG;
3402 }
374ca955
A
3403 } else {
3404 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3405 args->converter->toULength = 1;
3406 goto endloop;
3407 }
3408 }
3409 else{
3410 if(mySourceChar <= 0x7f) {
3411 targetUniChar = (UChar) mySourceChar;
3412 }
3413 }
3414 break;
b75a7d8f
A
3415 }
3416 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3417 if(args->offsets){
73c04bcf 3418 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f
A
3419 }
3420 *(myTarget++)=(UChar)targetUniChar;
3421 }
3422 else if(targetUniChar > missingCharMarker){
3423 /* disassemble the surrogate pair and write to output*/
3424 targetUniChar-=0x0010000;
374ca955 3425 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
b75a7d8f 3426 if(args->offsets){
73c04bcf 3427 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f 3428 }
374ca955 3429 ++myTarget;
46f4442e 3430 if(myTarget< args->targetLimit){
374ca955 3431 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
b75a7d8f 3432 if(args->offsets){
73c04bcf 3433 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f 3434 }
374ca955 3435 ++myTarget;
b75a7d8f
A
3436 }else{
3437 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3438 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3439 }
3440
3441 }
3442 else{
3443 /* Call the callback function*/
374ca955
A
3444 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3445 break;
b75a7d8f
A
3446 }
3447 }
3448 else{
3449 *err =U_BUFFER_OVERFLOW_ERROR;
3450 break;
3451 }
3452 }
374ca955 3453endloop:
b75a7d8f
A
3454 args->target = myTarget;
3455 args->source = mySource;
3456}
b331163b 3457#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
b75a7d8f
A
3458
3459static void
3460_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3461 UConverter *cnv = args->converter;
3462 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
374ca955
A
3463 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3464 char *p, *subchar;
3465 char buffer[8];
3466 int32_t length;
3467
73c04bcf 3468 subchar=(char *)cnv->subChars;
374ca955 3469 length=cnv->subCharLen; /* assume length==1 for most variants */
b75a7d8f
A
3470
3471 p = buffer;
3472 switch(myConverterData->locale[0]){
3473 case 'j':
374ca955
A
3474 {
3475 int8_t cs;
3476
3477 if(pFromU2022State->g == 1) {
3478 /* JIS7: switch from G1 to G0 */
3479 pFromU2022State->g = 0;
3480 *p++ = UCNV_SI;
3481 }
3482
3483 cs = pFromU2022State->cs[0];
3484 if(cs != ASCII && cs != JISX201) {
3485 /* not in ASCII or JIS X 0201: switch to ASCII */
3486 pFromU2022State->cs[0] = (int8_t)ASCII;
b75a7d8f
A
3487 *p++ = '\x1b';
3488 *p++ = '\x28';
3489 *p++ = '\x42';
b75a7d8f 3490 }
374ca955
A
3491
3492 *p++ = subchar[0];
b75a7d8f 3493 break;
374ca955 3494 }
b75a7d8f 3495 case 'c':
374ca955
A
3496 if(pFromU2022State->g != 0) {
3497 /* not in ASCII mode: switch to ASCII */
3498 pFromU2022State->g = 0;
3499 *p++ = UCNV_SI;
3500 }
3501 *p++ = subchar[0];
b75a7d8f
A
3502 break;
3503 case 'k':
374ca955
A
3504 if(myConverterData->version == 0) {
3505 if(length == 1) {
3506 if((UBool)args->converter->fromUnicodeStatus) {
3507 /* in DBCS mode: switch to SBCS */
3508 args->converter->fromUnicodeStatus = 0;
3509 *p++ = UCNV_SI;
3510 }
3511 *p++ = subchar[0];
3512 } else /* length == 2*/ {
3513 if(!(UBool)args->converter->fromUnicodeStatus) {
3514 /* in SBCS mode: switch to DBCS */
3515 args->converter->fromUnicodeStatus = 1;
3516 *p++ = UCNV_SO;
3517 }
3518 *p++ = subchar[0];
3519 *p++ = subchar[1];
3520 }
3521 break;
3522 } else {
73c04bcf
A
3523 /* save the subconverter's substitution string */
3524 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3525 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3526
3527 /* set our substitution string into the subconverter */
3528 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
374ca955
A
3529 myConverterData->currentConverter->subCharLen = (int8_t)length;
3530
73c04bcf
A
3531 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3532 args->converter = myConverterData->currentConverter;
374ca955
A
3533 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3534 ucnv_cbFromUWriteSub(args, 0, err);
3535 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
73c04bcf
A
3536 args->converter = cnv;
3537
3538 /* restore the subconverter's substitution string */
3539 myConverterData->currentConverter->subChars = currentSubChars;
3540 myConverterData->currentConverter->subCharLen = currentSubCharLen;
374ca955
A
3541
3542 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3543 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3544 uprv_memcpy(
3545 cnv->charErrorBuffer,
3546 myConverterData->currentConverter->charErrorBuffer,
3547 myConverterData->currentConverter->charErrorBufferLength);
3548 }
3549 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3550 myConverterData->currentConverter->charErrorBufferLength = 0;
3551 }
374ca955 3552 return;
b75a7d8f 3553 }
b75a7d8f
A
3554 default:
3555 /* not expected */
3556 break;
3557 }
3558 ucnv_cbFromUWriteBytes(args,
3559 buffer, (int32_t)(p - buffer),
3560 offsetIndex, err);
3561}
3562
73c04bcf
A
3563/*
3564 * Structure for cloning an ISO 2022 converter into a single memory block.
3565 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3566 * and then ucnv_safeClone() of the sub-converter may additionally align
3567 * currentConverter inside the cloneStruct, for which we need the deadSpace
3568 * after currentConverter.
3569 * This is because UAlignedMemory may be larger than the actually
3570 * necessary alignment size for the platform.
3571 * The other cloneStruct fields will not be moved around,
3572 * and are aligned properly with cloneStruct's alignment.
3573 */
b75a7d8f
A
3574struct cloneStruct
3575{
3576 UConverter cnv;
374ca955 3577 UConverter currentConverter;
73c04bcf
A
3578 UAlignedMemory deadSpace;
3579 UConverterDataISO2022 mydata;
b75a7d8f
A
3580};
3581
3582
46f4442e 3583static UConverter *
b75a7d8f 3584_ISO_2022_SafeClone(
46f4442e
A
3585 const UConverter *cnv,
3586 void *stackBuffer,
3587 int32_t *pBufferSize,
b75a7d8f
A
3588 UErrorCode *status)
3589{
3590 struct cloneStruct * localClone;
374ca955
A
3591 UConverterDataISO2022 *cnvData;
3592 int32_t i, size;
b75a7d8f
A
3593
3594 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
374ca955
A
3595 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3596 return NULL;
b75a7d8f
A
3597 }
3598
374ca955 3599 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
b75a7d8f 3600 localClone = (struct cloneStruct *)stackBuffer;
b75a7d8f 3601
374ca955 3602 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
b75a7d8f 3603
374ca955 3604 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
73c04bcf
A
3605 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3606 localClone->cnv.isExtraLocal = TRUE;
b75a7d8f 3607
374ca955 3608 /* share the subconverters */
b75a7d8f 3609
374ca955 3610 if(cnvData->currentConverter != NULL) {
73c04bcf 3611 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
374ca955
A
3612 localClone->mydata.currentConverter =
3613 ucnv_safeClone(cnvData->currentConverter,
3614 &localClone->currentConverter,
3615 &size, status);
3616 if(U_FAILURE(*status)) {
3617 return NULL;
b75a7d8f 3618 }
b75a7d8f
A
3619 }
3620
374ca955
A
3621 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3622 if(cnvData->myConverterArray[i] != NULL) {
3623 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3624 }
b75a7d8f
A
3625 }
3626
b75a7d8f
A
3627 return &localClone->cnv;
3628}
3629
3630static void
3631_ISO_2022_GetUnicodeSet(const UConverter *cnv,
73c04bcf 3632 const USetAdder *sa,
b75a7d8f
A
3633 UConverterUnicodeSet which,
3634 UErrorCode *pErrorCode)
3635{
3636 int32_t i;
b75a7d8f
A
3637 UConverterDataISO2022* cnvData;
3638
3639 if (U_FAILURE(*pErrorCode)) {
3640 return;
3641 }
374ca955 3642#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f
A
3643 if (cnv->sharedData == &_ISO2022Data) {
3644 /* We use UTF-8 in this case */
374ca955
A
3645 sa->addRange(sa->set, 0, 0xd7FF);
3646 sa->addRange(sa->set, 0xE000, 0x10FFFF);
b75a7d8f
A
3647 return;
3648 }
374ca955 3649#endif
b75a7d8f
A
3650
3651 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
b75a7d8f 3652
374ca955
A
3653 /* open a set and initialize it with code points that are algorithmically round-tripped */
3654 switch(cnvData->locale[0]){
3655 case 'j':
46f4442e
A
3656 /* include JIS X 0201 which is hardcoded */
3657 sa->add(sa->set, 0xa5);
3658 sa->add(sa->set, 0x203e);
374ca955
A
3659 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3660 /* include Latin-1 for some variants of JP */
3661 sa->addRange(sa->set, 0, 0xff);
3662 } else {
3663 /* include ASCII for JP */
3664 sa->addRange(sa->set, 0, 0x7f);
3665 }
46f4442e
A
3666 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3667 /*
3668 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3669 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3670 * use half-width Katakana.
3671 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3672 * half-width Katakana via the ESC ( I sequence.
3673 * However, we only emit (fromUnicode) half-width Katakana according to the
3674 * definition of each variant.
3675 *
3676 * When including fallbacks,
3677 * we need to include half-width Katakana Unicode code points for all JP variants because
3678 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3679 */
374ca955 3680 /* include half-width Katakana for JP */
46f4442e 3681 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
374ca955
A
3682 }
3683 break;
b331163b 3684#if !UCONFIG_ONLY_HTML_CONVERSION
374ca955
A
3685 case 'c':
3686 case 'z':
3687 /* include ASCII for CN */
3688 sa->addRange(sa->set, 0, 0x7f);
3689 break;
3690 case 'k':
3691 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3692 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3693 cnvData->currentConverter, sa, which, pErrorCode);
73c04bcf
A
3694 /* the loop over myConverterArray[] will simply not find another converter */
3695 break;
b331163b 3696#endif
374ca955
A
3697 default:
3698 break;
b75a7d8f
A
3699 }
3700
46f4442e 3701#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
374ca955
A
3702 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3703 cnvData->version==0 && i==CNS_11643
3704 ) {
3705 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3706 ucnv_MBCSGetUnicodeSetForBytes(
3707 cnvData->myConverterArray[i],
3708 sa, UCNV_ROUNDTRIP_SET,
3709 0, 0x81, 0x82,
3710 pErrorCode);
46f4442e
A
3711 }
3712#endif
3713
3714 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3715 UConverterSetFilter filter;
3716 if(cnvData->myConverterArray[i]!=NULL) {
b331163b
A
3717 if(cnvData->locale[0]=='j' && i==JISX208) {
3718 /*
3719 * Only add code points that map to Shift-JIS codes
3720 * corresponding to JIS X 0208.
3721 */
3722 filter=UCNV_SET_FILTER_SJIS;
3723#if !UCONFIG_ONLY_HTML_CONVERSION
3724 } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3725 cnvData->version==0 && i==CNS_11643) {
46f4442e
A
3726 /*
3727 * Version-specific for CN:
3728 * CN version 0 does not map CNS planes 3..7 although
3729 * they are all available in the CNS conversion table;
3730 * CN version 1 (-EXT) does map them all.
3731 * The two versions create different Unicode sets.
3732 */
3733 filter=UCNV_SET_FILTER_2022_CN;
46f4442e
A
3734 } else if(i==KSC5601) {
3735 /*
3736 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3737 * are broader than GR94.
3738 */
3739 filter=UCNV_SET_FILTER_GR94DBCS;
b331163b 3740#endif
374ca955 3741 } else {
46f4442e 3742 filter=UCNV_SET_FILTER_NONE;
374ca955 3743 }
46f4442e 3744 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
374ca955 3745 }
b75a7d8f 3746 }
73c04bcf
A
3747
3748 /*
3749 * ISO 2022 converters must not convert SO/SI/ESC despite what
3750 * sub-converters do by themselves.
3751 * Remove these characters from the set.
3752 */
3753 sa->remove(sa->set, 0x0e);
3754 sa->remove(sa->set, 0x0f);
3755 sa->remove(sa->set, 0x1b);
46f4442e
A
3756
3757 /* ISO 2022 converters do not convert C1 controls either */
3758 sa->removeRange(sa->set, 0x80, 0x9f);
b75a7d8f
A
3759}
3760
374ca955
A
3761static const UConverterImpl _ISO2022Impl={
3762 UCNV_ISO_2022,
3763
3764 NULL,
3765 NULL,
3766
3767 _ISO2022Open,
3768 _ISO2022Close,
3769 _ISO2022Reset,
3770
3771#ifdef U_ENABLE_GENERIC_ISO_2022
3772 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3773 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3774 ucnv_fromUnicode_UTF8,
3775 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3776#else
3777 NULL,
3778 NULL,
3779 NULL,
3780 NULL,
3781#endif
3782 NULL,
3783
3784 NULL,
3785 _ISO2022getName,
3786 _ISO_2022_WriteSub,
3787 _ISO_2022_SafeClone,
4388f060
A
3788 _ISO_2022_GetUnicodeSet,
3789
3790 NULL,
3791 NULL
374ca955
A
3792};
3793static const UConverterStaticData _ISO2022StaticData={
3794 sizeof(UConverterStaticData),
3795 "ISO_2022",
3796 2022,
3797 UCNV_IBM,
3798 UCNV_ISO_2022,
3799 1,
3800 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3801 { 0x1a, 0, 0, 0 },
3802 1,
3803 FALSE,
3804 FALSE,
3805 0,
3806 0,
3807 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3808};
3809const UConverterSharedData _ISO2022Data={
3810 sizeof(UConverterSharedData),
3811 ~((uint32_t) 0),
3812 NULL,
3813 NULL,
3814 &_ISO2022StaticData,
3815 FALSE,
3816 &_ISO2022Impl,
4388f060 3817 0, UCNV_MBCS_TABLE_INITIALIZER
374ca955
A
3818};
3819
3820/*************JP****************/
3821static const UConverterImpl _ISO2022JPImpl={
3822 UCNV_ISO_2022,
3823
3824 NULL,
3825 NULL,
3826
3827 _ISO2022Open,
3828 _ISO2022Close,
3829 _ISO2022Reset,
3830
3831 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3832 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3833 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3834 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3835 NULL,
3836
3837 NULL,
3838 _ISO2022getName,
3839 _ISO_2022_WriteSub,
3840 _ISO_2022_SafeClone,
4388f060
A
3841 _ISO_2022_GetUnicodeSet,
3842
3843 NULL,
3844 NULL
374ca955
A
3845};
3846static const UConverterStaticData _ISO2022JPStaticData={
3847 sizeof(UConverterStaticData),
3848 "ISO_2022_JP",
3849 0,
3850 UCNV_IBM,
3851 UCNV_ISO_2022,
3852 1,
3853 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3854 { 0x1a, 0, 0, 0 },
3855 1,
3856 FALSE,
3857 FALSE,
3858 0,
3859 0,
3860 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3861};
4388f060
A
3862
3863namespace {
3864
3865const UConverterSharedData _ISO2022JPData={
374ca955
A
3866 sizeof(UConverterSharedData),
3867 ~((uint32_t) 0),
3868 NULL,
3869 NULL,
3870 &_ISO2022JPStaticData,
3871 FALSE,
3872 &_ISO2022JPImpl,
4388f060 3873 0, UCNV_MBCS_TABLE_INITIALIZER
374ca955
A
3874};
3875
4388f060
A
3876} // namespace
3877
b331163b 3878#if !UCONFIG_ONLY_HTML_CONVERSION
374ca955
A
3879/************* KR ***************/
3880static const UConverterImpl _ISO2022KRImpl={
3881 UCNV_ISO_2022,
3882
3883 NULL,
3884 NULL,
3885
3886 _ISO2022Open,
3887 _ISO2022Close,
3888 _ISO2022Reset,
3889
3890 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3891 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3892 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3893 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3894 NULL,
3895
3896 NULL,
3897 _ISO2022getName,
3898 _ISO_2022_WriteSub,
3899 _ISO_2022_SafeClone,
4388f060
A
3900 _ISO_2022_GetUnicodeSet,
3901
3902 NULL,
3903 NULL
374ca955
A
3904};
3905static const UConverterStaticData _ISO2022KRStaticData={
3906 sizeof(UConverterStaticData),
3907 "ISO_2022_KR",
3908 0,
3909 UCNV_IBM,
3910 UCNV_ISO_2022,
3911 1,
3912 3, /* max 3 bytes per UChar: SO+DBCS */
3913 { 0x1a, 0, 0, 0 },
3914 1,
3915 FALSE,
3916 FALSE,
3917 0,
3918 0,
3919 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3920};
4388f060
A
3921
3922namespace {
3923
3924const UConverterSharedData _ISO2022KRData={
374ca955
A
3925 sizeof(UConverterSharedData),
3926 ~((uint32_t) 0),
3927 NULL,
3928 NULL,
3929 &_ISO2022KRStaticData,
3930 FALSE,
3931 &_ISO2022KRImpl,
4388f060 3932 0, UCNV_MBCS_TABLE_INITIALIZER
374ca955
A
3933};
3934
4388f060
A
3935} // namespace
3936
374ca955
A
3937/*************** CN ***************/
3938static const UConverterImpl _ISO2022CNImpl={
3939
3940 UCNV_ISO_2022,
3941
3942 NULL,
3943 NULL,
3944
3945 _ISO2022Open,
3946 _ISO2022Close,
3947 _ISO2022Reset,
3948
3949 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3950 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3951 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3952 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3953 NULL,
3954
3955 NULL,
3956 _ISO2022getName,
3957 _ISO_2022_WriteSub,
3958 _ISO_2022_SafeClone,
4388f060
A
3959 _ISO_2022_GetUnicodeSet,
3960
3961 NULL,
3962 NULL
374ca955
A
3963};
3964static const UConverterStaticData _ISO2022CNStaticData={
3965 sizeof(UConverterStaticData),
3966 "ISO_2022_CN",
3967 0,
3968 UCNV_IBM,
3969 UCNV_ISO_2022,
73c04bcf 3970 1,
374ca955
A
3971 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3972 { 0x1a, 0, 0, 0 },
3973 1,
3974 FALSE,
3975 FALSE,
3976 0,
3977 0,
3978 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3979};
4388f060
A
3980
3981namespace {
3982
3983const UConverterSharedData _ISO2022CNData={
374ca955
A
3984 sizeof(UConverterSharedData),
3985 ~((uint32_t) 0),
3986 NULL,
3987 NULL,
3988 &_ISO2022CNStaticData,
3989 FALSE,
3990 &_ISO2022CNImpl,
4388f060 3991 0, UCNV_MBCS_TABLE_INITIALIZER
374ca955
A
3992};
3993
4388f060 3994} // namespace
b331163b 3995#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
374ca955 3996
b75a7d8f 3997#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */