]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnv2022.c
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / common / ucnv2022.c
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
d5d484b0 3* Copyright (C) 2000-2006,2008 International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6* file name: ucnv2022.c
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2000feb03
12* created by: Markus W. Scherer
13*
14* Change history:
15*
16* 06/29/2000 helena Major rewrite of the callback APIs.
17* 08/08/2000 Ram Included support for ISO-2022-JP-2
18* Changed implementation of toUnicode
19* function
20* 08/21/2000 Ram Added support for ISO-2022-KR
21* 08/29/2000 Ram Seperated implementation of EBCDIC to
22* ucnvebdc.c
23* 09/20/2000 Ram Added support for ISO-2022-CN
24* Added implementations for getNextUChar()
25* for specific 2022 country variants.
26* 10/31/2000 Ram Implemented offsets logic functions
27*/
28
29#include "unicode/utypes.h"
30
374ca955 31#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
b75a7d8f
A
32
33#include "unicode/ucnv.h"
34#include "unicode/uset.h"
35#include "unicode/ucnv_err.h"
36#include "unicode/ucnv_cb.h"
374ca955 37#include "ucnv_imp.h"
b75a7d8f
A
38#include "ucnv_bld.h"
39#include "ucnv_cnv.h"
40#include "ucnvmbcs.h"
41#include "cstring.h"
42#include "cmemory.h"
43
374ca955
A
44#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45
46#ifdef U_ENABLE_GENERIC_ISO_2022
47/*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
72 * for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76#endif
77
78static const char SHIFT_IN_STR[] = "\x0F";
79static const char SHIFT_OUT_STR[] = "\x0E";
b75a7d8f
A
80
81#define CR 0x0D
82#define LF 0x0A
83#define H_TAB 0x09
84#define V_TAB 0x0B
85#define SPACE 0x20
86
73c04bcf
A
87/*
88 * ISO 2022 control codes must not be converted from Unicode
89 * because they would mess up the byte stream.
90 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
91 * corresponding to SO, SI, and ESC.
92 */
93#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
94
374ca955 95/* for ISO-2022-JP and -CN implementations */
b75a7d8f 96typedef enum {
374ca955
A
97 /* shared values */
98 INVALID_STATE=-1,
b75a7d8f 99 ASCII = 0,
374ca955
A
100
101 SS2_STATE=0x10,
102 SS3_STATE,
103
104 /* JP */
b75a7d8f
A
105 ISO8859_1 = 1 ,
106 ISO8859_7 = 2 ,
107 JISX201 = 3,
108 JISX208 = 4,
109 JISX212 = 5,
110 GB2312 =6,
111 KSC5601 =7,
112 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
b75a7d8f 113
374ca955
A
114 /* CN */
115 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
116 GB2312_1=1,
117 ISO_IR_165=2,
118 CNS_11643=3,
119
120 /*
121 * these are used in StateEnum and ISO2022State variables,
122 * but CNS_11643 must be used to index into myConverterArray[]
123 */
124 CNS_11643_0=0x20,
125 CNS_11643_1,
126 CNS_11643_2,
127 CNS_11643_3,
128 CNS_11643_4,
129 CNS_11643_5,
130 CNS_11643_6,
131 CNS_11643_7
b75a7d8f
A
132} StateEnum;
133
374ca955
A
134/* is the StateEnum charset value for a DBCS charset? */
135#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
136
137#define CSM(cs) ((uint16_t)1<<(cs))
b75a7d8f 138
374ca955
A
139/*
140 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
141 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
142 *
143 * Note: The converter uses some leniency:
144 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
145 * all versions, not just JIS7 and JIS8.
146 * - ICU does not distinguish between different versions of JIS X 0208.
147 */
148static const uint16_t jpCharsetMasks[5]={
149 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
150 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
151 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
152 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
153 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
154};
b75a7d8f
A
155
156typedef enum {
157 ASCII1=0,
158 LATIN1,
159 SBCS,
160 DBCS,
374ca955
A
161 MBCS,
162 HWKANA
b75a7d8f
A
163}Cnv2022Type;
164
374ca955
A
165typedef struct ISO2022State {
166 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
167 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
168 int8_t prevG; /* g before single shift (SS2 or SS3) */
169} ISO2022State;
170
b75a7d8f
A
171#define UCNV_OPTIONS_VERSION_MASK 0xf
172#define UCNV_2022_MAX_CONVERTERS 10
173
174typedef struct{
73c04bcf 175 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
b75a7d8f 176 UConverter *currentConverter;
b75a7d8f 177 Cnv2022Type currentType;
374ca955 178 ISO2022State toU2022State, fromU2022State;
b75a7d8f
A
179 uint32_t key;
180 uint32_t version;
73c04bcf
A
181#ifdef U_ENABLE_GENERIC_ISO_2022
182 UBool isFirstBuffer;
183#endif
d5d484b0 184 UBool isEmptySegment;
b75a7d8f 185 char name[30];
73c04bcf 186 char locale[3];
b75a7d8f
A
187}UConverterDataISO2022;
188
374ca955 189/* Protos */
b75a7d8f
A
190/* ISO-2022 ----------------------------------------------------------------- */
191
192/*Forward declaration */
193U_CFUNC void
374ca955
A
194ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
195 UErrorCode * err);
b75a7d8f 196U_CFUNC void
374ca955
A
197ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
198 UErrorCode * err);
b75a7d8f
A
199
200#define ESC_2022 0x1B /*ESC*/
201
202typedef enum
203{
204 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
205 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
206 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
374ca955 207 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
b75a7d8f
A
208} UCNV_TableStates_2022;
209
210/*
211* The way these state transition arrays work is:
212* ex : ESC$B is the sequence for JISX208
213* a) First Iteration: char is ESC
214* i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
215* int x = normalize_esq_chars_2022[27] which is equal to 1
216* ii) Search for this value in escSeqStateTable_Key_2022[]
217* value of x is stored at escSeqStateTable_Key_2022[0]
218* iii) Save this index as offset
219* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
220* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
221* b) Switch on this state and continue to next char
222* i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
223* which is normalize_esq_chars_2022[36] == 4
224* ii) x is currently 1(from above)
225* x<<=5 -- x is now 32
226* x+=normalize_esq_chars_2022[36]
227* now x is 36
228* iii) Search for this value in escSeqStateTable_Key_2022[]
229* value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
230* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
231* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
232* c) Switch on this state and continue to next char
233* i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
234* ii) x is currently 36 (from above)
235* x<<=5 -- x is now 1152
236* x+=normalize_esq_chars_2022[66]
237* now x is 1161
238* iii) Search for this value in escSeqStateTable_Key_2022[]
239* value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
240* iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
241* escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
242* v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
243*/
244
245
246/*Below are the 3 arrays depicting a state transition table*/
247static const int8_t normalize_esq_chars_2022[256] = {
248/* 0 1 2 3 4 5 6 7 8 9 */
249
250 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
251 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
252 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
253 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
254 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
255 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
256 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
257 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
258 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
259 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
260 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
261 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
262 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
263 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
264 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
265 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
266 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
267 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
268 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
269 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
270 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
271 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
273 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
274 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
275 ,0 ,0 ,0 ,0 ,0 ,0
276};
277
374ca955
A
278#ifdef U_ENABLE_GENERIC_ISO_2022
279/*
280 * When the generic ISO-2022 converter is completely removed, not just disabled
281 * per #ifdef, then the following state table and the associated tables that are
282 * dimensioned with MAX_STATES_2022 should be trimmed.
283 *
284 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
285 * the associated escape sequences starting with ESC ( B should be removed.
286 * This includes the ones with key values 1097 and all of the ones above 1000000.
287 *
288 * For the latter, the tables can simply be truncated.
289 * For the former, since the tables must be kept parallel, it is probably best
290 * to simply duplicate an adjacent table cell, parallel in all tables.
291 *
292 * It may make sense to restructure the tables, especially by using small search
293 * tables for the variants instead of indexing them parallel to the table here.
294 */
295#endif
296
b75a7d8f
A
297#define MAX_STATES_2022 74
298static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
299/* 0 1 2 3 4 5 6 7 8 9 */
300
301 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
302 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
303 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
304 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
305 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
306 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
307 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
308 ,35947631 ,35947635 ,35947636 ,35947638
309};
310
374ca955 311#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f
A
312
313static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
314 /* 0 1 2 3 4 5 6 7 8 9 */
315
316 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
374ca955 317 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
b75a7d8f
A
318 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
319 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
320 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
321 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
322 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
323 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
324};
325
374ca955
A
326#endif
327
b75a7d8f
A
328static const UCNV_TableStates_2022 escSeqStateTable_Value_2022[MAX_STATES_2022] = {
329/* 0 1 2 3 4 5 6 7 8 9 */
374ca955 330 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
b75a7d8f
A
331 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
332 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
333 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
334 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
335 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
336 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
337 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
338};
339
340
b75a7d8f
A
341/* Type def for refactoring changeState_2022 code*/
342typedef enum{
374ca955 343#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f 344 ISO_2022=0,
374ca955 345#endif
b75a7d8f
A
346 ISO_2022_JP=1,
347 ISO_2022_KR=2,
348 ISO_2022_CN=3
349} Variant2022;
350
b75a7d8f
A
351/*********** ISO 2022 Converter Protos ***********/
352static void
353_ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode);
354
355static void
356 _ISO2022Close(UConverter *converter);
357
358static void
359_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
360
361static const char*
362_ISO2022getName(const UConverter* cnv);
363
364static void
365_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
366
367static UConverter *
368_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
369
374ca955 370#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f 371static void
374ca955
A
372T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
373#endif
b75a7d8f 374
374ca955
A
375/*const UConverterSharedData _ISO2022Data;*/
376static const UConverterSharedData _ISO2022JPData;
377static const UConverterSharedData _ISO2022KRData;
378static const UConverterSharedData _ISO2022CNData;
b75a7d8f 379
374ca955 380/*************** Converter implementations ******************/
b75a7d8f 381
73c04bcf
A
382/* The purpose of this function is to get around gcc compiler warnings. */
383static U_INLINE void
384fromUWriteUInt8(UConverter *cnv,
385 const char *bytes, int32_t length,
386 uint8_t **target, const char *targetLimit,
387 int32_t **offsets,
388 int32_t sourceIndex,
389 UErrorCode *pErrorCode)
390{
391 char *targetChars = (char *)*target;
392 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
393 offsets, sourceIndex, pErrorCode);
394 *target = (uint8_t*)targetChars;
395
396}
397
398static U_INLINE void
374ca955
A
399setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
400 if(myConverterData->version == 1) {
401 UConverter *cnv = myConverterData->currentConverter;
b75a7d8f 402
374ca955
A
403 cnv->toUnicodeStatus=0; /* offset */
404 cnv->mode=0; /* state */
405 cnv->toULength=0; /* byteIndex */
406 }
407}
b75a7d8f 408
73c04bcf 409static U_INLINE void
374ca955
A
410setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
411 /* in ISO-2022-KR the designator sequence appears only once
412 * in a file so we append it only once
413 */
414 if( converter->charErrorBufferLength==0){
b75a7d8f 415
374ca955
A
416 converter->charErrorBufferLength = 4;
417 converter->charErrorBuffer[0] = 0x1b;
418 converter->charErrorBuffer[1] = 0x24;
419 converter->charErrorBuffer[2] = 0x29;
420 converter->charErrorBuffer[3] = 0x43;
421 }
422 if(myConverterData->version == 1) {
423 UConverter *cnv = myConverterData->currentConverter;
b75a7d8f 424
374ca955
A
425 cnv->fromUChar32=0;
426 cnv->fromUnicodeStatus=1; /* prevLength */
427 }
428}
b75a7d8f 429
374ca955
A
430static void
431_ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
b75a7d8f 432
374ca955 433 char myLocale[6]={' ',' ',' ',' ',' ',' '};
b75a7d8f 434
374ca955
A
435 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
436 if(cnv->extraInfo != NULL) {
437 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
438 uint32_t version;
b75a7d8f 439
374ca955 440 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
374ca955 441 myConverterData->currentType = ASCII1;
374ca955
A
442 cnv->fromUnicodeStatus =FALSE;
443 if(locale){
444 uprv_strncpy(myLocale, locale, sizeof(myLocale));
445 }
374ca955 446 version = options & UCNV_OPTIONS_VERSION_MASK;
73c04bcf 447 myConverterData->version = version;
374ca955 448 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
73c04bcf
A
449 (myLocale[2]=='_' || myLocale[2]=='\0'))
450 {
451 size_t len=0;
374ca955
A
452 /* open the required converters and cache them */
453 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
454 myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
455 }
456 myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode);
457 myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode);
458 if(jpCharsetMasks[version]&CSM(JISX212)) {
459 myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode);
460 }
461 if(jpCharsetMasks[version]&CSM(GB2312)) {
462 myConverterData->myConverterArray[GB2312] = ucnv_loadSharedData("ibm-5478", NULL, errorCode); /* gb_2312_80-1 */
463 }
464 if(jpCharsetMasks[version]&CSM(KSC5601)) {
465 myConverterData->myConverterArray[KSC5601] = ucnv_loadSharedData("ksc_5601", NULL, errorCode);
466 }
b75a7d8f 467
374ca955
A
468 /* set the function pointers to appropriate funtions */
469 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
470 uprv_strcpy(myConverterData->locale,"ja");
b75a7d8f 471
374ca955
A
472 uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
473 len = uprv_strlen(myConverterData->name);
474 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
475 myConverterData->name[len+1]='\0';
476 }
477 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
73c04bcf
A
478 (myLocale[2]=='_' || myLocale[2]=='\0'))
479 {
480 if (version==1){
481 myConverterData->currentConverter=
482 ucnv_open("icu-internal-25546",errorCode);
b75a7d8f 483
73c04bcf
A
484 if (U_FAILURE(*errorCode)) {
485 _ISO2022Close(cnv);
486 return;
487 }
b75a7d8f 488
73c04bcf
A
489 uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
490 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
491 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
374ca955 492 }else{
73c04bcf 493 myConverterData->currentConverter=ucnv_open("ibm-949",errorCode);
b75a7d8f 494
73c04bcf
A
495 if (U_FAILURE(*errorCode)) {
496 _ISO2022Close(cnv);
497 return;
498 }
b75a7d8f 499
73c04bcf
A
500 myConverterData->version = 0;
501 uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
374ca955 502 }
b75a7d8f 503
374ca955
A
504 /* initialize the state variables */
505 setInitialStateToUnicodeKR(cnv, myConverterData);
73c04bcf 506 setInitialStateFromUnicodeKR(cnv, myConverterData);
b75a7d8f
A
507
508 /* set the function pointers to appropriate funtions */
509 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
b75a7d8f
A
510 uprv_strcpy(myConverterData->locale,"ko");
511 }
512 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
73c04bcf
A
513 (myLocale[2]=='_' || myLocale[2]=='\0'))
514 {
b75a7d8f
A
515
516 /* open the required converters and cache them */
374ca955
A
517 myConverterData->myConverterArray[GB2312_1] = ucnv_loadSharedData("ibm-5478", NULL, errorCode);
518 if(version==1) {
519 myConverterData->myConverterArray[ISO_IR_165] = ucnv_loadSharedData("iso-ir-165", NULL, errorCode);
520 }
521 myConverterData->myConverterArray[CNS_11643] = ucnv_loadSharedData("cns-11643-1992", NULL, errorCode);
b75a7d8f 522
b75a7d8f
A
523
524 /* set the function pointers to appropriate funtions */
525 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
526 uprv_strcpy(myConverterData->locale,"cn");
527
73c04bcf 528 if (version==1){
b75a7d8f
A
529 uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
530 }else{
b75a7d8f 531 myConverterData->version = 0;
73c04bcf 532 uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
b75a7d8f
A
533 }
534 }
535 else{
374ca955 536#ifdef U_ENABLE_GENERIC_ISO_2022
73c04bcf
A
537 myConverterData->isFirstBuffer = TRUE;
538
b75a7d8f
A
539 /* append the UTF-8 escape sequence */
540 cnv->charErrorBufferLength = 3;
541 cnv->charErrorBuffer[0] = 0x1b;
542 cnv->charErrorBuffer[1] = 0x25;
543 cnv->charErrorBuffer[2] = 0x42;
544
545 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
546 /* initialize the state variables */
b75a7d8f 547 uprv_strcpy(myConverterData->name,"ISO_2022");
374ca955
A
548#else
549 *errorCode = U_UNSUPPORTED_ERROR;
550 return;
551#endif
b75a7d8f
A
552 }
553
374ca955
A
554 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
555
556 if(U_FAILURE(*errorCode)) {
557 _ISO2022Close(cnv);
558 }
b75a7d8f
A
559 } else {
560 *errorCode = U_MEMORY_ALLOCATION_ERROR;
561 }
b75a7d8f
A
562}
563
564
565static void
566_ISO2022Close(UConverter *converter) {
374ca955
A
567 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
568 UConverterSharedData **array = myData->myConverterArray;
569 int32_t i;
b75a7d8f
A
570
571 if (converter->extraInfo != NULL) {
572 /*close the array of converter pointers and free the memory*/
374ca955
A
573 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
574 if(array[i]!=NULL) {
575 ucnv_unloadSharedDataIfReady(array[i]);
b75a7d8f 576 }
b75a7d8f
A
577 }
578
374ca955 579 ucnv_close(myData->currentConverter);
b75a7d8f
A
580
581 if(!converter->isExtraLocal){
582 uprv_free (converter->extraInfo);
374ca955 583 converter->extraInfo = NULL;
b75a7d8f
A
584 }
585 }
586}
587
588static void
589_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
590 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
374ca955
A
591 if(choice<=UCNV_RESET_TO_UNICODE) {
592 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
593 myConverterData->key = 0;
d5d484b0 594 myConverterData->isEmptySegment = FALSE;
374ca955
A
595 }
596 if(choice!=UCNV_RESET_TO_UNICODE) {
597 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
598 }
599#ifdef U_ENABLE_GENERIC_ISO_2022
600 if(myConverterData->locale[0] == 0){
b75a7d8f
A
601 if(choice<=UCNV_RESET_TO_UNICODE) {
602 myConverterData->isFirstBuffer = TRUE;
374ca955 603 myConverterData->key = 0;
b75a7d8f
A
604 if (converter->mode == UCNV_SO){
605 ucnv_close (myConverterData->currentConverter);
606 myConverterData->currentConverter=NULL;
607 }
608 converter->mode = UCNV_SI;
609 }
610 if(choice!=UCNV_RESET_TO_UNICODE) {
611 /* re-append UTF-8 escape sequence */
612 converter->charErrorBufferLength = 3;
613 converter->charErrorBuffer[0] = 0x1b;
614 converter->charErrorBuffer[1] = 0x28;
615 converter->charErrorBuffer[2] = 0x42;
616 }
617 }
374ca955
A
618 else
619#endif
620 {
b75a7d8f 621 /* reset the state variables */
374ca955 622 if(myConverterData->locale[0] == 'k'){
b75a7d8f
A
623 if(choice<=UCNV_RESET_TO_UNICODE) {
624 setInitialStateToUnicodeKR(converter, myConverterData);
625 }
626 if(choice!=UCNV_RESET_TO_UNICODE) {
627 setInitialStateFromUnicodeKR(converter, myConverterData);
628 }
629 }
630 }
631}
632
633static const char*
634_ISO2022getName(const UConverter* cnv){
635 if(cnv->extraInfo){
636 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
637 return myData->name;
638 }
639 return NULL;
640}
641
b75a7d8f 642
374ca955
A
643/*************** to unicode *******************/
644/****************************************************************************
645 * Recognized escape sequences are
646 * <ESC>(B ASCII
647 * <ESC>.A ISO-8859-1
648 * <ESC>.F ISO-8859-7
649 * <ESC>(J JISX-201
650 * <ESC>(I JISX-201
651 * <ESC>$B JISX-208
652 * <ESC>$@ JISX-208
653 * <ESC>$(D JISX-212
654 * <ESC>$A GB2312
655 * <ESC>$(C KSC5601
656 */
657static const StateEnum nextStateToUnicodeJP[MAX_STATES_2022]= {
658/* 0 1 2 3 4 5 6 7 8 9 */
659 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
660 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
661 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
662 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
663 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
664 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
665 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
666 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
667};
b75a7d8f 668
374ca955
A
669/*************** to unicode *******************/
670static const StateEnum nextStateToUnicodeCN[MAX_STATES_2022]= {
671/* 0 1 2 3 4 5 6 7 8 9 */
672 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
673 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
674 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
675 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
676 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
677 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
678 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
679 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
680};
b75a7d8f 681
b75a7d8f 682
374ca955
A
683static UCNV_TableStates_2022
684getKey_2022(char c,int32_t* key,int32_t* offset){
685 int32_t togo;
686 int32_t low = 0;
687 int32_t hi = MAX_STATES_2022;
688 int32_t oldmid=0;
b75a7d8f 689
374ca955
A
690 togo = normalize_esq_chars_2022[(uint8_t)c];
691 if(togo == 0) {
692 /* not a valid character anywhere in an escape sequence */
693 *key = 0;
694 *offset = 0;
695 return INVALID_2022;
696 }
697 togo = (*key << 5) + togo;
b75a7d8f 698
374ca955 699 while (hi != low) /*binary search*/{
b75a7d8f 700
374ca955
A
701 register int32_t mid = (hi+low) >> 1; /*Finds median*/
702
703 if (mid == oldmid)
704 break;
705
706 if (escSeqStateTable_Key_2022[mid] > togo){
707 hi = mid;
708 }
709 else if (escSeqStateTable_Key_2022[mid] < togo){
710 low = mid;
711 }
712 else /*we found it*/{
713 *key = togo;
714 *offset = mid;
715 return escSeqStateTable_Value_2022[mid];
716 }
717 oldmid = mid;
b75a7d8f 718
b75a7d8f 719 }
b75a7d8f 720
374ca955
A
721 *key = 0;
722 *offset = 0;
723 return INVALID_2022;
b75a7d8f
A
724}
725
374ca955
A
726/*runs through a state machine to determine the escape sequence - codepage correspondance
727 */
728static void
729changeState_2022(UConverter* _this,
730 const char** source,
731 const char* sourceLimit,
732 Variant2022 var,
733 UErrorCode* err){
734 UCNV_TableStates_2022 value;
735 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
736 uint32_t key = myData2022->key;
73c04bcf 737 int32_t offset = 0;
fd0068a8 738 int8_t initialToULength = _this->toULength;
374ca955
A
739 char c;
740
741 value = VALID_NON_TERMINAL_2022;
742 while (*source < sourceLimit) {
743 c = *(*source)++;
744 _this->toUBytes[_this->toULength++]=(uint8_t)c;
745 value = getKey_2022(c,(int32_t *) &key, &offset);
746
747 switch (value){
b75a7d8f 748
374ca955
A
749 case VALID_NON_TERMINAL_2022 :
750 /* continue with the loop */
751 break;
b75a7d8f 752
374ca955
A
753 case VALID_TERMINAL_2022:
754 key = 0;
755 goto DONE;
b75a7d8f 756
374ca955
A
757 case INVALID_2022:
758 goto DONE;
b75a7d8f 759
374ca955
A
760 case VALID_MAYBE_TERMINAL_2022:
761#ifdef U_ENABLE_GENERIC_ISO_2022
762 /* ESC ( B is ambiguous only for ISO_2022 itself */
763 if(var == ISO_2022) {
764 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
765 _this->toULength = 0;
b75a7d8f 766
374ca955
A
767 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
768
769 /* continue with the loop */
770 value = VALID_NON_TERMINAL_2022;
771 break;
772 } else
773#endif
774 {
775 /* not ISO_2022 itself, finish here */
776 value = VALID_TERMINAL_2022;
777 key = 0;
778 goto DONE;
b75a7d8f
A
779 }
780 }
b75a7d8f 781 }
b75a7d8f 782
374ca955
A
783DONE:
784 myData2022->key = key;
b75a7d8f 785
374ca955
A
786 if (value == VALID_NON_TERMINAL_2022) {
787 /* indicate that the escape sequence is incomplete: key!=0 */
788 return;
789 } else if (value == INVALID_2022 ) {
790 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
374ca955
A
791 } else /* value == VALID_TERMINAL_2022 */ {
792 switch(var){
793#ifdef U_ENABLE_GENERIC_ISO_2022
794 case ISO_2022:
795 {
796 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
797 if(chosenConverterName == NULL) {
798 /* SS2 or SS3 */
799 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
800 return;
b75a7d8f 801 }
374ca955
A
802
803 _this->mode = UCNV_SI;
804 ucnv_close(myData2022->currentConverter);
805 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
806 if(U_SUCCESS(*err)) {
807 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
808 _this->mode = UCNV_SO;
809 }
810 break;
811 }
812#endif
813 case ISO_2022_JP:
814 {
815 StateEnum tempState=nextStateToUnicodeJP[offset];
816 switch(tempState) {
817 case INVALID_STATE:
818 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
819 break;
820 case SS2_STATE:
821 if(myData2022->toU2022State.cs[2]!=0) {
822 if(myData2022->toU2022State.g<2) {
823 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
824 }
825 myData2022->toU2022State.g=2;
826 } else {
827 /* illegal to have SS2 before a matching designator */
828 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
829 }
830 break;
831 /* case SS3_STATE: not used in ISO-2022-JP-x */
832 case ISO8859_1:
833 case ISO8859_7:
834 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
835 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
836 } else {
837 /* G2 charset for SS2 */
838 myData2022->toU2022State.cs[2]=(int8_t)tempState;
839 }
840 break;
841 default:
842 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
843 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
844 } else {
845 /* G0 charset */
846 myData2022->toU2022State.cs[0]=(int8_t)tempState;
847 }
848 break;
849 }
850 }
851 break;
852 case ISO_2022_CN:
853 {
854 StateEnum tempState=nextStateToUnicodeCN[offset];
855 switch(tempState) {
856 case INVALID_STATE:
857 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
858 break;
859 case SS2_STATE:
860 if(myData2022->toU2022State.cs[2]!=0) {
861 if(myData2022->toU2022State.g<2) {
862 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
863 }
864 myData2022->toU2022State.g=2;
865 } else {
866 /* illegal to have SS2 before a matching designator */
867 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
868 }
869 break;
870 case SS3_STATE:
871 if(myData2022->toU2022State.cs[3]!=0) {
872 if(myData2022->toU2022State.g<2) {
873 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
874 }
875 myData2022->toU2022State.g=3;
876 } else {
877 /* illegal to have SS3 before a matching designator */
878 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
879 }
880 break;
881 case ISO_IR_165:
882 if(myData2022->version==0) {
883 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
884 break;
885 }
73c04bcf 886 /*fall through*/
374ca955 887 case GB2312_1:
73c04bcf 888 /*fall through*/
374ca955
A
889 case CNS_11643_1:
890 myData2022->toU2022State.cs[1]=(int8_t)tempState;
891 break;
892 case CNS_11643_2:
893 myData2022->toU2022State.cs[2]=(int8_t)tempState;
894 break;
895 default:
896 /* other CNS 11643 planes */
897 if(myData2022->version==0) {
898 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
899 } else {
900 myData2022->toU2022State.cs[3]=(int8_t)tempState;
901 }
902 break;
903 }
904 }
905 break;
906 case ISO_2022_KR:
907 if(offset==0x30){
908 /* nothing to be done, just accept this one escape sequence */
909 } else {
910 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
911 }
912 break;
913
914 default:
915 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
916 break;
917 }
918 }
919 if(U_SUCCESS(*err)) {
920 _this->toULength = 0;
fd0068a8
A
921 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
922 if(_this->toULength>1) {
923 /*
924 * Ticket 5691: consistent illegal sequences:
925 * - We include at least the first byte (ESC) in the illegal sequence.
926 * - If any of the non-initial bytes could be the start of a character,
927 * we stop the illegal sequence before the first one of those.
928 * In escape sequences, all following bytes are "printable", that is,
929 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
930 * they are valid single/lead bytes.
931 * For simplicity, we always only report the initial ESC byte as the
932 * illegal sequence and back out all other bytes we looked at.
933 */
934 /* Back out some bytes. */
935 int8_t backOutDistance=_this->toULength-1;
936 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
937 if(backOutDistance<=bytesFromThisBuffer) {
938 /* same as initialToULength<=1 */
939 *source-=backOutDistance;
940 } else {
941 /* Back out bytes from the previous buffer: Need to replay them. */
942 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
943 /* same as -(initialToULength-1) */
944 /* preToULength is negative! */
945 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
946 *source-=bytesFromThisBuffer;
947 }
948 _this->toULength=1;
949 }
374ca955
A
950 }
951}
952
953/*Checks the characters of the buffer against valid 2022 escape sequences
954*if the match we return a pointer to the initial start of the sequence otherwise
955*we return sourceLimit
956*/
957/*for 2022 looks ahead in the stream
958 *to determine the longest possible convertible
959 *data stream
960 */
961static U_INLINE const char*
962getEndOfBuffer_2022(const char** source,
963 const char* sourceLimit,
964 UBool flush){
965
966 const char* mySource = *source;
967
968#ifdef U_ENABLE_GENERIC_ISO_2022
969 if (*source >= sourceLimit)
970 return sourceLimit;
971
972 do{
973
974 if (*mySource == ESC_2022){
975 int8_t i;
976 int32_t key = 0;
977 int32_t offset;
978 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
979
980 /* Kludge: I could not
981 * figure out the reason for validating an escape sequence
982 * twice - once here and once in changeState_2022().
983 * is it possible to have an ESC character in a ISO2022
984 * byte stream which is valid in a code page? Is it legal?
985 */
986 for (i=0;
987 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
988 i++) {
989 value = getKey_2022(*(mySource+i), &key, &offset);
990 }
991 if (value > 0 || *mySource==ESC_2022)
992 return mySource;
993
994 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
995 return sourceLimit;
996 }
997 }while (++mySource < sourceLimit);
998
999 return sourceLimit;
1000#else
1001 while(mySource < sourceLimit && *mySource != ESC_2022) {
1002 ++mySource;
1003 }
1004 return mySource;
1005#endif
1006}
1007
1008
1009/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1010 * any future change in _MBCSFromUChar32() function should be reflected in
1011 * this macro
1012 */
1013static U_INLINE void
1014MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1015 UChar32 c,
1016 uint32_t* value,
1017 UBool useFallback,
1018 int32_t *length,
1019 int outputType)
1020{
1021 const int32_t *cx;
1022 const uint16_t *table;
1023 uint32_t stage2Entry;
1024 uint32_t myValue;
1025 const uint8_t *p;
1026 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1027 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1028 table=sharedData->mbcs.fromUnicodeTable;
1029 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1030 /* get the bytes and the length for the output */
1031 if(outputType==MBCS_OUTPUT_2){
1032 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1033 if(myValue<=0xff) {
1034 *length=1;
1035 } else {
1036 *length=2;
1037 }
1038 } else /* outputType==MBCS_OUTPUT_3 */ {
1039 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1040 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1041 if(myValue<=0xff) {
1042 *length=1;
1043 } else if(myValue<=0xffff) {
1044 *length=2;
1045 } else {
1046 *length=3;
b75a7d8f
A
1047 }
1048 }
1049 /* is this code point assigned, or do we use fallbacks? */
1050 if( (stage2Entry&(1<<(16+(c&0xf))))!=0 ||
374ca955 1051 (FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0)
b75a7d8f
A
1052 ) {
1053 /*
374ca955 1054 * We allow a 0 byte output if the "assigned" bit is set for this entry.
b75a7d8f 1055 * There is no way with this data structure for fallback output
374ca955 1056 * to be a zero byte.
b75a7d8f
A
1057 */
1058 /* assigned */
1059 *value=myValue;
374ca955 1060 return;
b75a7d8f 1061 }
b75a7d8f 1062 }
374ca955
A
1063
1064 cx=sharedData->mbcs.extIndexes;
1065 if(cx!=NULL) {
1066 *length=ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1067 return;
1068 }
1069
1070 /* unassigned */
1071 *length=0;
b75a7d8f
A
1072}
1073
1074/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1075 * any future change in _MBCSSingleFromUChar32() function should be reflected in
1076 * this macro
1077 */
1078static U_INLINE void
1079MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1080 UChar32 c,
1081 uint32_t* retval,
1082 UBool useFallback)
1083{
1084 const uint16_t *table;
1085 int32_t value;
1086 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
374ca955
A
1087 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1088 *retval=(uint16_t)-1;
1089 return;
b75a7d8f
A
1090 }
1091 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
374ca955 1092 table=sharedData->mbcs.fromUnicodeTable;
b75a7d8f 1093 /* get the byte for the output */
374ca955 1094 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
b75a7d8f
A
1095 /* is this code point assigned, or do we use fallbacks? */
1096 if(useFallback ? value>=0x800 : value>=0xc00) {
1097 value &=0xff;
1098 } else {
1099 value= -1;
1100 }
1101 *retval=(uint16_t) value;
1102}
1103
374ca955
A
1104#ifdef U_ENABLE_GENERIC_ISO_2022
1105
b75a7d8f
A
1106/**********************************************************************************
1107* ISO-2022 Converter
1108*
1109*
1110*/
1111
b75a7d8f
A
1112static void
1113T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1114 UErrorCode* err){
374ca955
A
1115 const char* mySourceLimit, *realSourceLimit;
1116 const char* sourceStart;
1117 const UChar* myTargetStart;
b75a7d8f 1118 UConverter* saveThis;
b75a7d8f 1119 UConverterDataISO2022* myData;
374ca955
A
1120 int8_t length;
1121
1122 saveThis = args->converter;
1123 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1124
1125 realSourceLimit = args->sourceLimit;
1126 while (args->source < realSourceLimit) {
1127 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1128 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1129 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1130
1131 if(args->source < mySourceLimit) {
1132 if(myData->currentConverter==NULL) {
1133 myData->currentConverter = ucnv_open("ASCII",err);
1134 if(U_FAILURE(*err)){
1135 return;
1136 }
b75a7d8f 1137
374ca955
A
1138 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1139 saveThis->mode = UCNV_SO;
b75a7d8f 1140 }
b75a7d8f 1141
374ca955
A
1142 /* convert to before the ESC or until the end of the buffer */
1143 myData->isFirstBuffer=FALSE;
1144 sourceStart = args->source;
1145 myTargetStart = args->target;
1146 args->converter = myData->currentConverter;
1147 ucnv_toUnicode(args->converter,
1148 &args->target,
1149 args->targetLimit,
1150 &args->source,
1151 mySourceLimit,
1152 args->offsets,
1153 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1154 err);
1155 args->converter = saveThis;
1156
1157 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1158 /* move the overflow buffer */
1159 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1160 myData->currentConverter->UCharErrorBufferLength = 0;
1161 if(length > 0) {
1162 uprv_memcpy(saveThis->UCharErrorBuffer,
1163 myData->currentConverter->UCharErrorBuffer,
1164 length*U_SIZEOF_UCHAR);
1165 }
1166 return;
1167 }
b75a7d8f 1168
374ca955
A
1169 /*
1170 * At least one of:
1171 * -Error while converting
1172 * -Done with entire buffer
1173 * -Need to write offsets or update the current offset
1174 * (leave that up to the code in ucnv.c)
1175 *
1176 * or else we just stopped at an ESC byte and continue with changeState_2022()
1177 */
1178 if (U_FAILURE(*err) ||
1179 (args->source == realSourceLimit) ||
1180 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1181 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1182 ) {
1183 /* copy partial or error input for truncated detection and error handling */
1184 if(U_FAILURE(*err)) {
1185 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1186 if(length > 0) {
1187 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1188 }
1189 } else {
1190 length = saveThis->toULength = myData->currentConverter->toULength;
1191 if(length > 0) {
1192 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1193 if(args->source < mySourceLimit) {
1194 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1195 }
1196 }
1197 }
1198 return;
b75a7d8f 1199 }
b75a7d8f
A
1200 }
1201 }
b75a7d8f
A
1202
1203 sourceStart = args->source;
1204 changeState_2022(args->converter,
1205 &(args->source),
374ca955 1206 realSourceLimit,
b75a7d8f 1207 ISO_2022,
b75a7d8f 1208 err);
374ca955
A
1209 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1210 /* let the ucnv.c code update its current offset */
1211 return;
b75a7d8f 1212 }
b75a7d8f 1213 }
b75a7d8f
A
1214}
1215
374ca955 1216#endif
b75a7d8f
A
1217
1218/*
1219 * To Unicode Callback helper function
1220 */
1221static void
374ca955
A
1222toUnicodeCallback(UConverter *cnv,
1223 const uint32_t sourceChar, const uint32_t targetUniChar,
1224 UErrorCode* err){
b75a7d8f 1225 if(sourceChar>0xff){
374ca955
A
1226 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1227 cnv->toUBytes[1] = (uint8_t)sourceChar;
1228 cnv->toULength = 2;
b75a7d8f
A
1229 }
1230 else{
374ca955 1231 cnv->toUBytes[0] =(char) sourceChar;
fd0068a8 1232 cnv->toULength = 1;
b75a7d8f
A
1233 }
1234
1235 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
b75a7d8f
A
1236 *err = U_INVALID_CHAR_FOUND;
1237 }
1238 else{
b75a7d8f
A
1239 *err = U_ILLEGAL_CHAR_FOUND;
1240 }
b75a7d8f
A
1241}
1242
1243/**************************************ISO-2022-JP*************************************************/
1244
1245/************************************** IMPORTANT **************************************************
1246* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1247* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1248* The converter iterates over each Unicode codepoint
1249* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1250* processed one char at a time it would make sense to reduce the extra processing a canned converter
1251* would do as far as possible.
1252*
1253* If the implementation of these macros or structure of sharedData struct change in the future, make
1254* sure that ISO-2022 is also changed.
1255***************************************************************************************************
1256*/
1257
1258/***************************************************************************************************
1259* Rules for ISO-2022-jp encoding
1260* (i) Escape sequences must be fully contained within a line they should not
1261* span new lines or CRs
1262* (ii) If the last character on a line is represented by two bytes then an ASCII or
1263* JIS-Roman character escape sequence should follow before the line terminates
1264* (iii) If the first character on the line is represented by two bytes then a two
1265* byte character escape sequence should precede it
1266* (iv) If no escape sequence is encountered then the characters are ASCII
1267* (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1268* and invoked with SS2 (ESC N).
1269* (vi) If there is any G0 designation in text, there must be a switch to
1270* ASCII or to JIS X 0201-Roman before a space character (but not
1271* necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1272* characters such as tab or CRLF.
1273* (vi) Supported encodings:
1274* ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1275*
1276* source : RFC-1554
1277*
1278* JISX201, JISX208,JISX212 : new .cnv data files created
1279* KSC5601 : alias to ibm-949 mapping table
1280* GB2312 : alias to ibm-1386 mapping table
1281* ISO-8859-1 : Algorithmic implemented as LATIN1 case
1282* ISO-8859-7 : alisas to ibm-9409 mapping table
1283*/
b75a7d8f 1284
374ca955
A
1285/* preference order of JP charsets */
1286static const StateEnum jpCharsetPref[]={
1287 ASCII,
1288 JISX201,
1289 ISO8859_1,
1290 ISO8859_7,
1291 JISX208,
1292 JISX212,
1293 GB2312,
1294 KSC5601,
1295 HWKANA_7BIT
b75a7d8f
A
1296};
1297
73c04bcf
A
1298/*
1299 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1300 * not in order of jpCharsetPref[]!
1301 */
374ca955 1302static const char escSeqChars[][6] ={
b75a7d8f
A
1303 "\x1B\x28\x42", /* <ESC>(B ASCII */
1304 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1305 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1306 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1307 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1308 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1309 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1310 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1311 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1312
1313};
374ca955
A
1314static const int32_t escSeqCharsLen[] ={
1315 3, /* length of <ESC>(B ASCII */
b75a7d8f
A
1316 3, /* length of <ESC>.A ISO-8859-1 */
1317 3, /* length of <ESC>.F ISO-8859-7 */
1318 3, /* length of <ESC>(J JISX-201 */
1319 3, /* length of <ESC>$B JISX-208 */
1320 4, /* length of <ESC>$(D JISX-212 */
1321 3, /* length of <ESC>$A GB2312 */
1322 4, /* length of <ESC>$(C KSC5601 */
1323 3 /* length of <ESC>(I HWKANA_7BIT */
1324};
1325
1326/*
1327* The iteration over various code pages works this way:
1328* i) Get the currentState from myConverterData->currentState
1329* ii) Check if the character is mapped to a valid character in the currentState
1330* Yes -> a) set the initIterState to currentState
1331* b) remain in this state until an invalid character is found
1332* No -> a) go to the next code page and find the character
1333* iii) Before changing the state increment the current state check if the current state
1334* is equal to the intitIteration state
1335* Yes -> A character that cannot be represented in any of the supported encodings
1336* break and return a U_INVALID_CHARACTER error
1337* No -> Continue and find the character in next code page
1338*
1339*
1340* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1341*/
1342
1343static void
374ca955 1344UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
b75a7d8f 1345 UConverterDataISO2022 *converterData;
374ca955
A
1346 ISO2022State *pFromU2022State;
1347 uint8_t *target = (uint8_t *) args->target;
1348 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
b75a7d8f
A
1349 const UChar* source = args->source;
1350 const UChar* sourceLimit = args->sourceLimit;
1351 int32_t* offsets = args->offsets;
374ca955
A
1352 UChar32 sourceChar;
1353 char buffer[8];
1354 int32_t len, outLen;
1355 int8_t choices[10];
1356 int32_t choiceCount;
73c04bcf 1357 uint32_t targetValue = 0;
374ca955
A
1358 UBool useFallback;
1359
1360 int32_t i;
1361 int8_t cs, g;
1362
1363 /* set up the state */
1364 converterData = (UConverterDataISO2022*)args->converter->extraInfo;
1365 pFromU2022State = &converterData->fromU2022State;
1366 useFallback = args->converter->useFallback;
1367
1368 choiceCount = 0;
b75a7d8f 1369
b75a7d8f 1370 /* check if the last codepoint of previous buffer was a lead surrogate*/
374ca955 1371 if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
b75a7d8f
A
1372 goto getTrail;
1373 }
b75a7d8f 1374
374ca955
A
1375 while(source < sourceLimit) {
1376 if(target < targetLimit) {
b75a7d8f 1377
b75a7d8f 1378 sourceChar = *(source++);
374ca955 1379 /*check if the char is a First surrogate*/
73c04bcf 1380 if(UTF_IS_SURROGATE(sourceChar)) {
374ca955
A
1381 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1382getTrail:
1383 /*look ahead to find the trail surrogate*/
1384 if(source < sourceLimit) {
1385 /* test the following code unit */
1386 UChar trail=(UChar) *source;
1387 if(UTF_IS_SECOND_SURROGATE(trail)) {
1388 source++;
1389 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1390 args->converter->fromUChar32=0x00;
1391 /* convert this supplementary code point */
1392 /* exit this condition tree */
1393 } else {
1394 /* this is an unmatched lead code unit (1st surrogate) */
1395 /* callback(illegal) */
1396 *err=U_ILLEGAL_CHAR_FOUND;
1397 args->converter->fromUChar32=sourceChar;
1398 break;
b75a7d8f 1399 }
374ca955
A
1400 } else {
1401 /* no more input */
1402 args->converter->fromUChar32=sourceChar;
b75a7d8f
A
1403 break;
1404 }
374ca955
A
1405 } else {
1406 /* this is an unmatched trail code unit (2nd surrogate) */
1407 /* callback(illegal) */
1408 *err=U_ILLEGAL_CHAR_FOUND;
1409 args->converter->fromUChar32=sourceChar;
1410 break;
1411 }
b75a7d8f
A
1412 }
1413
73c04bcf
A
1414 /* do not convert SO/SI/ESC */
1415 if(IS_2022_CONTROL(sourceChar)) {
1416 /* callback(illegal) */
1417 *err=U_ILLEGAL_CHAR_FOUND;
1418 args->converter->fromUChar32=sourceChar;
1419 break;
1420 }
1421
374ca955 1422 /* do the conversion */
b75a7d8f 1423
374ca955
A
1424 if(choiceCount == 0) {
1425 uint16_t csm;
b75a7d8f 1426
374ca955
A
1427 /*
1428 * The csm variable keeps track of which charsets are allowed
1429 * and not used yet while building the choices[].
1430 */
1431 csm = jpCharsetMasks[converterData->version];
1432 choiceCount = 0;
1433
1434 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1435 if(converterData->version == 3 || converterData->version == 4) {
1436 choices[choiceCount++] = cs = (int8_t)HWKANA_7BIT;
1437 csm &= ~CSM(cs);
1438 }
b75a7d8f 1439
374ca955
A
1440 /* try the current G0 charset */
1441 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1442 csm &= ~CSM(cs);
b75a7d8f 1443
374ca955
A
1444 /* try the current G2 charset */
1445 if((cs = pFromU2022State->cs[2]) != 0) {
1446 choices[choiceCount++] = cs;
1447 csm &= ~CSM(cs);
1448 }
1449
1450 /* try all the other possible charsets */
1451 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1452 cs = (int8_t)jpCharsetPref[i];
1453 if(CSM(cs) & csm) {
1454 choices[choiceCount++] = cs;
1455 csm &= ~CSM(cs);
b75a7d8f
A
1456 }
1457 }
374ca955 1458 }
b75a7d8f 1459
374ca955
A
1460 cs = g = 0;
1461 len = 0;
1462
1463 for(i = 0; i < choiceCount && len == 0; ++i) {
1464 cs = choices[i];
1465 switch(cs) {
1466 case ASCII:
1467 if(sourceChar <= 0x7f) {
1468 targetValue = (uint32_t)sourceChar;
1469 len = 1;
b75a7d8f 1470 }
374ca955
A
1471 break;
1472 case ISO8859_1:
1473 if(0x80 <= sourceChar && sourceChar <= 0xff) {
1474 targetValue = (uint32_t)sourceChar - 0x80;
1475 len = 1;
1476 g = 2;
1477 }
1478 break;
1479 case HWKANA_7BIT:
1480 if((uint32_t)(0xff9f-sourceChar)<=(0xff9f-0xff61)) {
1481 targetValue = (uint32_t)(sourceChar - (0xff61 - 0x21));
1482 len = 1;
1483
1484 if(converterData->version==3) {
1485 /* JIS7: use G1 (SO) */
1486 pFromU2022State->cs[1] = cs; /* do not output an escape sequence */
1487 g = 1;
1488 } else if(converterData->version==4) {
1489 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1490 int8_t cs0;
1491
1492 targetValue += 0x80;
1493
1494 cs0 = pFromU2022State->cs[0];
1495 if(IS_JP_DBCS(cs0)) {
1496 /* switch from a DBCS charset to JISX201 */
1497 cs = (int8_t)JISX201;
1498 } else {
1499 /* stay in the current G0 charset */
1500 cs = cs0;
b75a7d8f 1501 }
b75a7d8f 1502 }
b75a7d8f 1503 }
374ca955
A
1504 break;
1505 case JISX201:
1506 /* G0 SBCS */
1507 MBCS_SINGLE_FROM_UCHAR32(
1508 converterData->myConverterArray[cs],
1509 sourceChar, &targetValue,
1510 useFallback);
1511 if(targetValue <= 0x7f) {
1512 len = 1;
1513 }
1514 break;
1515 case ISO8859_7:
1516 /* G0 SBCS forced to 7-bit output */
1517 MBCS_SINGLE_FROM_UCHAR32(
1518 converterData->myConverterArray[cs],
1519 sourceChar, &targetValue,
1520 useFallback);
1521 if(0x80 <= targetValue && targetValue <= 0xff) {
1522 targetValue -= 0x80;
1523 len = 1;
1524 g = 2;
1525 }
1526 break;
1527 default:
1528 /* G0 DBCS */
1529 MBCS_FROM_UCHAR32_ISO2022(
1530 converterData->myConverterArray[cs],
1531 sourceChar, &targetValue,
1532 useFallback, &len, MBCS_OUTPUT_2);
1533 if(len != 2) {
1534 len = 0;
1535 }
1536 break;
b75a7d8f
A
1537 }
1538 }
b75a7d8f 1539
374ca955
A
1540 if(len > 0) {
1541 outLen = 0; /* count output bytes */
1542
1543 /* write SI if necessary (only for JIS7) */
1544 if(pFromU2022State->g == 1 && g == 0) {
1545 buffer[outLen++] = UCNV_SI;
1546 pFromU2022State->g = 0;
1547 }
1548
1549 /* write the designation sequence if necessary */
1550 if(cs != pFromU2022State->cs[g]) {
1551 int32_t escLen = escSeqCharsLen[cs];
1552 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1553 outLen += escLen;
1554 pFromU2022State->cs[g] = cs;
1555
1556 /* invalidate the choices[] */
1557 choiceCount = 0;
1558 }
1559
1560 /* write the shift sequence if necessary */
1561 if(g != pFromU2022State->g) {
1562 switch(g) {
1563 /* case 0 handled before writing escapes */
1564 case 1:
1565 buffer[outLen++] = UCNV_SO;
1566 pFromU2022State->g = 1;
1567 break;
1568 default: /* case 2 */
1569 buffer[outLen++] = 0x1b;
1570 buffer[outLen++] = 0x4e;
1571 break;
1572 /* no case 3: no SS3 in ISO-2022-JP-x */
1573 }
1574 }
1575
1576 /* write the output bytes */
1577 if(len == 1) {
1578 buffer[outLen++] = (char)targetValue;
1579 } else /* len == 2 */ {
1580 buffer[outLen++] = (char)(targetValue >> 8);
1581 buffer[outLen++] = (char)targetValue;
1582 }
1583 } else {
1584 /*
1585 * if we cannot find the character after checking all codepages
b75a7d8f
A
1586 * then this is an error
1587 */
b75a7d8f 1588 *err = U_INVALID_CHAR_FOUND;
374ca955
A
1589 args->converter->fromUChar32=sourceChar;
1590 break;
1591 }
1592
1593 if(sourceChar == CR || sourceChar == LF) {
1594 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1595 pFromU2022State->cs[2] = 0;
1596 choiceCount = 0;
1597 }
1598
1599 /* output outLen>0 bytes in buffer[] */
1600 if(outLen == 1) {
1601 *target++ = buffer[0];
1602 if(offsets) {
73c04bcf 1603 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
b75a7d8f 1604 }
374ca955
A
1605 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1606 *target++ = buffer[0];
1607 *target++ = buffer[1];
1608 if(offsets) {
1609 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1610 *offsets++ = sourceIndex;
1611 *offsets++ = sourceIndex;
1612 }
1613 } else {
73c04bcf 1614 fromUWriteUInt8(
374ca955
A
1615 args->converter,
1616 buffer, outLen,
73c04bcf 1617 &target, (const char *)targetLimit,
374ca955
A
1618 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1619 err);
1620 if(U_FAILURE(*err)) {
b75a7d8f
A
1621 break;
1622 }
1623 }
1624 } /* end if(myTargetIndex<myTargetLength) */
1625 else{
1626 *err =U_BUFFER_OVERFLOW_ERROR;
1627 break;
1628 }
1629
1630 }/* end while(mySourceIndex<mySourceLength) */
1631
374ca955
A
1632 /*
1633 * the end of the input stream and detection of truncated input
1634 * are handled by the framework, but for ISO-2022-JP conversion
1635 * we need to be in ASCII mode at the very end
1636 *
1637 * conditions:
1638 * successful
1639 * in SO mode or not in ASCII mode
1640 * end of input and no truncated input
b75a7d8f 1641 */
374ca955
A
1642 if( U_SUCCESS(*err) &&
1643 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1644 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
1645 ) {
1646 int32_t sourceIndex;
1647
1648 outLen = 0;
1649
1650 if(pFromU2022State->g != 0) {
1651 buffer[outLen++] = UCNV_SI;
1652 pFromU2022State->g = 0;
1653 }
1654
1655 if(pFromU2022State->cs[0] != ASCII) {
1656 int32_t escLen = escSeqCharsLen[ASCII];
1657 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1658 outLen += escLen;
1659 pFromU2022State->cs[0] = (int8_t)ASCII;
1660 }
1661
1662 /* get the source index of the last input character */
1663 /*
1664 * TODO this would be simpler and more reliable if we used a pair
1665 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1666 * so that we could simply use the prevSourceIndex here;
1667 * this code gives an incorrect result for the rare case of an unmatched
1668 * trail surrogate that is alone in the last buffer of the text stream
1669 */
1670 sourceIndex=(int32_t)(source-args->source);
1671 if(sourceIndex>0) {
1672 --sourceIndex;
1673 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1674 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1675 ) {
1676 --sourceIndex;
1677 }
1678 } else {
1679 sourceIndex=-1;
1680 }
1681
73c04bcf 1682 fromUWriteUInt8(
374ca955
A
1683 args->converter,
1684 buffer, outLen,
73c04bcf 1685 &target, (const char *)targetLimit,
374ca955
A
1686 &offsets, sourceIndex,
1687 err);
b75a7d8f
A
1688 }
1689
1690 /*save the state and return */
1691 args->source = source;
1692 args->target = (char*)target;
1693}
1694
1695/*************** to unicode *******************/
1696
b75a7d8f
A
1697static void
1698UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
374ca955
A
1699 UErrorCode* err){
1700 char tempBuf[3];
1701 const char *mySource = (char *) args->source;
b75a7d8f
A
1702 UChar *myTarget = args->target;
1703 const char *mySourceLimit = args->sourceLimit;
1704 uint32_t targetUniChar = 0x0000;
1705 uint32_t mySourceChar = 0x0000;
1706 UConverterDataISO2022* myData;
374ca955
A
1707 ISO2022State *pToU2022State;
1708 StateEnum cs;
b75a7d8f 1709
b75a7d8f 1710 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
374ca955 1711 pToU2022State = &myData->toU2022State;
b75a7d8f 1712
374ca955
A
1713 if(myData->key != 0) {
1714 /* continue with a partial escape sequence */
1715 goto escape;
1716 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
1717 /* continue with a partial double-byte character */
1718 mySourceChar = args->converter->toUBytes[0];
1719 args->converter->toULength = 0;
1720 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
fd0068a8 1721 targetUniChar = missingCharMarker;
374ca955
A
1722 goto getTrailByte;
1723 }
1724
1725 while(mySource < mySourceLimit){
1726
1727 targetUniChar =missingCharMarker;
b75a7d8f
A
1728
1729 if(myTarget < args->targetLimit){
1730
1731 mySourceChar= (unsigned char) *mySource++;
374ca955
A
1732
1733 switch(mySourceChar) {
1734 case UCNV_SI:
1735 if(myData->version==3) {
1736 pToU2022State->g=0;
b75a7d8f 1737 continue;
374ca955
A
1738 } else {
1739 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
d5d484b0 1740 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
374ca955 1741 break;
b75a7d8f 1742 }
b75a7d8f 1743
374ca955
A
1744 case UCNV_SO:
1745 if(myData->version==3) {
1746 /* JIS7: switch to G1 half-width Katakana */
1747 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
1748 pToU2022State->g=1;
b75a7d8f 1749 continue;
374ca955
A
1750 } else {
1751 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
d5d484b0 1752 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
374ca955 1753 break;
b75a7d8f 1754 }
b75a7d8f 1755
374ca955
A
1756 case ESC_2022:
1757 mySource--;
1758escape:
d5d484b0
A
1759 {
1760 const char * mySourceBefore = mySource;
1761 int8_t toULengthBefore = args->converter->toULength;
1762
1763 changeState_2022(args->converter,&(mySource),
1764 mySourceLimit, ISO_2022_JP,err);
1765
1766 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
1767 if ( myData->version == 0 && myData->key == 0 && U_SUCCESS(*err) && myData->isEmptySegment ) {
1768 *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
1769 args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
1770 }
b75a7d8f 1771
d5d484b0 1772 }
374ca955
A
1773 /* invalid or illegal escape sequence */
1774 if(U_FAILURE(*err)){
1775 args->target = myTarget;
1776 args->source = mySource;
d5d484b0 1777 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
374ca955 1778 return;
b75a7d8f 1779 }
d5d484b0
A
1780 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
1781 if (myData->key == 0) {
1782 myData->isEmptySegment = TRUE;
1783 }
374ca955 1784 continue;
b75a7d8f 1785
374ca955 1786 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
b75a7d8f 1787
374ca955
A
1788 case CR:
1789 /*falls through*/
1790 case LF:
1791 /* automatically reset to single-byte mode */
1792 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
1793 pToU2022State->cs[0] = (int8_t)ASCII;
b75a7d8f 1794 }
374ca955
A
1795 pToU2022State->cs[2] = 0;
1796 pToU2022State->g = 0;
1797 /* falls through */
b75a7d8f 1798 default:
374ca955 1799 /* convert one or two bytes */
d5d484b0 1800 myData->isEmptySegment = FALSE;
374ca955
A
1801 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1802 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
1803 !IS_JP_DBCS(cs)
1804 ) {
1805 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
1806 targetUniChar = mySourceChar + (0xff61 - 0xa1);
1807
1808 /* return from a single-shift state to the previous one */
1809 if(pToU2022State->g >= 2) {
1810 pToU2022State->g=pToU2022State->prevG;
1811 }
1812 } else switch(cs) {
1813 case ASCII:
1814 if(mySourceChar <= 0x7f) {
1815 targetUniChar = mySourceChar;
1816 }
1817 break;
1818 case ISO8859_1:
1819 if(mySourceChar <= 0x7f) {
1820 targetUniChar = mySourceChar + 0x80;
1821 }
1822 /* return from a single-shift state to the previous one */
1823 pToU2022State->g=pToU2022State->prevG;
1824 break;
1825 case ISO8859_7:
1826 if(mySourceChar <= 0x7f) {
1827 /* convert mySourceChar+0x80 to use a normal 8-bit table */
1828 targetUniChar =
1829 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1830 myData->myConverterArray[cs],
1831 mySourceChar + 0x80);
1832 }
1833 /* return from a single-shift state to the previous one */
1834 pToU2022State->g=pToU2022State->prevG;
1835 break;
1836 case JISX201:
1837 if(mySourceChar <= 0x7f) {
1838 targetUniChar =
1839 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1840 myData->myConverterArray[cs],
1841 mySourceChar);
1842 }
1843 break;
1844 case HWKANA_7BIT:
1845 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
1846 /* 7-bit halfwidth Katakana */
1847 targetUniChar = mySourceChar + (0xff61 - 0x21);
1848 }
1849 break;
1850 default:
1851 /* G0 DBCS */
1852 if(mySource < mySourceLimit) {
fd0068a8
A
1853 int leadIsOk, trailIsOk;
1854 uint8_t trailByte;
374ca955 1855getTrailByte:
fd0068a8
A
1856 trailByte = (uint8_t)*mySource;
1857 /* old
374ca955
A
1858 tempBuf[0] = (char) (mySourceChar);
1859 tempBuf[1] = trailByte = *mySource++;
1860 mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
1861 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
fd0068a8
A
1862 */
1863 /*
1864 * Ticket 5691: consistent illegal sequences:
1865 * - We include at least the first byte in the illegal sequence.
1866 * - If any of the non-initial bytes could be the start of a character,
1867