]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnv2022.c
ICU-8.11.2.tar.gz
[apple/icu.git] / icuSources / common / ucnv2022.c
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
d5d484b0 3* Copyright (C) 2000-2006,2008 International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6* file name: ucnv2022.c
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2000feb03
12* created by: Markus W. Scherer
13*
14* Change history:
15*
16* 06/29/2000 helena Major rewrite of the callback APIs.
17* 08/08/2000 Ram Included support for ISO-2022-JP-2
18* Changed implementation of toUnicode
19* function
20* 08/21/2000 Ram Added support for ISO-2022-KR
21* 08/29/2000 Ram Seperated implementation of EBCDIC to
22* ucnvebdc.c
23* 09/20/2000 Ram Added support for ISO-2022-CN
24* Added implementations for getNextUChar()
25* for specific 2022 country variants.
26* 10/31/2000 Ram Implemented offsets logic functions
27*/
28
29#include "unicode/utypes.h"
30
374ca955 31#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
b75a7d8f
A
32
33#include "unicode/ucnv.h"
34#include "unicode/uset.h"
35#include "unicode/ucnv_err.h"
36#include "unicode/ucnv_cb.h"
374ca955 37#include "ucnv_imp.h"
b75a7d8f
A
38#include "ucnv_bld.h"
39#include "ucnv_cnv.h"
40#include "ucnvmbcs.h"
41#include "cstring.h"
42#include "cmemory.h"
43
374ca955
A
44#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45
46#ifdef U_ENABLE_GENERIC_ISO_2022
47/*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
72 * for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76#endif
77
78static const char SHIFT_IN_STR[] = "\x0F";
79static const char SHIFT_OUT_STR[] = "\x0E";
b75a7d8f
A
80
81#define CR 0x0D
82#define LF 0x0A
83#define H_TAB 0x09
84#define V_TAB 0x0B
85#define SPACE 0x20
86
73c04bcf
A
87/*
88 * ISO 2022 control codes must not be converted from Unicode
89 * because they would mess up the byte stream.
90 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
91 * corresponding to SO, SI, and ESC.
92 */
93#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
94
374ca955 95/* for ISO-2022-JP and -CN implementations */
b75a7d8f 96typedef enum {
374ca955
A
97 /* shared values */
98 INVALID_STATE=-1,
b75a7d8f 99 ASCII = 0,
374ca955
A
100
101 SS2_STATE=0x10,
102 SS3_STATE,
103
104 /* JP */
b75a7d8f
A
105 ISO8859_1 = 1 ,
106 ISO8859_7 = 2 ,
107 JISX201 = 3,
108 JISX208 = 4,
109 JISX212 = 5,
110 GB2312 =6,
111 KSC5601 =7,
112 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
b75a7d8f 113
374ca955
A
114 /* CN */
115 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
116 GB2312_1=1,
117 ISO_IR_165=2,
118 CNS_11643=3,
119
120 /*
121 * these are used in StateEnum and ISO2022State variables,
122 * but CNS_11643 must be used to index into myConverterArray[]
123 */
124 CNS_11643_0=0x20,
125 CNS_11643_1,
126 CNS_11643_2,
127 CNS_11643_3,
128 CNS_11643_4,
129 CNS_11643_5,
130 CNS_11643_6,
131 CNS_11643_7
b75a7d8f
A
132} StateEnum;
133
374ca955
A
134/* is the StateEnum charset value for a DBCS charset? */
135#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
136
137#define CSM(cs) ((uint16_t)1<<(cs))
b75a7d8f 138
374ca955
A
139/*
140 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
141 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
142 *
143 * Note: The converter uses some leniency:
144 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
145 * all versions, not just JIS7 and JIS8.
146 * - ICU does not distinguish between different versions of JIS X 0208.
147 */
148static const uint16_t jpCharsetMasks[5]={
149 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
150 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
151 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
152 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
153 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
154};
b75a7d8f
A
155
156typedef enum {
157 ASCII1=0,
158 LATIN1,
159 SBCS,
160 DBCS,
374ca955
A
161 MBCS,
162 HWKANA
b75a7d8f
A
163}Cnv2022Type;
164
374ca955
A
165typedef struct ISO2022State {
166 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
167 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
168 int8_t prevG; /* g before single shift (SS2 or SS3) */
169} ISO2022State;
170
b75a7d8f
A
171#define UCNV_OPTIONS_VERSION_MASK 0xf
172#define UCNV_2022_MAX_CONVERTERS 10
173
174typedef struct{
73c04bcf 175 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
b75a7d8f 176 UConverter *currentConverter;
b75a7d8f 177 Cnv2022Type currentType;
374ca955 178 ISO2022State toU2022State, fromU2022State;
b75a7d8f
A
179 uint32_t key;
180 uint32_t version;
73c04bcf
A
181#ifdef U_ENABLE_GENERIC_ISO_2022
182 UBool isFirstBuffer;
183#endif
d5d484b0 184 UBool isEmptySegment;
b75a7d8f 185 char name[30];
73c04bcf 186 char locale[3];
b75a7d8f
A
187}UConverterDataISO2022;
188
374ca955 189/* Protos */
b75a7d8f
A
190/* ISO-2022 ----------------------------------------------------------------- */
191
192/*Forward declaration */
193U_CFUNC void
374ca955
A
194ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
195 UErrorCode * err);
b75a7d8f 196U_CFUNC void
374ca955
A
197ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
198 UErrorCode * err);
b75a7d8f
A
199
200#define ESC_2022 0x1B /*ESC*/
201
202typedef enum
203{
204 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
205 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
206 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
374ca955 207 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
b75a7d8f
A
208} UCNV_TableStates_2022;
209
210/*
211* The way these state transition arrays work is:
212* ex : ESC$B is the sequence for JISX208
213* a) First Iteration: char is ESC
214* i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
215* int x = normalize_esq_chars_2022[27] which is equal to 1
216* ii) Search for this value in escSeqStateTable_Key_2022[]
217* value of x is stored at escSeqStateTable_Key_2022[0]
218* iii) Save this index as offset
219* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
220* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
221* b) Switch on this state and continue to next char
222* i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
223* which is normalize_esq_chars_2022[36] == 4
224* ii) x is currently 1(from above)
225* x<<=5 -- x is now 32
226* x+=normalize_esq_chars_2022[36]
227* now x is 36
228* iii) Search for this value in escSeqStateTable_Key_2022[]
229* value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
230* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
231* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
232* c) Switch on this state and continue to next char
233* i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
234* ii) x is currently 36 (from above)
235* x<<=5 -- x is now 1152
236* x+=normalize_esq_chars_2022[66]
237* now x is 1161
238* iii) Search for this value in escSeqStateTable_Key_2022[]
239* value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
240* iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
241* escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
242* v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
243*/
244
245
246/*Below are the 3 arrays depicting a state transition table*/
247static const int8_t normalize_esq_chars_2022[256] = {
248/* 0 1 2 3 4 5 6 7 8 9 */
249
250 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
251 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
252 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
253 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
254 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
255 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
256 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
257 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
258 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
259 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
260 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
261 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
262 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
263 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
264 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
265 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
266 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
267 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
268 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
269 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
270 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
271 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
273 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
274 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
275 ,0 ,0 ,0 ,0 ,0 ,0
276};
277
374ca955
A
278#ifdef U_ENABLE_GENERIC_ISO_2022
279/*
280 * When the generic ISO-2022 converter is completely removed, not just disabled
281 * per #ifdef, then the following state table and the associated tables that are
282 * dimensioned with MAX_STATES_2022 should be trimmed.
283 *
284 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
285 * the associated escape sequences starting with ESC ( B should be removed.
286 * This includes the ones with key values 1097 and all of the ones above 1000000.
287 *
288 * For the latter, the tables can simply be truncated.
289 * For the former, since the tables must be kept parallel, it is probably best
290 * to simply duplicate an adjacent table cell, parallel in all tables.
291 *
292 * It may make sense to restructure the tables, especially by using small search
293 * tables for the variants instead of indexing them parallel to the table here.
294 */
295#endif
296
b75a7d8f
A
297#define MAX_STATES_2022 74
298static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
299/* 0 1 2 3 4 5 6 7 8 9 */
300
301 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
302 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
303 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
304 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
305 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
306 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
307 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
308 ,35947631 ,35947635 ,35947636 ,35947638
309};
310
374ca955 311#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f
A
312
313static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
314 /* 0 1 2 3 4 5 6 7 8 9 */
315
316 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
374ca955 317 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
b75a7d8f
A
318 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
319 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
320 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
321 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
322 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
323 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
324};
325
374ca955
A
326#endif
327
b75a7d8f
A
328static const UCNV_TableStates_2022 escSeqStateTable_Value_2022[MAX_STATES_2022] = {
329/* 0 1 2 3 4 5 6 7 8 9 */
374ca955 330 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
b75a7d8f
A
331 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
332 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
333 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
334 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
335 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
336 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
337 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
338};
339
340
b75a7d8f
A
341/* Type def for refactoring changeState_2022 code*/
342typedef enum{
374ca955 343#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f 344 ISO_2022=0,
374ca955 345#endif
b75a7d8f
A
346 ISO_2022_JP=1,
347 ISO_2022_KR=2,
348 ISO_2022_CN=3
349} Variant2022;
350
b75a7d8f
A
351/*********** ISO 2022 Converter Protos ***********/
352static void
353_ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode);
354
355static void
356 _ISO2022Close(UConverter *converter);
357
358static void
359_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
360
361static const char*
362_ISO2022getName(const UConverter* cnv);
363
364static void
365_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
366
367static UConverter *
368_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
369
374ca955 370#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f 371static void
374ca955
A
372T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
373#endif
b75a7d8f 374
374ca955
A
375/*const UConverterSharedData _ISO2022Data;*/
376static const UConverterSharedData _ISO2022JPData;
377static const UConverterSharedData _ISO2022KRData;
378static const UConverterSharedData _ISO2022CNData;
b75a7d8f 379
374ca955 380/*************** Converter implementations ******************/
b75a7d8f 381
73c04bcf
A
382/* The purpose of this function is to get around gcc compiler warnings. */
383static U_INLINE void
384fromUWriteUInt8(UConverter *cnv,
385 const char *bytes, int32_t length,
386 uint8_t **target, const char *targetLimit,
387 int32_t **offsets,
388 int32_t sourceIndex,
389 UErrorCode *pErrorCode)
390{
391 char *targetChars = (char *)*target;
392 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
393 offsets, sourceIndex, pErrorCode);
394 *target = (uint8_t*)targetChars;
395
396}
397
398static U_INLINE void
374ca955
A
399setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
400 if(myConverterData->version == 1) {
401 UConverter *cnv = myConverterData->currentConverter;
b75a7d8f 402
374ca955
A
403 cnv->toUnicodeStatus=0; /* offset */
404 cnv->mode=0; /* state */
405 cnv->toULength=0; /* byteIndex */
406 }
407}
b75a7d8f 408
73c04bcf 409static U_INLINE void
374ca955
A
410setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
411 /* in ISO-2022-KR the designator sequence appears only once
412 * in a file so we append it only once
413 */
414 if( converter->charErrorBufferLength==0){
b75a7d8f 415
374ca955
A
416 converter->charErrorBufferLength = 4;
417 converter->charErrorBuffer[0] = 0x1b;
418 converter->charErrorBuffer[1] = 0x24;
419 converter->charErrorBuffer[2] = 0x29;
420 converter->charErrorBuffer[3] = 0x43;
421 }
422 if(myConverterData->version == 1) {
423 UConverter *cnv = myConverterData->currentConverter;
b75a7d8f 424
374ca955
A
425 cnv->fromUChar32=0;
426 cnv->fromUnicodeStatus=1; /* prevLength */
427 }
428}
b75a7d8f 429
374ca955
A
430static void
431_ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
b75a7d8f 432
374ca955 433 char myLocale[6]={' ',' ',' ',' ',' ',' '};
b75a7d8f 434
374ca955
A
435 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
436 if(cnv->extraInfo != NULL) {
437 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
438 uint32_t version;
b75a7d8f 439
374ca955 440 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
374ca955 441 myConverterData->currentType = ASCII1;
374ca955
A
442 cnv->fromUnicodeStatus =FALSE;
443 if(locale){
444 uprv_strncpy(myLocale, locale, sizeof(myLocale));
445 }
374ca955 446 version = options & UCNV_OPTIONS_VERSION_MASK;
73c04bcf 447 myConverterData->version = version;
374ca955 448 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
73c04bcf
A
449 (myLocale[2]=='_' || myLocale[2]=='\0'))
450 {
451 size_t len=0;
374ca955
A
452 /* open the required converters and cache them */
453 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
454 myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
455 }
456 myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode);
457 myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode);
458 if(jpCharsetMasks[version]&CSM(JISX212)) {
459 myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode);
460 }
461 if(jpCharsetMasks[version]&CSM(GB2312)) {
462 myConverterData->myConverterArray[GB2312] = ucnv_loadSharedData("ibm-5478", NULL, errorCode); /* gb_2312_80-1 */
463 }
464 if(jpCharsetMasks[version]&CSM(KSC5601)) {
465 myConverterData->myConverterArray[KSC5601] = ucnv_loadSharedData("ksc_5601", NULL, errorCode);
466 }
b75a7d8f 467
374ca955
A
468 /* set the function pointers to appropriate funtions */
469 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
470 uprv_strcpy(myConverterData->locale,"ja");
b75a7d8f 471
374ca955
A
472 uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
473 len = uprv_strlen(myConverterData->name);
474 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
475 myConverterData->name[len+1]='\0';
476 }
477 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
73c04bcf
A
478 (myLocale[2]=='_' || myLocale[2]=='\0'))
479 {
480 if (version==1){
481 myConverterData->currentConverter=
482 ucnv_open("icu-internal-25546",errorCode);
b75a7d8f 483
73c04bcf
A
484 if (U_FAILURE(*errorCode)) {
485 _ISO2022Close(cnv);
486 return;
487 }
b75a7d8f 488
73c04bcf
A
489 uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
490 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
491 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
374ca955 492 }else{
73c04bcf 493 myConverterData->currentConverter=ucnv_open("ibm-949",errorCode);
b75a7d8f 494
73c04bcf
A
495 if (U_FAILURE(*errorCode)) {
496 _ISO2022Close(cnv);
497 return;
498 }
b75a7d8f 499
73c04bcf
A
500 myConverterData->version = 0;
501 uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
374ca955 502 }
b75a7d8f 503
374ca955
A
504 /* initialize the state variables */
505 setInitialStateToUnicodeKR(cnv, myConverterData);
73c04bcf 506 setInitialStateFromUnicodeKR(cnv, myConverterData);
b75a7d8f
A
507
508 /* set the function pointers to appropriate funtions */
509 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
b75a7d8f
A
510 uprv_strcpy(myConverterData->locale,"ko");
511 }
512 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
73c04bcf
A
513 (myLocale[2]=='_' || myLocale[2]=='\0'))
514 {
b75a7d8f
A
515
516 /* open the required converters and cache them */
374ca955
A
517 myConverterData->myConverterArray[GB2312_1] = ucnv_loadSharedData("ibm-5478", NULL, errorCode);
518 if(version==1) {
519 myConverterData->myConverterArray[ISO_IR_165] = ucnv_loadSharedData("iso-ir-165", NULL, errorCode);
520 }
521 myConverterData->myConverterArray[CNS_11643] = ucnv_loadSharedData("cns-11643-1992", NULL, errorCode);
b75a7d8f 522
b75a7d8f
A
523
524 /* set the function pointers to appropriate funtions */
525 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
526 uprv_strcpy(myConverterData->locale,"cn");
527
73c04bcf 528 if (version==1){
b75a7d8f
A
529 uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
530 }else{
b75a7d8f 531 myConverterData->version = 0;
73c04bcf 532 uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
b75a7d8f
A
533 }
534 }
535 else{
374ca955 536#ifdef U_ENABLE_GENERIC_ISO_2022
73c04bcf
A
537 myConverterData->isFirstBuffer = TRUE;
538
b75a7d8f
A
539 /* append the UTF-8 escape sequence */
540 cnv->charErrorBufferLength = 3;
541 cnv->charErrorBuffer[0] = 0x1b;
542 cnv->charErrorBuffer[1] = 0x25;
543 cnv->charErrorBuffer[2] = 0x42;
544
545 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
546 /* initialize the state variables */
b75a7d8f 547 uprv_strcpy(myConverterData->name,"ISO_2022");
374ca955
A
548#else
549 *errorCode = U_UNSUPPORTED_ERROR;
550 return;
551#endif
b75a7d8f
A
552 }
553
374ca955
A
554 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
555
556 if(U_FAILURE(*errorCode)) {
557 _ISO2022Close(cnv);
558 }
b75a7d8f
A
559 } else {
560 *errorCode = U_MEMORY_ALLOCATION_ERROR;
561 }
b75a7d8f
A
562}
563
564
565static void
566_ISO2022Close(UConverter *converter) {
374ca955
A
567 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
568 UConverterSharedData **array = myData->myConverterArray;
569 int32_t i;
b75a7d8f
A
570
571 if (converter->extraInfo != NULL) {
572 /*close the array of converter pointers and free the memory*/
374ca955
A
573 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
574 if(array[i]!=NULL) {
575 ucnv_unloadSharedDataIfReady(array[i]);
b75a7d8f 576 }
b75a7d8f
A
577 }
578
374ca955 579 ucnv_close(myData->currentConverter);
b75a7d8f
A
580
581 if(!converter->isExtraLocal){
582 uprv_free (converter->extraInfo);
374ca955 583 converter->extraInfo = NULL;
b75a7d8f
A
584 }
585 }
586}
587
588static void
589_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
590 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
374ca955
A
591 if(choice<=UCNV_RESET_TO_UNICODE) {
592 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
593 myConverterData->key = 0;
d5d484b0 594 myConverterData->isEmptySegment = FALSE;
374ca955
A
595 }
596 if(choice!=UCNV_RESET_TO_UNICODE) {
597 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
598 }
599#ifdef U_ENABLE_GENERIC_ISO_2022
600 if(myConverterData->locale[0] == 0){
b75a7d8f
A
601 if(choice<=UCNV_RESET_TO_UNICODE) {
602 myConverterData->isFirstBuffer = TRUE;
374ca955 603 myConverterData->key = 0;
b75a7d8f
A
604 if (converter->mode == UCNV_SO){
605 ucnv_close (myConverterData->currentConverter);
606 myConverterData->currentConverter=NULL;
607 }
608 converter->mode = UCNV_SI;
609 }
610 if(choice!=UCNV_RESET_TO_UNICODE) {
611 /* re-append UTF-8 escape sequence */
612 converter->charErrorBufferLength = 3;
613 converter->charErrorBuffer[0] = 0x1b;
614 converter->charErrorBuffer[1] = 0x28;
615 converter->charErrorBuffer[2] = 0x42;
616 }
617 }
374ca955
A
618 else
619#endif
620 {
b75a7d8f 621 /* reset the state variables */
374ca955 622 if(myConverterData->locale[0] == 'k'){
b75a7d8f
A
623 if(choice<=UCNV_RESET_TO_UNICODE) {
624 setInitialStateToUnicodeKR(converter, myConverterData);
625 }
626 if(choice!=UCNV_RESET_TO_UNICODE) {
627 setInitialStateFromUnicodeKR(converter, myConverterData);
628 }
629 }
630 }
631}
632
633static const char*
634_ISO2022getName(const UConverter* cnv){
635 if(cnv->extraInfo){
636 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
637 return myData->name;
638 }
639 return NULL;
640}
641
b75a7d8f 642
374ca955
A
643/*************** to unicode *******************/
644/****************************************************************************
645 * Recognized escape sequences are
646 * <ESC>(B ASCII
647 * <ESC>.A ISO-8859-1
648 * <ESC>.F ISO-8859-7
649 * <ESC>(J JISX-201
650 * <ESC>(I JISX-201
651 * <ESC>$B JISX-208
652 * <ESC>$@ JISX-208
653 * <ESC>$(D JISX-212
654 * <ESC>$A GB2312
655 * <ESC>$(C KSC5601
656 */
657static const StateEnum nextStateToUnicodeJP[MAX_STATES_2022]= {
658/* 0 1 2 3 4 5 6 7 8 9 */
659 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
660 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
661 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
662 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
663 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
664 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
665 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
666 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
667};
b75a7d8f 668
374ca955
A
669/*************** to unicode *******************/
670static const StateEnum nextStateToUnicodeCN[MAX_STATES_2022]= {
671/* 0 1 2 3 4 5 6 7 8 9 */
672 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
673 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
674 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
675 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
676 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
677 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
678 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
679 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
680};
b75a7d8f 681
b75a7d8f 682
374ca955
A
683static UCNV_TableStates_2022
684getKey_2022(char c,int32_t* key,int32_t* offset){
685 int32_t togo;
686 int32_t low = 0;
687 int32_t hi = MAX_STATES_2022;
688 int32_t oldmid=0;
b75a7d8f 689
374ca955
A
690 togo = normalize_esq_chars_2022[(uint8_t)c];
691 if(togo == 0) {
692 /* not a valid character anywhere in an escape sequence */
693 *key = 0;
694 *offset = 0;
695 return INVALID_2022;
696 }
697 togo = (*key << 5) + togo;
b75a7d8f 698
374ca955 699 while (hi != low) /*binary search*/{
b75a7d8f 700
374ca955
A
701 register int32_t mid = (hi+low) >> 1; /*Finds median*/
702
703 if (mid == oldmid)
704 break;
705
706 if (escSeqStateTable_Key_2022[mid] > togo){
707 hi = mid;
708 }
709 else if (escSeqStateTable_Key_2022[mid] < togo){
710 low = mid;
711 }
712 else /*we found it*/{
713 *key = togo;
714 *offset = mid;
715 return escSeqStateTable_Value_2022[mid];
716 }
717 oldmid = mid;
b75a7d8f 718
b75a7d8f 719 }
b75a7d8f 720
374ca955
A
721 *key = 0;
722 *offset = 0;
723 return INVALID_2022;
b75a7d8f
A
724}
725
374ca955
A
726/*runs through a state machine to determine the escape sequence - codepage correspondance
727 */
728static void
729changeState_2022(UConverter* _this,
730 const char** source,
731 const char* sourceLimit,
732 Variant2022 var,
733 UErrorCode* err){
734 UCNV_TableStates_2022 value;
735 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
736 uint32_t key = myData2022->key;
73c04bcf 737 int32_t offset = 0;
374ca955
A
738 char c;
739
740 value = VALID_NON_TERMINAL_2022;
741 while (*source < sourceLimit) {
742 c = *(*source)++;
743 _this->toUBytes[_this->toULength++]=(uint8_t)c;
744 value = getKey_2022(c,(int32_t *) &key, &offset);
745
746 switch (value){
b75a7d8f 747
374ca955
A
748 case VALID_NON_TERMINAL_2022 :
749 /* continue with the loop */
750 break;
b75a7d8f 751
374ca955
A
752 case VALID_TERMINAL_2022:
753 key = 0;
754 goto DONE;
b75a7d8f 755
374ca955
A
756 case INVALID_2022:
757 goto DONE;
b75a7d8f 758
374ca955
A
759 case VALID_MAYBE_TERMINAL_2022:
760#ifdef U_ENABLE_GENERIC_ISO_2022
761 /* ESC ( B is ambiguous only for ISO_2022 itself */
762 if(var == ISO_2022) {
763 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
764 _this->toULength = 0;
b75a7d8f 765
374ca955
A
766 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
767
768 /* continue with the loop */
769 value = VALID_NON_TERMINAL_2022;
770 break;
771 } else
772#endif
773 {
774 /* not ISO_2022 itself, finish here */
775 value = VALID_TERMINAL_2022;
776 key = 0;
777 goto DONE;
b75a7d8f
A
778 }
779 }
b75a7d8f 780 }
b75a7d8f 781
374ca955
A
782DONE:
783 myData2022->key = key;
b75a7d8f 784
374ca955
A
785 if (value == VALID_NON_TERMINAL_2022) {
786 /* indicate that the escape sequence is incomplete: key!=0 */
787 return;
788 } else if (value == INVALID_2022 ) {
789 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
790 return;
791 } else /* value == VALID_TERMINAL_2022 */ {
792 switch(var){
793#ifdef U_ENABLE_GENERIC_ISO_2022
794 case ISO_2022:
795 {
796 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
797 if(chosenConverterName == NULL) {
798 /* SS2 or SS3 */
799 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
800 return;
b75a7d8f 801 }
374ca955
A
802
803 _this->mode = UCNV_SI;
804 ucnv_close(myData2022->currentConverter);
805 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
806 if(U_SUCCESS(*err)) {
807 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
808 _this->mode = UCNV_SO;
809 }
810 break;
811 }
812#endif
813 case ISO_2022_JP:
814 {
815 StateEnum tempState=nextStateToUnicodeJP[offset];
816 switch(tempState) {
817 case INVALID_STATE:
818 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
819 break;
820 case SS2_STATE:
821 if(myData2022->toU2022State.cs[2]!=0) {
822 if(myData2022->toU2022State.g<2) {
823 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
824 }
825 myData2022->toU2022State.g=2;
826 } else {
827 /* illegal to have SS2 before a matching designator */
828 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
829 }
830 break;
831 /* case SS3_STATE: not used in ISO-2022-JP-x */
832 case ISO8859_1:
833 case ISO8859_7:
834 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
835 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
836 } else {
837 /* G2 charset for SS2 */
838 myData2022->toU2022State.cs[2]=(int8_t)tempState;
839 }
840 break;
841 default:
842 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
843 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
844 } else {
845 /* G0 charset */
846 myData2022->toU2022State.cs[0]=(int8_t)tempState;
847 }
848 break;
849 }
850 }
851 break;
852 case ISO_2022_CN:
853 {
854 StateEnum tempState=nextStateToUnicodeCN[offset];
855 switch(tempState) {
856 case INVALID_STATE:
857 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
858 break;
859 case SS2_STATE:
860 if(myData2022->toU2022State.cs[2]!=0) {
861 if(myData2022->toU2022State.g<2) {
862 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
863 }
864 myData2022->toU2022State.g=2;
865 } else {
866 /* illegal to have SS2 before a matching designator */
867 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
868 }
869 break;
870 case SS3_STATE:
871 if(myData2022->toU2022State.cs[3]!=0) {
872 if(myData2022->toU2022State.g<2) {
873 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
874 }
875 myData2022->toU2022State.g=3;
876 } else {
877 /* illegal to have SS3 before a matching designator */
878 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
879 }
880 break;
881 case ISO_IR_165:
882 if(myData2022->version==0) {
883 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
884 break;
885 }
73c04bcf 886 /*fall through*/
374ca955 887 case GB2312_1:
73c04bcf 888 /*fall through*/
374ca955
A
889 case CNS_11643_1:
890 myData2022->toU2022State.cs[1]=(int8_t)tempState;
891 break;
892 case CNS_11643_2:
893 myData2022->toU2022State.cs[2]=(int8_t)tempState;
894 break;
895 default:
896 /* other CNS 11643 planes */
897 if(myData2022->version==0) {
898 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
899 } else {
900 myData2022->toU2022State.cs[3]=(int8_t)tempState;
901 }
902 break;
903 }
904 }
905 break;
906 case ISO_2022_KR:
907 if(offset==0x30){
908 /* nothing to be done, just accept this one escape sequence */
909 } else {
910 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
911 }
912 break;
913
914 default:
915 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
916 break;
917 }
918 }
919 if(U_SUCCESS(*err)) {
920 _this->toULength = 0;
921 }
922}
923
924/*Checks the characters of the buffer against valid 2022 escape sequences
925*if the match we return a pointer to the initial start of the sequence otherwise
926*we return sourceLimit
927*/
928/*for 2022 looks ahead in the stream
929 *to determine the longest possible convertible
930 *data stream
931 */
932static U_INLINE const char*
933getEndOfBuffer_2022(const char** source,
934 const char* sourceLimit,
935 UBool flush){
936
937 const char* mySource = *source;
938
939#ifdef U_ENABLE_GENERIC_ISO_2022
940 if (*source >= sourceLimit)
941 return sourceLimit;
942
943 do{
944
945 if (*mySource == ESC_2022){
946 int8_t i;
947 int32_t key = 0;
948 int32_t offset;
949 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
950
951 /* Kludge: I could not
952 * figure out the reason for validating an escape sequence
953 * twice - once here and once in changeState_2022().
954 * is it possible to have an ESC character in a ISO2022
955 * byte stream which is valid in a code page? Is it legal?
956 */
957 for (i=0;
958 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
959 i++) {
960 value = getKey_2022(*(mySource+i), &key, &offset);
961 }
962 if (value > 0 || *mySource==ESC_2022)
963 return mySource;
964
965 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
966 return sourceLimit;
967 }
968 }while (++mySource < sourceLimit);
969
970 return sourceLimit;
971#else
972 while(mySource < sourceLimit && *mySource != ESC_2022) {
973 ++mySource;
974 }
975 return mySource;
976#endif
977}
978
979
980/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
981 * any future change in _MBCSFromUChar32() function should be reflected in
982 * this macro
983 */
984static U_INLINE void
985MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
986 UChar32 c,
987 uint32_t* value,
988 UBool useFallback,
989 int32_t *length,
990 int outputType)
991{
992 const int32_t *cx;
993 const uint16_t *table;
994 uint32_t stage2Entry;
995 uint32_t myValue;
996 const uint8_t *p;
997 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
998 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
999 table=sharedData->mbcs.fromUnicodeTable;
1000 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1001 /* get the bytes and the length for the output */
1002 if(outputType==MBCS_OUTPUT_2){
1003 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1004 if(myValue<=0xff) {
1005 *length=1;
1006 } else {
1007 *length=2;
1008 }
1009 } else /* outputType==MBCS_OUTPUT_3 */ {
1010 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1011 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1012 if(myValue<=0xff) {
1013 *length=1;
1014 } else if(myValue<=0xffff) {
1015 *length=2;
1016 } else {
1017 *length=3;
b75a7d8f
A
1018 }
1019 }
1020 /* is this code point assigned, or do we use fallbacks? */
1021 if( (stage2Entry&(1<<(16+(c&0xf))))!=0 ||
374ca955 1022 (FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0)
b75a7d8f
A
1023 ) {
1024 /*
374ca955 1025 * We allow a 0 byte output if the "assigned" bit is set for this entry.
b75a7d8f 1026 * There is no way with this data structure for fallback output
374ca955 1027 * to be a zero byte.
b75a7d8f
A
1028 */
1029 /* assigned */
1030 *value=myValue;
374ca955 1031 return;
b75a7d8f 1032 }
b75a7d8f 1033 }
374ca955
A
1034
1035 cx=sharedData->mbcs.extIndexes;
1036 if(cx!=NULL) {
1037 *length=ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1038 return;
1039 }
1040
1041 /* unassigned */
1042 *length=0;
b75a7d8f
A
1043}
1044
1045/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1046 * any future change in _MBCSSingleFromUChar32() function should be reflected in
1047 * this macro
1048 */
1049static U_INLINE void
1050MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1051 UChar32 c,
1052 uint32_t* retval,
1053 UBool useFallback)
1054{
1055 const uint16_t *table;
1056 int32_t value;
1057 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
374ca955
A
1058 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1059 *retval=(uint16_t)-1;
1060 return;
b75a7d8f
A
1061 }
1062 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
374ca955 1063 table=sharedData->mbcs.fromUnicodeTable;
b75a7d8f 1064 /* get the byte for the output */
374ca955 1065 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
b75a7d8f
A
1066 /* is this code point assigned, or do we use fallbacks? */
1067 if(useFallback ? value>=0x800 : value>=0xc00) {
1068 value &=0xff;
1069 } else {
1070 value= -1;
1071 }
1072 *retval=(uint16_t) value;
1073}
1074
374ca955
A
1075#ifdef U_ENABLE_GENERIC_ISO_2022
1076
b75a7d8f
A
1077/**********************************************************************************
1078* ISO-2022 Converter
1079*
1080*
1081*/
1082
b75a7d8f
A
1083static void
1084T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1085 UErrorCode* err){
374ca955
A
1086 const char* mySourceLimit, *realSourceLimit;
1087 const char* sourceStart;
1088 const UChar* myTargetStart;
b75a7d8f 1089 UConverter* saveThis;
b75a7d8f 1090 UConverterDataISO2022* myData;
374ca955
A
1091 int8_t length;
1092
1093 saveThis = args->converter;
1094 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1095
1096 realSourceLimit = args->sourceLimit;
1097 while (args->source < realSourceLimit) {
1098 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1099 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1100 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1101
1102 if(args->source < mySourceLimit) {
1103 if(myData->currentConverter==NULL) {
1104 myData->currentConverter = ucnv_open("ASCII",err);
1105 if(U_FAILURE(*err)){
1106 return;
1107 }
b75a7d8f 1108
374ca955
A
1109 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1110 saveThis->mode = UCNV_SO;
b75a7d8f 1111 }
b75a7d8f 1112
374ca955
A
1113 /* convert to before the ESC or until the end of the buffer */
1114 myData->isFirstBuffer=FALSE;
1115 sourceStart = args->source;
1116 myTargetStart = args->target;
1117 args->converter = myData->currentConverter;
1118 ucnv_toUnicode(args->converter,
1119 &args->target,
1120 args->targetLimit,
1121 &args->source,
1122 mySourceLimit,
1123 args->offsets,
1124 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1125 err);
1126 args->converter = saveThis;
1127
1128 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1129 /* move the overflow buffer */
1130 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1131 myData->currentConverter->UCharErrorBufferLength = 0;
1132 if(length > 0) {
1133 uprv_memcpy(saveThis->UCharErrorBuffer,
1134 myData->currentConverter->UCharErrorBuffer,
1135 length*U_SIZEOF_UCHAR);
1136 }
1137 return;
1138 }
b75a7d8f 1139
374ca955
A
1140 /*
1141 * At least one of:
1142 * -Error while converting
1143 * -Done with entire buffer
1144 * -Need to write offsets or update the current offset
1145 * (leave that up to the code in ucnv.c)
1146 *
1147 * or else we just stopped at an ESC byte and continue with changeState_2022()
1148 */
1149 if (U_FAILURE(*err) ||
1150 (args->source == realSourceLimit) ||
1151 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1152 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1153 ) {
1154 /* copy partial or error input for truncated detection and error handling */
1155 if(U_FAILURE(*err)) {
1156 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1157 if(length > 0) {
1158 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1159 }
1160 } else {
1161 length = saveThis->toULength = myData->currentConverter->toULength;
1162 if(length > 0) {
1163 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1164 if(args->source < mySourceLimit) {
1165 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1166 }
1167 }
1168 }
1169 return;
b75a7d8f 1170 }
b75a7d8f
A
1171 }
1172 }
b75a7d8f
A
1173
1174 sourceStart = args->source;
1175 changeState_2022(args->converter,
1176 &(args->source),
374ca955 1177 realSourceLimit,
b75a7d8f 1178 ISO_2022,
b75a7d8f 1179 err);
374ca955
A
1180 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1181 /* let the ucnv.c code update its current offset */
1182 return;
b75a7d8f 1183 }
b75a7d8f 1184 }
b75a7d8f
A
1185}
1186
374ca955 1187#endif
b75a7d8f
A
1188
1189/*
1190 * To Unicode Callback helper function
1191 */
1192static void
374ca955
A
1193toUnicodeCallback(UConverter *cnv,
1194 const uint32_t sourceChar, const uint32_t targetUniChar,
1195 UErrorCode* err){
b75a7d8f 1196 if(sourceChar>0xff){
374ca955
A
1197 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1198 cnv->toUBytes[1] = (uint8_t)sourceChar;
1199 cnv->toULength = 2;
b75a7d8f
A
1200 }
1201 else{
374ca955
A
1202 cnv->toUBytes[0] =(char) sourceChar;
1203 cnv->toULength = 2;
b75a7d8f
A
1204 }
1205
1206 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
b75a7d8f
A
1207 *err = U_INVALID_CHAR_FOUND;
1208 }
1209 else{
b75a7d8f
A
1210 *err = U_ILLEGAL_CHAR_FOUND;
1211 }
b75a7d8f
A
1212}
1213
1214/**************************************ISO-2022-JP*************************************************/
1215
1216/************************************** IMPORTANT **************************************************
1217* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1218* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1219* The converter iterates over each Unicode codepoint
1220* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1221* processed one char at a time it would make sense to reduce the extra processing a canned converter
1222* would do as far as possible.
1223*
1224* If the implementation of these macros or structure of sharedData struct change in the future, make
1225* sure that ISO-2022 is also changed.
1226***************************************************************************************************
1227*/
1228
1229/***************************************************************************************************
1230* Rules for ISO-2022-jp encoding
1231* (i) Escape sequences must be fully contained within a line they should not
1232* span new lines or CRs
1233* (ii) If the last character on a line is represented by two bytes then an ASCII or
1234* JIS-Roman character escape sequence should follow before the line terminates
1235* (iii) If the first character on the line is represented by two bytes then a two
1236* byte character escape sequence should precede it
1237* (iv) If no escape sequence is encountered then the characters are ASCII
1238* (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1239* and invoked with SS2 (ESC N).
1240* (vi) If there is any G0 designation in text, there must be a switch to
1241* ASCII or to JIS X 0201-Roman before a space character (but not
1242* necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1243* characters such as tab or CRLF.
1244* (vi) Supported encodings:
1245* ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1246*
1247* source : RFC-1554
1248*
1249* JISX201, JISX208,JISX212 : new .cnv data files created
1250* KSC5601 : alias to ibm-949 mapping table
1251* GB2312 : alias to ibm-1386 mapping table
1252* ISO-8859-1 : Algorithmic implemented as LATIN1 case
1253* ISO-8859-7 : alisas to ibm-9409 mapping table
1254*/
b75a7d8f 1255
374ca955
A
1256/* preference order of JP charsets */
1257static const StateEnum jpCharsetPref[]={
1258 ASCII,
1259 JISX201,
1260 ISO8859_1,
1261 ISO8859_7,
1262 JISX208,
1263 JISX212,
1264 GB2312,
1265 KSC5601,
1266 HWKANA_7BIT
b75a7d8f
A
1267};
1268
73c04bcf
A
1269/*
1270 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1271 * not in order of jpCharsetPref[]!
1272 */
374ca955 1273static const char escSeqChars[][6] ={
b75a7d8f
A
1274 "\x1B\x28\x42", /* <ESC>(B ASCII */
1275 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1276 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1277 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1278 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1279 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1280 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1281 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1282 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1283
1284};
374ca955
A
1285static const int32_t escSeqCharsLen[] ={
1286 3, /* length of <ESC>(B ASCII */
b75a7d8f
A
1287 3, /* length of <ESC>.A ISO-8859-1 */
1288 3, /* length of <ESC>.F ISO-8859-7 */
1289 3, /* length of <ESC>(J JISX-201 */
1290 3, /* length of <ESC>$B JISX-208 */
1291 4, /* length of <ESC>$(D JISX-212 */
1292 3, /* length of <ESC>$A GB2312 */
1293 4, /* length of <ESC>$(C KSC5601 */
1294 3 /* length of <ESC>(I HWKANA_7BIT */
1295};
1296
1297/*
1298* The iteration over various code pages works this way:
1299* i) Get the currentState from myConverterData->currentState
1300* ii) Check if the character is mapped to a valid character in the currentState
1301* Yes -> a) set the initIterState to currentState
1302* b) remain in this state until an invalid character is found
1303* No -> a) go to the next code page and find the character
1304* iii) Before changing the state increment the current state check if the current state
1305* is equal to the intitIteration state
1306* Yes -> A character that cannot be represented in any of the supported encodings
1307* break and return a U_INVALID_CHARACTER error
1308* No -> Continue and find the character in next code page
1309*
1310*
1311* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1312*/
1313
1314static void
374ca955 1315UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
b75a7d8f 1316 UConverterDataISO2022 *converterData;
374ca955
A
1317 ISO2022State *pFromU2022State;
1318 uint8_t *target = (uint8_t *) args->target;
1319 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
b75a7d8f
A
1320 const UChar* source = args->source;
1321 const UChar* sourceLimit = args->sourceLimit;
1322 int32_t* offsets = args->offsets;
374ca955
A
1323 UChar32 sourceChar;
1324 char buffer[8];
1325 int32_t len, outLen;
1326 int8_t choices[10];
1327 int32_t choiceCount;
73c04bcf 1328 uint32_t targetValue = 0;
374ca955
A
1329 UBool useFallback;
1330
1331 int32_t i;
1332 int8_t cs, g;
1333
1334 /* set up the state */
1335 converterData = (UConverterDataISO2022*)args->converter->extraInfo;
1336 pFromU2022State = &converterData->fromU2022State;
1337 useFallback = args->converter->useFallback;
1338
1339 choiceCount = 0;
b75a7d8f 1340
b75a7d8f 1341 /* check if the last codepoint of previous buffer was a lead surrogate*/
374ca955 1342 if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
b75a7d8f
A
1343 goto getTrail;
1344 }
b75a7d8f 1345
374ca955
A
1346 while(source < sourceLimit) {
1347 if(target < targetLimit) {
b75a7d8f 1348
b75a7d8f 1349 sourceChar = *(source++);
374ca955 1350 /*check if the char is a First surrogate*/
73c04bcf 1351 if(UTF_IS_SURROGATE(sourceChar)) {
374ca955
A
1352 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1353getTrail:
1354 /*look ahead to find the trail surrogate*/
1355 if(source < sourceLimit) {
1356 /* test the following code unit */
1357 UChar trail=(UChar) *source;
1358 if(UTF_IS_SECOND_SURROGATE(trail)) {
1359 source++;
1360 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1361 args->converter->fromUChar32=0x00;
1362 /* convert this supplementary code point */
1363 /* exit this condition tree */
1364 } else {
1365 /* this is an unmatched lead code unit (1st surrogate) */
1366 /* callback(illegal) */
1367 *err=U_ILLEGAL_CHAR_FOUND;
1368 args->converter->fromUChar32=sourceChar;
1369 break;
b75a7d8f 1370 }
374ca955
A
1371 } else {
1372 /* no more input */
1373 args->converter->fromUChar32=sourceChar;
b75a7d8f
A
1374 break;
1375 }
374ca955
A
1376 } else {
1377 /* this is an unmatched trail code unit (2nd surrogate) */
1378 /* callback(illegal) */
1379 *err=U_ILLEGAL_CHAR_FOUND;
1380 args->converter->fromUChar32=sourceChar;
1381 break;
1382 }
b75a7d8f
A
1383 }
1384
73c04bcf
A
1385 /* do not convert SO/SI/ESC */
1386 if(IS_2022_CONTROL(sourceChar)) {
1387 /* callback(illegal) */
1388 *err=U_ILLEGAL_CHAR_FOUND;
1389 args->converter->fromUChar32=sourceChar;
1390 break;
1391 }
1392
374ca955 1393 /* do the conversion */
b75a7d8f 1394
374ca955
A
1395 if(choiceCount == 0) {
1396 uint16_t csm;
b75a7d8f 1397
374ca955
A
1398 /*
1399 * The csm variable keeps track of which charsets are allowed
1400 * and not used yet while building the choices[].
1401 */
1402 csm = jpCharsetMasks[converterData->version];
1403 choiceCount = 0;
1404
1405 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1406 if(converterData->version == 3 || converterData->version == 4) {
1407 choices[choiceCount++] = cs = (int8_t)HWKANA_7BIT;
1408 csm &= ~CSM(cs);
1409 }
b75a7d8f 1410
374ca955
A
1411 /* try the current G0 charset */
1412 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1413 csm &= ~CSM(cs);
b75a7d8f 1414
374ca955
A
1415 /* try the current G2 charset */
1416 if((cs = pFromU2022State->cs[2]) != 0) {
1417 choices[choiceCount++] = cs;
1418 csm &= ~CSM(cs);
1419 }
1420
1421 /* try all the other possible charsets */
1422 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1423 cs = (int8_t)jpCharsetPref[i];
1424 if(CSM(cs) & csm) {
1425 choices[choiceCount++] = cs;
1426 csm &= ~CSM(cs);
b75a7d8f
A
1427 }
1428 }
374ca955 1429 }
b75a7d8f 1430
374ca955
A
1431 cs = g = 0;
1432 len = 0;
1433
1434 for(i = 0; i < choiceCount && len == 0; ++i) {
1435 cs = choices[i];
1436 switch(cs) {
1437 case ASCII:
1438 if(sourceChar <= 0x7f) {
1439 targetValue = (uint32_t)sourceChar;
1440 len = 1;
b75a7d8f 1441 }
374ca955
A
1442 break;
1443 case ISO8859_1:
1444 if(0x80 <= sourceChar && sourceChar <= 0xff) {
1445 targetValue = (uint32_t)sourceChar - 0x80;
1446 len = 1;
1447 g = 2;
1448 }
1449 break;
1450 case HWKANA_7BIT:
1451 if((uint32_t)(0xff9f-sourceChar)<=(0xff9f-0xff61)) {
1452 targetValue = (uint32_t)(sourceChar - (0xff61 - 0x21));
1453 len = 1;
1454
1455 if(converterData->version==3) {
1456 /* JIS7: use G1 (SO) */
1457 pFromU2022State->cs[1] = cs; /* do not output an escape sequence */
1458 g = 1;
1459 } else if(converterData->version==4) {
1460 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1461 int8_t cs0;
1462
1463 targetValue += 0x80;
1464
1465 cs0 = pFromU2022State->cs[0];
1466 if(IS_JP_DBCS(cs0)) {
1467 /* switch from a DBCS charset to JISX201 */
1468 cs = (int8_t)JISX201;
1469 } else {
1470 /* stay in the current G0 charset */
1471 cs = cs0;
b75a7d8f 1472 }
b75a7d8f 1473 }
b75a7d8f 1474 }
374ca955
A
1475 break;
1476 case JISX201:
1477 /* G0 SBCS */
1478 MBCS_SINGLE_FROM_UCHAR32(
1479 converterData->myConverterArray[cs],
1480 sourceChar, &targetValue,
1481 useFallback);
1482 if(targetValue <= 0x7f) {
1483 len = 1;
1484 }
1485 break;
1486 case ISO8859_7:
1487 /* G0 SBCS forced to 7-bit output */
1488 MBCS_SINGLE_FROM_UCHAR32(
1489 converterData->myConverterArray[cs],
1490 sourceChar, &targetValue,
1491 useFallback);
1492 if(0x80 <= targetValue && targetValue <= 0xff) {
1493 targetValue -= 0x80;
1494 len = 1;
1495 g = 2;
1496 }
1497 break;
1498 default:
1499 /* G0 DBCS */
1500 MBCS_FROM_UCHAR32_ISO2022(
1501 converterData->myConverterArray[cs],
1502 sourceChar, &targetValue,
1503 useFallback, &len, MBCS_OUTPUT_2);
1504 if(len != 2) {
1505 len = 0;
1506 }
1507 break;
b75a7d8f
A
1508 }
1509 }
b75a7d8f 1510
374ca955
A
1511 if(len > 0) {
1512 outLen = 0; /* count output bytes */
1513
1514 /* write SI if necessary (only for JIS7) */
1515 if(pFromU2022State->g == 1 && g == 0) {
1516 buffer[outLen++] = UCNV_SI;
1517 pFromU2022State->g = 0;
1518 }
1519
1520 /* write the designation sequence if necessary */
1521 if(cs != pFromU2022State->cs[g]) {
1522 int32_t escLen = escSeqCharsLen[cs];
1523 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1524 outLen += escLen;
1525 pFromU2022State->cs[g] = cs;
1526
1527 /* invalidate the choices[] */
1528 choiceCount = 0;
1529 }
1530
1531 /* write the shift sequence if necessary */
1532 if(g != pFromU2022State->g) {
1533 switch(g) {
1534 /* case 0 handled before writing escapes */
1535 case 1:
1536 buffer[outLen++] = UCNV_SO;
1537 pFromU2022State->g = 1;
1538 break;
1539 default: /* case 2 */
1540 buffer[outLen++] = 0x1b;
1541 buffer[outLen++] = 0x4e;
1542 break;
1543 /* no case 3: no SS3 in ISO-2022-JP-x */
1544 }
1545 }
1546
1547 /* write the output bytes */
1548 if(len == 1) {
1549 buffer[outLen++] = (char)targetValue;
1550 } else /* len == 2 */ {
1551 buffer[outLen++] = (char)(targetValue >> 8);
1552 buffer[outLen++] = (char)targetValue;
1553 }
1554 } else {
1555 /*
1556 * if we cannot find the character after checking all codepages
b75a7d8f
A
1557 * then this is an error
1558 */
b75a7d8f 1559 *err = U_INVALID_CHAR_FOUND;
374ca955
A
1560 args->converter->fromUChar32=sourceChar;
1561 break;
1562 }
1563
1564 if(sourceChar == CR || sourceChar == LF) {
1565 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1566 pFromU2022State->cs[2] = 0;
1567 choiceCount = 0;
1568 }
1569
1570 /* output outLen>0 bytes in buffer[] */
1571 if(outLen == 1) {
1572 *target++ = buffer[0];
1573 if(offsets) {
73c04bcf 1574 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
b75a7d8f 1575 }
374ca955
A
1576 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1577 *target++ = buffer[0];
1578 *target++ = buffer[1];
1579 if(offsets) {
1580 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1581 *offsets++ = sourceIndex;
1582 *offsets++ = sourceIndex;
1583 }
1584 } else {
73c04bcf 1585 fromUWriteUInt8(
374ca955
A
1586 args->converter,
1587 buffer, outLen,
73c04bcf 1588 &target, (const char *)targetLimit,
374ca955
A
1589 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1590 err);
1591 if(U_FAILURE(*err)) {
b75a7d8f
A
1592 break;
1593 }
1594 }
1595 } /* end if(myTargetIndex<myTargetLength) */
1596 else{
1597 *err =U_BUFFER_OVERFLOW_ERROR;
1598 break;
1599 }
1600
1601 }/* end while(mySourceIndex<mySourceLength) */
1602
374ca955
A
1603 /*
1604 * the end of the input stream and detection of truncated input
1605 * are handled by the framework, but for ISO-2022-JP conversion
1606 * we need to be in ASCII mode at the very end
1607 *
1608 * conditions:
1609 * successful
1610 * in SO mode or not in ASCII mode
1611 * end of input and no truncated input
b75a7d8f 1612 */
374ca955
A
1613 if( U_SUCCESS(*err) &&
1614 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1615 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
1616 ) {
1617 int32_t sourceIndex;
1618
1619 outLen = 0;
1620
1621 if(pFromU2022State->g != 0) {
1622 buffer[outLen++] = UCNV_SI;
1623 pFromU2022State->g = 0;
1624 }
1625
1626 if(pFromU2022State->cs[0] != ASCII) {
1627 int32_t escLen = escSeqCharsLen[ASCII];
1628 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1629 outLen += escLen;
1630 pFromU2022State->cs[0] = (int8_t)ASCII;
1631 }
1632
1633 /* get the source index of the last input character */
1634 /*
1635 * TODO this would be simpler and more reliable if we used a pair
1636 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1637 * so that we could simply use the prevSourceIndex here;
1638 * this code gives an incorrect result for the rare case of an unmatched
1639 * trail surrogate that is alone in the last buffer of the text stream
1640 */
1641 sourceIndex=(int32_t)(source-args->source);
1642 if(sourceIndex>0) {
1643 --sourceIndex;
1644 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1645 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1646 ) {
1647 --sourceIndex;
1648 }
1649 } else {
1650 sourceIndex=-1;
1651 }
1652
73c04bcf 1653 fromUWriteUInt8(
374ca955
A
1654 args->converter,
1655 buffer, outLen,
73c04bcf 1656 &target, (const char *)targetLimit,
374ca955
A
1657 &offsets, sourceIndex,
1658 err);
b75a7d8f
A
1659 }
1660
1661 /*save the state and return */
1662 args->source = source;
1663 args->target = (char*)target;
1664}
1665
1666/*************** to unicode *******************/
1667
b75a7d8f
A
1668static void
1669UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
374ca955
A
1670 UErrorCode* err){
1671 char tempBuf[3];
1672 const char *mySource = (char *) args->source;
b75a7d8f
A
1673 UChar *myTarget = args->target;
1674 const char *mySourceLimit = args->sourceLimit;
1675 uint32_t targetUniChar = 0x0000;
1676 uint32_t mySourceChar = 0x0000;
1677 UConverterDataISO2022* myData;
374ca955
A
1678 ISO2022State *pToU2022State;
1679 StateEnum cs;
b75a7d8f 1680
b75a7d8f 1681 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
374ca955 1682 pToU2022State = &myData->toU2022State;
b75a7d8f 1683
374ca955
A
1684 if(myData->key != 0) {
1685 /* continue with a partial escape sequence */
1686 goto escape;
1687 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
1688 /* continue with a partial double-byte character */
1689 mySourceChar = args->converter->toUBytes[0];
1690 args->converter->toULength = 0;
1691 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1692 goto getTrailByte;
1693 }
1694
1695 while(mySource < mySourceLimit){
1696
1697 targetUniChar =missingCharMarker;
b75a7d8f
A
1698
1699 if(myTarget < args->targetLimit){
1700
1701 mySourceChar= (unsigned char) *mySource++;
374ca955
A
1702
1703 switch(mySourceChar) {
1704 case UCNV_SI:
1705 if(myData->version==3) {
1706 pToU2022State->g=0;
b75a7d8f 1707 continue;
374ca955
A
1708 } else {
1709 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
d5d484b0 1710 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
374ca955 1711 break;
b75a7d8f 1712 }
b75a7d8f 1713
374ca955
A
1714 case UCNV_SO:
1715 if(myData->version==3) {
1716 /* JIS7: switch to G1 half-width Katakana */
1717 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
1718 pToU2022State->g=1;
b75a7d8f 1719 continue;
374ca955
A
1720 } else {
1721 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
d5d484b0 1722 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
374ca955 1723 break;
b75a7d8f 1724 }
b75a7d8f 1725
374ca955
A
1726 case ESC_2022:
1727 mySource--;
1728escape:
d5d484b0
A
1729 {
1730 const char * mySourceBefore = mySource;
1731 int8_t toULengthBefore = args->converter->toULength;
1732
1733 changeState_2022(args->converter,&(mySource),
1734 mySourceLimit, ISO_2022_JP,err);
1735
1736 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
1737 if ( myData->version == 0 && myData->key == 0 && U_SUCCESS(*err) && myData->isEmptySegment ) {
1738 *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
1739 args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
1740 }
b75a7d8f 1741
d5d484b0 1742 }
374ca955
A
1743 /* invalid or illegal escape sequence */
1744 if(U_FAILURE(*err)){
1745 args->target = myTarget;
1746 args->source = mySource;
d5d484b0 1747 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
374ca955 1748 return;
b75a7d8f 1749 }
d5d484b0
A
1750 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
1751 if (myData->key == 0) {
1752 myData->isEmptySegment = TRUE;
1753 }
374ca955 1754 continue;
b75a7d8f 1755
374ca955 1756 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
b75a7d8f 1757
374ca955
A
1758 case CR:
1759 /*falls through*/
1760 case LF:
1761 /* automatically reset to single-byte mode */
1762 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
1763 pToU2022State->cs[0] = (int8_t)ASCII;
b75a7d8f 1764 }
374ca955
A
1765 pToU2022State->cs[2] = 0;
1766 pToU2022State->g = 0;
1767 /* falls through */
b75a7d8f 1768 default:
374ca955 1769 /* convert one or two bytes */
d5d484b0 1770 myData->isEmptySegment = FALSE;
374ca955
A
1771 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1772 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
1773 !IS_JP_DBCS(cs)
1774 ) {
1775 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
1776 targetUniChar = mySourceChar + (0xff61 - 0xa1);
1777
1778 /* return from a single-shift state to the previous one */
1779 if(pToU2022State->g >= 2) {
1780 pToU2022State->g=pToU2022State->prevG;
1781 }
1782 } else switch(cs) {
1783 case ASCII:
1784 if(mySourceChar <= 0x7f) {
1785 targetUniChar = mySourceChar;
1786 }
1787 break;
1788 case ISO8859_1:
1789 if(mySourceChar <= 0x7f) {
1790 targetUniChar = mySourceChar + 0x80;
1791 }
1792 /* return from a single-shift state to the previous one */
1793 pToU2022State->g=pToU2022State->prevG;
1794 break;
1795 case ISO8859_7:
1796 if(mySourceChar <= 0x7f) {
1797 /* convert mySourceChar+0x80 to use a normal 8-bit table */
1798 targetUniChar =
1799 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1800 myData->myConverterArray[cs],
1801 mySourceChar + 0x80);
1802 }
1803 /* return from a single-shift state to the previous one */
1804 pToU2022State->g=pToU2022State->prevG;
1805 break;
1806 case JISX201:
1807 if(mySourceChar <= 0x7f) {
1808 targetUniChar =
1809 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1810 myData->myConverterArray[cs],
1811 mySourceChar);
1812 }
1813 break;
1814 case HWKANA_7BIT:
1815 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
1816 /* 7-bit halfwidth Katakana */
1817 targetUniChar = mySourceChar + (0xff61 - 0x21);
1818 }
1819 break;
1820 default:
1821 /* G0 DBCS */
1822 if(mySource < mySourceLimit) {
1823 char trailByte;
1824getTrailByte:
1825 tempBuf[0] = (char) (mySourceChar);
1826 tempBuf[1] = trailByte = *mySource++;
1827 mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
1828 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
1829 } else {
1830 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
1831 args->converter->toULength = 1;
1832 goto endloop;
1833 }
1834 }
b75a7d8f
A
1835 break;
1836 }
1837 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
1838 if(args->offsets){
73c04bcf 1839 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f
A
1840 }
1841 *(myTarget++)=(UChar)targetUniChar;
b75a7d8f 1842 }
374ca955
A
1843 else if(targetUniChar > missingCharMarker){
1844 /* disassemble the surrogate pair and write to output*/
1845 targetUniChar-=0x0010000;
1846 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
1847 if(args->offsets){
73c04bcf 1848 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
374ca955
A
1849 }
1850 ++myTarget;
1851 if(myTarget< args->targetLimit){
1852 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
1853 if(args->offsets){
73c04bcf 1854 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
374ca955
A
1855 }
1856 ++myTarget;
1857 }else{
1858 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
1859 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
1860 }
b75a7d8f 1861
374ca955
A
1862 }
1863 else{
b75a7d8f 1864 /* Call the callback function*/
374ca955
A
1865 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
1866 break;
b75a7d8f
A
1867 }
1868 }
1869 else{
1870 *err =U_BUFFER_OVERFLOW_ERROR;
1871 break;
1872 }
1873 }
374ca955 1874endloop:
b75a7d8f
A
1875 args->target = myTarget;
1876 args->source = mySource;
1877}
1878
1879
b75a7d8f
A
1880/***************************************************************
1881* Rules for ISO-2022-KR encoding
1882* i) The KSC5601 designator sequence should appear only once in a file,
1883* at the begining of a line before any KSC5601 characters. This usually
1884* means that it appears by itself on the first line of the file
1885* ii) There are only 2 shifting sequences SO to shift into double byte mode
1886* and SI to shift into single byte mode
1887*/
1888static void
1889UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
1890
374ca955
A
1891 UConverter* saveConv = args->converter;
1892 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
1893 args->converter=myConverterData->currentConverter;
1894
1895 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
1896 ucnv_MBCSFromUnicodeWithOffsets(args,err);
1897 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
1898
1899 if(*err == U_BUFFER_OVERFLOW_ERROR) {
1900 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
1901 uprv_memcpy(
1902 saveConv->charErrorBuffer,
1903 myConverterData->currentConverter->charErrorBuffer,
1904 myConverterData->currentConverter->charErrorBufferLength);
1905 }
1906 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
1907 myConverterData->currentConverter->charErrorBufferLength = 0;
1908 }
1909 args->converter=saveConv;
b75a7d8f
A
1910}
1911
1912static void
1913UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
1914
1915 const UChar *source = args->source;
1916 const UChar *sourceLimit = args->sourceLimit;
1917 unsigned char *target = (unsigned char *) args->target;
1918 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
1919 int32_t* offsets = args->offsets;
1920 uint32_t targetByteUnit = 0x0000;
1921 UChar32 sourceChar = 0x0000;
1922 UBool isTargetByteDBCS;
1923 UBool oldIsTargetByteDBCS;
1924 UConverterDataISO2022 *converterData;
b75a7d8f
A
1925 UConverterSharedData* sharedData;
1926 UBool useFallback;
1927 int32_t length =0;
1928
b75a7d8f 1929 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
b75a7d8f
A
1930 /* if the version is 1 then the user is requesting
1931 * conversion with ibm-25546 pass the arguments to
1932 * MBCS converter and return
1933 */
1934 if(converterData->version==1){
1935 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
1936 return;
1937 }
374ca955
A
1938
1939 /* initialize data */
1940 sharedData = converterData->currentConverter->sharedData;
1941 useFallback = args->converter->useFallback;
1942 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
1943 oldIsTargetByteDBCS = isTargetByteDBCS;
b75a7d8f
A
1944
1945 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
374ca955 1946 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
b75a7d8f
A
1947 goto getTrail;
1948 }
1949 while(source < sourceLimit){
1950
1951 targetByteUnit = missingCharMarker;
1952
1953 if(target < (unsigned char*) args->targetLimit){
1954 sourceChar = *source++;
73c04bcf
A
1955
1956 /* do not convert SO/SI/ESC */
1957 if(IS_2022_CONTROL(sourceChar)) {
1958 /* callback(illegal) */
1959 *err=U_ILLEGAL_CHAR_FOUND;
1960 args->converter->fromUChar32=sourceChar;
1961 break;
1962 }
1963
374ca955 1964 /* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData,
b75a7d8f 1965 sourceChar,&targetByteUnit,args->converter->useFallback);*/
374ca955 1966 MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,&length,MBCS_OUTPUT_2);
b75a7d8f 1967 /* only DBCS or SBCS characters are expected*/
374ca955 1968 /* DB characters with high bit set to 1 are expected */
b75a7d8f
A
1969 if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
1970 targetByteUnit=missingCharMarker;
1971 }
1972 if (targetByteUnit != missingCharMarker){
1973
1974 oldIsTargetByteDBCS = isTargetByteDBCS;
1975 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
1976 /* append the shift sequence */
1977 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
1978
1979 if (isTargetByteDBCS)
1980 *target++ = UCNV_SO;
1981 else
1982 *target++ = UCNV_SI;
1983 if(offsets)
73c04bcf 1984 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
1985 }
1986 /* write the targetUniChar to target */
1987 if(targetByteUnit <= 0x00FF){
1988 if( target < targetLimit){
1989 *(target++) = (unsigned char) targetByteUnit;
1990 if(offsets){
73c04bcf 1991 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
1992 }
1993
1994 }else{
1995 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
1996 *err = U_BUFFER_OVERFLOW_ERROR;
1997 }
1998 }else{
1999 if(target < targetLimit){
2000 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2001 if(offsets){
73c04bcf 2002 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2003 }
2004 if(target < targetLimit){
2005 *(target++) =(unsigned char) (targetByteUnit -0x80);
2006 if(offsets){
73c04bcf 2007 *(offsets++) = (int32_t)(source - args->source-1);
b75a7d8f
A
2008 }
2009 }else{
2010 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2011 *err = U_BUFFER_OVERFLOW_ERROR;
2012 }
2013 }else{
2014 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2015 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2016 *err = U_BUFFER_OVERFLOW_ERROR;
2017 }
2018 }
2019
2020 }
2021 else{
2022 /* oops.. the code point is unassingned
2023 * set the error and reason
2024 */
b75a7d8f
A
2025
2026 /*check if the char is a First surrogate*/
2027 if(UTF_IS_SURROGATE(sourceChar)) {
2028 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
b75a7d8f
A
2029getTrail:
2030 /*look ahead to find the trail surrogate*/
2031 if(source < sourceLimit) {
2032 /* test the following code unit */
2033 UChar trail=(UChar) *source;
2034 if(UTF_IS_SECOND_SURROGATE(trail)) {
2035 source++;
374ca955 2036 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
b75a7d8f 2037 *err = U_INVALID_CHAR_FOUND;
b75a7d8f
A
2038 /* convert this surrogate code point */
2039 /* exit this condition tree */
2040 } else {
2041 /* this is an unmatched lead code unit (1st surrogate) */
2042 /* callback(illegal) */
b75a7d8f
A
2043 *err=U_ILLEGAL_CHAR_FOUND;
2044 }
2045 } else {
2046 /* no more input */
2047 *err = U_ZERO_ERROR;
b75a7d8f
A
2048 }
2049 } else {
2050 /* this is an unmatched trail code unit (2nd surrogate) */
2051 /* callback(illegal) */
b75a7d8f
A
2052 *err=U_ILLEGAL_CHAR_FOUND;
2053 }
374ca955
A
2054 } else {
2055 /* callback(unassigned) for a BMP code point */
2056 *err = U_INVALID_CHAR_FOUND;
b75a7d8f 2057 }
b75a7d8f 2058
374ca955 2059 args->converter->fromUChar32=sourceChar;
374ca955 2060 break;
b75a7d8f
A
2061 }
2062 } /* end if(myTargetIndex<myTargetLength) */
2063 else{
2064 *err =U_BUFFER_OVERFLOW_ERROR;
2065 break;
2066 }
2067
2068 }/* end while(mySourceIndex<mySourceLength) */
2069
374ca955
A
2070 /*
2071 * the end of the input stream and detection of truncated input
2072 * are handled by the framework, but for ISO-2022-KR conversion
2073 * we need to be in ASCII mode at the very end
2074 *
2075 * conditions:
2076 * successful
2077 * not in ASCII mode
2078 * end of input and no truncated input
b75a7d8f 2079 */
374ca955
A
2080 if( U_SUCCESS(*err) &&
2081 isTargetByteDBCS &&
2082 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2083 ) {
2084 int32_t sourceIndex;
2085
2086 /* we are switching to ASCII */
2087 isTargetByteDBCS=FALSE;
2088
2089 /* get the source index of the last input character */
2090 /*
2091 * TODO this would be simpler and more reliable if we used a pair
2092 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2093 * so that we could simply use the prevSourceIndex here;
2094 * this code gives an incorrect result for the rare case of an unmatched
2095 * trail surrogate that is alone in the last buffer of the text stream
2096 */
2097 sourceIndex=(int32_t)(source-args->source);
2098 if(sourceIndex>0) {
2099 --sourceIndex;
2100 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2101 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2102 ) {
2103 --sourceIndex;
2104 }
2105 } else {
2106 sourceIndex=-1;
2107 }
2108
73c04bcf 2109 fromUWriteUInt8(
374ca955
A
2110 args->converter,
2111 SHIFT_IN_STR, 1,
73c04bcf 2112 &target, (const char *)targetLimit,
374ca955
A
2113 &offsets, sourceIndex,
2114 err);
b75a7d8f
A
2115 }
2116
2117 /*save the state and return */
2118 args->source = source;
2119 args->target = (char*)target;
2120 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2121}
2122
2123/************************ To Unicode ***************************************/
2124
2125static void
2126UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2127 UErrorCode* err){
b75a7d8f 2128 char const* sourceStart;
b75a7d8f 2129 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
b75a7d8f 2130
374ca955
A
2131 UConverterToUnicodeArgs subArgs;
2132 int32_t minArgsSize;
2133
2134 /* set up the subconverter arguments */
2135 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2136 minArgsSize = args->size;
2137 } else {
2138 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2139 }
2140
2141 uprv_memcpy(&subArgs, args, minArgsSize);
2142 subArgs.size = (uint16_t)minArgsSize;
2143 subArgs.converter = myData->currentConverter;
2144
2145 /* remember the original start of the input for offsets */
2146 sourceStart = args->source;
2147
2148 if(myData->key != 0) {
2149 /* continue with a partial escape sequence */
2150 goto escape;
2151 }
2152
2153 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
b75a7d8f 2154 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
374ca955
A
2155 subArgs.source = args->source;
2156 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2157 if(subArgs.source != subArgs.sourceLimit) {
2158 /*
2159 * get the current partial byte sequence
2160 *
2161 * it needs to be moved between the public and the subconverter
2162 * so that the conversion framework, which only sees the public
2163 * converter, can handle truncated and illegal input etc.
2164 */
2165 if(args->converter->toULength > 0) {
2166 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2167 }
2168 subArgs.converter->toULength = args->converter->toULength;
2169
2170 /*
2171 * Convert up to the end of the input, or to before the next escape character.
2172 * Does not handle conversion extensions because the preToU[] state etc.
2173 * is not copied.
2174 */
2175 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2176
2177 if(args->offsets != NULL && sourceStart != args->source) {
2178 /* update offsets to base them on the actual start of the input */
2179 int32_t *offsets = args->offsets;
2180 UChar *target = args->target;
2181 int32_t delta = (int32_t)(args->source - sourceStart);
2182 while(target < subArgs.target) {
2183 if(*offsets >= 0) {
2184 *offsets += delta;
2185 }
2186 ++offsets;
2187 ++target;
2188 }
2189 }
2190 args->source = subArgs.source;
2191 args->target = subArgs.target;
2192 args->offsets = subArgs.offsets;
2193
2194 /* copy input/error/overflow buffers */
2195 if(subArgs.converter->toULength > 0) {
2196 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2197 }
2198 args->converter->toULength = subArgs.converter->toULength;
2199
2200 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2201 if(subArgs.converter->UCharErrorBufferLength > 0) {
2202 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2203 subArgs.converter->UCharErrorBufferLength);
2204 }
2205 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2206 subArgs.converter->UCharErrorBufferLength = 0;
b75a7d8f 2207 }
b75a7d8f
A
2208 }
2209
374ca955 2210 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
b75a7d8f 2211 return;
374ca955 2212 }
b75a7d8f 2213
374ca955 2214escape:
b75a7d8f
A
2215 changeState_2022(args->converter,
2216 &(args->source),
2217 args->sourceLimit,
b75a7d8f 2218 ISO_2022_KR,
b75a7d8f 2219 err);
374ca955 2220 }
b75a7d8f
A
2221}
2222
2223static void
2224UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2225 UErrorCode* err){
374ca955 2226 char tempBuf[2];
b75a7d8f
A
2227 const char *mySource = ( char *) args->source;
2228 UChar *myTarget = args->target;
2229 const char *mySourceLimit = args->sourceLimit;
2230 UChar32 targetUniChar = 0x0000;
2231 UChar mySourceChar = 0x0000;
2232 UConverterDataISO2022* myData;
b75a7d8f
A
2233 UConverterSharedData* sharedData ;
2234 UBool useFallback;
2235
374ca955
A
2236 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2237 if(myData->version==1){
2238 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
b75a7d8f
A
2239 return;
2240 }
374ca955 2241
b75a7d8f 2242 /* initialize state */
374ca955 2243 sharedData = myData->currentConverter->sharedData;
b75a7d8f
A
2244 useFallback = args->converter->useFallback;
2245
374ca955
A
2246 if(myData->key != 0) {
2247 /* continue with a partial escape sequence */
2248 goto escape;
2249 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2250 /* continue with a partial double-byte character */
2251 mySourceChar = args->converter->toUBytes[0];
2252 args->converter->toULength = 0;
2253 goto getTrailByte;
b75a7d8f 2254 }
b75a7d8f 2255
374ca955 2256 while(mySource< mySourceLimit){
b75a7d8f
A
2257
2258 if(myTarget < args->targetLimit){
2259
2260 mySourceChar= (unsigned char) *mySource++;
2261
2262 if(mySourceChar==UCNV_SI){
374ca955 2263 myData->toU2022State.g = 0;
d5d484b0
A
2264 if (myData->isEmptySegment) {
2265 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2266 *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
2267 args->converter->toUBytes[0] = mySourceChar;
2268 args->converter->toULength = 1;
2269 args->target = myTarget;
2270 args->source = mySource;
2271 return;
2272 }
b75a7d8f
A
2273 /*consume the source */
2274 continue;
2275 }else if(mySourceChar==UCNV_SO){
374ca955 2276 myData->toU2022State.g = 1;
d5d484b0 2277 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
b75a7d8f
A
2278 /*consume the source */
2279 continue;
374ca955
A
2280 }else if(mySourceChar==ESC_2022){
2281 mySource--;
2282escape:
d5d484b0 2283 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
374ca955
A
2284 changeState_2022(args->converter,&(mySource),
2285 mySourceLimit, ISO_2022_KR, err);
b75a7d8f
A
2286 if(U_FAILURE(*err)){
2287 args->target = myTarget;
2288 args->source = mySource;
2289 return;
2290 }
2291 continue;
2292 }
2293
d5d484b0 2294 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
374ca955
A
2295 if(myData->toU2022State.g == 1) {
2296 if(mySource < mySourceLimit) {
2297 char trailByte;
2298getTrailByte:
2299 trailByte = *mySource++;
2300 tempBuf[0] = (char)(mySourceChar + 0x80);
2301 tempBuf[1] = (char)(trailByte + 0x80);
2302 mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2303 if((mySourceChar & 0x8080) == 0) {
2304 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2305 } else {
2306 /* illegal bytes > 0x7f */
2307 targetUniChar = missingCharMarker;
2308 }
2309 } else {
2310 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2311 args->converter->toULength = 1;
2312 break;
b75a7d8f
A
2313 }
2314 }
2315 else{
374ca955 2316 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
b75a7d8f 2317 }
374ca955
A
2318 if(targetUniChar < 0xfffe){
2319 if(args->offsets) {
73c04bcf 2320 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
374ca955 2321 }
b75a7d8f
A
2322 *(myTarget++)=(UChar)targetUniChar;
2323 }
2324 else {
b75a7d8f 2325 /* Call the callback function*/
374ca955
A
2326 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2327 break;
b75a7d8f
A
2328 }
2329 }
2330 else{
2331 *err =U_BUFFER_OVERFLOW_ERROR;
2332 break;
2333 }
2334 }
b75a7d8f
A
2335 args->target = myTarget;
2336 args->source = mySource;
2337}
2338
2339/*************************** END ISO2022-KR *********************************/
2340
2341/*************************** ISO-2022-CN *********************************
2342*
2343* Rules for ISO-2022-CN Encoding:
374ca955 2344* i) The designator sequence must appear once on a line before any instance
b75a7d8f
A
2345* of character set it designates.
2346* ii) If two lines contain characters from the same character set, both lines
2347* must include the designator sequence.
374ca955 2348* iii) Once the designator sequence is known, a shifting sequence has to be found
b75a7d8f
A
2349* to invoke the shifting
2350* iv) All lines start in ASCII and end in ASCII.
2351* v) Four shifting sequences are employed for this purpose:
2352*
2353* Sequcence ASCII Eq Charsets
2354* ---------- ------- ---------
374ca955
A
2355* SI <SI> US-ASCII
2356* SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2357* SS2 <ESC>N CNS-11643-1992 Plane 2
2358* SS3 <ESC>O CNS-11643-1992 Planes 3-7
b75a7d8f
A
2359*
2360* vi)
2361* SOdesignator : ESC "$" ")" finalchar_for_SO
2362* SS2designator : ESC "$" "*" finalchar_for_SS2
2363* SS3designator : ESC "$" "+" finalchar_for_SS3
2364*
2365* ESC $ ) A Indicates the bytes following SO are Chinese
2366* characters as defined in GB 2312-80, until
2367* another SOdesignation appears
2368*
2369*
2370* ESC $ ) E Indicates the bytes following SO are as defined
2371* in ISO-IR-165 (for details, see section 2.1),
2372* until another SOdesignation appears
2373*
2374* ESC $ ) G Indicates the bytes following SO are as defined
2375* in CNS 11643-plane-1, until another
2376* SOdesignation appears
2377*
2378* ESC $ * H Indicates the two bytes immediately following
2379* SS2 is a Chinese character as defined in CNS
2380* 11643-plane-2, until another SS2designation
2381* appears
2382* (Meaning <ESC>N must preceed every 2 byte
2383* sequence.)
2384*
2385* ESC $ + I Indicates the immediate two bytes following SS3
2386* is a Chinese character as defined in CNS
2387* 11643-plane-3, until another SS3designation
2388* appears
2389* (Meaning <ESC>O must preceed every 2 byte
2390* sequence.)
2391*
2392* ESC $ + J Indicates the immediate two bytes following SS3
2393* is a Chinese character as defined in CNS
2394* 11643-plane-4, until another SS3designation
2395* appears
374ca955 2396* (In English: <ESC>O must preceed every 2 byte
b75a7d8f
A
2397* sequence.)
2398*
2399* ESC $ + K Indicates the immediate two bytes following SS3
2400* is a Chinese character as defined in CNS
2401* 11643-plane-5, until another SS3designation
2402* appears
2403*
2404* ESC $ + L Indicates the immediate two bytes following SS3
2405* is a Chinese character as defined in CNS
2406* 11643-plane-6, until another SS3designation
2407* appears
2408*
2409* ESC $ + M Indicates the immediate two bytes following SS3
2410* is a Chinese character as defined in CNS
2411* 11643-plane-7, until another SS3designation
2412* appears
2413*
2414* As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2415* has its own designation information before any Chinese characters
2416* appear
2417*
2418*/
2419
2420/* The following are defined this way to make the strings truely readonly */
b75a7d8f
A
2421static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2422static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2423static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2424static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2425static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2426static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2427static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2428static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2429static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2430
2431/********************** ISO2022-CN Data **************************/
2432static const char* const escSeqCharsCN[10] ={
2433 SHIFT_IN_STR, /* ASCII */
2434 GB_2312_80_STR,
2435 ISO_IR_165_STR,
2436 CNS_11643_1992_Plane_1_STR,
2437 CNS_11643_1992_Plane_2_STR,
2438 CNS_11643_1992_Plane_3_STR,
2439 CNS_11643_1992_Plane_4_STR,
2440 CNS_11643_1992_Plane_5_STR,
2441 CNS_11643_1992_Plane_6_STR,
2442 CNS_11643_1992_Plane_7_STR
2443};
b75a7d8f
A
2444
2445static void
2446UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2447
2448 UConverterDataISO2022 *converterData;
374ca955
A
2449 ISO2022State *pFromU2022State;
2450 uint8_t *target = (uint8_t *) args->target;
2451 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
b75a7d8f
A
2452 const UChar* source = args->source;
2453 const UChar* sourceLimit = args->sourceLimit;
2454 int32_t* offsets = args->offsets;
374ca955
A
2455 UChar32 sourceChar;
2456 char buffer[8];
2457 int32_t len;
2458 int8_t choices[3];
2459 int32_t choiceCount;
73c04bcf 2460 uint32_t targetValue = 0;
b75a7d8f
A
2461 UBool useFallback;
2462
b75a7d8f
A
2463 /* set up the state */
2464 converterData = (UConverterDataISO2022*)args->converter->extraInfo;
374ca955 2465 pFromU2022State = &converterData->fromU2022State;
b75a7d8f 2466 useFallback = args->converter->useFallback;
374ca955
A
2467
2468 choiceCount = 0;
b75a7d8f
A
2469
2470 /* check if the last codepoint of previous buffer was a lead surrogate*/
374ca955 2471 if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
b75a7d8f
A
2472 goto getTrail;
2473 }
2474
b75a7d8f 2475 while( source < sourceLimit){
b75a7d8f
A
2476 if(target < targetLimit){
2477
2478 sourceChar = *(source++);
2479 /*check if the char is a First surrogate*/
2480 if(UTF_IS_SURROGATE(sourceChar)) {
2481 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
b75a7d8f
A
2482getTrail:
2483 /*look ahead to find the trail surrogate*/
2484 if(source < sourceLimit) {
2485 /* test the following code unit */
2486 UChar trail=(UChar) *source;
2487 if(UTF_IS_SECOND_SURROGATE(trail)) {
2488 source++;
374ca955
A
2489 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2490 args->converter->fromUChar32=0x00;
2491 /* convert this supplementary code point */
b75a7d8f
A
2492 /* exit this condition tree */
2493 } else {
2494 /* this is an unmatched lead code unit (1st surrogate) */
2495 /* callback(illegal) */
b75a7d8f 2496 *err=U_ILLEGAL_CHAR_FOUND;
374ca955
A
2497 args->converter->fromUChar32=sourceChar;
2498 break;
b75a7d8f
A
2499 }
2500 } else {
2501 /* no more input */
374ca955 2502 args->converter->fromUChar32=sourceChar;
b75a7d8f
A
2503 break;
2504 }
2505 } else {
2506 /* this is an unmatched trail code unit (2nd surrogate) */
2507 /* callback(illegal) */
b75a7d8f 2508 *err=U_ILLEGAL_CHAR_FOUND;
374ca955
A
2509 args->converter->fromUChar32=sourceChar;
2510 break;
b75a7d8f
A
2511 }
2512 }
2513
2514 /* do the conversion */
374ca955 2515 if(sourceChar <= 0x007f ){
73c04bcf
A
2516 /* do not convert SO/SI/ESC */
2517 if(IS_2022_CONTROL(sourceChar)) {
2518 /* callback(illegal) */
2519 *err=U_ILLEGAL_CHAR_FOUND;
2520 args->converter->fromUChar32=sourceChar;
2521 break;
2522 }
2523
374ca955
A
2524 /* US-ASCII */
2525 if(pFromU2022State->g == 0) {
2526 buffer[0] = (char)sourceChar;
2527 len = 1;
2528 } else {
2529 buffer[0] = UCNV_SI;
2530 buffer[1] = (char)sourceChar;
2531 len = 2;
2532 pFromU2022State->g = 0;
2533 choiceCount = 0;
2534 }
2535 if(sourceChar == CR || sourceChar == LF) {
2536 /* reset the state at the end of a line */
2537 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2538 choiceCount = 0;
b75a7d8f 2539 }
b75a7d8f
A
2540 }
2541 else{
374ca955
A
2542 /* convert U+0080..U+10ffff */
2543 UConverterSharedData *cnv;
2544 int32_t i;
2545 int8_t cs, g;
2546
2547 if(choiceCount == 0) {
2548 /* try the current SO/G1 converter first */
2549 choices[0] = pFromU2022State->cs[1];
2550
2551 /* default to GB2312_1 if none is designated yet */
2552 if(choices[0] == 0) {
2553 choices[0] = GB2312_1;
2554 }
b75a7d8f 2555
374ca955
A
2556 if(converterData->version == 0) {
2557 /* ISO-2022-CN */
2558
2559 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2560 if(choices[0] == GB2312_1) {
2561 choices[1] = (int8_t)CNS_11643_1;
2562 } else {
2563 choices[1] = (int8_t)GB2312_1;
b75a7d8f 2564 }
374ca955
A
2565
2566 choiceCount = 2;
2567 } else {
2568 /* ISO-2022-CN-EXT */
2569
2570 /* try one of the other converters */
2571 switch(choices[0]) {
2572 case GB2312_1:
2573 choices[1] = (int8_t)CNS_11643_1;
2574 choices[2] = (int8_t)ISO_IR_165;
2575 break;
2576 case ISO_IR_165:
2577 choices[1] = (int8_t)GB2312_1;
2578 choices[2] = (int8_t)CNS_11643_1;
2579 break;
2580 default: /* CNS_11643_x */
2581 choices[1] = (int8_t)GB2312_1;
2582 choices[2] = (int8_t)ISO_IR_165;
2583 break;
b75a7d8f 2584 }
b75a7d8f 2585
374ca955
A
2586 choiceCount = 3;
2587 }
b75a7d8f
A
2588 }
2589
374ca955
A
2590 cs = g = 0;
2591 len = 0;
2592
2593 for(i = 0; i < choiceCount && len == 0; ++i) {
2594 cs = choices[i];
2595 if(cs > 0) {
2596 if(cs > CNS_11643_0) {
2597 cnv = converterData->myConverterArray[CNS_11643];
2598 MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_3);
2599 if(len==3) {
2600 cs = (int8_t)(CNS_11643_0 + (targetValue >> 16) - 0x80);
2601 len = 2;
2602 if(cs == CNS_11643_1) {
2603 g = 1;
2604 } else if(cs == CNS_11643_2) {
2605 g = 2;
2606 } else /* plane 3..7 */ if(converterData->version == 1) {
2607 g = 3;
2608 } else {
2609 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
2610 len = 0;
2611 }
2612 }
2613 } else {
2614 /* GB2312_1 or ISO-IR-165 */
2615 cnv = converterData->myConverterArray[cs];
2616 MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_2);
2617 g = 1; /* used if len == 2 */
2618 }
b75a7d8f 2619 }
b75a7d8f
A
2620 }
2621
374ca955
A
2622 if(len > 0) {
2623 len = 0; /* count output bytes; it must have been len == 2 */
b75a7d8f 2624
374ca955
A
2625 /* write the designation sequence if necessary */
2626 if(cs != pFromU2022State->cs[g]) {
2627 if(cs < CNS_11643) {
2628 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
2629 } else {
2630 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
b75a7d8f 2631 }
374ca955
A
2632 len = 4;
2633 pFromU2022State->cs[g] = cs;
2634 if(g == 1) {
2635 /* changing the SO/G1 charset invalidates the choices[] */
2636 choiceCount = 0;
b75a7d8f 2637 }
374ca955
A
2638 }
2639
2640 /* write the shift sequence if necessary */
2641 if(g != pFromU2022State->g) {
2642 switch(g) {
2643 case 1:
2644 buffer[len++] = UCNV_SO;
2645
2646 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
2647 pFromU2022State->g = 1;
2648 break;
2649 case 2:
2650 buffer[len++] = 0x1b;
2651 buffer[len++] = 0x4e;
2652 break;
2653 default: /* case 3 */
2654 buffer[len++] = 0x1b;
2655 buffer[len++] = 0x4f;
2656 break;
b75a7d8f 2657 }
b75a7d8f 2658 }
b75a7d8f 2659
374ca955
A
2660 /* write the two output bytes */
2661 buffer[len++] = (char)(targetValue >> 8);
2662 buffer[len++] = (char)targetValue;
2663 } else {
2664 /* if we cannot find the character after checking all codepages
2665 * then this is an error
2666 */
2667 *err = U_INVALID_CHAR_FOUND;
2668 args->converter->fromUChar32=sourceChar;
2669 break;
2670 }
b75a7d8f 2671 }
b75a7d8f 2672
374ca955
A
2673 /* output len>0 bytes in buffer[] */
2674 if(len == 1) {
2675 *target++ = buffer[0];
2676 if(offsets) {
73c04bcf 2677 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
374ca955
A
2678 }
2679 } else if(len == 2 && (target + 2) <= targetLimit) {
2680 *target++ = buffer[0];
2681 *target++ = buffer[1];
2682 if(offsets) {
2683 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
2684 *offsets++ = sourceIndex;
2685 *offsets++ = sourceIndex;
2686 }
2687 } else {
73c04bcf 2688 fromUWriteUInt8(
374ca955
A
2689 args->converter,
2690 buffer, len,
73c04bcf 2691 &target, (const char *)targetLimit,
374ca955
A
2692 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2693 err);
2694 if(U_FAILURE(*err)) {
b75a7d8f
A
2695 break;
2696 }
2697 }
2698 } /* end if(myTargetIndex<myTargetLength) */
2699 else{
2700 *err =U_BUFFER_OVERFLOW_ERROR;
2701 break;
2702 }
2703
2704 }/* end while(mySourceIndex<mySourceLength) */
2705
374ca955
A
2706 /*
2707 * the end of the input stream and detection of truncated input
2708 * are handled by the framework, but for ISO-2022-CN conversion
2709 * we need to be in ASCII mode at the very end
2710 *
2711 * conditions:
2712 * successful
2713 * not in ASCII mode
2714 * end of input and no truncated input
b75a7d8f 2715 */
374ca955
A
2716 if( U_SUCCESS(*err) &&
2717 pFromU2022State->g!=0 &&
2718 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2719 ) {
2720 int32_t sourceIndex;
2721
2722 /* we are switching to ASCII */
2723 pFromU2022State->g=0;
2724
2725 /* get the source index of the last input character */
2726 /*
2727 * TODO this would be simpler and more reliable if we used a pair
2728 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2729 * so that we could simply use the prevSourceIndex here;
2730 * this code gives an incorrect result for the rare case of an unmatched
2731 * trail surrogate that is alone in the last buffer of the text stream
2732 */
2733 sourceIndex=(int32_t)(source-args->source);
2734 if(sourceIndex>0) {
2735 --sourceIndex;
2736 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2737 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2738 ) {
2739 --sourceIndex;
b75a7d8f 2740 }
374ca955
A
2741 } else {
2742 sourceIndex=-1;
b75a7d8f 2743 }
b75a7d8f 2744
73c04bcf 2745 fromUWriteUInt8(
374ca955
A
2746 args->converter,
2747 SHIFT_IN_STR, 1,
73c04bcf 2748 &target, (const char *)targetLimit,
374ca955
A
2749 &offsets, sourceIndex,
2750 err);
b75a7d8f 2751 }
b75a7d8f 2752
374ca955
A
2753 /*save the state and return */
2754 args->source = source;
2755 args->target = (char*)target;
b75a7d8f
A
2756}
2757
2758
2759static void
2760UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2761 UErrorCode* err){
2762 char tempBuf[3];
374ca955 2763 const char *mySource = (char *) args->source;
b75a7d8f 2764 UChar *myTarget = args->target;
b75a7d8f
A
2765 const char *mySourceLimit = args->sourceLimit;
2766 uint32_t targetUniChar = 0x0000;
2767 uint32_t mySourceChar = 0x0000;
2768 UConverterDataISO2022* myData;
374ca955 2769 ISO2022State *pToU2022State;
b75a7d8f 2770
374ca955
A
2771 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2772 pToU2022State = &myData->toU2022State;
2773
2774 if(myData->key != 0) {
2775 /* continue with a partial escape sequence */
2776 goto escape;
2777 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2778 /* continue with a partial double-byte character */
2779 mySourceChar = args->converter->toUBytes[0];
2780 args->converter->toULength = 0;
2781 goto getTrailByte;
b75a7d8f 2782 }
374ca955
A
2783
2784 while(mySource < mySourceLimit){
b75a7d8f
A
2785
2786 targetUniChar =missingCharMarker;
2787
2788 if(myTarget < args->targetLimit){
2789
2790 mySourceChar= (unsigned char) *mySource++;
2791
b75a7d8f
A
2792 switch(mySourceChar){
2793 case UCNV_SI:
374ca955 2794 pToU2022State->g=0;
d5d484b0
A
2795 if (myData->isEmptySegment) {
2796 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2797 *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
2798 args->converter->toUBytes[0] = mySourceChar;
2799 args->converter->toULength = 1;
2800 args->target = myTarget;
2801 args->source = mySource;
2802 return;
2803 }
b75a7d8f
A
2804 continue;
2805
2806 case UCNV_SO:
374ca955
A
2807 if(pToU2022State->cs[1] != 0) {
2808 pToU2022State->g=1;
d5d484b0 2809 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
374ca955
A
2810 continue;
2811 } else {
2812 /* illegal to have SO before a matching designator */
d5d484b0 2813 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
b75a7d8f
A
2814 break;
2815 }
2816
b75a7d8f 2817 case ESC_2022:
b75a7d8f 2818 mySource--;
374ca955 2819escape:
d5d484b0
A
2820 {
2821 const char * mySourceBefore = mySource;
2822 int8_t toULengthBefore = args->converter->toULength;
2823
2824 changeState_2022(args->converter,&(mySource),
2825 mySourceLimit, ISO_2022_CN,err);
2826
2827 /* After SO there must be at least one character before a designator (designator error handled separately) */
2828 if ( myData->key == 0 && U_SUCCESS(*err) && myData->isEmptySegment ) {
2829 *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
2830 args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
2831 }
2832 }
b75a7d8f
A
2833
2834 /* invalid or illegal escape sequence */
2835 if(U_FAILURE(*err)){
2836 args->target = myTarget;
2837 args->source = mySource;
d5d484b0 2838 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
b75a7d8f
A
2839 return;
2840 }
2841 continue;
2842
374ca955
A
2843 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
2844
2845 case CR:
2846 /*falls through*/
2847 case LF:
2848 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
2849 /* falls through */
2850 default:
2851 /* convert one or two bytes */
d5d484b0 2852 myData->isEmptySegment = FALSE;
374ca955
A
2853 if(pToU2022State->g != 0) {
2854 if(mySource < mySourceLimit) {
2855 UConverterSharedData *cnv;
2856 StateEnum tempState;
2857 int32_t tempBufLen;
2858 char trailByte;
2859getTrailByte:
2860 trailByte = *mySource++;
2861 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
2862 if(tempState > CNS_11643_0) {
2863 cnv = myData->myConverterArray[CNS_11643];
2864 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
2865 tempBuf[1] = (char) (mySourceChar);
2866 tempBuf[2] = trailByte;
2867 tempBufLen = 3;
2868
2869 }else{
2870 cnv = myData->myConverterArray[tempState];
2871 tempBuf[0] = (char) (mySourceChar);
2872 tempBuf[1] = trailByte;
2873 tempBufLen = 2;
2874 }
2875 mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2876 if(pToU2022State->g>=2) {
2877 /* return from a single-shift state to the previous one */
2878 pToU2022State->g=pToU2022State->prevG;
2879 }
2880 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
2881 } else {
2882 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2883 args->converter->toULength = 1;
2884 goto endloop;
2885 }
2886 }
2887 else{
2888 if(mySourceChar <= 0x7f) {
2889 targetUniChar = (UChar) mySourceChar;
2890 }
2891 }
2892 break;
b75a7d8f
A
2893 }
2894 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2895 if(args->offsets){
73c04bcf 2896 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f
A
2897 }
2898 *(myTarget++)=(UChar)targetUniChar;
2899 }
2900 else if(targetUniChar > missingCharMarker){
2901 /* disassemble the surrogate pair and write to output*/
2902 targetUniChar-=0x0010000;
374ca955 2903 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
b75a7d8f 2904 if(args->offsets){
73c04bcf 2905 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f 2906 }
374ca955 2907 ++myTarget;
b75a7d8f 2908 if(myTarget< args->targetLimit){
374ca955 2909 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
b75a7d8f 2910 if(args->offsets){
73c04bcf 2911 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
b75a7d8f 2912 }
374ca955 2913 ++myTarget;
b75a7d8f
A
2914 }else{
2915 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2916 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2917 }
2918
2919 }
2920 else{
2921 /* Call the callback function*/
374ca955
A
2922 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2923 break;
b75a7d8f
A
2924 }
2925 }
2926 else{
2927 *err =U_BUFFER_OVERFLOW_ERROR;
2928 break;
2929 }
2930 }
374ca955 2931endloop:
b75a7d8f
A
2932 args->target = myTarget;
2933 args->source = mySource;
2934}
2935
2936static void
2937_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
2938 UConverter *cnv = args->converter;
2939 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
374ca955
A
2940 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
2941 char *p, *subchar;
2942 char buffer[8];
2943 int32_t length;
2944
73c04bcf 2945 subchar=(char *)cnv->subChars;
374ca955 2946 length=cnv->subCharLen; /* assume length==1 for most variants */
b75a7d8f
A
2947
2948 p = buffer;
2949 switch(myConverterData->locale[0]){
2950 case 'j':
374ca955
A
2951 {
2952 int8_t cs;
2953
2954 if(pFromU2022State->g == 1) {
2955 /* JIS7: switch from G1 to G0 */
2956 pFromU2022State->g = 0;
2957 *p++ = UCNV_SI;
2958 }
2959
2960 cs = pFromU2022State->cs[0];
2961 if(cs != ASCII && cs != JISX201) {
2962 /* not in ASCII or JIS X 0201: switch to ASCII */
2963 pFromU2022State->cs[0] = (int8_t)ASCII;
b75a7d8f
A
2964 *p++ = '\x1b';
2965 *p++ = '\x28';
2966 *p++ = '\x42';
b75a7d8f 2967 }
374ca955
A
2968
2969 *p++ = subchar[0];
b75a7d8f 2970 break;
374ca955 2971 }
b75a7d8f 2972 case 'c':
374ca955
A
2973 if(pFromU2022State->g != 0) {
2974 /* not in ASCII mode: switch to ASCII */
2975 pFromU2022State->g = 0;
2976 *p++ = UCNV_SI;
2977 }
2978 *p++ = subchar[0];
b75a7d8f
A
2979 break;
2980 case 'k':
374ca955
A
2981 if(myConverterData->version == 0) {
2982 if(length == 1) {
2983 if((UBool)args->converter->fromUnicodeStatus) {
2984 /* in DBCS mode: switch to SBCS */
2985 args->converter->fromUnicodeStatus = 0;
2986 *p++ = UCNV_SI;
2987 }
2988 *p++ = subchar[0];
2989 } else /* length == 2*/ {
2990 if(!(UBool)args->converter->fromUnicodeStatus) {
2991 /* in SBCS mode: switch to DBCS */
2992 args->converter->fromUnicodeStatus = 1;
2993 *p++ = UCNV_SO;
2994 }
2995 *p++ = subchar[0];
2996 *p++ = subchar[1];
2997 }
2998 break;
2999 } else {
73c04bcf
A
3000 /* save the subconverter's substitution string */
3001 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3002 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3003
3004 /* set our substitution string into the subconverter */
3005 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
374ca955
A
3006 myConverterData->currentConverter->subCharLen = (int8_t)length;
3007
73c04bcf
A
3008 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3009 args->converter = myConverterData->currentConverter;
374ca955
A
3010 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3011 ucnv_cbFromUWriteSub(args, 0, err);
3012 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
73c04bcf
A
3013 args->converter = cnv;
3014
3015 /* restore the subconverter's substitution string */
3016 myConverterData->currentConverter->subChars = currentSubChars;
3017 myConverterData->currentConverter->subCharLen = currentSubCharLen;
374ca955
A
3018
3019 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3020 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3021 uprv_memcpy(
3022 cnv->charErrorBuffer,
3023 myConverterData->currentConverter->charErrorBuffer,
3024 myConverterData->currentConverter->charErrorBufferLength);
3025 }
3026 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3027 myConverterData->currentConverter->charErrorBufferLength = 0;
3028 }
374ca955 3029 return;
b75a7d8f 3030 }
b75a7d8f
A
3031 default:
3032 /* not expected */
3033 break;
3034 }
3035 ucnv_cbFromUWriteBytes(args,
3036 buffer, (int32_t)(p - buffer),
3037 offsetIndex, err);
3038}
3039
73c04bcf
A
3040/*
3041 * Structure for cloning an ISO 2022 converter into a single memory block.
3042 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3043 * and then ucnv_safeClone() of the sub-converter may additionally align
3044 * currentConverter inside the cloneStruct, for which we need the deadSpace
3045 * after currentConverter.
3046 * This is because UAlignedMemory may be larger than the actually
3047 * necessary alignment size for the platform.
3048 * The other cloneStruct fields will not be moved around,
3049 * and are aligned properly with cloneStruct's alignment.
3050 */
b75a7d8f
A
3051struct cloneStruct
3052{
3053 UConverter cnv;
374ca955 3054 UConverter currentConverter;
73c04bcf
A
3055 UAlignedMemory deadSpace;
3056 UConverterDataISO2022 mydata;
b75a7d8f
A
3057};
3058
3059
3060static UConverter *
3061_ISO_2022_SafeClone(
3062 const UConverter *cnv,
3063 void *stackBuffer,
3064 int32_t *pBufferSize,
3065 UErrorCode *status)
3066{
3067 struct cloneStruct * localClone;
374ca955
A
3068 UConverterDataISO2022 *cnvData;
3069 int32_t i, size;
b75a7d8f
A
3070
3071 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
374ca955
A
3072 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3073 return NULL;
b75a7d8f
A
3074 }
3075
374ca955 3076 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
b75a7d8f 3077 localClone = (struct cloneStruct *)stackBuffer;
b75a7d8f 3078
374ca955 3079 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
b75a7d8f 3080
374ca955 3081 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
73c04bcf
A
3082 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3083 localClone->cnv.isExtraLocal = TRUE;
b75a7d8f 3084
374ca955 3085 /* share the subconverters */
b75a7d8f 3086
374ca955 3087 if(cnvData->currentConverter != NULL) {
73c04bcf 3088 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
374ca955
A
3089 localClone->mydata.currentConverter =
3090 ucnv_safeClone(cnvData->currentConverter,
3091 &localClone->currentConverter,
3092 &size, status);
3093 if(U_FAILURE(*status)) {
3094 return NULL;
b75a7d8f 3095 }
b75a7d8f
A
3096 }
3097
374ca955
A
3098 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3099 if(cnvData->myConverterArray[i] != NULL) {
3100 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3101 }
b75a7d8f
A
3102 }
3103
b75a7d8f
A
3104 return &localClone->cnv;
3105}
3106
3107static void
3108_ISO_2022_GetUnicodeSet(const UConverter *cnv,
73c04bcf 3109 const USetAdder *sa,
b75a7d8f
A
3110 UConverterUnicodeSet which,
3111 UErrorCode *pErrorCode)
3112{
3113 int32_t i;
b75a7d8f
A
3114 UConverterDataISO2022* cnvData;
3115
3116 if (U_FAILURE(*pErrorCode)) {
3117 return;
3118 }
374ca955 3119#ifdef U_ENABLE_GENERIC_ISO_2022
b75a7d8f
A
3120 if (cnv->sharedData == &_ISO2022Data) {
3121 /* We use UTF-8 in this case */
374ca955
A
3122 sa->addRange(sa->set, 0, 0xd7FF);
3123 sa->addRange(sa->set, 0xE000, 0x10FFFF);
b75a7d8f
A
3124 return;
3125 }
374ca955 3126#endif
b75a7d8f
A
3127
3128 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
b75a7d8f 3129
374ca955
A
3130 /* open a set and initialize it with code points that are algorithmically round-tripped */
3131 switch(cnvData->locale[0]){
3132 case 'j':
3133 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3134 /* include Latin-1 for some variants of JP */
3135 sa->addRange(sa->set, 0, 0xff);
3136 } else {
3137 /* include ASCII for JP */
3138 sa->addRange(sa->set, 0, 0x7f);
3139 }
3140 if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
3141 /* include half-width Katakana for JP */
3142 sa->addRange(sa->set, 0xff61, 0xff9f);
3143 }
3144 break;
3145 case 'c':
3146 case 'z':
3147 /* include ASCII for CN */
3148 sa->addRange(sa->set, 0, 0x7f);
3149 break;
3150 case 'k':
3151 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3152 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3153 cnvData->currentConverter, sa, which, pErrorCode);
73c04bcf
A
3154 /* the loop over myConverterArray[] will simply not find another converter */
3155 break;
374ca955
A
3156 default:
3157 break;
b75a7d8f
A
3158 }
3159
374ca955 3160 /*
73c04bcf 3161 * Version-specific for CN:
374ca955
A
3162 * CN version 0 does not map CNS planes 3..7 although
3163 * they are all available in the CNS conversion table;
3164 * CN version 1 does map them all.
73c04bcf 3165 * The two versions create different Unicode sets.
374ca955
A
3166 */
3167 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3168 if(cnvData->myConverterArray[i]!=NULL) {
3169 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3170 cnvData->version==0 && i==CNS_11643
3171 ) {
3172 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3173 ucnv_MBCSGetUnicodeSetForBytes(
3174 cnvData->myConverterArray[i],
3175 sa, UCNV_ROUNDTRIP_SET,
3176 0, 0x81, 0x82,
3177 pErrorCode);
3178 } else {
3179 ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode);
3180 }
3181 }
b75a7d8f 3182 }
73c04bcf
A
3183
3184 /*
3185 * ISO 2022 converters must not convert SO/SI/ESC despite what
3186 * sub-converters do by themselves.
3187 * Remove these characters from the set.
3188 */
3189 sa->remove(sa->set, 0x0e);
3190 sa->remove(sa->set, 0x0f);
3191 sa->remove(sa->set, 0x1b);
b75a7d8f
A
3192}
3193
374ca955
A
3194static const UConverterImpl _ISO2022Impl={
3195 UCNV_ISO_2022,
3196
3197 NULL,
3198 NULL,
3199
3200 _ISO2022Open,
3201 _ISO2022Close,
3202 _ISO2022Reset,
3203
3204#ifdef U_ENABLE_GENERIC_ISO_2022
3205 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3206 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3207 ucnv_fromUnicode_UTF8,
3208 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3209#else
3210 NULL,
3211 NULL,
3212 NULL,
3213 NULL,
3214#endif
3215 NULL,
3216
3217 NULL,
3218 _ISO2022getName,
3219 _ISO_2022_WriteSub,
3220 _ISO_2022_SafeClone,
3221 _ISO_2022_GetUnicodeSet
3222};
3223static const UConverterStaticData _ISO2022StaticData={
3224 sizeof(UConverterStaticData),
3225 "ISO_2022",
3226 2022,
3227 UCNV_IBM,
3228 UCNV_ISO_2022,
3229 1,
3230 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3231 { 0x1a, 0, 0, 0 },
3232 1,
3233 FALSE,
3234 FALSE,
3235 0,
3236 0,
3237 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3238};
3239const UConverterSharedData _ISO2022Data={
3240 sizeof(UConverterSharedData),
3241 ~((uint32_t) 0),
3242 NULL,
3243 NULL,
3244 &_ISO2022StaticData,
3245 FALSE,
3246 &_ISO2022Impl,
3247 0
3248};
3249
3250/*************JP****************/
3251static const UConverterImpl _ISO2022JPImpl={
3252 UCNV_ISO_2022,
3253
3254 NULL,
3255 NULL,
3256
3257 _ISO2022Open,
3258 _ISO2022Close,
3259 _ISO2022Reset,
3260
3261 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3262 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3263 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3264 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3265 NULL,
3266
3267 NULL,
3268 _ISO2022getName,
3269 _ISO_2022_WriteSub,
3270 _ISO_2022_SafeClone,
3271 _ISO_2022_GetUnicodeSet
3272};
3273static const UConverterStaticData _ISO2022JPStaticData={
3274 sizeof(UConverterStaticData),
3275 "ISO_2022_JP",
3276 0,
3277 UCNV_IBM,
3278 UCNV_ISO_2022,
3279 1,
3280 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3281 { 0x1a, 0, 0, 0 },
3282 1,
3283 FALSE,
3284 FALSE,
3285 0,
3286 0,
3287 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3288};
3289static const UConverterSharedData _ISO2022JPData={
3290 sizeof(UConverterSharedData),
3291 ~((uint32_t) 0),
3292 NULL,
3293 NULL,
3294 &_ISO2022JPStaticData,
3295 FALSE,
3296 &_ISO2022JPImpl,
3297 0
3298};
3299
3300/************* KR ***************/
3301static const UConverterImpl _ISO2022KRImpl={
3302 UCNV_ISO_2022,
3303
3304 NULL,
3305 NULL,
3306
3307 _ISO2022Open,
3308 _ISO2022Close,
3309 _ISO2022Reset,
3310
3311 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3312 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3313 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3314 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3315 NULL,
3316
3317 NULL,
3318 _ISO2022getName,
3319 _ISO_2022_WriteSub,
3320 _ISO_2022_SafeClone,
3321 _ISO_2022_GetUnicodeSet
3322};
3323static const UConverterStaticData _ISO2022KRStaticData={
3324 sizeof(UConverterStaticData),
3325 "ISO_2022_KR",
3326 0,
3327 UCNV_IBM,
3328 UCNV_ISO_2022,
3329 1,
3330 3, /* max 3 bytes per UChar: SO+DBCS */
3331 { 0x1a, 0, 0, 0 },
3332 1,
3333 FALSE,
3334 FALSE,
3335 0,
3336 0,
3337 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3338};
3339static const UConverterSharedData _ISO2022KRData={
3340 sizeof(UConverterSharedData),
3341 ~((uint32_t) 0),
3342 NULL,
3343 NULL,
3344 &_ISO2022KRStaticData,
3345 FALSE,
3346 &_ISO2022KRImpl,
3347 0
3348};
3349
3350/*************** CN ***************/
3351static const UConverterImpl _ISO2022CNImpl={
3352
3353 UCNV_ISO_2022,
3354
3355 NULL,
3356 NULL,
3357
3358 _ISO2022Open,
3359 _ISO2022Close,
3360 _ISO2022Reset,
3361
3362 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3363 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3364 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3365 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3366 NULL,
3367
3368 NULL,
3369 _ISO2022getName,
3370 _ISO_2022_WriteSub,
3371 _ISO_2022_SafeClone,
3372 _ISO_2022_GetUnicodeSet
3373};
3374static const UConverterStaticData _ISO2022CNStaticData={
3375 sizeof(UConverterStaticData),
3376 "ISO_2022_CN",
3377 0,
3378 UCNV_IBM,
3379 UCNV_ISO_2022,
73c04bcf 3380 1,
374ca955
A
3381 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3382 { 0x1a, 0, 0, 0 },
3383 1,
3384 FALSE,
3385 FALSE,
3386 0,
3387 0,
3388 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3389};
3390static const UConverterSharedData _ISO2022CNData={
3391 sizeof(UConverterSharedData),
3392 ~((uint32_t) 0),
3393 NULL,
3394 NULL,
3395 &_ISO2022CNStaticData,
3396 FALSE,
3397 &_ISO2022CNImpl,
3398 0
3399};
3400
3401
3402
b75a7d8f 3403#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */