]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnv2022.c
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / common / ucnv2022.c
1 /*
2 **********************************************************************
3 * Copyright (C) 2000-2006,2008 International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
13 *
14 * Change history:
15 *
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
19 * function
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
22 * ucnvebdc.c
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
27 */
28
29 #include "unicode/utypes.h"
30
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
37 #include "ucnv_imp.h"
38 #include "ucnv_bld.h"
39 #include "ucnv_cnv.h"
40 #include "ucnvmbcs.h"
41 #include "cstring.h"
42 #include "cmemory.h"
43
44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45
46 #ifdef U_ENABLE_GENERIC_ISO_2022
47 /*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
72 * for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76 #endif
77
78 static const char SHIFT_IN_STR[] = "\x0F";
79 static const char SHIFT_OUT_STR[] = "\x0E";
80
81 #define CR 0x0D
82 #define LF 0x0A
83 #define H_TAB 0x09
84 #define V_TAB 0x0B
85 #define SPACE 0x20
86
87 /*
88 * ISO 2022 control codes must not be converted from Unicode
89 * because they would mess up the byte stream.
90 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
91 * corresponding to SO, SI, and ESC.
92 */
93 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
94
95 /* for ISO-2022-JP and -CN implementations */
96 typedef enum {
97 /* shared values */
98 INVALID_STATE=-1,
99 ASCII = 0,
100
101 SS2_STATE=0x10,
102 SS3_STATE,
103
104 /* JP */
105 ISO8859_1 = 1 ,
106 ISO8859_7 = 2 ,
107 JISX201 = 3,
108 JISX208 = 4,
109 JISX212 = 5,
110 GB2312 =6,
111 KSC5601 =7,
112 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
113
114 /* CN */
115 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
116 GB2312_1=1,
117 ISO_IR_165=2,
118 CNS_11643=3,
119
120 /*
121 * these are used in StateEnum and ISO2022State variables,
122 * but CNS_11643 must be used to index into myConverterArray[]
123 */
124 CNS_11643_0=0x20,
125 CNS_11643_1,
126 CNS_11643_2,
127 CNS_11643_3,
128 CNS_11643_4,
129 CNS_11643_5,
130 CNS_11643_6,
131 CNS_11643_7
132 } StateEnum;
133
134 /* is the StateEnum charset value for a DBCS charset? */
135 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
136
137 #define CSM(cs) ((uint16_t)1<<(cs))
138
139 /*
140 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
141 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
142 *
143 * Note: The converter uses some leniency:
144 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
145 * all versions, not just JIS7 and JIS8.
146 * - ICU does not distinguish between different versions of JIS X 0208.
147 */
148 static const uint16_t jpCharsetMasks[5]={
149 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
150 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
151 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
152 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
153 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
154 };
155
156 typedef enum {
157 ASCII1=0,
158 LATIN1,
159 SBCS,
160 DBCS,
161 MBCS,
162 HWKANA
163 }Cnv2022Type;
164
165 typedef struct ISO2022State {
166 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
167 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
168 int8_t prevG; /* g before single shift (SS2 or SS3) */
169 } ISO2022State;
170
171 #define UCNV_OPTIONS_VERSION_MASK 0xf
172 #define UCNV_2022_MAX_CONVERTERS 10
173
174 typedef struct{
175 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
176 UConverter *currentConverter;
177 Cnv2022Type currentType;
178 ISO2022State toU2022State, fromU2022State;
179 uint32_t key;
180 uint32_t version;
181 #ifdef U_ENABLE_GENERIC_ISO_2022
182 UBool isFirstBuffer;
183 #endif
184 UBool isEmptySegment;
185 char name[30];
186 char locale[3];
187 }UConverterDataISO2022;
188
189 /* Protos */
190 /* ISO-2022 ----------------------------------------------------------------- */
191
192 /*Forward declaration */
193 U_CFUNC void
194 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
195 UErrorCode * err);
196 U_CFUNC void
197 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
198 UErrorCode * err);
199
200 #define ESC_2022 0x1B /*ESC*/
201
202 typedef enum
203 {
204 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
205 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
206 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
207 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
208 } UCNV_TableStates_2022;
209
210 /*
211 * The way these state transition arrays work is:
212 * ex : ESC$B is the sequence for JISX208
213 * a) First Iteration: char is ESC
214 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
215 * int x = normalize_esq_chars_2022[27] which is equal to 1
216 * ii) Search for this value in escSeqStateTable_Key_2022[]
217 * value of x is stored at escSeqStateTable_Key_2022[0]
218 * iii) Save this index as offset
219 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
220 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
221 * b) Switch on this state and continue to next char
222 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
223 * which is normalize_esq_chars_2022[36] == 4
224 * ii) x is currently 1(from above)
225 * x<<=5 -- x is now 32
226 * x+=normalize_esq_chars_2022[36]
227 * now x is 36
228 * iii) Search for this value in escSeqStateTable_Key_2022[]
229 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
230 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
231 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
232 * c) Switch on this state and continue to next char
233 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
234 * ii) x is currently 36 (from above)
235 * x<<=5 -- x is now 1152
236 * x+=normalize_esq_chars_2022[66]
237 * now x is 1161
238 * iii) Search for this value in escSeqStateTable_Key_2022[]
239 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
240 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
241 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
242 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
243 */
244
245
246 /*Below are the 3 arrays depicting a state transition table*/
247 static const int8_t normalize_esq_chars_2022[256] = {
248 /* 0 1 2 3 4 5 6 7 8 9 */
249
250 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
251 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
252 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
253 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
254 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
255 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
256 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
257 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
258 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
259 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
260 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
261 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
262 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
263 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
264 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
265 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
266 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
267 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
268 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
269 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
270 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
271 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
273 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
274 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
275 ,0 ,0 ,0 ,0 ,0 ,0
276 };
277
278 #ifdef U_ENABLE_GENERIC_ISO_2022
279 /*
280 * When the generic ISO-2022 converter is completely removed, not just disabled
281 * per #ifdef, then the following state table and the associated tables that are
282 * dimensioned with MAX_STATES_2022 should be trimmed.
283 *
284 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
285 * the associated escape sequences starting with ESC ( B should be removed.
286 * This includes the ones with key values 1097 and all of the ones above 1000000.
287 *
288 * For the latter, the tables can simply be truncated.
289 * For the former, since the tables must be kept parallel, it is probably best
290 * to simply duplicate an adjacent table cell, parallel in all tables.
291 *
292 * It may make sense to restructure the tables, especially by using small search
293 * tables for the variants instead of indexing them parallel to the table here.
294 */
295 #endif
296
297 #define MAX_STATES_2022 74
298 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
299 /* 0 1 2 3 4 5 6 7 8 9 */
300
301 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
302 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
303 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
304 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
305 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
306 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
307 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
308 ,35947631 ,35947635 ,35947636 ,35947638
309 };
310
311 #ifdef U_ENABLE_GENERIC_ISO_2022
312
313 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
314 /* 0 1 2 3 4 5 6 7 8 9 */
315
316 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
317 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
318 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
319 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
320 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
321 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
322 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
323 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
324 };
325
326 #endif
327
328 static const UCNV_TableStates_2022 escSeqStateTable_Value_2022[MAX_STATES_2022] = {
329 /* 0 1 2 3 4 5 6 7 8 9 */
330 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
331 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
332 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
333 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
334 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
335 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
336 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
337 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
338 };
339
340
341 /* Type def for refactoring changeState_2022 code*/
342 typedef enum{
343 #ifdef U_ENABLE_GENERIC_ISO_2022
344 ISO_2022=0,
345 #endif
346 ISO_2022_JP=1,
347 ISO_2022_KR=2,
348 ISO_2022_CN=3
349 } Variant2022;
350
351 /*********** ISO 2022 Converter Protos ***********/
352 static void
353 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode);
354
355 static void
356 _ISO2022Close(UConverter *converter);
357
358 static void
359 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
360
361 static const char*
362 _ISO2022getName(const UConverter* cnv);
363
364 static void
365 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
366
367 static UConverter *
368 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
369
370 #ifdef U_ENABLE_GENERIC_ISO_2022
371 static void
372 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
373 #endif
374
375 /*const UConverterSharedData _ISO2022Data;*/
376 static const UConverterSharedData _ISO2022JPData;
377 static const UConverterSharedData _ISO2022KRData;
378 static const UConverterSharedData _ISO2022CNData;
379
380 /*************** Converter implementations ******************/
381
382 /* The purpose of this function is to get around gcc compiler warnings. */
383 static U_INLINE void
384 fromUWriteUInt8(UConverter *cnv,
385 const char *bytes, int32_t length,
386 uint8_t **target, const char *targetLimit,
387 int32_t **offsets,
388 int32_t sourceIndex,
389 UErrorCode *pErrorCode)
390 {
391 char *targetChars = (char *)*target;
392 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
393 offsets, sourceIndex, pErrorCode);
394 *target = (uint8_t*)targetChars;
395
396 }
397
398 static U_INLINE void
399 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
400 if(myConverterData->version == 1) {
401 UConverter *cnv = myConverterData->currentConverter;
402
403 cnv->toUnicodeStatus=0; /* offset */
404 cnv->mode=0; /* state */
405 cnv->toULength=0; /* byteIndex */
406 }
407 }
408
409 static U_INLINE void
410 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
411 /* in ISO-2022-KR the designator sequence appears only once
412 * in a file so we append it only once
413 */
414 if( converter->charErrorBufferLength==0){
415
416 converter->charErrorBufferLength = 4;
417 converter->charErrorBuffer[0] = 0x1b;
418 converter->charErrorBuffer[1] = 0x24;
419 converter->charErrorBuffer[2] = 0x29;
420 converter->charErrorBuffer[3] = 0x43;
421 }
422 if(myConverterData->version == 1) {
423 UConverter *cnv = myConverterData->currentConverter;
424
425 cnv->fromUChar32=0;
426 cnv->fromUnicodeStatus=1; /* prevLength */
427 }
428 }
429
430 static void
431 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
432
433 char myLocale[6]={' ',' ',' ',' ',' ',' '};
434
435 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
436 if(cnv->extraInfo != NULL) {
437 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
438 uint32_t version;
439
440 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
441 myConverterData->currentType = ASCII1;
442 cnv->fromUnicodeStatus =FALSE;
443 if(locale){
444 uprv_strncpy(myLocale, locale, sizeof(myLocale));
445 }
446 version = options & UCNV_OPTIONS_VERSION_MASK;
447 myConverterData->version = version;
448 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
449 (myLocale[2]=='_' || myLocale[2]=='\0'))
450 {
451 size_t len=0;
452 /* open the required converters and cache them */
453 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
454 myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
455 }
456 myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode);
457 myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode);
458 if(jpCharsetMasks[version]&CSM(JISX212)) {
459 myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode);
460 }
461 if(jpCharsetMasks[version]&CSM(GB2312)) {
462 myConverterData->myConverterArray[GB2312] = ucnv_loadSharedData("ibm-5478", NULL, errorCode); /* gb_2312_80-1 */
463 }
464 if(jpCharsetMasks[version]&CSM(KSC5601)) {
465 myConverterData->myConverterArray[KSC5601] = ucnv_loadSharedData("ksc_5601", NULL, errorCode);
466 }
467
468 /* set the function pointers to appropriate funtions */
469 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
470 uprv_strcpy(myConverterData->locale,"ja");
471
472 uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
473 len = uprv_strlen(myConverterData->name);
474 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
475 myConverterData->name[len+1]='\0';
476 }
477 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
478 (myLocale[2]=='_' || myLocale[2]=='\0'))
479 {
480 if (version==1){
481 myConverterData->currentConverter=
482 ucnv_open("icu-internal-25546",errorCode);
483
484 if (U_FAILURE(*errorCode)) {
485 _ISO2022Close(cnv);
486 return;
487 }
488
489 uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
490 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
491 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
492 }else{
493 myConverterData->currentConverter=ucnv_open("ibm-949",errorCode);
494
495 if (U_FAILURE(*errorCode)) {
496 _ISO2022Close(cnv);
497 return;
498 }
499
500 myConverterData->version = 0;
501 uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
502 }
503
504 /* initialize the state variables */
505 setInitialStateToUnicodeKR(cnv, myConverterData);
506 setInitialStateFromUnicodeKR(cnv, myConverterData);
507
508 /* set the function pointers to appropriate funtions */
509 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
510 uprv_strcpy(myConverterData->locale,"ko");
511 }
512 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
513 (myLocale[2]=='_' || myLocale[2]=='\0'))
514 {
515
516 /* open the required converters and cache them */
517 myConverterData->myConverterArray[GB2312_1] = ucnv_loadSharedData("ibm-5478", NULL, errorCode);
518 if(version==1) {
519 myConverterData->myConverterArray[ISO_IR_165] = ucnv_loadSharedData("iso-ir-165", NULL, errorCode);
520 }
521 myConverterData->myConverterArray[CNS_11643] = ucnv_loadSharedData("cns-11643-1992", NULL, errorCode);
522
523
524 /* set the function pointers to appropriate funtions */
525 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
526 uprv_strcpy(myConverterData->locale,"cn");
527
528 if (version==1){
529 uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
530 }else{
531 myConverterData->version = 0;
532 uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
533 }
534 }
535 else{
536 #ifdef U_ENABLE_GENERIC_ISO_2022
537 myConverterData->isFirstBuffer = TRUE;
538
539 /* append the UTF-8 escape sequence */
540 cnv->charErrorBufferLength = 3;
541 cnv->charErrorBuffer[0] = 0x1b;
542 cnv->charErrorBuffer[1] = 0x25;
543 cnv->charErrorBuffer[2] = 0x42;
544
545 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
546 /* initialize the state variables */
547 uprv_strcpy(myConverterData->name,"ISO_2022");
548 #else
549 *errorCode = U_UNSUPPORTED_ERROR;
550 return;
551 #endif
552 }
553
554 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
555
556 if(U_FAILURE(*errorCode)) {
557 _ISO2022Close(cnv);
558 }
559 } else {
560 *errorCode = U_MEMORY_ALLOCATION_ERROR;
561 }
562 }
563
564
565 static void
566 _ISO2022Close(UConverter *converter) {
567 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
568 UConverterSharedData **array = myData->myConverterArray;
569 int32_t i;
570
571 if (converter->extraInfo != NULL) {
572 /*close the array of converter pointers and free the memory*/
573 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
574 if(array[i]!=NULL) {
575 ucnv_unloadSharedDataIfReady(array[i]);
576 }
577 }
578
579 ucnv_close(myData->currentConverter);
580
581 if(!converter->isExtraLocal){
582 uprv_free (converter->extraInfo);
583 converter->extraInfo = NULL;
584 }
585 }
586 }
587
588 static void
589 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
590 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
591 if(choice<=UCNV_RESET_TO_UNICODE) {
592 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
593 myConverterData->key = 0;
594 myConverterData->isEmptySegment = FALSE;
595 }
596 if(choice!=UCNV_RESET_TO_UNICODE) {
597 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
598 }
599 #ifdef U_ENABLE_GENERIC_ISO_2022
600 if(myConverterData->locale[0] == 0){
601 if(choice<=UCNV_RESET_TO_UNICODE) {
602 myConverterData->isFirstBuffer = TRUE;
603 myConverterData->key = 0;
604 if (converter->mode == UCNV_SO){
605 ucnv_close (myConverterData->currentConverter);
606 myConverterData->currentConverter=NULL;
607 }
608 converter->mode = UCNV_SI;
609 }
610 if(choice!=UCNV_RESET_TO_UNICODE) {
611 /* re-append UTF-8 escape sequence */
612 converter->charErrorBufferLength = 3;
613 converter->charErrorBuffer[0] = 0x1b;
614 converter->charErrorBuffer[1] = 0x28;
615 converter->charErrorBuffer[2] = 0x42;
616 }
617 }
618 else
619 #endif
620 {
621 /* reset the state variables */
622 if(myConverterData->locale[0] == 'k'){
623 if(choice<=UCNV_RESET_TO_UNICODE) {
624 setInitialStateToUnicodeKR(converter, myConverterData);
625 }
626 if(choice!=UCNV_RESET_TO_UNICODE) {
627 setInitialStateFromUnicodeKR(converter, myConverterData);
628 }
629 }
630 }
631 }
632
633 static const char*
634 _ISO2022getName(const UConverter* cnv){
635 if(cnv->extraInfo){
636 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
637 return myData->name;
638 }
639 return NULL;
640 }
641
642
643 /*************** to unicode *******************/
644 /****************************************************************************
645 * Recognized escape sequences are
646 * <ESC>(B ASCII
647 * <ESC>.A ISO-8859-1
648 * <ESC>.F ISO-8859-7
649 * <ESC>(J JISX-201
650 * <ESC>(I JISX-201
651 * <ESC>$B JISX-208
652 * <ESC>$@ JISX-208
653 * <ESC>$(D JISX-212
654 * <ESC>$A GB2312
655 * <ESC>$(C KSC5601
656 */
657 static const StateEnum nextStateToUnicodeJP[MAX_STATES_2022]= {
658 /* 0 1 2 3 4 5 6 7 8 9 */
659 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
660 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
661 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
662 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
663 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
664 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
665 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
666 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
667 };
668
669 /*************** to unicode *******************/
670 static const StateEnum nextStateToUnicodeCN[MAX_STATES_2022]= {
671 /* 0 1 2 3 4 5 6 7 8 9 */
672 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
673 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
674 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
675 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
676 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
677 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
678 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
679 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
680 };
681
682
683 static UCNV_TableStates_2022
684 getKey_2022(char c,int32_t* key,int32_t* offset){
685 int32_t togo;
686 int32_t low = 0;
687 int32_t hi = MAX_STATES_2022;
688 int32_t oldmid=0;
689
690 togo = normalize_esq_chars_2022[(uint8_t)c];
691 if(togo == 0) {
692 /* not a valid character anywhere in an escape sequence */
693 *key = 0;
694 *offset = 0;
695 return INVALID_2022;
696 }
697 togo = (*key << 5) + togo;
698
699 while (hi != low) /*binary search*/{
700
701 register int32_t mid = (hi+low) >> 1; /*Finds median*/
702
703 if (mid == oldmid)
704 break;
705
706 if (escSeqStateTable_Key_2022[mid] > togo){
707 hi = mid;
708 }
709 else if (escSeqStateTable_Key_2022[mid] < togo){
710 low = mid;
711 }
712 else /*we found it*/{
713 *key = togo;
714 *offset = mid;
715 return escSeqStateTable_Value_2022[mid];
716 }
717 oldmid = mid;
718
719 }
720
721 *key = 0;
722 *offset = 0;
723 return INVALID_2022;
724 }
725
726 /*runs through a state machine to determine the escape sequence - codepage correspondance
727 */
728 static void
729 changeState_2022(UConverter* _this,
730 const char** source,
731 const char* sourceLimit,
732 Variant2022 var,
733 UErrorCode* err){
734 UCNV_TableStates_2022 value;
735 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
736 uint32_t key = myData2022->key;
737 int32_t offset = 0;
738 int8_t initialToULength = _this->toULength;
739 char c;
740
741 value = VALID_NON_TERMINAL_2022;
742 while (*source < sourceLimit) {
743 c = *(*source)++;
744 _this->toUBytes[_this->toULength++]=(uint8_t)c;
745 value = getKey_2022(c,(int32_t *) &key, &offset);
746
747 switch (value){
748
749 case VALID_NON_TERMINAL_2022 :
750 /* continue with the loop */
751 break;
752
753 case VALID_TERMINAL_2022:
754 key = 0;
755 goto DONE;
756
757 case INVALID_2022:
758 goto DONE;
759
760 case VALID_MAYBE_TERMINAL_2022:
761 #ifdef U_ENABLE_GENERIC_ISO_2022
762 /* ESC ( B is ambiguous only for ISO_2022 itself */
763 if(var == ISO_2022) {
764 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
765 _this->toULength = 0;
766
767 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
768
769 /* continue with the loop */
770 value = VALID_NON_TERMINAL_2022;
771 break;
772 } else
773 #endif
774 {
775 /* not ISO_2022 itself, finish here */
776 value = VALID_TERMINAL_2022;
777 key = 0;
778 goto DONE;
779 }
780 }
781 }
782
783 DONE:
784 myData2022->key = key;
785
786 if (value == VALID_NON_TERMINAL_2022) {
787 /* indicate that the escape sequence is incomplete: key!=0 */
788 return;
789 } else if (value == INVALID_2022 ) {
790 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
791 } else /* value == VALID_TERMINAL_2022 */ {
792 switch(var){
793 #ifdef U_ENABLE_GENERIC_ISO_2022
794 case ISO_2022:
795 {
796 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
797 if(chosenConverterName == NULL) {
798 /* SS2 or SS3 */
799 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
800 return;
801 }
802
803 _this->mode = UCNV_SI;
804 ucnv_close(myData2022->currentConverter);
805 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
806 if(U_SUCCESS(*err)) {
807 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
808 _this->mode = UCNV_SO;
809 }
810 break;
811 }
812 #endif
813 case ISO_2022_JP:
814 {
815 StateEnum tempState=nextStateToUnicodeJP[offset];
816 switch(tempState) {
817 case INVALID_STATE:
818 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
819 break;
820 case SS2_STATE:
821 if(myData2022->toU2022State.cs[2]!=0) {
822 if(myData2022->toU2022State.g<2) {
823 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
824 }
825 myData2022->toU2022State.g=2;
826 } else {
827 /* illegal to have SS2 before a matching designator */
828 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
829 }
830 break;
831 /* case SS3_STATE: not used in ISO-2022-JP-x */
832 case ISO8859_1:
833 case ISO8859_7:
834 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
835 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
836 } else {
837 /* G2 charset for SS2 */
838 myData2022->toU2022State.cs[2]=(int8_t)tempState;
839 }
840 break;
841 default:
842 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
843 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
844 } else {
845 /* G0 charset */
846 myData2022->toU2022State.cs[0]=(int8_t)tempState;
847 }
848 break;
849 }
850 }
851 break;
852 case ISO_2022_CN:
853 {
854 StateEnum tempState=nextStateToUnicodeCN[offset];
855 switch(tempState) {
856 case INVALID_STATE:
857 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
858 break;
859 case SS2_STATE:
860 if(myData2022->toU2022State.cs[2]!=0) {
861 if(myData2022->toU2022State.g<2) {
862 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
863 }
864 myData2022->toU2022State.g=2;
865 } else {
866 /* illegal to have SS2 before a matching designator */
867 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
868 }
869 break;
870 case SS3_STATE:
871 if(myData2022->toU2022State.cs[3]!=0) {
872 if(myData2022->toU2022State.g<2) {
873 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
874 }
875 myData2022->toU2022State.g=3;
876 } else {
877 /* illegal to have SS3 before a matching designator */
878 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
879 }
880 break;
881 case ISO_IR_165:
882 if(myData2022->version==0) {
883 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
884 break;
885 }
886 /*fall through*/
887 case GB2312_1:
888 /*fall through*/
889 case CNS_11643_1:
890 myData2022->toU2022State.cs[1]=(int8_t)tempState;
891 break;
892 case CNS_11643_2:
893 myData2022->toU2022State.cs[2]=(int8_t)tempState;
894 break;
895 default:
896 /* other CNS 11643 planes */
897 if(myData2022->version==0) {
898 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
899 } else {
900 myData2022->toU2022State.cs[3]=(int8_t)tempState;
901 }
902 break;
903 }
904 }
905 break;
906 case ISO_2022_KR:
907 if(offset==0x30){
908 /* nothing to be done, just accept this one escape sequence */
909 } else {
910 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
911 }
912 break;
913
914 default:
915 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
916 break;
917 }
918 }
919 if(U_SUCCESS(*err)) {
920 _this->toULength = 0;
921 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
922 if(_this->toULength>1) {
923 /*
924 * Ticket 5691: consistent illegal sequences:
925 * - We include at least the first byte (ESC) in the illegal sequence.
926 * - If any of the non-initial bytes could be the start of a character,
927 * we stop the illegal sequence before the first one of those.
928 * In escape sequences, all following bytes are "printable", that is,
929 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
930 * they are valid single/lead bytes.
931 * For simplicity, we always only report the initial ESC byte as the
932 * illegal sequence and back out all other bytes we looked at.
933 */
934 /* Back out some bytes. */
935 int8_t backOutDistance=_this->toULength-1;
936 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
937 if(backOutDistance<=bytesFromThisBuffer) {
938 /* same as initialToULength<=1 */
939 *source-=backOutDistance;
940 } else {
941 /* Back out bytes from the previous buffer: Need to replay them. */
942 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
943 /* same as -(initialToULength-1) */
944 /* preToULength is negative! */
945 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
946 *source-=bytesFromThisBuffer;
947 }
948 _this->toULength=1;
949 }
950 }
951 }
952
953 /*Checks the characters of the buffer against valid 2022 escape sequences
954 *if the match we return a pointer to the initial start of the sequence otherwise
955 *we return sourceLimit
956 */
957 /*for 2022 looks ahead in the stream
958 *to determine the longest possible convertible
959 *data stream
960 */
961 static U_INLINE const char*
962 getEndOfBuffer_2022(const char** source,
963 const char* sourceLimit,
964 UBool flush){
965
966 const char* mySource = *source;
967
968 #ifdef U_ENABLE_GENERIC_ISO_2022
969 if (*source >= sourceLimit)
970 return sourceLimit;
971
972 do{
973
974 if (*mySource == ESC_2022){
975 int8_t i;
976 int32_t key = 0;
977 int32_t offset;
978 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
979
980 /* Kludge: I could not
981 * figure out the reason for validating an escape sequence
982 * twice - once here and once in changeState_2022().
983 * is it possible to have an ESC character in a ISO2022
984 * byte stream which is valid in a code page? Is it legal?
985 */
986 for (i=0;
987 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
988 i++) {
989 value = getKey_2022(*(mySource+i), &key, &offset);
990 }
991 if (value > 0 || *mySource==ESC_2022)
992 return mySource;
993
994 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
995 return sourceLimit;
996 }
997 }while (++mySource < sourceLimit);
998
999 return sourceLimit;
1000 #else
1001 while(mySource < sourceLimit && *mySource != ESC_2022) {
1002 ++mySource;
1003 }
1004 return mySource;
1005 #endif
1006 }
1007
1008
1009 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1010 * any future change in _MBCSFromUChar32() function should be reflected in
1011 * this macro
1012 */
1013 static U_INLINE void
1014 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1015 UChar32 c,
1016 uint32_t* value,
1017 UBool useFallback,
1018 int32_t *length,
1019 int outputType)
1020 {
1021 const int32_t *cx;
1022 const uint16_t *table;
1023 uint32_t stage2Entry;
1024 uint32_t myValue;
1025 const uint8_t *p;
1026 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1027 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1028 table=sharedData->mbcs.fromUnicodeTable;
1029 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1030 /* get the bytes and the length for the output */
1031 if(outputType==MBCS_OUTPUT_2){
1032 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1033 if(myValue<=0xff) {
1034 *length=1;
1035 } else {
1036 *length=2;
1037 }
1038 } else /* outputType==MBCS_OUTPUT_3 */ {
1039 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1040 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1041 if(myValue<=0xff) {
1042 *length=1;
1043 } else if(myValue<=0xffff) {
1044 *length=2;
1045 } else {
1046 *length=3;
1047 }
1048 }
1049 /* is this code point assigned, or do we use fallbacks? */
1050 if( (stage2Entry&(1<<(16+(c&0xf))))!=0 ||
1051 (FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0)
1052 ) {
1053 /*
1054 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1055 * There is no way with this data structure for fallback output
1056 * to be a zero byte.
1057 */
1058 /* assigned */
1059 *value=myValue;
1060 return;
1061 }
1062 }
1063
1064 cx=sharedData->mbcs.extIndexes;
1065 if(cx!=NULL) {
1066 *length=ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1067 return;
1068 }
1069
1070 /* unassigned */
1071 *length=0;
1072 }
1073
1074 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1075 * any future change in _MBCSSingleFromUChar32() function should be reflected in
1076 * this macro
1077 */
1078 static U_INLINE void
1079 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1080 UChar32 c,
1081 uint32_t* retval,
1082 UBool useFallback)
1083 {
1084 const uint16_t *table;
1085 int32_t value;
1086 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1087 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1088 *retval=(uint16_t)-1;
1089 return;
1090 }
1091 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1092 table=sharedData->mbcs.fromUnicodeTable;
1093 /* get the byte for the output */
1094 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1095 /* is this code point assigned, or do we use fallbacks? */
1096 if(useFallback ? value>=0x800 : value>=0xc00) {
1097 value &=0xff;
1098 } else {
1099 value= -1;
1100 }
1101 *retval=(uint16_t) value;
1102 }
1103
1104 #ifdef U_ENABLE_GENERIC_ISO_2022
1105
1106 /**********************************************************************************
1107 * ISO-2022 Converter
1108 *
1109 *
1110 */
1111
1112 static void
1113 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1114 UErrorCode* err){
1115 const char* mySourceLimit, *realSourceLimit;
1116 const char* sourceStart;
1117 const UChar* myTargetStart;
1118 UConverter* saveThis;
1119 UConverterDataISO2022* myData;
1120 int8_t length;
1121
1122 saveThis = args->converter;
1123 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1124
1125 realSourceLimit = args->sourceLimit;
1126 while (args->source < realSourceLimit) {
1127 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1128 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1129 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1130
1131 if(args->source < mySourceLimit) {
1132 if(myData->currentConverter==NULL) {
1133 myData->currentConverter = ucnv_open("ASCII",err);
1134 if(U_FAILURE(*err)){
1135 return;
1136 }
1137
1138 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1139 saveThis->mode = UCNV_SO;
1140 }
1141
1142 /* convert to before the ESC or until the end of the buffer */
1143 myData->isFirstBuffer=FALSE;
1144 sourceStart = args->source;
1145 myTargetStart = args->target;
1146 args->converter = myData->currentConverter;
1147 ucnv_toUnicode(args->converter,
1148 &args->target,
1149 args->targetLimit,
1150 &args->source,
1151 mySourceLimit,
1152 args->offsets,
1153 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1154 err);
1155 args->converter = saveThis;
1156
1157 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1158 /* move the overflow buffer */
1159 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1160 myData->currentConverter->UCharErrorBufferLength = 0;
1161 if(length > 0) {
1162 uprv_memcpy(saveThis->UCharErrorBuffer,
1163 myData->currentConverter->UCharErrorBuffer,
1164 length*U_SIZEOF_UCHAR);
1165 }
1166 return;
1167 }
1168
1169 /*
1170 * At least one of:
1171 * -Error while converting
1172 * -Done with entire buffer
1173 * -Need to write offsets or update the current offset
1174 * (leave that up to the code in ucnv.c)
1175 *
1176 * or else we just stopped at an ESC byte and continue with changeState_2022()
1177 */
1178 if (U_FAILURE(*err) ||
1179 (args->source == realSourceLimit) ||
1180 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1181 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1182 ) {
1183 /* copy partial or error input for truncated detection and error handling */
1184 if(U_FAILURE(*err)) {
1185 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1186 if(length > 0) {
1187 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1188 }
1189 } else {
1190 length = saveThis->toULength = myData->currentConverter->toULength;
1191 if(length > 0) {
1192 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1193 if(args->source < mySourceLimit) {
1194 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1195 }
1196 }
1197 }
1198 return;
1199 }
1200 }
1201 }
1202
1203 sourceStart = args->source;
1204 changeState_2022(args->converter,
1205 &(args->source),
1206 realSourceLimit,
1207 ISO_2022,
1208 err);
1209 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1210 /* let the ucnv.c code update its current offset */
1211 return;
1212 }
1213 }
1214 }
1215
1216 #endif
1217
1218 /*
1219 * To Unicode Callback helper function
1220 */
1221 static void
1222 toUnicodeCallback(UConverter *cnv,
1223 const uint32_t sourceChar, const uint32_t targetUniChar,
1224 UErrorCode* err){
1225 if(sourceChar>0xff){
1226 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1227 cnv->toUBytes[1] = (uint8_t)sourceChar;
1228 cnv->toULength = 2;
1229 }
1230 else{
1231 cnv->toUBytes[0] =(char) sourceChar;
1232 cnv->toULength = 1;
1233 }
1234
1235 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1236 *err = U_INVALID_CHAR_FOUND;
1237 }
1238 else{
1239 *err = U_ILLEGAL_CHAR_FOUND;
1240 }
1241 }
1242
1243 /**************************************ISO-2022-JP*************************************************/
1244
1245 /************************************** IMPORTANT **************************************************
1246 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1247 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1248 * The converter iterates over each Unicode codepoint
1249 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1250 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1251 * would do as far as possible.
1252 *
1253 * If the implementation of these macros or structure of sharedData struct change in the future, make
1254 * sure that ISO-2022 is also changed.
1255 ***************************************************************************************************
1256 */
1257
1258 /***************************************************************************************************
1259 * Rules for ISO-2022-jp encoding
1260 * (i) Escape sequences must be fully contained within a line they should not
1261 * span new lines or CRs
1262 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1263 * JIS-Roman character escape sequence should follow before the line terminates
1264 * (iii) If the first character on the line is represented by two bytes then a two
1265 * byte character escape sequence should precede it
1266 * (iv) If no escape sequence is encountered then the characters are ASCII
1267 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1268 * and invoked with SS2 (ESC N).
1269 * (vi) If there is any G0 designation in text, there must be a switch to
1270 * ASCII or to JIS X 0201-Roman before a space character (but not
1271 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1272 * characters such as tab or CRLF.
1273 * (vi) Supported encodings:
1274 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1275 *
1276 * source : RFC-1554
1277 *
1278 * JISX201, JISX208,JISX212 : new .cnv data files created
1279 * KSC5601 : alias to ibm-949 mapping table
1280 * GB2312 : alias to ibm-1386 mapping table
1281 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1282 * ISO-8859-7 : alisas to ibm-9409 mapping table
1283 */
1284
1285 /* preference order of JP charsets */
1286 static const StateEnum jpCharsetPref[]={
1287 ASCII,
1288 JISX201,
1289 ISO8859_1,
1290 ISO8859_7,
1291 JISX208,
1292 JISX212,
1293 GB2312,
1294 KSC5601,
1295 HWKANA_7BIT
1296 };
1297
1298 /*
1299 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1300 * not in order of jpCharsetPref[]!
1301 */
1302 static const char escSeqChars[][6] ={
1303 "\x1B\x28\x42", /* <ESC>(B ASCII */
1304 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1305 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1306 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1307 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1308 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1309 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1310 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1311 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1312
1313 };
1314 static const int32_t escSeqCharsLen[] ={
1315 3, /* length of <ESC>(B ASCII */
1316 3, /* length of <ESC>.A ISO-8859-1 */
1317 3, /* length of <ESC>.F ISO-8859-7 */
1318 3, /* length of <ESC>(J JISX-201 */
1319 3, /* length of <ESC>$B JISX-208 */
1320 4, /* length of <ESC>$(D JISX-212 */
1321 3, /* length of <ESC>$A GB2312 */
1322 4, /* length of <ESC>$(C KSC5601 */
1323 3 /* length of <ESC>(I HWKANA_7BIT */
1324 };
1325
1326 /*
1327 * The iteration over various code pages works this way:
1328 * i) Get the currentState from myConverterData->currentState
1329 * ii) Check if the character is mapped to a valid character in the currentState
1330 * Yes -> a) set the initIterState to currentState
1331 * b) remain in this state until an invalid character is found
1332 * No -> a) go to the next code page and find the character
1333 * iii) Before changing the state increment the current state check if the current state
1334 * is equal to the intitIteration state
1335 * Yes -> A character that cannot be represented in any of the supported encodings
1336 * break and return a U_INVALID_CHARACTER error
1337 * No -> Continue and find the character in next code page
1338 *
1339 *
1340 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1341 */
1342
1343 static void
1344 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1345 UConverterDataISO2022 *converterData;
1346 ISO2022State *pFromU2022State;
1347 uint8_t *target = (uint8_t *) args->target;
1348 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1349 const UChar* source = args->source;
1350 const UChar* sourceLimit = args->sourceLimit;
1351 int32_t* offsets = args->offsets;
1352 UChar32 sourceChar;
1353 char buffer[8];
1354 int32_t len, outLen;
1355 int8_t choices[10];
1356 int32_t choiceCount;
1357 uint32_t targetValue = 0;
1358 UBool useFallback;
1359
1360 int32_t i;
1361 int8_t cs, g;
1362
1363 /* set up the state */
1364 converterData = (UConverterDataISO2022*)args->converter->extraInfo;
1365 pFromU2022State = &converterData->fromU2022State;
1366 useFallback = args->converter->useFallback;
1367
1368 choiceCount = 0;
1369
1370 /* check if the last codepoint of previous buffer was a lead surrogate*/
1371 if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
1372 goto getTrail;
1373 }
1374
1375 while(source < sourceLimit) {
1376 if(target < targetLimit) {
1377
1378 sourceChar = *(source++);
1379 /*check if the char is a First surrogate*/
1380 if(UTF_IS_SURROGATE(sourceChar)) {
1381 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1382 getTrail:
1383 /*look ahead to find the trail surrogate*/
1384 if(source < sourceLimit) {
1385 /* test the following code unit */
1386 UChar trail=(UChar) *source;
1387 if(UTF_IS_SECOND_SURROGATE(trail)) {
1388 source++;
1389 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1390 args->converter->fromUChar32=0x00;
1391 /* convert this supplementary code point */
1392 /* exit this condition tree */
1393 } else {
1394 /* this is an unmatched lead code unit (1st surrogate) */
1395 /* callback(illegal) */
1396 *err=U_ILLEGAL_CHAR_FOUND;
1397 args->converter->fromUChar32=sourceChar;
1398 break;
1399 }
1400 } else {
1401 /* no more input */
1402 args->converter->fromUChar32=sourceChar;
1403 break;
1404 }
1405 } else {
1406 /* this is an unmatched trail code unit (2nd surrogate) */
1407 /* callback(illegal) */
1408 *err=U_ILLEGAL_CHAR_FOUND;
1409 args->converter->fromUChar32=sourceChar;
1410 break;
1411 }
1412 }
1413
1414 /* do not convert SO/SI/ESC */
1415 if(IS_2022_CONTROL(sourceChar)) {
1416 /* callback(illegal) */
1417 *err=U_ILLEGAL_CHAR_FOUND;
1418 args->converter->fromUChar32=sourceChar;
1419 break;
1420 }
1421
1422 /* do the conversion */
1423
1424 if(choiceCount == 0) {
1425 uint16_t csm;
1426
1427 /*
1428 * The csm variable keeps track of which charsets are allowed
1429 * and not used yet while building the choices[].
1430 */
1431 csm = jpCharsetMasks[converterData->version];
1432 choiceCount = 0;
1433
1434 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1435 if(converterData->version == 3 || converterData->version == 4) {
1436 choices[choiceCount++] = cs = (int8_t)HWKANA_7BIT;
1437 csm &= ~CSM(cs);
1438 }
1439
1440 /* try the current G0 charset */
1441 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1442 csm &= ~CSM(cs);
1443
1444 /* try the current G2 charset */
1445 if((cs = pFromU2022State->cs[2]) != 0) {
1446 choices[choiceCount++] = cs;
1447 csm &= ~CSM(cs);
1448 }
1449
1450 /* try all the other possible charsets */
1451 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1452 cs = (int8_t)jpCharsetPref[i];
1453 if(CSM(cs) & csm) {
1454 choices[choiceCount++] = cs;
1455 csm &= ~CSM(cs);
1456 }
1457 }
1458 }
1459
1460 cs = g = 0;
1461 len = 0;
1462
1463 for(i = 0; i < choiceCount && len == 0; ++i) {
1464 cs = choices[i];
1465 switch(cs) {
1466 case ASCII:
1467 if(sourceChar <= 0x7f) {
1468 targetValue = (uint32_t)sourceChar;
1469 len = 1;
1470 }
1471 break;
1472 case ISO8859_1:
1473 if(0x80 <= sourceChar && sourceChar <= 0xff) {
1474 targetValue = (uint32_t)sourceChar - 0x80;
1475 len = 1;
1476 g = 2;
1477 }
1478 break;
1479 case HWKANA_7BIT:
1480 if((uint32_t)(0xff9f-sourceChar)<=(0xff9f-0xff61)) {
1481 targetValue = (uint32_t)(sourceChar - (0xff61 - 0x21));
1482 len = 1;
1483
1484 if(converterData->version==3) {
1485 /* JIS7: use G1 (SO) */
1486 pFromU2022State->cs[1] = cs; /* do not output an escape sequence */
1487 g = 1;
1488 } else if(converterData->version==4) {
1489 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1490 int8_t cs0;
1491
1492 targetValue += 0x80;
1493
1494 cs0 = pFromU2022State->cs[0];
1495 if(IS_JP_DBCS(cs0)) {
1496 /* switch from a DBCS charset to JISX201 */
1497 cs = (int8_t)JISX201;
1498 } else {
1499 /* stay in the current G0 charset */
1500 cs = cs0;
1501 }
1502 }
1503 }
1504 break;
1505 case JISX201:
1506 /* G0 SBCS */
1507 MBCS_SINGLE_FROM_UCHAR32(
1508 converterData->myConverterArray[cs],
1509 sourceChar, &targetValue,
1510 useFallback);
1511 if(targetValue <= 0x7f) {
1512 len = 1;
1513 }
1514 break;
1515 case ISO8859_7:
1516 /* G0 SBCS forced to 7-bit output */
1517 MBCS_SINGLE_FROM_UCHAR32(
1518 converterData->myConverterArray[cs],
1519 sourceChar, &targetValue,
1520 useFallback);
1521 if(0x80 <= targetValue && targetValue <= 0xff) {
1522 targetValue -= 0x80;
1523 len = 1;
1524 g = 2;
1525 }
1526 break;
1527 default:
1528 /* G0 DBCS */
1529 MBCS_FROM_UCHAR32_ISO2022(
1530 converterData->myConverterArray[cs],
1531 sourceChar, &targetValue,
1532 useFallback, &len, MBCS_OUTPUT_2);
1533 if(len != 2) {
1534 len = 0;
1535 }
1536 break;
1537 }
1538 }
1539
1540 if(len > 0) {
1541 outLen = 0; /* count output bytes */
1542
1543 /* write SI if necessary (only for JIS7) */
1544 if(pFromU2022State->g == 1 && g == 0) {
1545 buffer[outLen++] = UCNV_SI;
1546 pFromU2022State->g = 0;
1547 }
1548
1549 /* write the designation sequence if necessary */
1550 if(cs != pFromU2022State->cs[g]) {
1551 int32_t escLen = escSeqCharsLen[cs];
1552 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1553 outLen += escLen;
1554 pFromU2022State->cs[g] = cs;
1555
1556 /* invalidate the choices[] */
1557 choiceCount = 0;
1558 }
1559
1560 /* write the shift sequence if necessary */
1561 if(g != pFromU2022State->g) {
1562 switch(g) {
1563 /* case 0 handled before writing escapes */
1564 case 1:
1565 buffer[outLen++] = UCNV_SO;
1566 pFromU2022State->g = 1;
1567 break;
1568 default: /* case 2 */
1569 buffer[outLen++] = 0x1b;
1570 buffer[outLen++] = 0x4e;
1571 break;
1572 /* no case 3: no SS3 in ISO-2022-JP-x */
1573 }
1574 }
1575
1576 /* write the output bytes */
1577 if(len == 1) {
1578 buffer[outLen++] = (char)targetValue;
1579 } else /* len == 2 */ {
1580 buffer[outLen++] = (char)(targetValue >> 8);
1581 buffer[outLen++] = (char)targetValue;
1582 }
1583 } else {
1584 /*
1585 * if we cannot find the character after checking all codepages
1586 * then this is an error
1587 */
1588 *err = U_INVALID_CHAR_FOUND;
1589 args->converter->fromUChar32=sourceChar;
1590 break;
1591 }
1592
1593 if(sourceChar == CR || sourceChar == LF) {
1594 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1595 pFromU2022State->cs[2] = 0;
1596 choiceCount = 0;
1597 }
1598
1599 /* output outLen>0 bytes in buffer[] */
1600 if(outLen == 1) {
1601 *target++ = buffer[0];
1602 if(offsets) {
1603 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1604 }
1605 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1606 *target++ = buffer[0];
1607 *target++ = buffer[1];
1608 if(offsets) {
1609 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1610 *offsets++ = sourceIndex;
1611 *offsets++ = sourceIndex;
1612 }
1613 } else {
1614 fromUWriteUInt8(
1615 args->converter,
1616 buffer, outLen,
1617 &target, (const char *)targetLimit,
1618 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1619 err);
1620 if(U_FAILURE(*err)) {
1621 break;
1622 }
1623 }
1624 } /* end if(myTargetIndex<myTargetLength) */
1625 else{
1626 *err =U_BUFFER_OVERFLOW_ERROR;
1627 break;
1628 }
1629
1630 }/* end while(mySourceIndex<mySourceLength) */
1631
1632 /*
1633 * the end of the input stream and detection of truncated input
1634 * are handled by the framework, but for ISO-2022-JP conversion
1635 * we need to be in ASCII mode at the very end
1636 *
1637 * conditions:
1638 * successful
1639 * in SO mode or not in ASCII mode
1640 * end of input and no truncated input
1641 */
1642 if( U_SUCCESS(*err) &&
1643 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1644 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
1645 ) {
1646 int32_t sourceIndex;
1647
1648 outLen = 0;
1649
1650 if(pFromU2022State->g != 0) {
1651 buffer[outLen++] = UCNV_SI;
1652 pFromU2022State->g = 0;
1653 }
1654
1655 if(pFromU2022State->cs[0] != ASCII) {
1656 int32_t escLen = escSeqCharsLen[ASCII];
1657 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1658 outLen += escLen;
1659 pFromU2022State->cs[0] = (int8_t)ASCII;
1660 }
1661
1662 /* get the source index of the last input character */
1663 /*
1664 * TODO this would be simpler and more reliable if we used a pair
1665 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1666 * so that we could simply use the prevSourceIndex here;
1667 * this code gives an incorrect result for the rare case of an unmatched
1668 * trail surrogate that is alone in the last buffer of the text stream
1669 */
1670 sourceIndex=(int32_t)(source-args->source);
1671 if(sourceIndex>0) {
1672 --sourceIndex;
1673 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1674 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1675 ) {
1676 --sourceIndex;
1677 }
1678 } else {
1679 sourceIndex=-1;
1680 }
1681
1682 fromUWriteUInt8(
1683 args->converter,
1684 buffer, outLen,
1685 &target, (const char *)targetLimit,
1686 &offsets, sourceIndex,
1687 err);
1688 }
1689
1690 /*save the state and return */
1691 args->source = source;
1692 args->target = (char*)target;
1693 }
1694
1695 /*************** to unicode *******************/
1696
1697 static void
1698 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
1699 UErrorCode* err){
1700 char tempBuf[3];
1701 const char *mySource = (char *) args->source;
1702 UChar *myTarget = args->target;
1703 const char *mySourceLimit = args->sourceLimit;
1704 uint32_t targetUniChar = 0x0000;
1705 uint32_t mySourceChar = 0x0000;
1706 UConverterDataISO2022* myData;
1707 ISO2022State *pToU2022State;
1708 StateEnum cs;
1709
1710 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
1711 pToU2022State = &myData->toU2022State;
1712
1713 if(myData->key != 0) {
1714 /* continue with a partial escape sequence */
1715 goto escape;
1716 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
1717 /* continue with a partial double-byte character */
1718 mySourceChar = args->converter->toUBytes[0];
1719 args->converter->toULength = 0;
1720 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1721 targetUniChar = missingCharMarker;
1722 goto getTrailByte;
1723 }
1724
1725 while(mySource < mySourceLimit){
1726
1727 targetUniChar =missingCharMarker;
1728
1729 if(myTarget < args->targetLimit){
1730
1731 mySourceChar= (unsigned char) *mySource++;
1732
1733 switch(mySourceChar) {
1734 case UCNV_SI:
1735 if(myData->version==3) {
1736 pToU2022State->g=0;
1737 continue;
1738 } else {
1739 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1740 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
1741 break;
1742 }
1743
1744 case UCNV_SO:
1745 if(myData->version==3) {
1746 /* JIS7: switch to G1 half-width Katakana */
1747 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
1748 pToU2022State->g=1;
1749 continue;
1750 } else {
1751 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1752 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
1753 break;
1754 }
1755
1756 case ESC_2022:
1757 mySource--;
1758 escape:
1759 {
1760 const char * mySourceBefore = mySource;
1761 int8_t toULengthBefore = args->converter->toULength;
1762
1763 changeState_2022(args->converter,&(mySource),
1764 mySourceLimit, ISO_2022_JP,err);
1765
1766 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
1767 if ( myData->version == 0 && myData->key == 0 && U_SUCCESS(*err) && myData->isEmptySegment ) {
1768 *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
1769 args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
1770 }
1771
1772 }
1773 /* invalid or illegal escape sequence */
1774 if(U_FAILURE(*err)){
1775 args->target = myTarget;
1776 args->source = mySource;
1777 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
1778 return;
1779 }
1780 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
1781 if (myData->key == 0) {
1782 myData->isEmptySegment = TRUE;
1783 }
1784 continue;
1785
1786 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
1787
1788 case CR:
1789 /*falls through*/
1790 case LF:
1791 /* automatically reset to single-byte mode */
1792 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
1793 pToU2022State->cs[0] = (int8_t)ASCII;
1794 }
1795 pToU2022State->cs[2] = 0;
1796 pToU2022State->g = 0;
1797 /* falls through */
1798 default:
1799 /* convert one or two bytes */
1800 myData->isEmptySegment = FALSE;
1801 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1802 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
1803 !IS_JP_DBCS(cs)
1804 ) {
1805 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
1806 targetUniChar = mySourceChar + (0xff61 - 0xa1);
1807
1808 /* return from a single-shift state to the previous one */
1809 if(pToU2022State->g >= 2) {
1810 pToU2022State->g=pToU2022State->prevG;
1811 }
1812 } else switch(cs) {
1813 case ASCII:
1814 if(mySourceChar <= 0x7f) {
1815 targetUniChar = mySourceChar;
1816 }
1817 break;
1818 case ISO8859_1:
1819 if(mySourceChar <= 0x7f) {
1820 targetUniChar = mySourceChar + 0x80;
1821 }
1822 /* return from a single-shift state to the previous one */
1823 pToU2022State->g=pToU2022State->prevG;
1824 break;
1825 case ISO8859_7:
1826 if(mySourceChar <= 0x7f) {
1827 /* convert mySourceChar+0x80 to use a normal 8-bit table */
1828 targetUniChar =
1829 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1830 myData->myConverterArray[cs],
1831 mySourceChar + 0x80);
1832 }
1833 /* return from a single-shift state to the previous one */
1834 pToU2022State->g=pToU2022State->prevG;
1835 break;
1836 case JISX201:
1837 if(mySourceChar <= 0x7f) {
1838 targetUniChar =
1839 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1840 myData->myConverterArray[cs],
1841 mySourceChar);
1842 }
1843 break;
1844 case HWKANA_7BIT:
1845 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
1846 /* 7-bit halfwidth Katakana */
1847 targetUniChar = mySourceChar + (0xff61 - 0x21);
1848 }
1849 break;
1850 default:
1851 /* G0 DBCS */
1852 if(mySource < mySourceLimit) {
1853 int leadIsOk, trailIsOk;
1854 uint8_t trailByte;
1855 getTrailByte:
1856 trailByte = (uint8_t)*mySource;
1857 /* old
1858 tempBuf[0] = (char) (mySourceChar);
1859 tempBuf[1] = trailByte = *mySource++;
1860 mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
1861 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
1862 */
1863 /*
1864 * Ticket 5691: consistent illegal sequences:
1865 * - We include at least the first byte in the illegal sequence.
1866 * - If any of the non-initial bytes could be the start of a character,
1867 *Ê Êwe stop the illegal sequence before the first one of those.
1868 *
1869 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
1870 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
1871 * Otherwise we convert or report the pair of bytes.
1872 */
1873 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
1874 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
1875 if (leadIsOk && trailIsOk) {
1876 ++mySource;
1877 tempBuf[0] = (char) (mySourceChar);
1878 tempBuf[1] = trailByte;
1879 mySourceChar = (mySourceChar << 8) | trailByte;
1880 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
1881 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
1882 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
1883 ++mySource;
1884 /* add another bit so that the code below writes 2 bytes in case of error */
1885 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
1886 }
1887 } else {
1888 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
1889 args->converter->toULength = 1;
1890 goto endloop;
1891 }
1892 }
1893 break;
1894 }
1895 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
1896 if(args->offsets){
1897 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
1898 }
1899 *(myTarget++)=(UChar)targetUniChar;
1900 }
1901 else if(targetUniChar > missingCharMarker){
1902 /* disassemble the surrogate pair and write to output*/
1903 targetUniChar-=0x0010000;
1904 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
1905 if(args->offsets){
1906 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
1907 }
1908 ++myTarget;
1909 if(myTarget< args->targetLimit){
1910 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
1911 if(args->offsets){
1912 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
1913 }
1914 ++myTarget;
1915 }else{
1916 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
1917 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
1918 }
1919
1920 }
1921 else{
1922 /* Call the callback function*/
1923 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
1924 break;
1925 }
1926 }
1927 else{
1928 *err =U_BUFFER_OVERFLOW_ERROR;
1929 break;
1930 }
1931 }
1932 endloop:
1933 args->target = myTarget;
1934 args->source = mySource;
1935 }
1936
1937
1938 /***************************************************************
1939 * Rules for ISO-2022-KR encoding
1940 * i) The KSC5601 designator sequence should appear only once in a file,
1941 * at the begining of a line before any KSC5601 characters. This usually
1942 * means that it appears by itself on the first line of the file
1943 * ii) There are only 2 shifting sequences SO to shift into double byte mode
1944 * and SI to shift into single byte mode
1945 */
1946 static void
1947 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
1948
1949 UConverter* saveConv = args->converter;
1950 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
1951 args->converter=myConverterData->currentConverter;
1952
1953 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
1954 ucnv_MBCSFromUnicodeWithOffsets(args,err);
1955 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
1956
1957 if(*err == U_BUFFER_OVERFLOW_ERROR) {
1958 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
1959 uprv_memcpy(
1960 saveConv->charErrorBuffer,
1961 myConverterData->currentConverter->charErrorBuffer,
1962 myConverterData->currentConverter->charErrorBufferLength);
1963 }
1964 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
1965 myConverterData->currentConverter->charErrorBufferLength = 0;
1966 }
1967 args->converter=saveConv;
1968 }
1969
1970 static void
1971 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
1972
1973 const UChar *source = args->source;
1974 const UChar *sourceLimit = args->sourceLimit;
1975 unsigned char *target = (unsigned char *) args->target;
1976 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
1977 int32_t* offsets = args->offsets;
1978 uint32_t targetByteUnit = 0x0000;
1979 UChar32 sourceChar = 0x0000;
1980 UBool isTargetByteDBCS;
1981 UBool oldIsTargetByteDBCS;
1982 UConverterDataISO2022 *converterData;
1983 UConverterSharedData* sharedData;
1984 UBool useFallback;
1985 int32_t length =0;
1986
1987 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
1988 /* if the version is 1 then the user is requesting
1989 * conversion with ibm-25546 pass the arguments to
1990 * MBCS converter and return
1991 */
1992 if(converterData->version==1){
1993 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
1994 return;
1995 }
1996
1997 /* initialize data */
1998 sharedData = converterData->currentConverter->sharedData;
1999 useFallback = args->converter->useFallback;
2000 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2001 oldIsTargetByteDBCS = isTargetByteDBCS;
2002
2003 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2004 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2005 goto getTrail;
2006 }
2007 while(source < sourceLimit){
2008
2009 targetByteUnit = missingCharMarker;
2010
2011 if(target < (unsigned char*) args->targetLimit){
2012 sourceChar = *source++;
2013
2014 /* do not convert SO/SI/ESC */
2015 if(IS_2022_CONTROL(sourceChar)) {
2016 /* callback(illegal) */
2017 *err=U_ILLEGAL_CHAR_FOUND;
2018 args->converter->fromUChar32=sourceChar;
2019 break;
2020 }
2021
2022 /* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData,
2023 sourceChar,&targetByteUnit,args->converter->useFallback);*/
2024 MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,&length,MBCS_OUTPUT_2);
2025 /* only DBCS or SBCS characters are expected*/
2026 /* DB characters with high bit set to 1 are expected */
2027 if( length > 2 || length==0 ||
2028 (length == 1 && targetByteUnit > 0x7f) ||
2029 (length == 2 &&
2030 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2031 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2032 ) {
2033 targetByteUnit=missingCharMarker;
2034 }
2035 if (targetByteUnit != missingCharMarker){
2036
2037 oldIsTargetByteDBCS = isTargetByteDBCS;
2038 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2039 /* append the shift sequence */
2040 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2041
2042 if (isTargetByteDBCS)
2043 *target++ = UCNV_SO;
2044 else
2045 *target++ = UCNV_SI;
2046 if(offsets)
2047 *(offsets++) = (int32_t)(source - args->source-1);
2048 }
2049 /* write the targetUniChar to target */
2050 if(targetByteUnit <= 0x00FF){
2051 if( target < targetLimit){
2052 *(target++) = (unsigned char) targetByteUnit;
2053 if(offsets){
2054 *(offsets++) = (int32_t)(source - args->source-1);
2055 }
2056
2057 }else{
2058 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2059 *err = U_BUFFER_OVERFLOW_ERROR;
2060 }
2061 }else{
2062 if(target < targetLimit){
2063 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2064 if(offsets){
2065 *(offsets++) = (int32_t)(source - args->source-1);
2066 }
2067 if(target < targetLimit){
2068 *(target++) =(unsigned char) (targetByteUnit -0x80);
2069 if(offsets){
2070 *(offsets++) = (int32_t)(source - args->source-1);
2071 }
2072 }else{
2073 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2074 *err = U_BUFFER_OVERFLOW_ERROR;
2075 }
2076 }else{
2077 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2078 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2079 *err = U_BUFFER_OVERFLOW_ERROR;
2080 }
2081 }
2082
2083 }
2084 else{
2085 /* oops.. the code point is unassingned
2086 * set the error and reason
2087 */
2088
2089 /*check if the char is a First surrogate*/
2090 if(UTF_IS_SURROGATE(sourceChar)) {
2091 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2092 getTrail:
2093 /*look ahead to find the trail surrogate*/
2094 if(source < sourceLimit) {
2095 /* test the following code unit */
2096 UChar trail=(UChar) *source;
2097 if(UTF_IS_SECOND_SURROGATE(trail)) {
2098 source++;
2099 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2100 *err = U_INVALID_CHAR_FOUND;
2101 /* convert this surrogate code point */
2102 /* exit this condition tree */
2103 } else {
2104 /* this is an unmatched lead code unit (1st surrogate) */
2105 /* callback(illegal) */
2106 *err=U_ILLEGAL_CHAR_FOUND;
2107 }
2108 } else {
2109 /* no more input */
2110 *err = U_ZERO_ERROR;
2111 }
2112 } else {
2113 /* this is an unmatched trail code unit (2nd surrogate) */
2114 /* callback(illegal) */
2115 *err=U_ILLEGAL_CHAR_FOUND;
2116 }
2117 } else {
2118 /* callback(unassigned) for a BMP code point */
2119 *err = U_INVALID_CHAR_FOUND;
2120 }
2121
2122 args->converter->fromUChar32=sourceChar;
2123 break;
2124 }
2125 } /* end if(myTargetIndex<myTargetLength) */
2126 else{
2127 *err =U_BUFFER_OVERFLOW_ERROR;
2128 break;
2129 }
2130
2131 }/* end while(mySourceIndex<mySourceLength) */
2132
2133 /*
2134 * the end of the input stream and detection of truncated input
2135 * are handled by the framework, but for ISO-2022-KR conversion
2136 * we need to be in ASCII mode at the very end
2137 *
2138 * conditions:
2139 * successful
2140 * not in ASCII mode
2141 * end of input and no truncated input
2142 */
2143 if( U_SUCCESS(*err) &&
2144 isTargetByteDBCS &&
2145 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2146 ) {
2147 int32_t sourceIndex;
2148
2149 /* we are switching to ASCII */
2150 isTargetByteDBCS=FALSE;
2151
2152 /* get the source index of the last input character */
2153 /*
2154 * TODO this would be simpler and more reliable if we used a pair
2155 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2156 * so that we could simply use the prevSourceIndex here;
2157 * this code gives an incorrect result for the rare case of an unmatched
2158 * trail surrogate that is alone in the last buffer of the text stream
2159 */
2160 sourceIndex=(int32_t)(source-args->source);
2161 if(sourceIndex>0) {
2162 --sourceIndex;
2163 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2164 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2165 ) {
2166 --sourceIndex;
2167 }
2168 } else {
2169 sourceIndex=-1;
2170 }
2171
2172 fromUWriteUInt8(
2173 args->converter,
2174 SHIFT_IN_STR, 1,
2175 &target, (const char *)targetLimit,
2176 &offsets, sourceIndex,
2177 err);
2178 }
2179
2180 /*save the state and return */
2181 args->source = source;
2182 args->target = (char*)target;
2183 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2184 }
2185
2186 /************************ To Unicode ***************************************/
2187
2188 static void
2189 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2190 UErrorCode* err){
2191 char const* sourceStart;
2192 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2193
2194 UConverterToUnicodeArgs subArgs;
2195 int32_t minArgsSize;
2196
2197 /* set up the subconverter arguments */
2198 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2199 minArgsSize = args->size;
2200 } else {
2201 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2202 }
2203
2204 uprv_memcpy(&subArgs, args, minArgsSize);
2205 subArgs.size = (uint16_t)minArgsSize;
2206 subArgs.converter = myData->currentConverter;
2207
2208 /* remember the original start of the input for offsets */
2209 sourceStart = args->source;
2210
2211 if(myData->key != 0) {
2212 /* continue with a partial escape sequence */
2213 goto escape;
2214 }
2215
2216 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2217 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2218 subArgs.source = args->source;
2219 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2220 if(subArgs.source != subArgs.sourceLimit) {
2221 /*
2222 * get the current partial byte sequence
2223 *
2224 * it needs to be moved between the public and the subconverter
2225 * so that the conversion framework, which only sees the public
2226 * converter, can handle truncated and illegal input etc.
2227 */
2228 if(args->converter->toULength > 0) {
2229 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2230 }
2231 subArgs.converter->toULength = args->converter->toULength;
2232
2233 /*
2234 * Convert up to the end of the input, or to before the next escape character.
2235 * Does not handle conversion extensions because the preToU[] state etc.
2236 * is not copied.
2237 */
2238 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2239
2240 if(args->offsets != NULL && sourceStart != args->source) {
2241 /* update offsets to base them on the actual start of the input */
2242 int32_t *offsets = args->offsets;
2243 UChar *target = args->target;
2244 int32_t delta = (int32_t)(args->source - sourceStart);
2245 while(target < subArgs.target) {
2246 if(*offsets >= 0) {
2247 *offsets += delta;
2248 }
2249 ++offsets;
2250 ++target;
2251 }
2252 }
2253 args->source = subArgs.source;
2254 args->target = subArgs.target;
2255 args->offsets = subArgs.offsets;
2256
2257 /* copy input/error/overflow buffers */
2258 if(subArgs.converter->toULength > 0) {
2259 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2260 }
2261 args->converter->toULength = subArgs.converter->toULength;
2262
2263 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2264 if(subArgs.converter->UCharErrorBufferLength > 0) {
2265 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2266 subArgs.converter->UCharErrorBufferLength);
2267 }
2268 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2269 subArgs.converter->UCharErrorBufferLength = 0;
2270 }
2271 }
2272
2273 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2274 return;
2275 }
2276
2277 escape:
2278 changeState_2022(args->converter,
2279 &(args->source),
2280 args->sourceLimit,
2281 ISO_2022_KR,
2282 err);
2283 }
2284 }
2285
2286 static void
2287 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2288 UErrorCode* err){
2289 char tempBuf[2];
2290 const char *mySource = ( char *) args->source;
2291 UChar *myTarget = args->target;
2292 const char *mySourceLimit = args->sourceLimit;
2293 UChar32 targetUniChar = 0x0000;
2294 UChar mySourceChar = 0x0000;
2295 UConverterDataISO2022* myData;
2296 UConverterSharedData* sharedData ;
2297 UBool useFallback;
2298
2299 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2300 if(myData->version==1){
2301 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2302 return;
2303 }
2304
2305 /* initialize state */
2306 sharedData = myData->currentConverter->sharedData;
2307 useFallback = args->converter->useFallback;
2308
2309 if(myData->key != 0) {
2310 /* continue with a partial escape sequence */
2311 goto escape;
2312 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2313 /* continue with a partial double-byte character */
2314 mySourceChar = args->converter->toUBytes[0];
2315 args->converter->toULength = 0;
2316 goto getTrailByte;
2317 }
2318
2319 while(mySource< mySourceLimit){
2320
2321 if(myTarget < args->targetLimit){
2322
2323 mySourceChar= (unsigned char) *mySource++;
2324
2325 if(mySourceChar==UCNV_SI){
2326 myData->toU2022State.g = 0;
2327 if (myData->isEmptySegment) {
2328 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2329 *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
2330 args->converter->toUBytes[0] = mySourceChar;
2331 args->converter->toULength = 1;
2332 args->target = myTarget;
2333 args->source = mySource;
2334 return;
2335 }
2336 /*consume the source */
2337 continue;
2338 }else if(mySourceChar==UCNV_SO){
2339 myData->toU2022State.g = 1;
2340 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
2341 /*consume the source */
2342 continue;
2343 }else if(mySourceChar==ESC_2022){
2344 mySource--;
2345 escape:
2346 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2347 changeState_2022(args->converter,&(mySource),
2348 mySourceLimit, ISO_2022_KR, err);
2349 if(U_FAILURE(*err)){
2350 args->target = myTarget;
2351 args->source = mySource;
2352 return;
2353 }
2354 continue;
2355 }
2356
2357 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
2358 if(myData->toU2022State.g == 1) {
2359 if(mySource < mySourceLimit) {
2360 int leadIsOk, trailIsOk;
2361 uint8_t trailByte;
2362 getTrailByte:
2363 /* old
2364 trailByte = *mySource++;
2365 tempBuf[0] = (char)(mySourceChar + 0x80);
2366 tempBuf[1] = (char)(trailByte + 0x80);
2367 mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2368 if((mySourceChar & 0x8080) == 0) {
2369 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2370 */
2371 targetUniChar = missingCharMarker;
2372 trailByte = (uint8_t)*mySource;
2373 /*
2374 * Ticket 5691: consistent illegal sequences:
2375 * - We include at least the first byte in the illegal sequence.
2376 * - If any of the non-initial bytes could be the start of a character,
2377 * we stop the illegal sequence before the first one of those.
2378 *
2379 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2380 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2381 * Otherwise we convert or report the pair of bytes.
2382 */
2383 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2384 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2385 if (leadIsOk && trailIsOk) {
2386 ++mySource;
2387 tempBuf[0] = (char)(mySourceChar + 0x80);
2388 tempBuf[1] = (char)(trailByte + 0x80);
2389 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2390 mySourceChar = (mySourceChar << 8) | trailByte;
2391 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2392 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2393 ++mySource;
2394 /* add another bit so that the code below writes 2 bytes in case of error */
2395 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2396 }
2397 } else {
2398 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2399 args->converter->toULength = 1;
2400 break;
2401 }
2402 }
2403 else if(mySourceChar <= 0x7f) {
2404 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2405 } else {
2406 targetUniChar = 0xffff;
2407 }
2408 if(targetUniChar < 0xfffe){
2409 if(args->offsets) {
2410 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2411 }
2412 *(myTarget++)=(UChar)targetUniChar;
2413 }
2414 else {
2415 /* Call the callback function*/
2416 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2417 break;
2418 }
2419 }
2420 else{
2421 *err =U_BUFFER_OVERFLOW_ERROR;
2422 break;
2423 }
2424 }
2425 args->target = myTarget;
2426 args->source = mySource;
2427 }
2428
2429 /*************************** END ISO2022-KR *********************************/
2430
2431 /*************************** ISO-2022-CN *********************************
2432 *
2433 * Rules for ISO-2022-CN Encoding:
2434 * i) The designator sequence must appear once on a line before any instance
2435 * of character set it designates.
2436 * ii) If two lines contain characters from the same character set, both lines
2437 * must include the designator sequence.
2438 * iii) Once the designator sequence is known, a shifting sequence has to be found
2439 * to invoke the shifting
2440 * iv) All lines start in ASCII and end in ASCII.
2441 * v) Four shifting sequences are employed for this purpose:
2442 *
2443 * Sequcence ASCII Eq Charsets
2444 * ---------- ------- ---------
2445 * SI <SI> US-ASCII
2446 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2447 * SS2 <ESC>N CNS-11643-1992 Plane 2
2448 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2449 *
2450 * vi)
2451 * SOdesignator : ESC "$" ")" finalchar_for_SO
2452 * SS2designator : ESC "$" "*" finalchar_for_SS2
2453 * SS3designator : ESC "$" "+" finalchar_for_SS3
2454 *
2455 * ESC $ ) A Indicates the bytes following SO are Chinese
2456 * characters as defined in GB 2312-80, until
2457 * another SOdesignation appears
2458 *
2459 *
2460 * ESC $ ) E Indicates the bytes following SO are as defined
2461 * in ISO-IR-165 (for details, see section 2.1),
2462 * until another SOdesignation appears
2463 *
2464 * ESC $ ) G Indicates the bytes following SO are as defined
2465 * in CNS 11643-plane-1, until another
2466 * SOdesignation appears
2467 *
2468 * ESC $ * H Indicates the two bytes immediately following
2469 * SS2 is a Chinese character as defined in CNS
2470 * 11643-plane-2, until another SS2designation
2471 * appears
2472 * (Meaning <ESC>N must preceed every 2 byte
2473 * sequence.)
2474 *
2475 * ESC $ + I Indicates the immediate two bytes following SS3
2476 * is a Chinese character as defined in CNS
2477 * 11643-plane-3, until another SS3designation
2478 * appears
2479 * (Meaning <ESC>O must preceed every 2 byte
2480 * sequence.)
2481 *
2482 * ESC $ + J Indicates the immediate two bytes following SS3
2483 * is a Chinese character as defined in CNS
2484 * 11643-plane-4, until another SS3designation
2485 * appears
2486 * (In English: <ESC>O must preceed every 2 byte
2487 * sequence.)
2488 *
2489 * ESC $ + K Indicates the immediate two bytes following SS3
2490 * is a Chinese character as defined in CNS
2491 * 11643-plane-5, until another SS3designation
2492 * appears
2493 *
2494 * ESC $ + L Indicates the immediate two bytes following SS3
2495 * is a Chinese character as defined in CNS
2496 * 11643-plane-6, until another SS3designation
2497 * appears
2498 *
2499 * ESC $ + M Indicates the immediate two bytes following SS3
2500 * is a Chinese character as defined in CNS
2501 * 11643-plane-7, until another SS3designation
2502 * appears
2503 *
2504 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2505 * has its own designation information before any Chinese characters
2506 * appear
2507 *
2508 */
2509
2510 /* The following are defined this way to make the strings truely readonly */
2511 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2512 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2513 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2514 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2515 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2516 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2517 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2518 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2519 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2520
2521 /********************** ISO2022-CN Data **************************/
2522 static const char* const escSeqCharsCN[10] ={
2523 SHIFT_IN_STR, /* ASCII */
2524 GB_2312_80_STR,
2525 ISO_IR_165_STR,
2526 CNS_11643_1992_Plane_1_STR,
2527 CNS_11643_1992_Plane_2_STR,
2528 CNS_11643_1992_Plane_3_STR,
2529 CNS_11643_1992_Plane_4_STR,
2530 CNS_11643_1992_Plane_5_STR,
2531 CNS_11643_1992_Plane_6_STR,
2532 CNS_11643_1992_Plane_7_STR
2533 };
2534
2535 static void
2536 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2537
2538 UConverterDataISO2022 *converterData;
2539 ISO2022State *pFromU2022State;
2540 uint8_t *target = (uint8_t *) args->target;
2541 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2542 const UChar* source = args->source;
2543 const UChar* sourceLimit = args->sourceLimit;
2544 int32_t* offsets = args->offsets;
2545 UChar32 sourceChar;
2546 char buffer[8];
2547 int32_t len;
2548 int8_t choices[3];
2549 int32_t choiceCount;
2550 uint32_t targetValue = 0;
2551 UBool useFallback;
2552
2553 /* set up the state */
2554 converterData = (UConverterDataISO2022*)args->converter->extraInfo;
2555 pFromU2022State = &converterData->fromU2022State;
2556 useFallback = args->converter->useFallback;
2557
2558 choiceCount = 0;
2559
2560 /* check if the last codepoint of previous buffer was a lead surrogate*/
2561 if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
2562 goto getTrail;
2563 }
2564
2565 while( source < sourceLimit){
2566 if(target < targetLimit){
2567
2568 sourceChar = *(source++);
2569 /*check if the char is a First surrogate*/
2570 if(UTF_IS_SURROGATE(sourceChar)) {
2571 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2572 getTrail:
2573 /*look ahead to find the trail surrogate*/
2574 if(source < sourceLimit) {
2575 /* test the following code unit */
2576 UChar trail=(UChar) *source;
2577 if(UTF_IS_SECOND_SURROGATE(trail)) {
2578 source++;
2579 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2580 args->converter->fromUChar32=0x00;
2581 /* convert this supplementary code point */
2582 /* exit this condition tree */
2583 } else {
2584 /* this is an unmatched lead code unit (1st surrogate) */
2585 /* callback(illegal) */
2586 *err=U_ILLEGAL_CHAR_FOUND;
2587 args->converter->fromUChar32=sourceChar;
2588 break;
2589 }
2590 } else {
2591 /* no more input */
2592 args->converter->fromUChar32=sourceChar;
2593 break;
2594 }
2595 } else {
2596 /* this is an unmatched trail code unit (2nd surrogate) */
2597 /* callback(illegal) */
2598 *err=U_ILLEGAL_CHAR_FOUND;
2599 args->converter->fromUChar32=sourceChar;
2600 break;
2601 }
2602 }
2603
2604 /* do the conversion */
2605 if(sourceChar <= 0x007f ){
2606 /* do not convert SO/SI/ESC */
2607 if(IS_2022_CONTROL(sourceChar)) {
2608 /* callback(illegal) */
2609 *err=U_ILLEGAL_CHAR_FOUND;
2610 args->converter->fromUChar32=sourceChar;
2611 break;
2612 }
2613
2614 /* US-ASCII */
2615 if(pFromU2022State->g == 0) {
2616 buffer[0] = (char)sourceChar;
2617 len = 1;
2618 } else {
2619 buffer[0] = UCNV_SI;
2620 buffer[1] = (char)sourceChar;
2621 len = 2;
2622 pFromU2022State->g = 0;
2623 choiceCount = 0;
2624 }
2625 if(sourceChar == CR || sourceChar == LF) {
2626 /* reset the state at the end of a line */
2627 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2628 choiceCount = 0;
2629 }
2630 }
2631 else{
2632 /* convert U+0080..U+10ffff */
2633 UConverterSharedData *cnv;
2634 int32_t i;
2635 int8_t cs, g;
2636
2637 if(choiceCount == 0) {
2638 /* try the current SO/G1 converter first */
2639 choices[0] = pFromU2022State->cs[1];
2640
2641 /* default to GB2312_1 if none is designated yet */
2642 if(choices[0] == 0) {
2643 choices[0] = GB2312_1;
2644 }
2645
2646 if(converterData->version == 0) {
2647 /* ISO-2022-CN */
2648
2649 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2650 if(choices[0] == GB2312_1) {
2651 choices[1] = (int8_t)CNS_11643_1;
2652 } else {
2653 choices[1] = (int8_t)GB2312_1;
2654 }
2655
2656 choiceCount = 2;
2657 } else {
2658 /* ISO-2022-CN-EXT */
2659
2660 /* try one of the other converters */
2661 switch(choices[0]) {
2662 case GB2312_1:
2663 choices[1] = (int8_t)CNS_11643_1;
2664 choices[2] = (int8_t)ISO_IR_165;
2665 break;
2666 case ISO_IR_165:
2667 choices[1] = (int8_t)GB2312_1;
2668 choices[2] = (int8_t)CNS_11643_1;
2669 break;
2670 default: /* CNS_11643_x */
2671 choices[1] = (int8_t)GB2312_1;
2672 choices[2] = (int8_t)ISO_IR_165;
2673 break;
2674 }
2675
2676 choiceCount = 3;
2677 }
2678 }
2679
2680 cs = g = 0;
2681 len = 0;
2682
2683 for(i = 0; i < choiceCount && len == 0; ++i) {
2684 cs = choices[i];
2685 if(cs > 0) {
2686 if(cs > CNS_11643_0) {
2687 cnv = converterData->myConverterArray[CNS_11643];
2688 MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_3);
2689 if(len==3) {
2690 cs = (int8_t)(CNS_11643_0 + (targetValue >> 16) - 0x80);
2691 len = 2;
2692 if(cs == CNS_11643_1) {
2693 g = 1;
2694 } else if(cs == CNS_11643_2) {
2695 g = 2;
2696 } else /* plane 3..7 */ if(converterData->version == 1) {
2697 g = 3;
2698 } else {
2699 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
2700 len = 0;
2701 }
2702 }
2703 } else {
2704 /* GB2312_1 or ISO-IR-165 */
2705 cnv = converterData->myConverterArray[cs];
2706 MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_2);
2707 g = 1; /* used if len == 2 */
2708 }
2709 }
2710 }
2711
2712 if(len > 0) {
2713 len = 0; /* count output bytes; it must have been len == 2 */
2714
2715 /* write the designation sequence if necessary */
2716 if(cs != pFromU2022State->cs[g]) {
2717 if(cs < CNS_11643) {
2718 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
2719 } else {
2720 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
2721 }
2722 len = 4;
2723 pFromU2022State->cs[g] = cs;
2724 if(g == 1) {
2725 /* changing the SO/G1 charset invalidates the choices[] */
2726 choiceCount = 0;
2727 }
2728 }
2729
2730 /* write the shift sequence if necessary */
2731 if(g != pFromU2022State->g) {
2732 switch(g) {
2733 case 1:
2734 buffer[len++] = UCNV_SO;
2735
2736 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
2737 pFromU2022State->g = 1;
2738 break;
2739 case 2:
2740 buffer[len++] = 0x1b;
2741 buffer[len++] = 0x4e;
2742 break;
2743 default: /* case 3 */
2744 buffer[len++] = 0x1b;
2745 buffer[len++] = 0x4f;
2746 break;
2747 }
2748 }
2749
2750 /* write the two output bytes */
2751 buffer[len++] = (char)(targetValue >> 8);
2752 buffer[len++] = (char)targetValue;
2753 } else {
2754 /* if we cannot find the character after checking all codepages
2755 * then this is an error
2756 */
2757 *err = U_INVALID_CHAR_FOUND;
2758 args->converter->fromUChar32=sourceChar;
2759 break;
2760 }
2761 }
2762
2763 /* output len>0 bytes in buffer[] */
2764 if(len == 1) {
2765 *target++ = buffer[0];
2766 if(offsets) {
2767 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
2768 }
2769 } else if(len == 2 && (target + 2) <= targetLimit) {
2770 *target++ = buffer[0];
2771 *target++ = buffer[1];
2772 if(offsets) {
2773 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
2774 *offsets++ = sourceIndex;
2775 *offsets++ = sourceIndex;
2776 }
2777 } else {
2778 fromUWriteUInt8(
2779 args->converter,
2780 buffer, len,
2781 &target, (const char *)targetLimit,
2782 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2783 err);
2784 if(U_FAILURE(*err)) {
2785 break;
2786 }
2787 }
2788 } /* end if(myTargetIndex<myTargetLength) */
2789 else{
2790 *err =U_BUFFER_OVERFLOW_ERROR;
2791 break;
2792 }
2793
2794 }/* end while(mySourceIndex<mySourceLength) */
2795
2796 /*
2797 * the end of the input stream and detection of truncated input
2798 * are handled by the framework, but for ISO-2022-CN conversion
2799 * we need to be in ASCII mode at the very end
2800 *
2801 * conditions:
2802 * successful
2803 * not in ASCII mode
2804 * end of input and no truncated input
2805 */
2806 if( U_SUCCESS(*err) &&
2807 pFromU2022State->g!=0 &&
2808 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2809 ) {
2810 int32_t sourceIndex;
2811
2812 /* we are switching to ASCII */
2813 pFromU2022State->g=0;
2814
2815 /* get the source index of the last input character */
2816 /*
2817 * TODO this would be simpler and more reliable if we used a pair
2818 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2819 * so that we could simply use the prevSourceIndex here;
2820 * this code gives an incorrect result for the rare case of an unmatched
2821 * trail surrogate that is alone in the last buffer of the text stream
2822 */
2823 sourceIndex=(int32_t)(source-args->source);
2824 if(sourceIndex>0) {
2825 --sourceIndex;
2826 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2827 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2828 ) {
2829 --sourceIndex;
2830 }
2831 } else {
2832 sourceIndex=-1;
2833 }
2834
2835 fromUWriteUInt8(
2836 args->converter,
2837 SHIFT_IN_STR, 1,
2838 &target, (const char *)targetLimit,
2839 &offsets, sourceIndex,
2840 err);
2841 }
2842
2843 /*save the state and return */
2844 args->source = source;
2845 args->target = (char*)target;
2846 }
2847
2848
2849 static void
2850 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2851 UErrorCode* err){
2852 char tempBuf[3];
2853 const char *mySource = (char *) args->source;
2854 UChar *myTarget = args->target;
2855 const char *mySourceLimit = args->sourceLimit;
2856 uint32_t targetUniChar = 0x0000;
2857 uint32_t mySourceChar = 0x0000;
2858 UConverterDataISO2022* myData;
2859 ISO2022State *pToU2022State;
2860
2861 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2862 pToU2022State = &myData->toU2022State;
2863
2864 if(myData->key != 0) {
2865 /* continue with a partial escape sequence */
2866 goto escape;
2867 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2868 /* continue with a partial double-byte character */
2869 mySourceChar = args->converter->toUBytes[0];
2870 args->converter->toULength = 0;
2871 targetUniChar = missingCharMarker;
2872 goto getTrailByte;
2873 }
2874
2875 while(mySource < mySourceLimit){
2876
2877 targetUniChar =missingCharMarker;
2878
2879 if(myTarget < args->targetLimit){
2880
2881 mySourceChar= (unsigned char) *mySource++;
2882
2883 switch(mySourceChar){
2884 case UCNV_SI:
2885 pToU2022State->g=0;
2886 if (myData->isEmptySegment) {
2887 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2888 *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
2889 args->converter->toUBytes[0] = mySourceChar;
2890 args->converter->toULength = 1;
2891 args->target = myTarget;
2892 args->source = mySource;
2893 return;
2894 }
2895 continue;
2896
2897 case UCNV_SO:
2898 if(pToU2022State->cs[1] != 0) {
2899 pToU2022State->g=1;
2900 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
2901 continue;
2902 } else {
2903 /* illegal to have SO before a matching designator */
2904 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
2905 break;
2906 }
2907
2908 case ESC_2022:
2909 mySource--;
2910 escape:
2911 {
2912 const char * mySourceBefore = mySource;
2913 int8_t toULengthBefore = args->converter->toULength;
2914
2915 changeState_2022(args->converter,&(mySource),
2916 mySourceLimit, ISO_2022_CN,err);
2917
2918 /* After SO there must be at least one character before a designator (designator error handled separately) */
2919 if ( myData->key == 0 && U_SUCCESS(*err) && myData->isEmptySegment ) {
2920 *err = U_PARSE_ERROR; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
2921 args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
2922 }
2923 }
2924
2925 /* invalid or illegal escape sequence */
2926 if(U_FAILURE(*err)){
2927 args->target = myTarget;
2928 args->source = mySource;
2929 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
2930 return;
2931 }
2932 continue;
2933
2934 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
2935
2936 case CR:
2937 /*falls through*/
2938 case LF:
2939 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
2940 /* falls through */
2941 default:
2942 /* convert one or two bytes */
2943 myData->isEmptySegment = FALSE;
2944 if(pToU2022State->g != 0) {
2945 if(mySource < mySourceLimit) {
2946 UConverterSharedData *cnv;
2947 StateEnum tempState;
2948 int32_t tempBufLen;
2949 int leadIsOk, trailIsOk;
2950 uint8_t trailByte;
2951 getTrailByte:
2952 /* old
2953 trailByte = *mySource++;
2954 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
2955 if(tempState > CNS_11643_0) {
2956 cnv = myData->myConverterArray[CNS_11643];
2957 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
2958 tempBuf[1] = (char) (mySourceChar);
2959 tempBuf[2] = trailByte;
2960 tempBufLen = 3;
2961
2962 }else{
2963 cnv = myData->myConverterArray[tempState];
2964 tempBuf[0] = (char) (mySourceChar);
2965 tempBuf[1] = trailByte;
2966 tempBufLen = 2;
2967 */
2968 trailByte = (uint8_t)*mySource;
2969 /*
2970 * Ticket 5691: consistent illegal sequences:
2971 * - We include at least the first byte in the illegal sequence.
2972 * - If any of the non-initial bytes could be the start of a character,
2973 * we stop the illegal sequence before the first one of those.
2974 *
2975 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2976 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2977 * Otherwise we convert or report the pair of bytes.
2978 */
2979 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2980 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2981 if (leadIsOk && trailIsOk) {
2982 ++mySource;
2983 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
2984 if(tempState >= CNS_11643_0) {
2985 cnv = myData->myConverterArray[CNS_11643];
2986 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
2987 tempBuf[1] = (char) (mySourceChar);
2988 tempBuf[2] = (char) trailByte;
2989 tempBufLen = 3;
2990
2991 }else{
2992 cnv = myData->myConverterArray[tempState];
2993 tempBuf[0] = (char) (mySourceChar);
2994 tempBuf[1] = (char) trailByte;
2995 tempBufLen = 2;
2996 }
2997 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
2998 mySourceChar = (mySourceChar << 8) | trailByte;
2999 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3000 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3001 ++mySource;
3002 /* add another bit so that the code below writes 2 bytes in case of error */
3003 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3004 }
3005 if(pToU2022State->g>=2) {
3006 /* return from a single-shift state to the previous one */
3007 pToU2022State->g=pToU2022State->prevG;
3008 }
3009 } else {
3010 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3011 args->converter->toULength = 1;
3012 goto endloop;
3013 }
3014 }
3015 else{
3016 if(mySourceChar <= 0x7f) {
3017 targetUniChar = (UChar) mySourceChar;
3018 }
3019 }
3020 break;
3021 }
3022 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3023 if(args->offsets){
3024 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3025 }
3026 *(myTarget++)=(UChar)targetUniChar;
3027 }
3028 else if(targetUniChar > missingCharMarker){
3029 /* disassemble the surrogate pair and write to output*/
3030 targetUniChar-=0x0010000;
3031 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3032 if(args->offsets){
3033 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3034 }
3035 ++myTarget;
3036 if(myTarget< args->targetLimit){
3037 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3038 if(args->offsets){
3039 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3040 }
3041 ++myTarget;
3042 }else{
3043 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3044 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3045 }
3046
3047 }
3048 else{
3049 /* Call the callback function*/
3050 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3051 break;
3052 }
3053 }
3054 else{
3055 *err =U_BUFFER_OVERFLOW_ERROR;
3056 break;
3057 }
3058 }
3059 endloop:
3060 args->target = myTarget;
3061 args->source = mySource;
3062 }
3063
3064 static void
3065 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3066 UConverter *cnv = args->converter;
3067 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3068 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3069 char *p, *subchar;
3070 char buffer[8];
3071 int32_t length;
3072
3073 subchar=(char *)cnv->subChars;
3074 length=cnv->subCharLen; /* assume length==1 for most variants */
3075
3076 p = buffer;
3077 switch(myConverterData->locale[0]){
3078 case 'j':
3079 {
3080 int8_t cs;
3081
3082 if(pFromU2022State->g == 1) {
3083 /* JIS7: switch from G1 to G0 */
3084 pFromU2022State->g = 0;
3085 *p++ = UCNV_SI;
3086 }
3087
3088 cs = pFromU2022State->cs[0];
3089 if(cs != ASCII && cs != JISX201) {
3090 /* not in ASCII or JIS X 0201: switch to ASCII */
3091 pFromU2022State->cs[0] = (int8_t)ASCII;
3092 *p++ = '\x1b';
3093 *p++ = '\x28';
3094 *p++ = '\x42';
3095 }
3096
3097 *p++ = subchar[0];
3098 break;
3099 }
3100 case 'c':
3101 if(pFromU2022State->g != 0) {
3102 /* not in ASCII mode: switch to ASCII */
3103 pFromU2022State->g = 0;
3104 *p++ = UCNV_SI;
3105 }
3106 *p++ = subchar[0];
3107 break;
3108 case 'k':
3109 if(myConverterData->version == 0) {
3110 if(length == 1) {
3111 if((UBool)args->converter->fromUnicodeStatus) {
3112 /* in DBCS mode: switch to SBCS */
3113 args->converter->fromUnicodeStatus = 0;
3114 *p++ = UCNV_SI;
3115 }
3116 *p++ = subchar[0];
3117 } else /* length == 2*/ {
3118 if(!(UBool)args->converter->fromUnicodeStatus) {
3119 /* in SBCS mode: switch to DBCS */
3120 args->converter->fromUnicodeStatus = 1;
3121 *p++ = UCNV_SO;
3122 }
3123 *p++ = subchar[0];
3124 *p++ = subchar[1];
3125 }
3126 break;
3127 } else {
3128 /* save the subconverter's substitution string */
3129 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3130 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3131
3132 /* set our substitution string into the subconverter */
3133 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3134 myConverterData->currentConverter->subCharLen = (int8_t)length;
3135
3136 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3137 args->converter = myConverterData->currentConverter;
3138 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3139 ucnv_cbFromUWriteSub(args, 0, err);
3140 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3141 args->converter = cnv;
3142
3143 /* restore the subconverter's substitution string */
3144 myConverterData->currentConverter->subChars = currentSubChars;
3145 myConverterData->currentConverter->subCharLen = currentSubCharLen;
3146
3147 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3148 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3149 uprv_memcpy(
3150 cnv->charErrorBuffer,
3151 myConverterData->currentConverter->charErrorBuffer,
3152 myConverterData->currentConverter->charErrorBufferLength);
3153 }
3154 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3155 myConverterData->currentConverter->charErrorBufferLength = 0;
3156 }
3157 return;
3158 }
3159 default:
3160 /* not expected */
3161 break;
3162 }
3163 ucnv_cbFromUWriteBytes(args,
3164 buffer, (int32_t)(p - buffer),
3165 offsetIndex, err);
3166 }
3167
3168 /*
3169 * Structure for cloning an ISO 2022 converter into a single memory block.
3170 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3171 * and then ucnv_safeClone() of the sub-converter may additionally align
3172 * currentConverter inside the cloneStruct, for which we need the deadSpace
3173 * after currentConverter.
3174 * This is because UAlignedMemory may be larger than the actually
3175 * necessary alignment size for the platform.
3176 * The other cloneStruct fields will not be moved around,
3177 * and are aligned properly with cloneStruct's alignment.
3178 */
3179 struct cloneStruct
3180 {
3181 UConverter cnv;
3182 UConverter currentConverter;
3183 UAlignedMemory deadSpace;
3184 UConverterDataISO2022 mydata;
3185 };
3186
3187
3188 static UConverter *
3189 _ISO_2022_SafeClone(
3190 const UConverter *cnv,
3191 void *stackBuffer,
3192 int32_t *pBufferSize,
3193 UErrorCode *status)
3194 {
3195 struct cloneStruct * localClone;
3196 UConverterDataISO2022 *cnvData;
3197 int32_t i, size;
3198
3199 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3200 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3201 return NULL;
3202 }
3203
3204 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3205 localClone = (struct cloneStruct *)stackBuffer;
3206
3207 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3208
3209 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3210 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3211 localClone->cnv.isExtraLocal = TRUE;
3212
3213 /* share the subconverters */
3214
3215 if(cnvData->currentConverter != NULL) {
3216 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3217 localClone->mydata.currentConverter =
3218 ucnv_safeClone(cnvData->currentConverter,
3219 &localClone->currentConverter,
3220 &size, status);
3221 if(U_FAILURE(*status)) {
3222 return NULL;
3223 }
3224 }
3225
3226 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3227 if(cnvData->myConverterArray[i] != NULL) {
3228 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3229 }
3230 }
3231
3232 return &localClone->cnv;
3233 }
3234
3235 static void
3236 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3237 const USetAdder *sa,
3238 UConverterUnicodeSet which,
3239 UErrorCode *pErrorCode)
3240 {
3241 int32_t i;
3242 UConverterDataISO2022* cnvData;
3243
3244 if (U_FAILURE(*pErrorCode)) {
3245 return;
3246 }
3247 #ifdef U_ENABLE_GENERIC_ISO_2022
3248 if (cnv->sharedData == &_ISO2022Data) {
3249 /* We use UTF-8 in this case */
3250 sa->addRange(sa->set, 0, 0xd7FF);
3251 sa->addRange(sa->set, 0xE000, 0x10FFFF);
3252 return;
3253 }
3254 #endif
3255
3256 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3257
3258 /* open a set and initialize it with code points that are algorithmically round-tripped */
3259 switch(cnvData->locale[0]){
3260 case 'j':
3261 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3262 /* include Latin-1 for some variants of JP */
3263 sa->addRange(sa->set, 0, 0xff);
3264 } else {
3265 /* include ASCII for JP */
3266 sa->addRange(sa->set, 0, 0x7f);
3267 }
3268 if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
3269 /* include half-width Katakana for JP */
3270 sa->addRange(sa->set, 0xff61, 0xff9f);
3271 }
3272 break;
3273 case 'c':
3274 case 'z':
3275 /* include ASCII for CN */
3276 sa->addRange(sa->set, 0, 0x7f);
3277 break;
3278 case 'k':
3279 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3280 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3281 cnvData->currentConverter, sa, which, pErrorCode);
3282 /* the loop over myConverterArray[] will simply not find another converter */
3283 break;
3284 default:
3285 break;
3286 }
3287
3288 /*
3289 * Version-specific for CN:
3290 * CN version 0 does not map CNS planes 3..7 although
3291 * they are all available in the CNS conversion table;
3292 * CN version 1 does map them all.
3293 * The two versions create different Unicode sets.
3294 */
3295 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3296 if(cnvData->myConverterArray[i]!=NULL) {
3297 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3298 cnvData->version==0 && i==CNS_11643
3299 ) {
3300 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3301 ucnv_MBCSGetUnicodeSetForBytes(
3302 cnvData->myConverterArray[i],
3303 sa, UCNV_ROUNDTRIP_SET,
3304 0, 0x81, 0x82,
3305 pErrorCode);
3306 } else {
3307 ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode);
3308 }
3309 }
3310 }
3311
3312 /*
3313 * ISO 2022 converters must not convert SO/SI/ESC despite what
3314 * sub-converters do by themselves.
3315 * Remove these characters from the set.
3316 */
3317 sa->remove(sa->set, 0x0e);
3318 sa->remove(sa->set, 0x0f);
3319 sa->remove(sa->set, 0x1b);
3320 }
3321
3322 static const UConverterImpl _ISO2022Impl={
3323 UCNV_ISO_2022,
3324
3325 NULL,
3326 NULL,
3327
3328 _ISO2022Open,
3329 _ISO2022Close,
3330 _ISO2022Reset,
3331
3332 #ifdef U_ENABLE_GENERIC_ISO_2022
3333 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3334 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3335 ucnv_fromUnicode_UTF8,
3336 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3337 #else
3338 NULL,
3339 NULL,
3340 NULL,
3341 NULL,
3342 #endif
3343 NULL,
3344
3345 NULL,
3346 _ISO2022getName,
3347 _ISO_2022_WriteSub,
3348 _ISO_2022_SafeClone,
3349 _ISO_2022_GetUnicodeSet
3350 };
3351 static const UConverterStaticData _ISO2022StaticData={
3352 sizeof(UConverterStaticData),
3353 "ISO_2022",
3354 2022,
3355 UCNV_IBM,
3356 UCNV_ISO_2022,
3357 1,
3358 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3359 { 0x1a, 0, 0, 0 },
3360 1,
3361 FALSE,
3362 FALSE,
3363 0,
3364 0,
3365 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3366 };
3367 const UConverterSharedData _ISO2022Data={
3368 sizeof(UConverterSharedData),
3369 ~((uint32_t) 0),
3370 NULL,
3371 NULL,
3372 &_ISO2022StaticData,
3373 FALSE,
3374 &_ISO2022Impl,
3375 0
3376 };
3377
3378 /*************JP****************/
3379 static const UConverterImpl _ISO2022JPImpl={
3380 UCNV_ISO_2022,
3381
3382 NULL,
3383 NULL,
3384
3385 _ISO2022Open,
3386 _ISO2022Close,
3387 _ISO2022Reset,
3388
3389 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3390 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3391 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3392 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3393 NULL,
3394
3395 NULL,
3396 _ISO2022getName,
3397 _ISO_2022_WriteSub,
3398 _ISO_2022_SafeClone,
3399 _ISO_2022_GetUnicodeSet
3400 };
3401 static const UConverterStaticData _ISO2022JPStaticData={
3402 sizeof(UConverterStaticData),
3403 "ISO_2022_JP",
3404 0,
3405 UCNV_IBM,
3406 UCNV_ISO_2022,
3407 1,
3408 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3409 { 0x1a, 0, 0, 0 },
3410 1,
3411 FALSE,
3412 FALSE,
3413 0,
3414 0,
3415 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3416 };
3417 static const UConverterSharedData _ISO2022JPData={
3418 sizeof(UConverterSharedData),
3419 ~((uint32_t) 0),
3420 NULL,
3421 NULL,
3422 &_ISO2022JPStaticData,
3423 FALSE,
3424 &_ISO2022JPImpl,
3425 0
3426 };
3427
3428 /************* KR ***************/
3429 static const UConverterImpl _ISO2022KRImpl={
3430 UCNV_ISO_2022,
3431
3432 NULL,
3433 NULL,
3434
3435 _ISO2022Open,
3436 _ISO2022Close,
3437 _ISO2022Reset,
3438
3439 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3440 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3441 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3442 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3443 NULL,
3444
3445 NULL,
3446 _ISO2022getName,
3447 _ISO_2022_WriteSub,
3448 _ISO_2022_SafeClone,
3449 _ISO_2022_GetUnicodeSet
3450 };
3451 static const UConverterStaticData _ISO2022KRStaticData={
3452 sizeof(UConverterStaticData),
3453 "ISO_2022_KR",
3454 0,
3455 UCNV_IBM,
3456 UCNV_ISO_2022,
3457 1,
3458 3, /* max 3 bytes per UChar: SO+DBCS */
3459 { 0x1a, 0, 0, 0 },
3460 1,
3461 FALSE,
3462 FALSE,
3463 0,
3464 0,
3465 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3466 };
3467 static const UConverterSharedData _ISO2022KRData={
3468 sizeof(UConverterSharedData),
3469 ~((uint32_t) 0),
3470 NULL,
3471 NULL,
3472 &_ISO2022KRStaticData,
3473 FALSE,
3474 &_ISO2022KRImpl,
3475 0
3476 };
3477
3478 /*************** CN ***************/
3479 static const UConverterImpl _ISO2022CNImpl={
3480
3481 UCNV_ISO_2022,
3482
3483 NULL,
3484 NULL,
3485
3486 _ISO2022Open,
3487 _ISO2022Close,
3488 _ISO2022Reset,
3489
3490 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3491 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3492 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3493 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3494 NULL,
3495
3496 NULL,
3497 _ISO2022getName,
3498 _ISO_2022_WriteSub,
3499 _ISO_2022_SafeClone,
3500 _ISO_2022_GetUnicodeSet
3501 };
3502 static const UConverterStaticData _ISO2022CNStaticData={
3503 sizeof(UConverterStaticData),
3504 "ISO_2022_CN",
3505 0,
3506 UCNV_IBM,
3507 UCNV_ISO_2022,
3508 1,
3509 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3510 { 0x1a, 0, 0, 0 },
3511 1,
3512 FALSE,
3513 FALSE,
3514 0,
3515 0,
3516 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3517 };
3518 static const UConverterSharedData _ISO2022CNData={
3519 sizeof(UConverterSharedData),
3520 ~((uint32_t) 0),
3521 NULL,
3522 NULL,
3523 &_ISO2022CNStaticData,
3524 FALSE,
3525 &_ISO2022CNImpl,
3526 0
3527 };
3528
3529
3530
3531 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */