]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnv2022.c
ICU-400.37.tar.gz
[apple/icu.git] / icuSources / common / ucnv2022.c
1 /*
2 **********************************************************************
3 * Copyright (C) 2000-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
13 *
14 * Change history:
15 *
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
19 * function
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
22 * ucnvebdc.c
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
27 */
28
29 #include "unicode/utypes.h"
30
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
37 #include "ucnv_imp.h"
38 #include "ucnv_bld.h"
39 #include "ucnv_cnv.h"
40 #include "ucnvmbcs.h"
41 #include "cstring.h"
42 #include "cmemory.h"
43
44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45
46 #ifdef U_ENABLE_GENERIC_ISO_2022
47 /*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
72 * for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76 #endif
77
78 static const char SHIFT_IN_STR[] = "\x0F";
79 static const char SHIFT_OUT_STR[] = "\x0E";
80
81 #define CR 0x0D
82 #define LF 0x0A
83 #define H_TAB 0x09
84 #define V_TAB 0x0B
85 #define SPACE 0x20
86
87 enum {
88 HWKANA_START=0xff61,
89 HWKANA_END=0xff9f
90 };
91
92 /*
93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
94 * as bytes 21..7E. (Subtract 0x80.)
95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
96 * as bytes 20..7F. (Subtract 0x80.)
97 * Do not encode C1 control codes with native bytes 80..9F
98 * as bytes 00..1F (C0 control codes).
99 */
100 enum {
101 GR94_START=0xa1,
102 GR94_END=0xfe,
103 GR96_START=0xa0,
104 GR96_END=0xff
105 };
106
107 /*
108 * ISO 2022 control codes must not be converted from Unicode
109 * because they would mess up the byte stream.
110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
111 * corresponding to SO, SI, and ESC.
112 */
113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
114
115 /* for ISO-2022-JP and -CN implementations */
116 typedef enum {
117 /* shared values */
118 INVALID_STATE=-1,
119 ASCII = 0,
120
121 SS2_STATE=0x10,
122 SS3_STATE,
123
124 /* JP */
125 ISO8859_1 = 1 ,
126 ISO8859_7 = 2 ,
127 JISX201 = 3,
128 JISX208 = 4,
129 JISX212 = 5,
130 GB2312 =6,
131 KSC5601 =7,
132 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
133
134 /* CN */
135 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
136 GB2312_1=1,
137 ISO_IR_165=2,
138 CNS_11643=3,
139
140 /*
141 * these are used in StateEnum and ISO2022State variables,
142 * but CNS_11643 must be used to index into myConverterArray[]
143 */
144 CNS_11643_0=0x20,
145 CNS_11643_1,
146 CNS_11643_2,
147 CNS_11643_3,
148 CNS_11643_4,
149 CNS_11643_5,
150 CNS_11643_6,
151 CNS_11643_7
152 } StateEnum;
153
154 /* is the StateEnum charset value for a DBCS charset? */
155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
156
157 #define CSM(cs) ((uint16_t)1<<(cs))
158
159 /*
160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
162 *
163 * Note: The converter uses some leniency:
164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
165 * all versions, not just JIS7 and JIS8.
166 * - ICU does not distinguish between different versions of JIS X 0208.
167 */
168 static const uint16_t jpCharsetMasks[5]={
169 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
170 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
171 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
174 };
175
176 typedef enum {
177 ASCII1=0,
178 LATIN1,
179 SBCS,
180 DBCS,
181 MBCS,
182 HWKANA
183 }Cnv2022Type;
184
185 typedef struct ISO2022State {
186 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
187 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
188 int8_t prevG; /* g before single shift (SS2 or SS3) */
189 } ISO2022State;
190
191 #define UCNV_OPTIONS_VERSION_MASK 0xf
192 #define UCNV_2022_MAX_CONVERTERS 10
193
194 typedef struct{
195 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
196 UConverter *currentConverter;
197 Cnv2022Type currentType;
198 ISO2022State toU2022State, fromU2022State;
199 uint32_t key;
200 uint32_t version;
201 #ifdef U_ENABLE_GENERIC_ISO_2022
202 UBool isFirstBuffer;
203 #endif
204 UBool isEmptySegment;
205 char name[30];
206 char locale[3];
207 }UConverterDataISO2022;
208
209 /* Protos */
210 /* ISO-2022 ----------------------------------------------------------------- */
211
212 /*Forward declaration */
213 U_CFUNC void
214 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
215 UErrorCode * err);
216 U_CFUNC void
217 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
218 UErrorCode * err);
219
220 #define ESC_2022 0x1B /*ESC*/
221
222 typedef enum
223 {
224 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
225 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
226 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
227 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
228 } UCNV_TableStates_2022;
229
230 /*
231 * The way these state transition arrays work is:
232 * ex : ESC$B is the sequence for JISX208
233 * a) First Iteration: char is ESC
234 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
235 * int x = normalize_esq_chars_2022[27] which is equal to 1
236 * ii) Search for this value in escSeqStateTable_Key_2022[]
237 * value of x is stored at escSeqStateTable_Key_2022[0]
238 * iii) Save this index as offset
239 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
240 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
241 * b) Switch on this state and continue to next char
242 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
243 * which is normalize_esq_chars_2022[36] == 4
244 * ii) x is currently 1(from above)
245 * x<<=5 -- x is now 32
246 * x+=normalize_esq_chars_2022[36]
247 * now x is 36
248 * iii) Search for this value in escSeqStateTable_Key_2022[]
249 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
250 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
251 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
252 * c) Switch on this state and continue to next char
253 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
254 * ii) x is currently 36 (from above)
255 * x<<=5 -- x is now 1152
256 * x+=normalize_esq_chars_2022[66]
257 * now x is 1161
258 * iii) Search for this value in escSeqStateTable_Key_2022[]
259 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
260 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
261 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
262 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
263 */
264
265
266 /*Below are the 3 arrays depicting a state transition table*/
267 static const int8_t normalize_esq_chars_2022[256] = {
268 /* 0 1 2 3 4 5 6 7 8 9 */
269
270 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
271 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
273 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
274 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
275 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
276 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
277 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
278 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
279 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
280 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
282 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0
296 };
297
298 #ifdef U_ENABLE_GENERIC_ISO_2022
299 /*
300 * When the generic ISO-2022 converter is completely removed, not just disabled
301 * per #ifdef, then the following state table and the associated tables that are
302 * dimensioned with MAX_STATES_2022 should be trimmed.
303 *
304 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
305 * the associated escape sequences starting with ESC ( B should be removed.
306 * This includes the ones with key values 1097 and all of the ones above 1000000.
307 *
308 * For the latter, the tables can simply be truncated.
309 * For the former, since the tables must be kept parallel, it is probably best
310 * to simply duplicate an adjacent table cell, parallel in all tables.
311 *
312 * It may make sense to restructure the tables, especially by using small search
313 * tables for the variants instead of indexing them parallel to the table here.
314 */
315 #endif
316
317 #define MAX_STATES_2022 74
318 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
319 /* 0 1 2 3 4 5 6 7 8 9 */
320
321 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
322 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
323 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
324 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
325 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
326 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
327 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
328 ,35947631 ,35947635 ,35947636 ,35947638
329 };
330
331 #ifdef U_ENABLE_GENERIC_ISO_2022
332
333 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
334 /* 0 1 2 3 4 5 6 7 8 9 */
335
336 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
337 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
338 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
339 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
340 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
341 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
342 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
343 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
344 };
345
346 #endif
347
348 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
349 /* 0 1 2 3 4 5 6 7 8 9 */
350 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
351 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
352 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
353 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
354 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
355 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
358 };
359
360
361 /* Type def for refactoring changeState_2022 code*/
362 typedef enum{
363 #ifdef U_ENABLE_GENERIC_ISO_2022
364 ISO_2022=0,
365 #endif
366 ISO_2022_JP=1,
367 ISO_2022_KR=2,
368 ISO_2022_CN=3
369 } Variant2022;
370
371 /*********** ISO 2022 Converter Protos ***********/
372 static void
373 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode);
374
375 static void
376 _ISO2022Close(UConverter *converter);
377
378 static void
379 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
380
381 static const char*
382 _ISO2022getName(const UConverter* cnv);
383
384 static void
385 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
386
387 static UConverter *
388 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
389
390 #ifdef U_ENABLE_GENERIC_ISO_2022
391 static void
392 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
393 #endif
394
395 /*const UConverterSharedData _ISO2022Data;*/
396 static const UConverterSharedData _ISO2022JPData;
397 static const UConverterSharedData _ISO2022KRData;
398 static const UConverterSharedData _ISO2022CNData;
399
400 /*************** Converter implementations ******************/
401
402 /* The purpose of this function is to get around gcc compiler warnings. */
403 static U_INLINE void
404 fromUWriteUInt8(UConverter *cnv,
405 const char *bytes, int32_t length,
406 uint8_t **target, const char *targetLimit,
407 int32_t **offsets,
408 int32_t sourceIndex,
409 UErrorCode *pErrorCode)
410 {
411 char *targetChars = (char *)*target;
412 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
413 offsets, sourceIndex, pErrorCode);
414 *target = (uint8_t*)targetChars;
415
416 }
417
418 static U_INLINE void
419 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
420 if(myConverterData->version == 1) {
421 UConverter *cnv = myConverterData->currentConverter;
422
423 cnv->toUnicodeStatus=0; /* offset */
424 cnv->mode=0; /* state */
425 cnv->toULength=0; /* byteIndex */
426 }
427 }
428
429 static U_INLINE void
430 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
431 /* in ISO-2022-KR the designator sequence appears only once
432 * in a file so we append it only once
433 */
434 if( converter->charErrorBufferLength==0){
435
436 converter->charErrorBufferLength = 4;
437 converter->charErrorBuffer[0] = 0x1b;
438 converter->charErrorBuffer[1] = 0x24;
439 converter->charErrorBuffer[2] = 0x29;
440 converter->charErrorBuffer[3] = 0x43;
441 }
442 if(myConverterData->version == 1) {
443 UConverter *cnv = myConverterData->currentConverter;
444
445 cnv->fromUChar32=0;
446 cnv->fromUnicodeStatus=1; /* prevLength */
447 }
448 }
449
450 static void
451 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
452
453 char myLocale[6]={' ',' ',' ',' ',' ',' '};
454
455 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
456 if(cnv->extraInfo != NULL) {
457 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
458 uint32_t version;
459
460 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
461 myConverterData->currentType = ASCII1;
462 cnv->fromUnicodeStatus =FALSE;
463 if(locale){
464 uprv_strncpy(myLocale, locale, sizeof(myLocale));
465 }
466 version = options & UCNV_OPTIONS_VERSION_MASK;
467 myConverterData->version = version;
468 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
469 (myLocale[2]=='_' || myLocale[2]=='\0'))
470 {
471 size_t len=0;
472 /* open the required converters and cache them */
473 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
474 myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
475 }
476 myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("Shift-JIS", NULL, errorCode);
477 if(jpCharsetMasks[version]&CSM(JISX212)) {
478 myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode);
479 }
480 if(jpCharsetMasks[version]&CSM(GB2312)) {
481 myConverterData->myConverterArray[GB2312] = ucnv_loadSharedData("ibm-5478", NULL, errorCode); /* gb_2312_80-1 */
482 }
483 if(jpCharsetMasks[version]&CSM(KSC5601)) {
484 myConverterData->myConverterArray[KSC5601] = ucnv_loadSharedData("ksc_5601", NULL, errorCode);
485 }
486
487 /* set the function pointers to appropriate funtions */
488 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
489 uprv_strcpy(myConverterData->locale,"ja");
490
491 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
492 len = uprv_strlen(myConverterData->name);
493 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
494 myConverterData->name[len+1]='\0';
495 }
496 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
497 (myLocale[2]=='_' || myLocale[2]=='\0'))
498 {
499 if (version==1){
500 myConverterData->currentConverter=
501 ucnv_open("icu-internal-25546",errorCode);
502
503 if (U_FAILURE(*errorCode)) {
504 _ISO2022Close(cnv);
505 return;
506 }
507
508 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
509 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
510 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
511 }else{
512 myConverterData->currentConverter=ucnv_open("ibm-949",errorCode);
513
514 if (U_FAILURE(*errorCode)) {
515 _ISO2022Close(cnv);
516 return;
517 }
518
519 myConverterData->version = 0;
520 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
521 }
522
523 /* initialize the state variables */
524 setInitialStateToUnicodeKR(cnv, myConverterData);
525 setInitialStateFromUnicodeKR(cnv, myConverterData);
526
527 /* set the function pointers to appropriate funtions */
528 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
529 uprv_strcpy(myConverterData->locale,"ko");
530 }
531 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
532 (myLocale[2]=='_' || myLocale[2]=='\0'))
533 {
534
535 /* open the required converters and cache them */
536 myConverterData->myConverterArray[GB2312_1] = ucnv_loadSharedData("ibm-5478", NULL, errorCode);
537 if(version==1) {
538 myConverterData->myConverterArray[ISO_IR_165] = ucnv_loadSharedData("iso-ir-165", NULL, errorCode);
539 }
540 myConverterData->myConverterArray[CNS_11643] = ucnv_loadSharedData("cns-11643-1992", NULL, errorCode);
541
542
543 /* set the function pointers to appropriate funtions */
544 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
545 uprv_strcpy(myConverterData->locale,"cn");
546
547 if (version==1){
548 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
549 }else{
550 myConverterData->version = 0;
551 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
552 }
553 }
554 else{
555 #ifdef U_ENABLE_GENERIC_ISO_2022
556 myConverterData->isFirstBuffer = TRUE;
557
558 /* append the UTF-8 escape sequence */
559 cnv->charErrorBufferLength = 3;
560 cnv->charErrorBuffer[0] = 0x1b;
561 cnv->charErrorBuffer[1] = 0x25;
562 cnv->charErrorBuffer[2] = 0x42;
563
564 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
565 /* initialize the state variables */
566 uprv_strcpy(myConverterData->name,"ISO_2022");
567 #else
568 *errorCode = U_UNSUPPORTED_ERROR;
569 return;
570 #endif
571 }
572
573 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
574
575 if(U_FAILURE(*errorCode)) {
576 _ISO2022Close(cnv);
577 }
578 } else {
579 *errorCode = U_MEMORY_ALLOCATION_ERROR;
580 }
581 }
582
583
584 static void
585 _ISO2022Close(UConverter *converter) {
586 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
587 UConverterSharedData **array = myData->myConverterArray;
588 int32_t i;
589
590 if (converter->extraInfo != NULL) {
591 /*close the array of converter pointers and free the memory*/
592 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
593 if(array[i]!=NULL) {
594 ucnv_unloadSharedDataIfReady(array[i]);
595 }
596 }
597
598 ucnv_close(myData->currentConverter);
599
600 if(!converter->isExtraLocal){
601 uprv_free (converter->extraInfo);
602 converter->extraInfo = NULL;
603 }
604 }
605 }
606
607 static void
608 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
609 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
610 if(choice<=UCNV_RESET_TO_UNICODE) {
611 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
612 myConverterData->key = 0;
613 myConverterData->isEmptySegment = FALSE;
614 }
615 if(choice!=UCNV_RESET_TO_UNICODE) {
616 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
617 }
618 #ifdef U_ENABLE_GENERIC_ISO_2022
619 if(myConverterData->locale[0] == 0){
620 if(choice<=UCNV_RESET_TO_UNICODE) {
621 myConverterData->isFirstBuffer = TRUE;
622 myConverterData->key = 0;
623 if (converter->mode == UCNV_SO){
624 ucnv_close (myConverterData->currentConverter);
625 myConverterData->currentConverter=NULL;
626 }
627 converter->mode = UCNV_SI;
628 }
629 if(choice!=UCNV_RESET_TO_UNICODE) {
630 /* re-append UTF-8 escape sequence */
631 converter->charErrorBufferLength = 3;
632 converter->charErrorBuffer[0] = 0x1b;
633 converter->charErrorBuffer[1] = 0x28;
634 converter->charErrorBuffer[2] = 0x42;
635 }
636 }
637 else
638 #endif
639 {
640 /* reset the state variables */
641 if(myConverterData->locale[0] == 'k'){
642 if(choice<=UCNV_RESET_TO_UNICODE) {
643 setInitialStateToUnicodeKR(converter, myConverterData);
644 }
645 if(choice!=UCNV_RESET_TO_UNICODE) {
646 setInitialStateFromUnicodeKR(converter, myConverterData);
647 }
648 }
649 }
650 }
651
652 static const char*
653 _ISO2022getName(const UConverter* cnv){
654 if(cnv->extraInfo){
655 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
656 return myData->name;
657 }
658 return NULL;
659 }
660
661
662 /*************** to unicode *******************/
663 /****************************************************************************
664 * Recognized escape sequences are
665 * <ESC>(B ASCII
666 * <ESC>.A ISO-8859-1
667 * <ESC>.F ISO-8859-7
668 * <ESC>(J JISX-201
669 * <ESC>(I JISX-201
670 * <ESC>$B JISX-208
671 * <ESC>$@ JISX-208
672 * <ESC>$(D JISX-212
673 * <ESC>$A GB2312
674 * <ESC>$(C KSC5601
675 */
676 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
677 /* 0 1 2 3 4 5 6 7 8 9 */
678 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
679 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
680 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
681 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
682 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
683 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
684 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
685 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
686 };
687
688 /*************** to unicode *******************/
689 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
690 /* 0 1 2 3 4 5 6 7 8 9 */
691 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
692 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
693 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
694 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
695 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
696 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
697 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
698 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
699 };
700
701
702 static UCNV_TableStates_2022
703 getKey_2022(char c,int32_t* key,int32_t* offset){
704 int32_t togo;
705 int32_t low = 0;
706 int32_t hi = MAX_STATES_2022;
707 int32_t oldmid=0;
708
709 togo = normalize_esq_chars_2022[(uint8_t)c];
710 if(togo == 0) {
711 /* not a valid character anywhere in an escape sequence */
712 *key = 0;
713 *offset = 0;
714 return INVALID_2022;
715 }
716 togo = (*key << 5) + togo;
717
718 while (hi != low) /*binary search*/{
719
720 register int32_t mid = (hi+low) >> 1; /*Finds median*/
721
722 if (mid == oldmid)
723 break;
724
725 if (escSeqStateTable_Key_2022[mid] > togo){
726 hi = mid;
727 }
728 else if (escSeqStateTable_Key_2022[mid] < togo){
729 low = mid;
730 }
731 else /*we found it*/{
732 *key = togo;
733 *offset = mid;
734 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
735 }
736 oldmid = mid;
737
738 }
739
740 *key = 0;
741 *offset = 0;
742 return INVALID_2022;
743 }
744
745 /*runs through a state machine to determine the escape sequence - codepage correspondance
746 */
747 static void
748 changeState_2022(UConverter* _this,
749 const char** source,
750 const char* sourceLimit,
751 Variant2022 var,
752 UErrorCode* err){
753 UCNV_TableStates_2022 value;
754 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
755 uint32_t key = myData2022->key;
756 int32_t offset = 0;
757 int8_t initialToULength = _this->toULength;
758 char c;
759
760 value = VALID_NON_TERMINAL_2022;
761 while (*source < sourceLimit) {
762 c = *(*source)++;
763 _this->toUBytes[_this->toULength++]=(uint8_t)c;
764 value = getKey_2022(c,(int32_t *) &key, &offset);
765
766 switch (value){
767
768 case VALID_NON_TERMINAL_2022 :
769 /* continue with the loop */
770 break;
771
772 case VALID_TERMINAL_2022:
773 key = 0;
774 goto DONE;
775
776 case INVALID_2022:
777 goto DONE;
778
779 case VALID_MAYBE_TERMINAL_2022:
780 #ifdef U_ENABLE_GENERIC_ISO_2022
781 /* ESC ( B is ambiguous only for ISO_2022 itself */
782 if(var == ISO_2022) {
783 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
784 _this->toULength = 0;
785
786 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
787
788 /* continue with the loop */
789 value = VALID_NON_TERMINAL_2022;
790 break;
791 } else
792 #endif
793 {
794 /* not ISO_2022 itself, finish here */
795 value = VALID_TERMINAL_2022;
796 key = 0;
797 goto DONE;
798 }
799 }
800 }
801
802 DONE:
803 myData2022->key = key;
804
805 if (value == VALID_NON_TERMINAL_2022) {
806 /* indicate that the escape sequence is incomplete: key!=0 */
807 return;
808 } else if (value == INVALID_2022 ) {
809 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
810 } else /* value == VALID_TERMINAL_2022 */ {
811 switch(var){
812 #ifdef U_ENABLE_GENERIC_ISO_2022
813 case ISO_2022:
814 {
815 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
816 if(chosenConverterName == NULL) {
817 /* SS2 or SS3 */
818 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
819 _this->toUCallbackReason = UCNV_UNASSIGNED;
820 return;
821 }
822
823 _this->mode = UCNV_SI;
824 ucnv_close(myData2022->currentConverter);
825 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
826 if(U_SUCCESS(*err)) {
827 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
828 _this->mode = UCNV_SO;
829 }
830 break;
831 }
832 #endif
833 case ISO_2022_JP:
834 {
835 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
836 switch(tempState) {
837 case INVALID_STATE:
838 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
839 break;
840 case SS2_STATE:
841 if(myData2022->toU2022State.cs[2]!=0) {
842 if(myData2022->toU2022State.g<2) {
843 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
844 }
845 myData2022->toU2022State.g=2;
846 } else {
847 /* illegal to have SS2 before a matching designator */
848 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
849 }
850 break;
851 /* case SS3_STATE: not used in ISO-2022-JP-x */
852 case ISO8859_1:
853 case ISO8859_7:
854 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
855 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
856 } else {
857 /* G2 charset for SS2 */
858 myData2022->toU2022State.cs[2]=(int8_t)tempState;
859 }
860 break;
861 default:
862 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
863 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
864 } else {
865 /* G0 charset */
866 myData2022->toU2022State.cs[0]=(int8_t)tempState;
867 }
868 break;
869 }
870 }
871 break;
872 case ISO_2022_CN:
873 {
874 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
875 switch(tempState) {
876 case INVALID_STATE:
877 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
878 break;
879 case SS2_STATE:
880 if(myData2022->toU2022State.cs[2]!=0) {
881 if(myData2022->toU2022State.g<2) {
882 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
883 }
884 myData2022->toU2022State.g=2;
885 } else {
886 /* illegal to have SS2 before a matching designator */
887 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
888 }
889 break;
890 case SS3_STATE:
891 if(myData2022->toU2022State.cs[3]!=0) {
892 if(myData2022->toU2022State.g<2) {
893 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
894 }
895 myData2022->toU2022State.g=3;
896 } else {
897 /* illegal to have SS3 before a matching designator */
898 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
899 }
900 break;
901 case ISO_IR_165:
902 if(myData2022->version==0) {
903 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
904 break;
905 }
906 /*fall through*/
907 case GB2312_1:
908 /*fall through*/
909 case CNS_11643_1:
910 myData2022->toU2022State.cs[1]=(int8_t)tempState;
911 break;
912 case CNS_11643_2:
913 myData2022->toU2022State.cs[2]=(int8_t)tempState;
914 break;
915 default:
916 /* other CNS 11643 planes */
917 if(myData2022->version==0) {
918 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
919 } else {
920 myData2022->toU2022State.cs[3]=(int8_t)tempState;
921 }
922 break;
923 }
924 }
925 break;
926 case ISO_2022_KR:
927 if(offset==0x30){
928 /* nothing to be done, just accept this one escape sequence */
929 } else {
930 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
931 }
932 break;
933
934 default:
935 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
936 break;
937 }
938 }
939 if(U_SUCCESS(*err)) {
940 _this->toULength = 0;
941 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
942 if(_this->toULength>1) {
943 /*
944 * Ticket 5691: consistent illegal sequences:
945 * - We include at least the first byte (ESC) in the illegal sequence.
946 * - If any of the non-initial bytes could be the start of a character,
947 * we stop the illegal sequence before the first one of those.
948 * In escape sequences, all following bytes are "printable", that is,
949 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
950 * they are valid single/lead bytes.
951 * For simplicity, we always only report the initial ESC byte as the
952 * illegal sequence and back out all other bytes we looked at.
953 */
954 /* Back out some bytes. */
955 int8_t backOutDistance=_this->toULength-1;
956 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
957 if(backOutDistance<=bytesFromThisBuffer) {
958 /* same as initialToULength<=1 */
959 *source-=backOutDistance;
960 } else {
961 /* Back out bytes from the previous buffer: Need to replay them. */
962 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
963 /* same as -(initialToULength-1) */
964 /* preToULength is negative! */
965 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
966 *source-=bytesFromThisBuffer;
967 }
968 _this->toULength=1;
969 }
970 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
971 _this->toUCallbackReason = UCNV_UNASSIGNED;
972 }
973 }
974
975 /*Checks the characters of the buffer against valid 2022 escape sequences
976 *if the match we return a pointer to the initial start of the sequence otherwise
977 *we return sourceLimit
978 */
979 /*for 2022 looks ahead in the stream
980 *to determine the longest possible convertible
981 *data stream
982 */
983 static U_INLINE const char*
984 getEndOfBuffer_2022(const char** source,
985 const char* sourceLimit,
986 UBool flush){
987
988 const char* mySource = *source;
989
990 #ifdef U_ENABLE_GENERIC_ISO_2022
991 if (*source >= sourceLimit)
992 return sourceLimit;
993
994 do{
995
996 if (*mySource == ESC_2022){
997 int8_t i;
998 int32_t key = 0;
999 int32_t offset;
1000 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1001
1002 /* Kludge: I could not
1003 * figure out the reason for validating an escape sequence
1004 * twice - once here and once in changeState_2022().
1005 * is it possible to have an ESC character in a ISO2022
1006 * byte stream which is valid in a code page? Is it legal?
1007 */
1008 for (i=0;
1009 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1010 i++) {
1011 value = getKey_2022(*(mySource+i), &key, &offset);
1012 }
1013 if (value > 0 || *mySource==ESC_2022)
1014 return mySource;
1015
1016 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1017 return sourceLimit;
1018 }
1019 }while (++mySource < sourceLimit);
1020
1021 return sourceLimit;
1022 #else
1023 while(mySource < sourceLimit && *mySource != ESC_2022) {
1024 ++mySource;
1025 }
1026 return mySource;
1027 #endif
1028 }
1029
1030
1031 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1032 * any future change in _MBCSFromUChar32() function should be reflected here.
1033 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1034 */
1035 static U_INLINE int32_t
1036 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1037 UChar32 c,
1038 uint32_t* value,
1039 UBool useFallback,
1040 int outputType)
1041 {
1042 const int32_t *cx;
1043 const uint16_t *table;
1044 uint32_t stage2Entry;
1045 uint32_t myValue;
1046 int32_t length;
1047 const uint8_t *p;
1048 /*
1049 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1050 * Use internal version of ucnv_open() that verifies that the new structures are available,
1051 * else U_INTERNAL_PROGRAM_ERROR.
1052 */
1053 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1054 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1055 table=sharedData->mbcs.fromUnicodeTable;
1056 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1057 /* get the bytes and the length for the output */
1058 if(outputType==MBCS_OUTPUT_2){
1059 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1060 if(myValue<=0xff) {
1061 length=1;
1062 } else {
1063 length=2;
1064 }
1065 } else /* outputType==MBCS_OUTPUT_3 */ {
1066 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1067 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1068 if(myValue<=0xff) {
1069 length=1;
1070 } else if(myValue<=0xffff) {
1071 length=2;
1072 } else {
1073 length=3;
1074 }
1075 }
1076 /* is this code point assigned, or do we use fallbacks? */
1077 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1078 /* assigned */
1079 *value=myValue;
1080 return length;
1081 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1082 /*
1083 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1084 * There is no way with this data structure for fallback output
1085 * to be a zero byte.
1086 */
1087 *value=myValue;
1088 return -length;
1089 }
1090 }
1091
1092 cx=sharedData->mbcs.extIndexes;
1093 if(cx!=NULL) {
1094 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1095 }
1096
1097 /* unassigned */
1098 return 0;
1099 }
1100
1101 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1102 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1103 * @param retval pointer to output byte
1104 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1105 */
1106 static U_INLINE int32_t
1107 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1108 UChar32 c,
1109 uint32_t* retval,
1110 UBool useFallback)
1111 {
1112 const uint16_t *table;
1113 int32_t value;
1114 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1115 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1116 return 0;
1117 }
1118 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1119 table=sharedData->mbcs.fromUnicodeTable;
1120 /* get the byte for the output */
1121 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1122 /* is this code point assigned, or do we use fallbacks? */
1123 *retval=(uint32_t)(value&0xff);
1124 if(value>=0xf00) {
1125 return 1; /* roundtrip */
1126 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1127 return -1; /* fallback taken */
1128 } else {
1129 return 0; /* no mapping */
1130 }
1131 }
1132
1133 /*
1134 * Check that the result is a 2-byte value with each byte in the range A1..FE
1135 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1136 * to move it to the ISO 2022 range 21..7E.
1137 * Return 0 if out of range.
1138 */
1139 static U_INLINE uint32_t
1140 _2022FromGR94DBCS(uint32_t value) {
1141 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1142 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1143 ) {
1144 return value - 0x8080; /* shift down to 21..7e byte range */
1145 } else {
1146 return 0; /* not valid for ISO 2022 */
1147 }
1148 }
1149
1150 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1151 /*
1152 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1153 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1154 * unchanged.
1155 */
1156 static U_INLINE uint32_t
1157 _2022ToGR94DBCS(uint32_t value) {
1158 uint32_t returnValue = value + 0x8080;
1159 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1160 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1161 return returnValue;
1162 } else {
1163 return value;
1164 }
1165 }
1166 #endif
1167
1168 #ifdef U_ENABLE_GENERIC_ISO_2022
1169
1170 /**********************************************************************************
1171 * ISO-2022 Converter
1172 *
1173 *
1174 */
1175
1176 static void
1177 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1178 UErrorCode* err){
1179 const char* mySourceLimit, *realSourceLimit;
1180 const char* sourceStart;
1181 const UChar* myTargetStart;
1182 UConverter* saveThis;
1183 UConverterDataISO2022* myData;
1184 int8_t length;
1185
1186 saveThis = args->converter;
1187 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1188
1189 realSourceLimit = args->sourceLimit;
1190 while (args->source < realSourceLimit) {
1191 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1192 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1193 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1194
1195 if(args->source < mySourceLimit) {
1196 if(myData->currentConverter==NULL) {
1197 myData->currentConverter = ucnv_open("ASCII",err);
1198 if(U_FAILURE(*err)){
1199 return;
1200 }
1201
1202 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1203 saveThis->mode = UCNV_SO;
1204 }
1205
1206 /* convert to before the ESC or until the end of the buffer */
1207 myData->isFirstBuffer=FALSE;
1208 sourceStart = args->source;
1209 myTargetStart = args->target;
1210 args->converter = myData->currentConverter;
1211 ucnv_toUnicode(args->converter,
1212 &args->target,
1213 args->targetLimit,
1214 &args->source,
1215 mySourceLimit,
1216 args->offsets,
1217 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1218 err);
1219 args->converter = saveThis;
1220
1221 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1222 /* move the overflow buffer */
1223 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1224 myData->currentConverter->UCharErrorBufferLength = 0;
1225 if(length > 0) {
1226 uprv_memcpy(saveThis->UCharErrorBuffer,
1227 myData->currentConverter->UCharErrorBuffer,
1228 length*U_SIZEOF_UCHAR);
1229 }
1230 return;
1231 }
1232
1233 /*
1234 * At least one of:
1235 * -Error while converting
1236 * -Done with entire buffer
1237 * -Need to write offsets or update the current offset
1238 * (leave that up to the code in ucnv.c)
1239 *
1240 * or else we just stopped at an ESC byte and continue with changeState_2022()
1241 */
1242 if (U_FAILURE(*err) ||
1243 (args->source == realSourceLimit) ||
1244 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1245 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1246 ) {
1247 /* copy partial or error input for truncated detection and error handling */
1248 if(U_FAILURE(*err)) {
1249 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1250 if(length > 0) {
1251 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1252 }
1253 } else {
1254 length = saveThis->toULength = myData->currentConverter->toULength;
1255 if(length > 0) {
1256 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1257 if(args->source < mySourceLimit) {
1258 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1259 }
1260 }
1261 }
1262 return;
1263 }
1264 }
1265 }
1266
1267 sourceStart = args->source;
1268 changeState_2022(args->converter,
1269 &(args->source),
1270 realSourceLimit,
1271 ISO_2022,
1272 err);
1273 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1274 /* let the ucnv.c code update its current offset */
1275 return;
1276 }
1277 }
1278 }
1279
1280 #endif
1281
1282 /*
1283 * To Unicode Callback helper function
1284 */
1285 static void
1286 toUnicodeCallback(UConverter *cnv,
1287 const uint32_t sourceChar, const uint32_t targetUniChar,
1288 UErrorCode* err){
1289 if(sourceChar>0xff){
1290 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1291 cnv->toUBytes[1] = (uint8_t)sourceChar;
1292 cnv->toULength = 2;
1293 }
1294 else{
1295 cnv->toUBytes[0] =(char) sourceChar;
1296 cnv->toULength = 1;
1297 }
1298
1299 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1300 *err = U_INVALID_CHAR_FOUND;
1301 }
1302 else{
1303 *err = U_ILLEGAL_CHAR_FOUND;
1304 }
1305 }
1306
1307 /**************************************ISO-2022-JP*************************************************/
1308
1309 /************************************** IMPORTANT **************************************************
1310 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1311 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1312 * The converter iterates over each Unicode codepoint
1313 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1314 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1315 * would do as far as possible.
1316 *
1317 * If the implementation of these macros or structure of sharedData struct change in the future, make
1318 * sure that ISO-2022 is also changed.
1319 ***************************************************************************************************
1320 */
1321
1322 /***************************************************************************************************
1323 * Rules for ISO-2022-jp encoding
1324 * (i) Escape sequences must be fully contained within a line they should not
1325 * span new lines or CRs
1326 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1327 * JIS-Roman character escape sequence should follow before the line terminates
1328 * (iii) If the first character on the line is represented by two bytes then a two
1329 * byte character escape sequence should precede it
1330 * (iv) If no escape sequence is encountered then the characters are ASCII
1331 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1332 * and invoked with SS2 (ESC N).
1333 * (vi) If there is any G0 designation in text, there must be a switch to
1334 * ASCII or to JIS X 0201-Roman before a space character (but not
1335 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1336 * characters such as tab or CRLF.
1337 * (vi) Supported encodings:
1338 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1339 *
1340 * source : RFC-1554
1341 *
1342 * JISX201, JISX208,JISX212 : new .cnv data files created
1343 * KSC5601 : alias to ibm-949 mapping table
1344 * GB2312 : alias to ibm-1386 mapping table
1345 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1346 * ISO-8859-7 : alisas to ibm-9409 mapping table
1347 */
1348
1349 /* preference order of JP charsets */
1350 static const StateEnum jpCharsetPref[]={
1351 ASCII,
1352 JISX201,
1353 ISO8859_1,
1354 ISO8859_7,
1355 JISX208,
1356 JISX212,
1357 GB2312,
1358 KSC5601,
1359 HWKANA_7BIT
1360 };
1361
1362 /*
1363 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1364 * not in order of jpCharsetPref[]!
1365 */
1366 static const char escSeqChars[][6] ={
1367 "\x1B\x28\x42", /* <ESC>(B ASCII */
1368 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1369 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1370 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1371 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1372 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1373 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1374 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1375 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1376
1377 };
1378 static const int8_t escSeqCharsLen[] ={
1379 3, /* length of <ESC>(B ASCII */
1380 3, /* length of <ESC>.A ISO-8859-1 */
1381 3, /* length of <ESC>.F ISO-8859-7 */
1382 3, /* length of <ESC>(J JISX-201 */
1383 3, /* length of <ESC>$B JISX-208 */
1384 4, /* length of <ESC>$(D JISX-212 */
1385 3, /* length of <ESC>$A GB2312 */
1386 4, /* length of <ESC>$(C KSC5601 */
1387 3 /* length of <ESC>(I HWKANA_7BIT */
1388 };
1389
1390 /*
1391 * The iteration over various code pages works this way:
1392 * i) Get the currentState from myConverterData->currentState
1393 * ii) Check if the character is mapped to a valid character in the currentState
1394 * Yes -> a) set the initIterState to currentState
1395 * b) remain in this state until an invalid character is found
1396 * No -> a) go to the next code page and find the character
1397 * iii) Before changing the state increment the current state check if the current state
1398 * is equal to the intitIteration state
1399 * Yes -> A character that cannot be represented in any of the supported encodings
1400 * break and return a U_INVALID_CHARACTER error
1401 * No -> Continue and find the character in next code page
1402 *
1403 *
1404 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1405 */
1406
1407 /* Map 00..7F to Unicode according to JIS X 0201. */
1408 static U_INLINE uint32_t
1409 jisx201ToU(uint32_t value) {
1410 if(value < 0x5c) {
1411 return value;
1412 } else if(value == 0x5c) {
1413 return 0xa5;
1414 } else if(value == 0x7e) {
1415 return 0x203e;
1416 } else /* value <= 0x7f */ {
1417 return value;
1418 }
1419 }
1420
1421 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1422 static U_INLINE uint32_t
1423 jisx201FromU(uint32_t value) {
1424 if(value<=0x7f) {
1425 if(value!=0x5c && value!=0x7e) {
1426 return value;
1427 }
1428 } else if(value==0xa5) {
1429 return 0x5c;
1430 } else if(value==0x203e) {
1431 return 0x7e;
1432 }
1433 return 0xfffe;
1434 }
1435
1436 /*
1437 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1438 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1439 * Return 0 if the byte pair is out of range.
1440 */
1441 static U_INLINE uint32_t
1442 _2022FromSJIS(uint32_t value) {
1443 uint8_t trail;
1444
1445 if(value > 0xEFFC) {
1446 return 0; /* beyond JIS X 0208 */
1447 }
1448
1449 trail = (uint8_t)value;
1450
1451 value &= 0xff00; /* lead byte */
1452 if(value <= 0x9f00) {
1453 value -= 0x7000;
1454 } else /* 0xe000 <= value <= 0xef00 */ {
1455 value -= 0xb000;
1456 }
1457 value <<= 1;
1458
1459 if(trail <= 0x9e) {
1460 value -= 0x100;
1461 if(trail <= 0x7e) {
1462 value |= trail - 0x1f;
1463 } else {
1464 value |= trail - 0x20;
1465 }
1466 } else /* trail <= 0xfc */ {
1467 value |= trail - 0x7e;
1468 }
1469 return value;
1470 }
1471
1472 /*
1473 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1474 * If either byte is outside 21..7E make sure that the result is not valid
1475 * for Shift-JIS so that the converter catches it.
1476 * Some invalid byte values already turn into equally invalid Shift-JIS
1477 * byte values and need not be tested explicitly.
1478 */
1479 static U_INLINE void
1480 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1481 if(c1&1) {
1482 ++c1;
1483 if(c2 <= 0x5f) {
1484 c2 += 0x1f;
1485 } else if(c2 <= 0x7e) {
1486 c2 += 0x20;
1487 } else {
1488 c2 = 0; /* invalid */
1489 }
1490 } else {
1491 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1492 c2 += 0x7e;
1493 } else {
1494 c2 = 0; /* invalid */
1495 }
1496 }
1497 c1 >>= 1;
1498 if(c1 <= 0x2f) {
1499 c1 += 0x70;
1500 } else if(c1 <= 0x3f) {
1501 c1 += 0xb0;
1502 } else {
1503 c1 = 0; /* invalid */
1504 }
1505 bytes[0] = (char)c1;
1506 bytes[1] = (char)c2;
1507 }
1508
1509 /*
1510 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1511 * Katakana.
1512 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1513 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1514 * These were the only fallbacks in ICU's jisx-208.ucm file.
1515 */
1516 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1517 0x2123, /* U+FF61 */
1518 0x2156,
1519 0x2157,
1520 0x2122,
1521 0x2126,
1522 0x2572,
1523 0x2521,
1524 0x2523,
1525 0x2525,
1526 0x2527,
1527 0x2529,
1528 0x2563,
1529 0x2565,
1530 0x2567,
1531 0x2543,
1532 0x213C, /* U+FF70 */
1533 0x2522,
1534 0x2524,
1535 0x2526,
1536 0x2528,
1537 0x252A,
1538 0x252B,
1539 0x252D,
1540 0x252F,
1541 0x2531,
1542 0x2533,
1543 0x2535,
1544 0x2537,
1545 0x2539,
1546 0x253B,
1547 0x253D,
1548 0x253F, /* U+FF80 */
1549 0x2541,
1550 0x2544,
1551 0x2546,
1552 0x2548,
1553 0x254A,
1554 0x254B,
1555 0x254C,
1556 0x254D,
1557 0x254E,
1558 0x254F,
1559 0x2552,
1560 0x2555,
1561 0x2558,
1562 0x255B,
1563 0x255E,
1564 0x255F, /* U+FF90 */
1565 0x2560,
1566 0x2561,
1567 0x2562,
1568 0x2564,
1569 0x2566,
1570 0x2568,
1571 0x2569,
1572 0x256A,
1573 0x256B,
1574 0x256C,
1575 0x256D,
1576 0x256F,
1577 0x2573,
1578 0x212B,
1579 0x212C /* U+FF9F */
1580 };
1581
1582 static void
1583 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1584 UConverter *cnv = args->converter;
1585 UConverterDataISO2022 *converterData;
1586 ISO2022State *pFromU2022State;
1587 uint8_t *target = (uint8_t *) args->target;
1588 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1589 const UChar* source = args->source;
1590 const UChar* sourceLimit = args->sourceLimit;
1591 int32_t* offsets = args->offsets;
1592 UChar32 sourceChar;
1593 char buffer[8];
1594 int32_t len, outLen;
1595 int8_t choices[10];
1596 int32_t choiceCount;
1597 uint32_t targetValue = 0;
1598 UBool useFallback;
1599
1600 int32_t i;
1601 int8_t cs, g;
1602
1603 /* set up the state */
1604 converterData = (UConverterDataISO2022*)cnv->extraInfo;
1605 pFromU2022State = &converterData->fromU2022State;
1606
1607 choiceCount = 0;
1608
1609 /* check if the last codepoint of previous buffer was a lead surrogate*/
1610 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1611 goto getTrail;
1612 }
1613
1614 while(source < sourceLimit) {
1615 if(target < targetLimit) {
1616
1617 sourceChar = *(source++);
1618 /*check if the char is a First surrogate*/
1619 if(UTF_IS_SURROGATE(sourceChar)) {
1620 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1621 getTrail:
1622 /*look ahead to find the trail surrogate*/
1623 if(source < sourceLimit) {
1624 /* test the following code unit */
1625 UChar trail=(UChar) *source;
1626 if(UTF_IS_SECOND_SURROGATE(trail)) {
1627 source++;
1628 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1629 cnv->fromUChar32=0x00;
1630 /* convert this supplementary code point */
1631 /* exit this condition tree */
1632 } else {
1633 /* this is an unmatched lead code unit (1st surrogate) */
1634 /* callback(illegal) */
1635 *err=U_ILLEGAL_CHAR_FOUND;
1636 cnv->fromUChar32=sourceChar;
1637 break;
1638 }
1639 } else {
1640 /* no more input */
1641 cnv->fromUChar32=sourceChar;
1642 break;
1643 }
1644 } else {
1645 /* this is an unmatched trail code unit (2nd surrogate) */
1646 /* callback(illegal) */
1647 *err=U_ILLEGAL_CHAR_FOUND;
1648 cnv->fromUChar32=sourceChar;
1649 break;
1650 }
1651 }
1652
1653 /* do not convert SO/SI/ESC */
1654 if(IS_2022_CONTROL(sourceChar)) {
1655 /* callback(illegal) */
1656 *err=U_ILLEGAL_CHAR_FOUND;
1657 cnv->fromUChar32=sourceChar;
1658 break;
1659 }
1660
1661 /* do the conversion */
1662
1663 if(choiceCount == 0) {
1664 uint16_t csm;
1665
1666 /*
1667 * The csm variable keeps track of which charsets are allowed
1668 * and not used yet while building the choices[].
1669 */
1670 csm = jpCharsetMasks[converterData->version];
1671 choiceCount = 0;
1672
1673 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1674 if(converterData->version == 3 || converterData->version == 4) {
1675 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1676 }
1677 /* Do not try single-byte half-width Katakana for other versions. */
1678 csm &= ~CSM(HWKANA_7BIT);
1679
1680 /* try the current G0 charset */
1681 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1682 csm &= ~CSM(cs);
1683
1684 /* try the current G2 charset */
1685 if((cs = pFromU2022State->cs[2]) != 0) {
1686 choices[choiceCount++] = cs;
1687 csm &= ~CSM(cs);
1688 }
1689
1690 /* try all the other possible charsets */
1691 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1692 cs = (int8_t)jpCharsetPref[i];
1693 if(CSM(cs) & csm) {
1694 choices[choiceCount++] = cs;
1695 csm &= ~CSM(cs);
1696 }
1697 }
1698 }
1699
1700 cs = g = 0;
1701 /*
1702 * len==0: no mapping found yet
1703 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1704 * len>0: found a roundtrip result, done
1705 */
1706 len = 0;
1707 /*
1708 * We will turn off useFallback after finding a fallback,
1709 * but we still get fallbacks from PUA code points as usual.
1710 * Therefore, we will also need to check that we don't overwrite
1711 * an early fallback with a later one.
1712 */
1713 useFallback = cnv->useFallback;
1714
1715 for(i = 0; i < choiceCount && len <= 0; ++i) {
1716 uint32_t value;
1717 int32_t len2;
1718 int8_t cs0 = choices[i];
1719 switch(cs0) {
1720 case ASCII:
1721 if(sourceChar <= 0x7f) {
1722 targetValue = (uint32_t)sourceChar;
1723 len = 1;
1724 cs = cs0;
1725 g = 0;
1726 }
1727 break;
1728 case ISO8859_1:
1729 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1730 targetValue = (uint32_t)sourceChar - 0x80;
1731 len = 1;
1732 cs = cs0;
1733 g = 2;
1734 }
1735 break;
1736 case HWKANA_7BIT:
1737 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1738 if(converterData->version==3) {
1739 /* JIS7: use G1 (SO) */
1740 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1741 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1742 len = 1;
1743 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1744 g = 1;
1745 } else if(converterData->version==4) {
1746 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1747 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1748 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1749 len = 1;
1750
1751 cs = pFromU2022State->cs[0];
1752 if(IS_JP_DBCS(cs)) {
1753 /* switch from a DBCS charset to JISX201 */
1754 cs = (int8_t)JISX201;
1755 }
1756 /* else stay in the current G0 charset */
1757 g = 0;
1758 }
1759 /* else do not use HWKANA_7BIT with other versions */
1760 }
1761 break;
1762 case JISX201:
1763 /* G0 SBCS */
1764 value = jisx201FromU(sourceChar);
1765 if(value <= 0x7f) {
1766 targetValue = value;
1767 len = 1;
1768 cs = cs0;
1769 g = 0;
1770 useFallback = FALSE;
1771 }
1772 break;
1773 case JISX208:
1774 /* G0 DBCS from Shift-JIS table */
1775 len2 = MBCS_FROM_UCHAR32_ISO2022(
1776 converterData->myConverterArray[cs0],
1777 sourceChar, &value,
1778 useFallback, MBCS_OUTPUT_2);
1779 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1780 value = _2022FromSJIS(value);
1781 if(value != 0) {
1782 targetValue = value;
1783 len = len2;
1784 cs = cs0;
1785 g = 0;
1786 useFallback = FALSE;
1787 }
1788 } else if(len == 0 && useFallback &&
1789 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1790 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1791 len = -2;
1792 cs = cs0;
1793 g = 0;
1794 useFallback = FALSE;
1795 }
1796 break;
1797 case ISO8859_7:
1798 /* G0 SBCS forced to 7-bit output */
1799 len2 = MBCS_SINGLE_FROM_UCHAR32(
1800 converterData->myConverterArray[cs0],
1801 sourceChar, &value,
1802 useFallback);
1803 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1804 targetValue = value - 0x80;
1805 len = len2;
1806 cs = cs0;
1807 g = 2;
1808 useFallback = FALSE;
1809 }
1810 break;
1811 default:
1812 /* G0 DBCS */
1813 len2 = MBCS_FROM_UCHAR32_ISO2022(
1814 converterData->myConverterArray[cs0],
1815 sourceChar, &value,
1816 useFallback, MBCS_OUTPUT_2);
1817 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1818 if(cs0 == KSC5601) {
1819 /*
1820 * Check for valid bytes for the encoding scheme.
1821 * This is necessary because the sub-converter (windows-949)
1822 * has a broader encoding scheme than is valid for 2022.
1823 */
1824 value = _2022FromGR94DBCS(value);
1825 if(value == 0) {
1826 break;
1827 }
1828 }
1829 targetValue = value;
1830 len = len2;
1831 cs = cs0;
1832 g = 0;
1833 useFallback = FALSE;
1834 }
1835 break;
1836 }
1837 }
1838
1839 if(len != 0) {
1840 if(len < 0) {
1841 len = -len; /* fallback */
1842 }
1843 outLen = 0; /* count output bytes */
1844
1845 /* write SI if necessary (only for JIS7) */
1846 if(pFromU2022State->g == 1 && g == 0) {
1847 buffer[outLen++] = UCNV_SI;
1848 pFromU2022State->g = 0;
1849 }
1850
1851 /* write the designation sequence if necessary */
1852 if(cs != pFromU2022State->cs[g]) {
1853 int32_t escLen = escSeqCharsLen[cs];
1854 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1855 outLen += escLen;
1856 pFromU2022State->cs[g] = cs;
1857
1858 /* invalidate the choices[] */
1859 choiceCount = 0;
1860 }
1861
1862 /* write the shift sequence if necessary */
1863 if(g != pFromU2022State->g) {
1864 switch(g) {
1865 /* case 0 handled before writing escapes */
1866 case 1:
1867 buffer[outLen++] = UCNV_SO;
1868 pFromU2022State->g = 1;
1869 break;
1870 default: /* case 2 */
1871 buffer[outLen++] = 0x1b;
1872 buffer[outLen++] = 0x4e;
1873 break;
1874 /* no case 3: no SS3 in ISO-2022-JP-x */
1875 }
1876 }
1877
1878 /* write the output bytes */
1879 if(len == 1) {
1880 buffer[outLen++] = (char)targetValue;
1881 } else /* len == 2 */ {
1882 buffer[outLen++] = (char)(targetValue >> 8);
1883 buffer[outLen++] = (char)targetValue;
1884 }
1885 } else {
1886 /*
1887 * if we cannot find the character after checking all codepages
1888 * then this is an error
1889 */
1890 *err = U_INVALID_CHAR_FOUND;
1891 cnv->fromUChar32=sourceChar;
1892 break;
1893 }
1894
1895 if(sourceChar == CR || sourceChar == LF) {
1896 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1897 pFromU2022State->cs[2] = 0;
1898 choiceCount = 0;
1899 }
1900
1901 /* output outLen>0 bytes in buffer[] */
1902 if(outLen == 1) {
1903 *target++ = buffer[0];
1904 if(offsets) {
1905 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1906 }
1907 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1908 *target++ = buffer[0];
1909 *target++ = buffer[1];
1910 if(offsets) {
1911 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1912 *offsets++ = sourceIndex;
1913 *offsets++ = sourceIndex;
1914 }
1915 } else {
1916 fromUWriteUInt8(
1917 cnv,
1918 buffer, outLen,
1919 &target, (const char *)targetLimit,
1920 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1921 err);
1922 if(U_FAILURE(*err)) {
1923 break;
1924 }
1925 }
1926 } /* end if(myTargetIndex<myTargetLength) */
1927 else{
1928 *err =U_BUFFER_OVERFLOW_ERROR;
1929 break;
1930 }
1931
1932 }/* end while(mySourceIndex<mySourceLength) */
1933
1934 /*
1935 * the end of the input stream and detection of truncated input
1936 * are handled by the framework, but for ISO-2022-JP conversion
1937 * we need to be in ASCII mode at the very end
1938 *
1939 * conditions:
1940 * successful
1941 * in SO mode or not in ASCII mode
1942 * end of input and no truncated input
1943 */
1944 if( U_SUCCESS(*err) &&
1945 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1946 args->flush && source>=sourceLimit && cnv->fromUChar32==0
1947 ) {
1948 int32_t sourceIndex;
1949
1950 outLen = 0;
1951
1952 if(pFromU2022State->g != 0) {
1953 buffer[outLen++] = UCNV_SI;
1954 pFromU2022State->g = 0;
1955 }
1956
1957 if(pFromU2022State->cs[0] != ASCII) {
1958 int32_t escLen = escSeqCharsLen[ASCII];
1959 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1960 outLen += escLen;
1961 pFromU2022State->cs[0] = (int8_t)ASCII;
1962 }
1963
1964 /* get the source index of the last input character */
1965 /*
1966 * TODO this would be simpler and more reliable if we used a pair
1967 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1968 * so that we could simply use the prevSourceIndex here;
1969 * this code gives an incorrect result for the rare case of an unmatched
1970 * trail surrogate that is alone in the last buffer of the text stream
1971 */
1972 sourceIndex=(int32_t)(source-args->source);
1973 if(sourceIndex>0) {
1974 --sourceIndex;
1975 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1976 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1977 ) {
1978 --sourceIndex;
1979 }
1980 } else {
1981 sourceIndex=-1;
1982 }
1983
1984 fromUWriteUInt8(
1985 cnv,
1986 buffer, outLen,
1987 &target, (const char *)targetLimit,
1988 &offsets, sourceIndex,
1989 err);
1990 }
1991
1992 /*save the state and return */
1993 args->source = source;
1994 args->target = (char*)target;
1995 }
1996
1997 /*************** to unicode *******************/
1998
1999 static void
2000 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2001 UErrorCode* err){
2002 char tempBuf[2];
2003 const char *mySource = (char *) args->source;
2004 UChar *myTarget = args->target;
2005 const char *mySourceLimit = args->sourceLimit;
2006 uint32_t targetUniChar = 0x0000;
2007 uint32_t mySourceChar = 0x0000;
2008 uint32_t tmpSourceChar = 0x0000;
2009 UConverterDataISO2022* myData;
2010 ISO2022State *pToU2022State;
2011 StateEnum cs;
2012
2013 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2014 pToU2022State = &myData->toU2022State;
2015
2016 if(myData->key != 0) {
2017 /* continue with a partial escape sequence */
2018 goto escape;
2019 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2020 /* continue with a partial double-byte character */
2021 mySourceChar = args->converter->toUBytes[0];
2022 args->converter->toULength = 0;
2023 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2024 targetUniChar = missingCharMarker;
2025 goto getTrailByte;
2026 }
2027
2028 while(mySource < mySourceLimit){
2029
2030 targetUniChar =missingCharMarker;
2031
2032 if(myTarget < args->targetLimit){
2033
2034 mySourceChar= (unsigned char) *mySource++;
2035
2036 switch(mySourceChar) {
2037 case UCNV_SI:
2038 if(myData->version==3) {
2039 pToU2022State->g=0;
2040 continue;
2041 } else {
2042 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2043 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2044 break;
2045 }
2046
2047 case UCNV_SO:
2048 if(myData->version==3) {
2049 /* JIS7: switch to G1 half-width Katakana */
2050 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2051 pToU2022State->g=1;
2052 continue;
2053 } else {
2054 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2055 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2056 break;
2057 }
2058
2059 case ESC_2022:
2060 mySource--;
2061 escape:
2062 {
2063 const char * mySourceBefore = mySource;
2064 int8_t toULengthBefore = args->converter->toULength;
2065
2066 changeState_2022(args->converter,&(mySource),
2067 mySourceLimit, ISO_2022_JP,err);
2068
2069 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2070 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2071 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2072 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2073 args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
2074 }
2075 }
2076
2077 /* invalid or illegal escape sequence */
2078 if(U_FAILURE(*err)){
2079 args->target = myTarget;
2080 args->source = mySource;
2081 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
2082 return;
2083 }
2084 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2085 if(myData->key==0) {
2086 myData->isEmptySegment = TRUE;
2087 }
2088 continue;
2089
2090 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2091
2092 case CR:
2093 /*falls through*/
2094 case LF:
2095 /* automatically reset to single-byte mode */
2096 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2097 pToU2022State->cs[0] = (int8_t)ASCII;
2098 }
2099 pToU2022State->cs[2] = 0;
2100 pToU2022State->g = 0;
2101 /* falls through */
2102 default:
2103 /* convert one or two bytes */
2104 myData->isEmptySegment = FALSE;
2105 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2106 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2107 !IS_JP_DBCS(cs)
2108 ) {
2109 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2110 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2111
2112 /* return from a single-shift state to the previous one */
2113 if(pToU2022State->g >= 2) {
2114 pToU2022State->g=pToU2022State->prevG;
2115 }
2116 } else switch(cs) {
2117 case ASCII:
2118 if(mySourceChar <= 0x7f) {
2119 targetUniChar = mySourceChar;
2120 }
2121 break;
2122 case ISO8859_1:
2123 if(mySourceChar <= 0x7f) {
2124 targetUniChar = mySourceChar + 0x80;
2125 }
2126 /* return from a single-shift state to the previous one */
2127 pToU2022State->g=pToU2022State->prevG;
2128 break;
2129 case ISO8859_7:
2130 if(mySourceChar <= 0x7f) {
2131 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2132 targetUniChar =
2133 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2134 myData->myConverterArray[cs],
2135 mySourceChar + 0x80);
2136 }
2137 /* return from a single-shift state to the previous one */
2138 pToU2022State->g=pToU2022State->prevG;
2139 break;
2140 case JISX201:
2141 if(mySourceChar <= 0x7f) {
2142 targetUniChar = jisx201ToU(mySourceChar);
2143 }
2144 break;
2145 case HWKANA_7BIT:
2146 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2147 /* 7-bit halfwidth Katakana */
2148 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2149 }
2150 break;
2151 default:
2152 /* G0 DBCS */
2153 if(mySource < mySourceLimit) {
2154 int leadIsOk, trailIsOk;
2155 uint8_t trailByte;
2156 getTrailByte:
2157 trailByte = (uint8_t)*mySource;
2158 /*
2159 * Ticket 5691: consistent illegal sequences:
2160 * - We include at least the first byte in the illegal sequence.
2161 * - If any of the non-initial bytes could be the start of a character,
2162 * we stop the illegal sequence before the first one of those.
2163 *
2164 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2165 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2166 * Otherwise we convert or report the pair of bytes.
2167 */
2168 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2169 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2170 if (leadIsOk && trailIsOk) {
2171 ++mySource;
2172 tmpSourceChar = (mySourceChar << 8) | trailByte;
2173 if(cs == JISX208) {
2174 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2175 mySourceChar = tmpSourceChar;
2176 } else {
2177 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2178 mySourceChar = tmpSourceChar;
2179 if (cs == KSC5601) {
2180 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2181 }
2182 tempBuf[0] = (char)(tmpSourceChar >> 8);
2183 tempBuf[1] = (char)(tmpSourceChar);
2184 }
2185 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2186 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2187 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2188 ++mySource;
2189 /* add another bit so that the code below writes 2 bytes in case of error */
2190 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2191 }
2192 } else {
2193 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2194 args->converter->toULength = 1;
2195 goto endloop;
2196 }
2197 } /* End of inner switch */
2198 break;
2199 } /* End of outer switch */
2200 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2201 if(args->offsets){
2202 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2203 }
2204 *(myTarget++)=(UChar)targetUniChar;
2205 }
2206 else if(targetUniChar > missingCharMarker){
2207 /* disassemble the surrogate pair and write to output*/
2208 targetUniChar-=0x0010000;
2209 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2210 if(args->offsets){
2211 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2212 }
2213 ++myTarget;
2214 if(myTarget< args->targetLimit){
2215 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2216 if(args->offsets){
2217 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2218 }
2219 ++myTarget;
2220 }else{
2221 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2222 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2223 }
2224
2225 }
2226 else{
2227 /* Call the callback function*/
2228 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2229 break;
2230 }
2231 }
2232 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2233 *err =U_BUFFER_OVERFLOW_ERROR;
2234 break;
2235 }
2236 }
2237 endloop:
2238 args->target = myTarget;
2239 args->source = mySource;
2240 }
2241
2242
2243 /***************************************************************
2244 * Rules for ISO-2022-KR encoding
2245 * i) The KSC5601 designator sequence should appear only once in a file,
2246 * at the begining of a line before any KSC5601 characters. This usually
2247 * means that it appears by itself on the first line of the file
2248 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2249 * and SI to shift into single byte mode
2250 */
2251 static void
2252 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2253
2254 UConverter* saveConv = args->converter;
2255 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2256 args->converter=myConverterData->currentConverter;
2257
2258 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2259 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2260 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2261
2262 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2263 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2264 uprv_memcpy(
2265 saveConv->charErrorBuffer,
2266 myConverterData->currentConverter->charErrorBuffer,
2267 myConverterData->currentConverter->charErrorBufferLength);
2268 }
2269 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2270 myConverterData->currentConverter->charErrorBufferLength = 0;
2271 }
2272 args->converter=saveConv;
2273 }
2274
2275 static void
2276 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2277
2278 const UChar *source = args->source;
2279 const UChar *sourceLimit = args->sourceLimit;
2280 unsigned char *target = (unsigned char *) args->target;
2281 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2282 int32_t* offsets = args->offsets;
2283 uint32_t targetByteUnit = 0x0000;
2284 UChar32 sourceChar = 0x0000;
2285 UBool isTargetByteDBCS;
2286 UBool oldIsTargetByteDBCS;
2287 UConverterDataISO2022 *converterData;
2288 UConverterSharedData* sharedData;
2289 UBool useFallback;
2290 int32_t length =0;
2291
2292 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2293 /* if the version is 1 then the user is requesting
2294 * conversion with ibm-25546 pass the arguments to
2295 * MBCS converter and return
2296 */
2297 if(converterData->version==1){
2298 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2299 return;
2300 }
2301
2302 /* initialize data */
2303 sharedData = converterData->currentConverter->sharedData;
2304 useFallback = args->converter->useFallback;
2305 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2306 oldIsTargetByteDBCS = isTargetByteDBCS;
2307
2308 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2309 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2310 goto getTrail;
2311 }
2312 while(source < sourceLimit){
2313
2314 targetByteUnit = missingCharMarker;
2315
2316 if(target < (unsigned char*) args->targetLimit){
2317 sourceChar = *source++;
2318
2319 /* do not convert SO/SI/ESC */
2320 if(IS_2022_CONTROL(sourceChar)) {
2321 /* callback(illegal) */
2322 *err=U_ILLEGAL_CHAR_FOUND;
2323 args->converter->fromUChar32=sourceChar;
2324 break;
2325 }
2326
2327 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2328 if(length < 0) {
2329 length = -length; /* fallback */
2330 }
2331 /* only DBCS or SBCS characters are expected*/
2332 /* DB characters with high bit set to 1 are expected */
2333 if( length > 2 || length==0 ||
2334 (length == 1 && targetByteUnit > 0x7f) ||
2335 (length == 2 &&
2336 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2337 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2338 ) {
2339 targetByteUnit=missingCharMarker;
2340 }
2341 if (targetByteUnit != missingCharMarker){
2342
2343 oldIsTargetByteDBCS = isTargetByteDBCS;
2344 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2345 /* append the shift sequence */
2346 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2347
2348 if (isTargetByteDBCS)
2349 *target++ = UCNV_SO;
2350 else
2351 *target++ = UCNV_SI;
2352 if(offsets)
2353 *(offsets++) = (int32_t)(source - args->source-1);
2354 }
2355 /* write the targetUniChar to target */
2356 if(targetByteUnit <= 0x00FF){
2357 if( target < targetLimit){
2358 *(target++) = (unsigned char) targetByteUnit;
2359 if(offsets){
2360 *(offsets++) = (int32_t)(source - args->source-1);
2361 }
2362
2363 }else{
2364 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2365 *err = U_BUFFER_OVERFLOW_ERROR;
2366 }
2367 }else{
2368 if(target < targetLimit){
2369 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2370 if(offsets){
2371 *(offsets++) = (int32_t)(source - args->source-1);
2372 }
2373 if(target < targetLimit){
2374 *(target++) =(unsigned char) (targetByteUnit -0x80);
2375 if(offsets){
2376 *(offsets++) = (int32_t)(source - args->source-1);
2377 }
2378 }else{
2379 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2380 *err = U_BUFFER_OVERFLOW_ERROR;
2381 }
2382 }else{
2383 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2384 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2385 *err = U_BUFFER_OVERFLOW_ERROR;
2386 }
2387 }
2388
2389 }
2390 else{
2391 /* oops.. the code point is unassingned
2392 * set the error and reason
2393 */
2394
2395 /*check if the char is a First surrogate*/
2396 if(UTF_IS_SURROGATE(sourceChar)) {
2397 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2398 getTrail:
2399 /*look ahead to find the trail surrogate*/
2400 if(source < sourceLimit) {
2401 /* test the following code unit */
2402 UChar trail=(UChar) *source;
2403 if(UTF_IS_SECOND_SURROGATE(trail)) {
2404 source++;
2405 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2406 *err = U_INVALID_CHAR_FOUND;
2407 /* convert this surrogate code point */
2408 /* exit this condition tree */
2409 } else {
2410 /* this is an unmatched lead code unit (1st surrogate) */
2411 /* callback(illegal) */
2412 *err=U_ILLEGAL_CHAR_FOUND;
2413 }
2414 } else {
2415 /* no more input */
2416 *err = U_ZERO_ERROR;
2417 }
2418 } else {
2419 /* this is an unmatched trail code unit (2nd surrogate) */
2420 /* callback(illegal) */
2421 *err=U_ILLEGAL_CHAR_FOUND;
2422 }
2423 } else {
2424 /* callback(unassigned) for a BMP code point */
2425 *err = U_INVALID_CHAR_FOUND;
2426 }
2427
2428 args->converter->fromUChar32=sourceChar;
2429 break;
2430 }
2431 } /* end if(myTargetIndex<myTargetLength) */
2432 else{
2433 *err =U_BUFFER_OVERFLOW_ERROR;
2434 break;
2435 }
2436
2437 }/* end while(mySourceIndex<mySourceLength) */
2438
2439 /*
2440 * the end of the input stream and detection of truncated input
2441 * are handled by the framework, but for ISO-2022-KR conversion
2442 * we need to be in ASCII mode at the very end
2443 *
2444 * conditions:
2445 * successful
2446 * not in ASCII mode
2447 * end of input and no truncated input
2448 */
2449 if( U_SUCCESS(*err) &&
2450 isTargetByteDBCS &&
2451 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2452 ) {
2453 int32_t sourceIndex;
2454
2455 /* we are switching to ASCII */
2456 isTargetByteDBCS=FALSE;
2457
2458 /* get the source index of the last input character */
2459 /*
2460 * TODO this would be simpler and more reliable if we used a pair
2461 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2462 * so that we could simply use the prevSourceIndex here;
2463 * this code gives an incorrect result for the rare case of an unmatched
2464 * trail surrogate that is alone in the last buffer of the text stream
2465 */
2466 sourceIndex=(int32_t)(source-args->source);
2467 if(sourceIndex>0) {
2468 --sourceIndex;
2469 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2470 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2471 ) {
2472 --sourceIndex;
2473 }
2474 } else {
2475 sourceIndex=-1;
2476 }
2477
2478 fromUWriteUInt8(
2479 args->converter,
2480 SHIFT_IN_STR, 1,
2481 &target, (const char *)targetLimit,
2482 &offsets, sourceIndex,
2483 err);
2484 }
2485
2486 /*save the state and return */
2487 args->source = source;
2488 args->target = (char*)target;
2489 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2490 }
2491
2492 /************************ To Unicode ***************************************/
2493
2494 static void
2495 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2496 UErrorCode* err){
2497 char const* sourceStart;
2498 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2499
2500 UConverterToUnicodeArgs subArgs;
2501 int32_t minArgsSize;
2502
2503 /* set up the subconverter arguments */
2504 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2505 minArgsSize = args->size;
2506 } else {
2507 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2508 }
2509
2510 uprv_memcpy(&subArgs, args, minArgsSize);
2511 subArgs.size = (uint16_t)minArgsSize;
2512 subArgs.converter = myData->currentConverter;
2513
2514 /* remember the original start of the input for offsets */
2515 sourceStart = args->source;
2516
2517 if(myData->key != 0) {
2518 /* continue with a partial escape sequence */
2519 goto escape;
2520 }
2521
2522 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2523 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2524 subArgs.source = args->source;
2525 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2526 if(subArgs.source != subArgs.sourceLimit) {
2527 /*
2528 * get the current partial byte sequence
2529 *
2530 * it needs to be moved between the public and the subconverter
2531 * so that the conversion framework, which only sees the public
2532 * converter, can handle truncated and illegal input etc.
2533 */
2534 if(args->converter->toULength > 0) {
2535 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2536 }
2537 subArgs.converter->toULength = args->converter->toULength;
2538
2539 /*
2540 * Convert up to the end of the input, or to before the next escape character.
2541 * Does not handle conversion extensions because the preToU[] state etc.
2542 * is not copied.
2543 */
2544 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2545
2546 if(args->offsets != NULL && sourceStart != args->source) {
2547 /* update offsets to base them on the actual start of the input */
2548 int32_t *offsets = args->offsets;
2549 UChar *target = args->target;
2550 int32_t delta = (int32_t)(args->source - sourceStart);
2551 while(target < subArgs.target) {
2552 if(*offsets >= 0) {
2553 *offsets += delta;
2554 }
2555 ++offsets;
2556 ++target;
2557 }
2558 }
2559 args->source = subArgs.source;
2560 args->target = subArgs.target;
2561 args->offsets = subArgs.offsets;
2562
2563 /* copy input/error/overflow buffers */
2564 if(subArgs.converter->toULength > 0) {
2565 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2566 }
2567 args->converter->toULength = subArgs.converter->toULength;
2568
2569 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2570 if(subArgs.converter->UCharErrorBufferLength > 0) {
2571 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2572 subArgs.converter->UCharErrorBufferLength);
2573 }
2574 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2575 subArgs.converter->UCharErrorBufferLength = 0;
2576 }
2577 }
2578
2579 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2580 return;
2581 }
2582
2583 escape:
2584 changeState_2022(args->converter,
2585 &(args->source),
2586 args->sourceLimit,
2587 ISO_2022_KR,
2588 err);
2589 }
2590 }
2591
2592 static void
2593 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2594 UErrorCode* err){
2595 char tempBuf[2];
2596 const char *mySource = ( char *) args->source;
2597 UChar *myTarget = args->target;
2598 const char *mySourceLimit = args->sourceLimit;
2599 UChar32 targetUniChar = 0x0000;
2600 UChar mySourceChar = 0x0000;
2601 UConverterDataISO2022* myData;
2602 UConverterSharedData* sharedData ;
2603 UBool useFallback;
2604
2605 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2606 if(myData->version==1){
2607 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2608 return;
2609 }
2610
2611 /* initialize state */
2612 sharedData = myData->currentConverter->sharedData;
2613 useFallback = args->converter->useFallback;
2614
2615 if(myData->key != 0) {
2616 /* continue with a partial escape sequence */
2617 goto escape;
2618 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2619 /* continue with a partial double-byte character */
2620 mySourceChar = args->converter->toUBytes[0];
2621 args->converter->toULength = 0;
2622 goto getTrailByte;
2623 }
2624
2625 while(mySource< mySourceLimit){
2626
2627 if(myTarget < args->targetLimit){
2628
2629 mySourceChar= (unsigned char) *mySource++;
2630
2631 if(mySourceChar==UCNV_SI){
2632 myData->toU2022State.g = 0;
2633 if (myData->isEmptySegment) {
2634 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2635 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2636 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2637 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2638 args->converter->toULength = 1;
2639 args->target = myTarget;
2640 args->source = mySource;
2641 return;
2642 }
2643 /*consume the source */
2644 continue;
2645 }else if(mySourceChar==UCNV_SO){
2646 myData->toU2022State.g = 1;
2647 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
2648 /*consume the source */
2649 continue;
2650 }else if(mySourceChar==ESC_2022){
2651 mySource--;
2652 escape:
2653 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2654 changeState_2022(args->converter,&(mySource),
2655 mySourceLimit, ISO_2022_KR, err);
2656 if(U_FAILURE(*err)){
2657 args->target = myTarget;
2658 args->source = mySource;
2659 return;
2660 }
2661 continue;
2662 }
2663
2664 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
2665 if(myData->toU2022State.g == 1) {
2666 if(mySource < mySourceLimit) {
2667 int leadIsOk, trailIsOk;
2668 uint8_t trailByte;
2669 getTrailByte:
2670 targetUniChar = missingCharMarker;
2671 trailByte = (uint8_t)*mySource;
2672 /*
2673 * Ticket 5691: consistent illegal sequences:
2674 * - We include at least the first byte in the illegal sequence.
2675 * - If any of the non-initial bytes could be the start of a character,
2676 * we stop the illegal sequence before the first one of those.
2677 *
2678 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2679 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2680 * Otherwise we convert or report the pair of bytes.
2681 */
2682 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2683 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2684 if (leadIsOk && trailIsOk) {
2685 ++mySource;
2686 tempBuf[0] = (char)(mySourceChar + 0x80);
2687 tempBuf[1] = (char)(trailByte + 0x80);
2688 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2689 mySourceChar = (mySourceChar << 8) | trailByte;
2690 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2691 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2692 ++mySource;
2693 /* add another bit so that the code below writes 2 bytes in case of error */
2694 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2695 }
2696 } else {
2697 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2698 args->converter->toULength = 1;
2699 break;
2700 }
2701 }
2702 else if(mySourceChar <= 0x7f) {
2703 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2704 } else {
2705 targetUniChar = 0xffff;
2706 }
2707 if(targetUniChar < 0xfffe){
2708 if(args->offsets) {
2709 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2710 }
2711 *(myTarget++)=(UChar)targetUniChar;
2712 }
2713 else {
2714 /* Call the callback function*/
2715 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2716 break;
2717 }
2718 }
2719 else{
2720 *err =U_BUFFER_OVERFLOW_ERROR;
2721 break;
2722 }
2723 }
2724 args->target = myTarget;
2725 args->source = mySource;
2726 }
2727
2728 /*************************** END ISO2022-KR *********************************/
2729
2730 /*************************** ISO-2022-CN *********************************
2731 *
2732 * Rules for ISO-2022-CN Encoding:
2733 * i) The designator sequence must appear once on a line before any instance
2734 * of character set it designates.
2735 * ii) If two lines contain characters from the same character set, both lines
2736 * must include the designator sequence.
2737 * iii) Once the designator sequence is known, a shifting sequence has to be found
2738 * to invoke the shifting
2739 * iv) All lines start in ASCII and end in ASCII.
2740 * v) Four shifting sequences are employed for this purpose:
2741 *
2742 * Sequcence ASCII Eq Charsets
2743 * ---------- ------- ---------
2744 * SI <SI> US-ASCII
2745 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2746 * SS2 <ESC>N CNS-11643-1992 Plane 2
2747 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2748 *
2749 * vi)
2750 * SOdesignator : ESC "$" ")" finalchar_for_SO
2751 * SS2designator : ESC "$" "*" finalchar_for_SS2
2752 * SS3designator : ESC "$" "+" finalchar_for_SS3
2753 *
2754 * ESC $ ) A Indicates the bytes following SO are Chinese
2755 * characters as defined in GB 2312-80, until
2756 * another SOdesignation appears
2757 *
2758 *
2759 * ESC $ ) E Indicates the bytes following SO are as defined
2760 * in ISO-IR-165 (for details, see section 2.1),
2761 * until another SOdesignation appears
2762 *
2763 * ESC $ ) G Indicates the bytes following SO are as defined
2764 * in CNS 11643-plane-1, until another
2765 * SOdesignation appears
2766 *
2767 * ESC $ * H Indicates the two bytes immediately following
2768 * SS2 is a Chinese character as defined in CNS
2769 * 11643-plane-2, until another SS2designation
2770 * appears
2771 * (Meaning <ESC>N must preceed every 2 byte
2772 * sequence.)
2773 *
2774 * ESC $ + I Indicates the immediate two bytes following SS3
2775 * is a Chinese character as defined in CNS
2776 * 11643-plane-3, until another SS3designation
2777 * appears
2778 * (Meaning <ESC>O must preceed every 2 byte
2779 * sequence.)
2780 *
2781 * ESC $ + J Indicates the immediate two bytes following SS3
2782 * is a Chinese character as defined in CNS
2783 * 11643-plane-4, until another SS3designation
2784 * appears
2785 * (In English: <ESC>O must preceed every 2 byte
2786 * sequence.)
2787 *
2788 * ESC $ + K Indicates the immediate two bytes following SS3
2789 * is a Chinese character as defined in CNS
2790 * 11643-plane-5, until another SS3designation
2791 * appears
2792 *
2793 * ESC $ + L Indicates the immediate two bytes following SS3
2794 * is a Chinese character as defined in CNS
2795 * 11643-plane-6, until another SS3designation
2796 * appears
2797 *
2798 * ESC $ + M Indicates the immediate two bytes following SS3
2799 * is a Chinese character as defined in CNS
2800 * 11643-plane-7, until another SS3designation
2801 * appears
2802 *
2803 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2804 * has its own designation information before any Chinese characters
2805 * appear
2806 *
2807 */
2808
2809 /* The following are defined this way to make the strings truely readonly */
2810 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2811 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2812 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2813 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2814 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2815 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2816 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2817 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2818 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2819
2820 /********************** ISO2022-CN Data **************************/
2821 static const char* const escSeqCharsCN[10] ={
2822 SHIFT_IN_STR, /* ASCII */
2823 GB_2312_80_STR,
2824 ISO_IR_165_STR,
2825 CNS_11643_1992_Plane_1_STR,
2826 CNS_11643_1992_Plane_2_STR,
2827 CNS_11643_1992_Plane_3_STR,
2828 CNS_11643_1992_Plane_4_STR,
2829 CNS_11643_1992_Plane_5_STR,
2830 CNS_11643_1992_Plane_6_STR,
2831 CNS_11643_1992_Plane_7_STR
2832 };
2833
2834 static void
2835 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2836 UConverter *cnv = args->converter;
2837 UConverterDataISO2022 *converterData;
2838 ISO2022State *pFromU2022State;
2839 uint8_t *target = (uint8_t *) args->target;
2840 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2841 const UChar* source = args->source;
2842 const UChar* sourceLimit = args->sourceLimit;
2843 int32_t* offsets = args->offsets;
2844 UChar32 sourceChar;
2845 char buffer[8];
2846 int32_t len;
2847 int8_t choices[3];
2848 int32_t choiceCount;
2849 uint32_t targetValue = 0;
2850 UBool useFallback;
2851
2852 /* set up the state */
2853 converterData = (UConverterDataISO2022*)cnv->extraInfo;
2854 pFromU2022State = &converterData->fromU2022State;
2855
2856 choiceCount = 0;
2857
2858 /* check if the last codepoint of previous buffer was a lead surrogate*/
2859 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2860 goto getTrail;
2861 }
2862
2863 while( source < sourceLimit){
2864 if(target < targetLimit){
2865
2866 sourceChar = *(source++);
2867 /*check if the char is a First surrogate*/
2868 if(UTF_IS_SURROGATE(sourceChar)) {
2869 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2870 getTrail:
2871 /*look ahead to find the trail surrogate*/
2872 if(source < sourceLimit) {
2873 /* test the following code unit */
2874 UChar trail=(UChar) *source;
2875 if(UTF_IS_SECOND_SURROGATE(trail)) {
2876 source++;
2877 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2878 cnv->fromUChar32=0x00;
2879 /* convert this supplementary code point */
2880 /* exit this condition tree */
2881 } else {
2882 /* this is an unmatched lead code unit (1st surrogate) */
2883 /* callback(illegal) */
2884 *err=U_ILLEGAL_CHAR_FOUND;
2885 cnv->fromUChar32=sourceChar;
2886 break;
2887 }
2888 } else {
2889 /* no more input */
2890 cnv->fromUChar32=sourceChar;
2891 break;
2892 }
2893 } else {
2894 /* this is an unmatched trail code unit (2nd surrogate) */
2895 /* callback(illegal) */
2896 *err=U_ILLEGAL_CHAR_FOUND;
2897 cnv->fromUChar32=sourceChar;
2898 break;
2899 }
2900 }
2901
2902 /* do the conversion */
2903 if(sourceChar <= 0x007f ){
2904 /* do not convert SO/SI/ESC */
2905 if(IS_2022_CONTROL(sourceChar)) {
2906 /* callback(illegal) */
2907 *err=U_ILLEGAL_CHAR_FOUND;
2908 cnv->fromUChar32=sourceChar;
2909 break;
2910 }
2911
2912 /* US-ASCII */
2913 if(pFromU2022State->g == 0) {
2914 buffer[0] = (char)sourceChar;
2915 len = 1;
2916 } else {
2917 buffer[0] = UCNV_SI;
2918 buffer[1] = (char)sourceChar;
2919 len = 2;
2920 pFromU2022State->g = 0;
2921 choiceCount = 0;
2922 }
2923 if(sourceChar == CR || sourceChar == LF) {
2924 /* reset the state at the end of a line */
2925 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2926 choiceCount = 0;
2927 }
2928 }
2929 else{
2930 /* convert U+0080..U+10ffff */
2931 int32_t i;
2932 int8_t cs, g;
2933
2934 if(choiceCount == 0) {
2935 /* try the current SO/G1 converter first */
2936 choices[0] = pFromU2022State->cs[1];
2937
2938 /* default to GB2312_1 if none is designated yet */
2939 if(choices[0] == 0) {
2940 choices[0] = GB2312_1;
2941 }
2942
2943 if(converterData->version == 0) {
2944 /* ISO-2022-CN */
2945
2946 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2947 if(choices[0] == GB2312_1) {
2948 choices[1] = (int8_t)CNS_11643_1;
2949 } else {
2950 choices[1] = (int8_t)GB2312_1;
2951 }
2952
2953 choiceCount = 2;
2954 } else {
2955 /* ISO-2022-CN-EXT */
2956
2957 /* try one of the other converters */
2958 switch(choices[0]) {
2959 case GB2312_1:
2960 choices[1] = (int8_t)CNS_11643_1;
2961 choices[2] = (int8_t)ISO_IR_165;
2962 break;
2963 case ISO_IR_165:
2964 choices[1] = (int8_t)GB2312_1;
2965 choices[2] = (int8_t)CNS_11643_1;
2966 break;
2967 default: /* CNS_11643_x */
2968 choices[1] = (int8_t)GB2312_1;
2969 choices[2] = (int8_t)ISO_IR_165;
2970 break;
2971 }
2972
2973 choiceCount = 3;
2974 }
2975 }
2976
2977 cs = g = 0;
2978 /*
2979 * len==0: no mapping found yet
2980 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
2981 * len>0: found a roundtrip result, done
2982 */
2983 len = 0;
2984 /*
2985 * We will turn off useFallback after finding a fallback,
2986 * but we still get fallbacks from PUA code points as usual.
2987 * Therefore, we will also need to check that we don't overwrite
2988 * an early fallback with a later one.
2989 */
2990 useFallback = cnv->useFallback;
2991
2992 for(i = 0; i < choiceCount && len <= 0; ++i) {
2993 int8_t cs0 = choices[i];
2994 if(cs0 > 0) {
2995 uint32_t value;
2996 int32_t len2;
2997 if(cs0 >= CNS_11643_0) {
2998 len2 = MBCS_FROM_UCHAR32_ISO2022(
2999 converterData->myConverterArray[CNS_11643],
3000 sourceChar,
3001 &value,
3002 useFallback,
3003 MBCS_OUTPUT_3);
3004 if(len2 == 3 || (len2 == -3 && len == 0)) {
3005 targetValue = value;
3006 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3007 if(len2 >= 0) {
3008 len = 2;
3009 } else {
3010 len = -2;
3011 useFallback = FALSE;
3012 }
3013 if(cs == CNS_11643_1) {
3014 g = 1;
3015 } else if(cs == CNS_11643_2) {
3016 g = 2;
3017 } else /* plane 3..7 */ if(converterData->version == 1) {
3018 g = 3;
3019 } else {
3020 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3021 len = 0;
3022 }
3023 }
3024 } else {
3025 /* GB2312_1 or ISO-IR-165 */
3026 len2 = MBCS_FROM_UCHAR32_ISO2022(
3027 converterData->myConverterArray[cs0],
3028 sourceChar,
3029 &value,
3030 useFallback,
3031 MBCS_OUTPUT_2);
3032 if(len2 == 2 || (len2 == -2 && len == 0)) {
3033 targetValue = value;
3034 len = len2;
3035 cs = cs0;
3036 g = 1;
3037 useFallback = FALSE;
3038 }
3039 }
3040 }
3041 }
3042
3043 if(len != 0) {
3044 len = 0; /* count output bytes; it must have been abs(len) == 2 */
3045
3046 /* write the designation sequence if necessary */
3047 if(cs != pFromU2022State->cs[g]) {
3048 if(cs < CNS_11643) {
3049 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3050 } else {
3051 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3052 }
3053 len = 4;
3054 pFromU2022State->cs[g] = cs;
3055 if(g == 1) {
3056 /* changing the SO/G1 charset invalidates the choices[] */
3057 choiceCount = 0;
3058 }
3059 }
3060
3061 /* write the shift sequence if necessary */
3062 if(g != pFromU2022State->g) {
3063 switch(g) {
3064 case 1:
3065 buffer[len++] = UCNV_SO;
3066
3067 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3068 pFromU2022State->g = 1;
3069 break;
3070 case 2:
3071 buffer[len++] = 0x1b;
3072 buffer[len++] = 0x4e;
3073 break;
3074 default: /* case 3 */
3075 buffer[len++] = 0x1b;
3076 buffer[len++] = 0x4f;
3077 break;
3078 }
3079 }
3080
3081 /* write the two output bytes */
3082 buffer[len++] = (char)(targetValue >> 8);
3083 buffer[len++] = (char)targetValue;
3084 } else {
3085 /* if we cannot find the character after checking all codepages
3086 * then this is an error
3087 */
3088 *err = U_INVALID_CHAR_FOUND;
3089 cnv->fromUChar32=sourceChar;
3090 break;
3091 }
3092 }
3093
3094 /* output len>0 bytes in buffer[] */
3095 if(len == 1) {
3096 *target++ = buffer[0];
3097 if(offsets) {
3098 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3099 }
3100 } else if(len == 2 && (target + 2) <= targetLimit) {
3101 *target++ = buffer[0];
3102 *target++ = buffer[1];
3103 if(offsets) {
3104 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3105 *offsets++ = sourceIndex;
3106 *offsets++ = sourceIndex;
3107 }
3108 } else {
3109 fromUWriteUInt8(
3110 cnv,
3111 buffer, len,
3112 &target, (const char *)targetLimit,
3113 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3114 err);
3115 if(U_FAILURE(*err)) {
3116 break;
3117 }
3118 }
3119 } /* end if(myTargetIndex<myTargetLength) */
3120 else{
3121 *err =U_BUFFER_OVERFLOW_ERROR;
3122 break;
3123 }
3124
3125 }/* end while(mySourceIndex<mySourceLength) */
3126
3127 /*
3128 * the end of the input stream and detection of truncated input
3129 * are handled by the framework, but for ISO-2022-CN conversion
3130 * we need to be in ASCII mode at the very end
3131 *
3132 * conditions:
3133 * successful
3134 * not in ASCII mode
3135 * end of input and no truncated input
3136 */
3137 if( U_SUCCESS(*err) &&
3138 pFromU2022State->g!=0 &&
3139 args->flush && source>=sourceLimit && cnv->fromUChar32==0
3140 ) {
3141 int32_t sourceIndex;
3142
3143 /* we are switching to ASCII */
3144 pFromU2022State->g=0;
3145
3146 /* get the source index of the last input character */
3147 /*
3148 * TODO this would be simpler and more reliable if we used a pair
3149 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3150 * so that we could simply use the prevSourceIndex here;
3151 * this code gives an incorrect result for the rare case of an unmatched
3152 * trail surrogate that is alone in the last buffer of the text stream
3153 */
3154 sourceIndex=(int32_t)(source-args->source);
3155 if(sourceIndex>0) {
3156 --sourceIndex;
3157 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3158 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3159 ) {
3160 --sourceIndex;
3161 }
3162 } else {
3163 sourceIndex=-1;
3164 }
3165
3166 fromUWriteUInt8(
3167 cnv,
3168 SHIFT_IN_STR, 1,
3169 &target, (const char *)targetLimit,
3170 &offsets, sourceIndex,
3171 err);
3172 }
3173
3174 /*save the state and return */
3175 args->source = source;
3176 args->target = (char*)target;
3177 }
3178
3179
3180 static void
3181 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3182 UErrorCode* err){
3183 char tempBuf[3];
3184 const char *mySource = (char *) args->source;
3185 UChar *myTarget = args->target;
3186 const char *mySourceLimit = args->sourceLimit;
3187 uint32_t targetUniChar = 0x0000;
3188 uint32_t mySourceChar = 0x0000;
3189 UConverterDataISO2022* myData;
3190 ISO2022State *pToU2022State;
3191
3192 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3193 pToU2022State = &myData->toU2022State;
3194
3195 if(myData->key != 0) {
3196 /* continue with a partial escape sequence */
3197 goto escape;
3198 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3199 /* continue with a partial double-byte character */
3200 mySourceChar = args->converter->toUBytes[0];
3201 args->converter->toULength = 0;
3202 targetUniChar = missingCharMarker;
3203 goto getTrailByte;
3204 }
3205
3206 while(mySource < mySourceLimit){
3207
3208 targetUniChar =missingCharMarker;
3209
3210 if(myTarget < args->targetLimit){
3211
3212 mySourceChar= (unsigned char) *mySource++;
3213
3214 switch(mySourceChar){
3215 case UCNV_SI:
3216 pToU2022State->g=0;
3217 if (myData->isEmptySegment) {
3218 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
3219 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3220 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3221 args->converter->toUBytes[0] = mySourceChar;
3222 args->converter->toULength = 1;
3223 args->target = myTarget;
3224 args->source = mySource;
3225 return;
3226 }
3227 continue;
3228
3229 case UCNV_SO:
3230 if(pToU2022State->cs[1] != 0) {
3231 pToU2022State->g=1;
3232 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
3233 continue;
3234 } else {
3235 /* illegal to have SO before a matching designator */
3236 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
3237 break;
3238 }
3239
3240 case ESC_2022:
3241 mySource--;
3242 escape:
3243 {
3244 const char * mySourceBefore = mySource;
3245 int8_t toULengthBefore = args->converter->toULength;
3246
3247 changeState_2022(args->converter,&(mySource),
3248 mySourceLimit, ISO_2022_CN,err);
3249
3250 /* After SO there must be at least one character before a designator (designator error handled separately) */
3251 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3252 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3253 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3254 args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
3255 }
3256 }
3257
3258 /* invalid or illegal escape sequence */
3259 if(U_FAILURE(*err)){
3260 args->target = myTarget;
3261 args->source = mySource;
3262 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
3263 return;
3264 }
3265 continue;
3266
3267 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3268
3269 case CR:
3270 /*falls through*/
3271 case LF:
3272 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3273 /* falls through */
3274 default:
3275 /* convert one or two bytes */
3276 myData->isEmptySegment = FALSE;
3277 if(pToU2022State->g != 0) {
3278 if(mySource < mySourceLimit) {
3279 UConverterSharedData *cnv;
3280 StateEnum tempState;
3281 int32_t tempBufLen;
3282 int leadIsOk, trailIsOk;
3283 uint8_t trailByte;
3284 getTrailByte:
3285 trailByte = (uint8_t)*mySource;
3286 /*
3287 * Ticket 5691: consistent illegal sequences:
3288 * - We include at least the first byte in the illegal sequence.
3289 * - If any of the non-initial bytes could be the start of a character,
3290 * we stop the illegal sequence before the first one of those.
3291 *
3292 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3293 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3294 * Otherwise we convert or report the pair of bytes.
3295 */
3296 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3297 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3298 if (leadIsOk && trailIsOk) {
3299 ++mySource;
3300 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3301 if(tempState >= CNS_11643_0) {
3302 cnv = myData->myConverterArray[CNS_11643];
3303 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3304 tempBuf[1] = (char) (mySourceChar);
3305 tempBuf[2] = (char) trailByte;
3306 tempBufLen = 3;
3307
3308 }else{
3309 cnv = myData->myConverterArray[tempState];
3310 tempBuf[0] = (char) (mySourceChar);
3311 tempBuf[1] = (char) trailByte;
3312 tempBufLen = 2;
3313 }
3314 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3315 mySourceChar = (mySourceChar << 8) | trailByte;
3316 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3317 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3318 ++mySource;
3319 /* add another bit so that the code below writes 2 bytes in case of error */
3320 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3321 }
3322 if(pToU2022State->g>=2) {
3323 /* return from a single-shift state to the previous one */
3324 pToU2022State->g=pToU2022State->prevG;
3325 }
3326 } else {
3327 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3328 args->converter->toULength = 1;
3329 goto endloop;
3330 }
3331 }
3332 else{
3333 if(mySourceChar <= 0x7f) {
3334 targetUniChar = (UChar) mySourceChar;
3335 }
3336 }
3337 break;
3338 }
3339 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3340 if(args->offsets){
3341 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3342 }
3343 *(myTarget++)=(UChar)targetUniChar;
3344 }
3345 else if(targetUniChar > missingCharMarker){
3346 /* disassemble the surrogate pair and write to output*/
3347 targetUniChar-=0x0010000;
3348 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3349 if(args->offsets){
3350 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3351 }
3352 ++myTarget;
3353 if(myTarget< args->targetLimit){
3354 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3355 if(args->offsets){
3356 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3357 }
3358 ++myTarget;
3359 }else{
3360 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3361 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3362 }
3363
3364 }
3365 else{
3366 /* Call the callback function*/
3367 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3368 break;
3369 }
3370 }
3371 else{
3372 *err =U_BUFFER_OVERFLOW_ERROR;
3373 break;
3374 }
3375 }
3376 endloop:
3377 args->target = myTarget;
3378 args->source = mySource;
3379 }
3380
3381 static void
3382 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3383 UConverter *cnv = args->converter;
3384 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3385 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3386 char *p, *subchar;
3387 char buffer[8];
3388 int32_t length;
3389
3390 subchar=(char *)cnv->subChars;
3391 length=cnv->subCharLen; /* assume length==1 for most variants */
3392
3393 p = buffer;
3394 switch(myConverterData->locale[0]){
3395 case 'j':
3396 {
3397 int8_t cs;
3398
3399 if(pFromU2022State->g == 1) {
3400 /* JIS7: switch from G1 to G0 */
3401 pFromU2022State->g = 0;
3402 *p++ = UCNV_SI;
3403 }
3404
3405 cs = pFromU2022State->cs[0];
3406 if(cs != ASCII && cs != JISX201) {
3407 /* not in ASCII or JIS X 0201: switch to ASCII */
3408 pFromU2022State->cs[0] = (int8_t)ASCII;
3409 *p++ = '\x1b';
3410 *p++ = '\x28';
3411 *p++ = '\x42';
3412 }
3413
3414 *p++ = subchar[0];
3415 break;
3416 }
3417 case 'c':
3418 if(pFromU2022State->g != 0) {
3419 /* not in ASCII mode: switch to ASCII */
3420 pFromU2022State->g = 0;
3421 *p++ = UCNV_SI;
3422 }
3423 *p++ = subchar[0];
3424 break;
3425 case 'k':
3426 if(myConverterData->version == 0) {
3427 if(length == 1) {
3428 if((UBool)args->converter->fromUnicodeStatus) {
3429 /* in DBCS mode: switch to SBCS */
3430 args->converter->fromUnicodeStatus = 0;
3431 *p++ = UCNV_SI;
3432 }
3433 *p++ = subchar[0];
3434 } else /* length == 2*/ {
3435 if(!(UBool)args->converter->fromUnicodeStatus) {
3436 /* in SBCS mode: switch to DBCS */
3437 args->converter->fromUnicodeStatus = 1;
3438 *p++ = UCNV_SO;
3439 }
3440 *p++ = subchar[0];
3441 *p++ = subchar[1];
3442 }
3443 break;
3444 } else {
3445 /* save the subconverter's substitution string */
3446 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3447 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3448
3449 /* set our substitution string into the subconverter */
3450 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3451 myConverterData->currentConverter->subCharLen = (int8_t)length;
3452
3453 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3454 args->converter = myConverterData->currentConverter;
3455 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3456 ucnv_cbFromUWriteSub(args, 0, err);
3457 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3458 args->converter = cnv;
3459
3460 /* restore the subconverter's substitution string */
3461 myConverterData->currentConverter->subChars = currentSubChars;
3462 myConverterData->currentConverter->subCharLen = currentSubCharLen;
3463
3464 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3465 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3466 uprv_memcpy(
3467 cnv->charErrorBuffer,
3468 myConverterData->currentConverter->charErrorBuffer,
3469 myConverterData->currentConverter->charErrorBufferLength);
3470 }
3471 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3472 myConverterData->currentConverter->charErrorBufferLength = 0;
3473 }
3474 return;
3475 }
3476 default:
3477 /* not expected */
3478 break;
3479 }
3480 ucnv_cbFromUWriteBytes(args,
3481 buffer, (int32_t)(p - buffer),
3482 offsetIndex, err);
3483 }
3484
3485 /*
3486 * Structure for cloning an ISO 2022 converter into a single memory block.
3487 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3488 * and then ucnv_safeClone() of the sub-converter may additionally align
3489 * currentConverter inside the cloneStruct, for which we need the deadSpace
3490 * after currentConverter.
3491 * This is because UAlignedMemory may be larger than the actually
3492 * necessary alignment size for the platform.
3493 * The other cloneStruct fields will not be moved around,
3494 * and are aligned properly with cloneStruct's alignment.
3495 */
3496 struct cloneStruct
3497 {
3498 UConverter cnv;
3499 UConverter currentConverter;
3500 UAlignedMemory deadSpace;
3501 UConverterDataISO2022 mydata;
3502 };
3503
3504
3505 static UConverter *
3506 _ISO_2022_SafeClone(
3507 const UConverter *cnv,
3508 void *stackBuffer,
3509 int32_t *pBufferSize,
3510 UErrorCode *status)
3511 {
3512 struct cloneStruct * localClone;
3513 UConverterDataISO2022 *cnvData;
3514 int32_t i, size;
3515
3516 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3517 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3518 return NULL;
3519 }
3520
3521 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3522 localClone = (struct cloneStruct *)stackBuffer;
3523
3524 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3525
3526 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3527 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3528 localClone->cnv.isExtraLocal = TRUE;
3529
3530 /* share the subconverters */
3531
3532 if(cnvData->currentConverter != NULL) {
3533 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3534 localClone->mydata.currentConverter =
3535 ucnv_safeClone(cnvData->currentConverter,
3536 &localClone->currentConverter,
3537 &size, status);
3538 if(U_FAILURE(*status)) {
3539 return NULL;
3540 }
3541 }
3542
3543 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3544 if(cnvData->myConverterArray[i] != NULL) {
3545 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3546 }
3547 }
3548
3549 return &localClone->cnv;
3550 }
3551
3552 static void
3553 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3554 const USetAdder *sa,
3555 UConverterUnicodeSet which,
3556 UErrorCode *pErrorCode)
3557 {
3558 int32_t i;
3559 UConverterDataISO2022* cnvData;
3560
3561 if (U_FAILURE(*pErrorCode)) {
3562 return;
3563 }
3564 #ifdef U_ENABLE_GENERIC_ISO_2022
3565 if (cnv->sharedData == &_ISO2022Data) {
3566 /* We use UTF-8 in this case */
3567 sa->addRange(sa->set, 0, 0xd7FF);
3568 sa->addRange(sa->set, 0xE000, 0x10FFFF);
3569 return;
3570 }
3571 #endif
3572
3573 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3574
3575 /* open a set and initialize it with code points that are algorithmically round-tripped */
3576 switch(cnvData->locale[0]){
3577 case 'j':
3578 /* include JIS X 0201 which is hardcoded */
3579 sa->add(sa->set, 0xa5);
3580 sa->add(sa->set, 0x203e);
3581 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3582 /* include Latin-1 for some variants of JP */
3583 sa->addRange(sa->set, 0, 0xff);
3584 } else {
3585 /* include ASCII for JP */
3586 sa->addRange(sa->set, 0, 0x7f);
3587 }
3588 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3589 /*
3590 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3591 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3592 * use half-width Katakana.
3593 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3594 * half-width Katakana via the ESC ( I sequence.
3595 * However, we only emit (fromUnicode) half-width Katakana according to the
3596 * definition of each variant.
3597 *
3598 * When including fallbacks,
3599 * we need to include half-width Katakana Unicode code points for all JP variants because
3600 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3601 */
3602 /* include half-width Katakana for JP */
3603 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3604 }
3605 break;
3606 case 'c':
3607 case 'z':
3608 /* include ASCII for CN */
3609 sa->addRange(sa->set, 0, 0x7f);
3610 break;
3611 case 'k':
3612 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3613 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3614 cnvData->currentConverter, sa, which, pErrorCode);
3615 /* the loop over myConverterArray[] will simply not find another converter */
3616 break;
3617 default:
3618 break;
3619 }
3620
3621 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3622 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3623 cnvData->version==0 && i==CNS_11643
3624 ) {
3625 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3626 ucnv_MBCSGetUnicodeSetForBytes(
3627 cnvData->myConverterArray[i],
3628 sa, UCNV_ROUNDTRIP_SET,
3629 0, 0x81, 0x82,
3630 pErrorCode);
3631 }
3632 #endif
3633
3634 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3635 UConverterSetFilter filter;
3636 if(cnvData->myConverterArray[i]!=NULL) {
3637 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3638 cnvData->version==0 && i==CNS_11643
3639 ) {
3640 /*
3641 * Version-specific for CN:
3642 * CN version 0 does not map CNS planes 3..7 although
3643 * they are all available in the CNS conversion table;
3644 * CN version 1 (-EXT) does map them all.
3645 * The two versions create different Unicode sets.
3646 */
3647 filter=UCNV_SET_FILTER_2022_CN;
3648 } else if(cnvData->locale[0]=='j' && i==JISX208) {
3649 /*
3650 * Only add code points that map to Shift-JIS codes
3651 * corresponding to JIS X 0208.
3652 */
3653 filter=UCNV_SET_FILTER_SJIS;
3654 } else if(i==KSC5601) {
3655 /*
3656 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3657 * are broader than GR94.
3658 */
3659 filter=UCNV_SET_FILTER_GR94DBCS;
3660 } else {
3661 filter=UCNV_SET_FILTER_NONE;
3662 }
3663 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3664 }
3665 }
3666
3667 /*
3668 * ISO 2022 converters must not convert SO/SI/ESC despite what
3669 * sub-converters do by themselves.
3670 * Remove these characters from the set.
3671 */
3672 sa->remove(sa->set, 0x0e);
3673 sa->remove(sa->set, 0x0f);
3674 sa->remove(sa->set, 0x1b);
3675
3676 /* ISO 2022 converters do not convert C1 controls either */
3677 sa->removeRange(sa->set, 0x80, 0x9f);
3678 }
3679
3680 static const UConverterImpl _ISO2022Impl={
3681 UCNV_ISO_2022,
3682
3683 NULL,
3684 NULL,
3685
3686 _ISO2022Open,
3687 _ISO2022Close,
3688 _ISO2022Reset,
3689
3690 #ifdef U_ENABLE_GENERIC_ISO_2022
3691 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3692 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3693 ucnv_fromUnicode_UTF8,
3694 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3695 #else
3696 NULL,
3697 NULL,
3698 NULL,
3699 NULL,
3700 #endif
3701 NULL,
3702
3703 NULL,
3704 _ISO2022getName,
3705 _ISO_2022_WriteSub,
3706 _ISO_2022_SafeClone,
3707 _ISO_2022_GetUnicodeSet
3708 };
3709 static const UConverterStaticData _ISO2022StaticData={
3710 sizeof(UConverterStaticData),
3711 "ISO_2022",
3712 2022,
3713 UCNV_IBM,
3714 UCNV_ISO_2022,
3715 1,
3716 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3717 { 0x1a, 0, 0, 0 },
3718 1,
3719 FALSE,
3720 FALSE,
3721 0,
3722 0,
3723 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3724 };
3725 const UConverterSharedData _ISO2022Data={
3726 sizeof(UConverterSharedData),
3727 ~((uint32_t) 0),
3728 NULL,
3729 NULL,
3730 &_ISO2022StaticData,
3731 FALSE,
3732 &_ISO2022Impl,
3733 0
3734 };
3735
3736 /*************JP****************/
3737 static const UConverterImpl _ISO2022JPImpl={
3738 UCNV_ISO_2022,
3739
3740 NULL,
3741 NULL,
3742
3743 _ISO2022Open,
3744 _ISO2022Close,
3745 _ISO2022Reset,
3746
3747 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3748 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3749 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3750 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3751 NULL,
3752
3753 NULL,
3754 _ISO2022getName,
3755 _ISO_2022_WriteSub,
3756 _ISO_2022_SafeClone,
3757 _ISO_2022_GetUnicodeSet
3758 };
3759 static const UConverterStaticData _ISO2022JPStaticData={
3760 sizeof(UConverterStaticData),
3761 "ISO_2022_JP",
3762 0,
3763 UCNV_IBM,
3764 UCNV_ISO_2022,
3765 1,
3766 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3767 { 0x1a, 0, 0, 0 },
3768 1,
3769 FALSE,
3770 FALSE,
3771 0,
3772 0,
3773 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3774 };
3775 static const UConverterSharedData _ISO2022JPData={
3776 sizeof(UConverterSharedData),
3777 ~((uint32_t) 0),
3778 NULL,
3779 NULL,
3780 &_ISO2022JPStaticData,
3781 FALSE,
3782 &_ISO2022JPImpl,
3783 0
3784 };
3785
3786 /************* KR ***************/
3787 static const UConverterImpl _ISO2022KRImpl={
3788 UCNV_ISO_2022,
3789
3790 NULL,
3791 NULL,
3792
3793 _ISO2022Open,
3794 _ISO2022Close,
3795 _ISO2022Reset,
3796
3797 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3798 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3799 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3800 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3801 NULL,
3802
3803 NULL,
3804 _ISO2022getName,
3805 _ISO_2022_WriteSub,
3806 _ISO_2022_SafeClone,
3807 _ISO_2022_GetUnicodeSet
3808 };
3809 static const UConverterStaticData _ISO2022KRStaticData={
3810 sizeof(UConverterStaticData),
3811 "ISO_2022_KR",
3812 0,
3813 UCNV_IBM,
3814 UCNV_ISO_2022,
3815 1,
3816 3, /* max 3 bytes per UChar: SO+DBCS */
3817 { 0x1a, 0, 0, 0 },
3818 1,
3819 FALSE,
3820 FALSE,
3821 0,
3822 0,
3823 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3824 };
3825 static const UConverterSharedData _ISO2022KRData={
3826 sizeof(UConverterSharedData),
3827 ~((uint32_t) 0),
3828 NULL,
3829 NULL,
3830 &_ISO2022KRStaticData,
3831 FALSE,
3832 &_ISO2022KRImpl,
3833 0
3834 };
3835
3836 /*************** CN ***************/
3837 static const UConverterImpl _ISO2022CNImpl={
3838
3839 UCNV_ISO_2022,
3840
3841 NULL,
3842 NULL,
3843
3844 _ISO2022Open,
3845 _ISO2022Close,
3846 _ISO2022Reset,
3847
3848 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3849 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3850 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3851 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3852 NULL,
3853
3854 NULL,
3855 _ISO2022getName,
3856 _ISO_2022_WriteSub,
3857 _ISO_2022_SafeClone,
3858 _ISO_2022_GetUnicodeSet
3859 };
3860 static const UConverterStaticData _ISO2022CNStaticData={
3861 sizeof(UConverterStaticData),
3862 "ISO_2022_CN",
3863 0,
3864 UCNV_IBM,
3865 UCNV_ISO_2022,
3866 1,
3867 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3868 { 0x1a, 0, 0, 0 },
3869 1,
3870 FALSE,
3871 FALSE,
3872 0,
3873 0,
3874 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3875 };
3876 static const UConverterSharedData _ISO2022CNData={
3877 sizeof(UConverterSharedData),
3878 ~((uint32_t) 0),
3879 NULL,
3880 NULL,
3881 &_ISO2022CNStaticData,
3882 FALSE,
3883 &_ISO2022CNImpl,
3884 0
3885 };
3886
3887
3888
3889 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */