2 **********************************************************************
3 * Copyright (C) 2000-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.c
8 * tab size: 8 (not used)
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
29 #include "unicode/utypes.h"
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
46 #ifdef U_ENABLE_GENERIC_ISO_2022
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
74 * Markus Scherer 2003-dec-03
78 static const char SHIFT_IN_STR
[] = "\x0F";
79 static const char SHIFT_OUT_STR
[] = "\x0E";
93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
94 * as bytes 21..7E. (Subtract 0x80.)
95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
96 * as bytes 20..7F. (Subtract 0x80.)
97 * Do not encode C1 control codes with native bytes 80..9F
98 * as bytes 00..1F (C0 control codes).
108 * ISO 2022 control codes must not be converted from Unicode
109 * because they would mess up the byte stream.
110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
111 * corresponding to SO, SI, and ESC.
113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
115 /* for ISO-2022-JP and -CN implementations */
132 HWKANA_7BIT
=8, /* Halfwidth Katakana 7 bit */
135 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
141 * these are used in StateEnum and ISO2022State variables,
142 * but CNS_11643 must be used to index into myConverterArray[]
154 /* is the StateEnum charset value for a DBCS charset? */
155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
157 #define CSM(cs) ((uint16_t)1<<(cs))
160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
163 * Note: The converter uses some leniency:
164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
165 * all versions, not just JIS7 and JIS8.
166 * - ICU does not distinguish between different versions of JIS X 0208.
168 static const uint16_t jpCharsetMasks
[5]={
169 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
),
170 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
),
171 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
),
172 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
),
173 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
)
185 typedef struct ISO2022State
{
186 int8_t cs
[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
187 int8_t g
; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
188 int8_t prevG
; /* g before single shift (SS2 or SS3) */
191 #define UCNV_OPTIONS_VERSION_MASK 0xf
192 #define UCNV_2022_MAX_CONVERTERS 10
195 UConverterSharedData
*myConverterArray
[UCNV_2022_MAX_CONVERTERS
];
196 UConverter
*currentConverter
;
197 Cnv2022Type currentType
;
198 ISO2022State toU2022State
, fromU2022State
;
201 #ifdef U_ENABLE_GENERIC_ISO_2022
204 UBool isEmptySegment
;
207 }UConverterDataISO2022
;
210 /* ISO-2022 ----------------------------------------------------------------- */
212 /*Forward declaration */
214 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs
* args
,
217 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
,
220 #define ESC_2022 0x1B /*ESC*/
224 INVALID_2022
= -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
225 VALID_NON_TERMINAL_2022
= 0, /*so far corresponds to a valid iso 2022 escape sequence*/
226 VALID_TERMINAL_2022
= 1, /*corresponds to a valid iso 2022 escape sequence*/
227 VALID_MAYBE_TERMINAL_2022
= 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
228 } UCNV_TableStates_2022
;
231 * The way these state transition arrays work is:
232 * ex : ESC$B is the sequence for JISX208
233 * a) First Iteration: char is ESC
234 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
235 * int x = normalize_esq_chars_2022[27] which is equal to 1
236 * ii) Search for this value in escSeqStateTable_Key_2022[]
237 * value of x is stored at escSeqStateTable_Key_2022[0]
238 * iii) Save this index as offset
239 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
240 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
241 * b) Switch on this state and continue to next char
242 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
243 * which is normalize_esq_chars_2022[36] == 4
244 * ii) x is currently 1(from above)
245 * x<<=5 -- x is now 32
246 * x+=normalize_esq_chars_2022[36]
248 * iii) Search for this value in escSeqStateTable_Key_2022[]
249 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
250 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
251 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
252 * c) Switch on this state and continue to next char
253 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
254 * ii) x is currently 36 (from above)
255 * x<<=5 -- x is now 1152
256 * x+=normalize_esq_chars_2022[66]
258 * iii) Search for this value in escSeqStateTable_Key_2022[]
259 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
260 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
261 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
262 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
266 /*Below are the 3 arrays depicting a state transition table*/
267 static const int8_t normalize_esq_chars_2022
[256] = {
268 /* 0 1 2 3 4 5 6 7 8 9 */
270 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
271 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
273 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
274 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
275 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
276 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
277 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
278 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
279 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
280 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
282 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
298 #ifdef U_ENABLE_GENERIC_ISO_2022
300 * When the generic ISO-2022 converter is completely removed, not just disabled
301 * per #ifdef, then the following state table and the associated tables that are
302 * dimensioned with MAX_STATES_2022 should be trimmed.
304 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
305 * the associated escape sequences starting with ESC ( B should be removed.
306 * This includes the ones with key values 1097 and all of the ones above 1000000.
308 * For the latter, the tables can simply be truncated.
309 * For the former, since the tables must be kept parallel, it is probably best
310 * to simply duplicate an adjacent table cell, parallel in all tables.
312 * It may make sense to restructure the tables, especially by using small search
313 * tables for the variants instead of indexing them parallel to the table here.
317 #define MAX_STATES_2022 74
318 static const int32_t escSeqStateTable_Key_2022
[MAX_STATES_2022
] = {
319 /* 0 1 2 3 4 5 6 7 8 9 */
321 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
322 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
323 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
324 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
325 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
326 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
327 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
328 ,35947631 ,35947635 ,35947636 ,35947638
331 #ifdef U_ENABLE_GENERIC_ISO_2022
333 static const char* const escSeqStateTable_Result_2022
[MAX_STATES_2022
] = {
334 /* 0 1 2 3 4 5 6 7 8 9 */
336 NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,"latin1" ,"latin1"
337 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
338 ,"latin1" ,NULL
,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL
,NULL
,NULL
,NULL
,"UTF8"
339 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL
,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
340 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
341 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
342 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL
,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
343 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
348 static const int8_t escSeqStateTable_Value_2022
[MAX_STATES_2022
] = {
349 /* 0 1 2 3 4 5 6 7 8 9 */
350 VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
351 ,VALID_MAYBE_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
352 ,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
353 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
354 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
355 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
356 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
357 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
361 /* Type def for refactoring changeState_2022 code*/
363 #ifdef U_ENABLE_GENERIC_ISO_2022
371 /*********** ISO 2022 Converter Protos ***********/
373 _ISO2022Open(UConverter
*cnv
, const char *name
, const char *locale
,uint32_t options
, UErrorCode
*errorCode
);
376 _ISO2022Close(UConverter
*converter
);
379 _ISO2022Reset(UConverter
*converter
, UConverterResetChoice choice
);
382 _ISO2022getName(const UConverter
* cnv
);
385 _ISO_2022_WriteSub(UConverterFromUnicodeArgs
*args
, int32_t offsetIndex
, UErrorCode
*err
);
388 _ISO_2022_SafeClone(const UConverter
*cnv
, void *stackBuffer
, int32_t *pBufferSize
, UErrorCode
*status
);
390 #ifdef U_ENABLE_GENERIC_ISO_2022
392 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs
* args
, UErrorCode
* err
);
395 /*const UConverterSharedData _ISO2022Data;*/
396 static const UConverterSharedData _ISO2022JPData
;
397 static const UConverterSharedData _ISO2022KRData
;
398 static const UConverterSharedData _ISO2022CNData
;
400 /*************** Converter implementations ******************/
402 /* The purpose of this function is to get around gcc compiler warnings. */
404 fromUWriteUInt8(UConverter
*cnv
,
405 const char *bytes
, int32_t length
,
406 uint8_t **target
, const char *targetLimit
,
409 UErrorCode
*pErrorCode
)
411 char *targetChars
= (char *)*target
;
412 ucnv_fromUWriteBytes(cnv
, bytes
, length
, &targetChars
, targetLimit
,
413 offsets
, sourceIndex
, pErrorCode
);
414 *target
= (uint8_t*)targetChars
;
419 setInitialStateToUnicodeKR(UConverter
* converter
, UConverterDataISO2022
*myConverterData
){
420 if(myConverterData
->version
== 1) {
421 UConverter
*cnv
= myConverterData
->currentConverter
;
423 cnv
->toUnicodeStatus
=0; /* offset */
424 cnv
->mode
=0; /* state */
425 cnv
->toULength
=0; /* byteIndex */
430 setInitialStateFromUnicodeKR(UConverter
* converter
,UConverterDataISO2022
*myConverterData
){
431 /* in ISO-2022-KR the designator sequence appears only once
432 * in a file so we append it only once
434 if( converter
->charErrorBufferLength
==0){
436 converter
->charErrorBufferLength
= 4;
437 converter
->charErrorBuffer
[0] = 0x1b;
438 converter
->charErrorBuffer
[1] = 0x24;
439 converter
->charErrorBuffer
[2] = 0x29;
440 converter
->charErrorBuffer
[3] = 0x43;
442 if(myConverterData
->version
== 1) {
443 UConverter
*cnv
= myConverterData
->currentConverter
;
446 cnv
->fromUnicodeStatus
=1; /* prevLength */
451 _ISO2022Open(UConverter
*cnv
, const char *name
, const char *locale
,uint32_t options
, UErrorCode
*errorCode
){
453 char myLocale
[6]={' ',' ',' ',' ',' ',' '};
455 cnv
->extraInfo
= uprv_malloc (sizeof (UConverterDataISO2022
));
456 if(cnv
->extraInfo
!= NULL
) {
457 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) cnv
->extraInfo
;
460 uprv_memset(myConverterData
, 0, sizeof(UConverterDataISO2022
));
461 myConverterData
->currentType
= ASCII1
;
462 cnv
->fromUnicodeStatus
=FALSE
;
464 uprv_strncpy(myLocale
, locale
, sizeof(myLocale
));
466 version
= options
& UCNV_OPTIONS_VERSION_MASK
;
467 myConverterData
->version
= version
;
468 if(myLocale
[0]=='j' && (myLocale
[1]=='a'|| myLocale
[1]=='p') &&
469 (myLocale
[2]=='_' || myLocale
[2]=='\0'))
472 /* open the required converters and cache them */
473 if(jpCharsetMasks
[version
]&CSM(ISO8859_7
)) {
474 myConverterData
->myConverterArray
[ISO8859_7
]= ucnv_loadSharedData("ISO8859_7", NULL
, errorCode
);
476 myConverterData
->myConverterArray
[JISX208
] = ucnv_loadSharedData("Shift-JIS", NULL
, errorCode
);
477 if(jpCharsetMasks
[version
]&CSM(JISX212
)) {
478 myConverterData
->myConverterArray
[JISX212
] = ucnv_loadSharedData("jisx-212", NULL
, errorCode
);
480 if(jpCharsetMasks
[version
]&CSM(GB2312
)) {
481 myConverterData
->myConverterArray
[GB2312
] = ucnv_loadSharedData("ibm-5478", NULL
, errorCode
); /* gb_2312_80-1 */
483 if(jpCharsetMasks
[version
]&CSM(KSC5601
)) {
484 myConverterData
->myConverterArray
[KSC5601
] = ucnv_loadSharedData("ksc_5601", NULL
, errorCode
);
487 /* set the function pointers to appropriate funtions */
488 cnv
->sharedData
=(UConverterSharedData
*)(&_ISO2022JPData
);
489 uprv_strcpy(myConverterData
->locale
,"ja");
491 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ja,version=");
492 len
= uprv_strlen(myConverterData
->name
);
493 myConverterData
->name
[len
]=(char)(myConverterData
->version
+(int)'0');
494 myConverterData
->name
[len
+1]='\0';
496 else if(myLocale
[0]=='k' && (myLocale
[1]=='o'|| myLocale
[1]=='r') &&
497 (myLocale
[2]=='_' || myLocale
[2]=='\0'))
500 myConverterData
->currentConverter
=
501 ucnv_open("icu-internal-25546",errorCode
);
503 if (U_FAILURE(*errorCode
)) {
508 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ko,version=1");
509 uprv_memcpy(cnv
->subChars
, myConverterData
->currentConverter
->subChars
, 4);
510 cnv
->subCharLen
= myConverterData
->currentConverter
->subCharLen
;
512 myConverterData
->currentConverter
=ucnv_open("ibm-949",errorCode
);
514 if (U_FAILURE(*errorCode
)) {
519 myConverterData
->version
= 0;
520 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ko,version=0");
523 /* initialize the state variables */
524 setInitialStateToUnicodeKR(cnv
, myConverterData
);
525 setInitialStateFromUnicodeKR(cnv
, myConverterData
);
527 /* set the function pointers to appropriate funtions */
528 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022KRData
;
529 uprv_strcpy(myConverterData
->locale
,"ko");
531 else if(((myLocale
[0]=='z' && myLocale
[1]=='h') || (myLocale
[0]=='c'&& myLocale
[1]=='n'))&&
532 (myLocale
[2]=='_' || myLocale
[2]=='\0'))
535 /* open the required converters and cache them */
536 myConverterData
->myConverterArray
[GB2312_1
] = ucnv_loadSharedData("ibm-5478", NULL
, errorCode
);
538 myConverterData
->myConverterArray
[ISO_IR_165
] = ucnv_loadSharedData("iso-ir-165", NULL
, errorCode
);
540 myConverterData
->myConverterArray
[CNS_11643
] = ucnv_loadSharedData("cns-11643-1992", NULL
, errorCode
);
543 /* set the function pointers to appropriate funtions */
544 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022CNData
;
545 uprv_strcpy(myConverterData
->locale
,"cn");
548 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=zh,version=1");
550 myConverterData
->version
= 0;
551 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=zh,version=0");
555 #ifdef U_ENABLE_GENERIC_ISO_2022
556 myConverterData
->isFirstBuffer
= TRUE
;
558 /* append the UTF-8 escape sequence */
559 cnv
->charErrorBufferLength
= 3;
560 cnv
->charErrorBuffer
[0] = 0x1b;
561 cnv
->charErrorBuffer
[1] = 0x25;
562 cnv
->charErrorBuffer
[2] = 0x42;
564 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022Data
;
565 /* initialize the state variables */
566 uprv_strcpy(myConverterData
->name
,"ISO_2022");
568 *errorCode
= U_UNSUPPORTED_ERROR
;
573 cnv
->maxBytesPerUChar
=cnv
->sharedData
->staticData
->maxBytesPerChar
;
575 if(U_FAILURE(*errorCode
)) {
579 *errorCode
= U_MEMORY_ALLOCATION_ERROR
;
585 _ISO2022Close(UConverter
*converter
) {
586 UConverterDataISO2022
* myData
=(UConverterDataISO2022
*) (converter
->extraInfo
);
587 UConverterSharedData
**array
= myData
->myConverterArray
;
590 if (converter
->extraInfo
!= NULL
) {
591 /*close the array of converter pointers and free the memory*/
592 for (i
=0; i
<UCNV_2022_MAX_CONVERTERS
; i
++) {
594 ucnv_unloadSharedDataIfReady(array
[i
]);
598 ucnv_close(myData
->currentConverter
);
600 if(!converter
->isExtraLocal
){
601 uprv_free (converter
->extraInfo
);
602 converter
->extraInfo
= NULL
;
608 _ISO2022Reset(UConverter
*converter
, UConverterResetChoice choice
) {
609 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) (converter
->extraInfo
);
610 if(choice
<=UCNV_RESET_TO_UNICODE
) {
611 uprv_memset(&myConverterData
->toU2022State
, 0, sizeof(ISO2022State
));
612 myConverterData
->key
= 0;
613 myConverterData
->isEmptySegment
= FALSE
;
615 if(choice
!=UCNV_RESET_TO_UNICODE
) {
616 uprv_memset(&myConverterData
->fromU2022State
, 0, sizeof(ISO2022State
));
618 #ifdef U_ENABLE_GENERIC_ISO_2022
619 if(myConverterData
->locale
[0] == 0){
620 if(choice
<=UCNV_RESET_TO_UNICODE
) {
621 myConverterData
->isFirstBuffer
= TRUE
;
622 myConverterData
->key
= 0;
623 if (converter
->mode
== UCNV_SO
){
624 ucnv_close (myConverterData
->currentConverter
);
625 myConverterData
->currentConverter
=NULL
;
627 converter
->mode
= UCNV_SI
;
629 if(choice
!=UCNV_RESET_TO_UNICODE
) {
630 /* re-append UTF-8 escape sequence */
631 converter
->charErrorBufferLength
= 3;
632 converter
->charErrorBuffer
[0] = 0x1b;
633 converter
->charErrorBuffer
[1] = 0x28;
634 converter
->charErrorBuffer
[2] = 0x42;
640 /* reset the state variables */
641 if(myConverterData
->locale
[0] == 'k'){
642 if(choice
<=UCNV_RESET_TO_UNICODE
) {
643 setInitialStateToUnicodeKR(converter
, myConverterData
);
645 if(choice
!=UCNV_RESET_TO_UNICODE
) {
646 setInitialStateFromUnicodeKR(converter
, myConverterData
);
653 _ISO2022getName(const UConverter
* cnv
){
655 UConverterDataISO2022
* myData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
662 /*************** to unicode *******************/
663 /****************************************************************************
664 * Recognized escape sequences are
676 static const int8_t nextStateToUnicodeJP
[MAX_STATES_2022
]= {
677 /* 0 1 2 3 4 5 6 7 8 9 */
678 INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,SS2_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
679 ,ASCII
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,JISX201
,HWKANA_7BIT
,JISX201
,INVALID_STATE
680 ,INVALID_STATE
,INVALID_STATE
,JISX208
,GB2312
,JISX208
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
681 ,ISO8859_1
,ISO8859_7
,JISX208
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,KSC5601
,JISX212
,INVALID_STATE
682 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
683 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
684 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
685 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
688 /*************** to unicode *******************/
689 static const int8_t nextStateToUnicodeCN
[MAX_STATES_2022
]= {
690 /* 0 1 2 3 4 5 6 7 8 9 */
691 INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,SS2_STATE
,SS3_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
692 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
693 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
694 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
695 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,GB2312_1
,INVALID_STATE
,ISO_IR_165
696 ,CNS_11643_1
,CNS_11643_2
,CNS_11643_3
,CNS_11643_4
,CNS_11643_5
,CNS_11643_6
,CNS_11643_7
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
697 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
698 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
702 static UCNV_TableStates_2022
703 getKey_2022(char c
,int32_t* key
,int32_t* offset
){
706 int32_t hi
= MAX_STATES_2022
;
709 togo
= normalize_esq_chars_2022
[(uint8_t)c
];
711 /* not a valid character anywhere in an escape sequence */
716 togo
= (*key
<< 5) + togo
;
718 while (hi
!= low
) /*binary search*/{
720 register int32_t mid
= (hi
+low
) >> 1; /*Finds median*/
725 if (escSeqStateTable_Key_2022
[mid
] > togo
){
728 else if (escSeqStateTable_Key_2022
[mid
] < togo
){
731 else /*we found it*/{
734 return (UCNV_TableStates_2022
)escSeqStateTable_Value_2022
[mid
];
745 /*runs through a state machine to determine the escape sequence - codepage correspondance
748 changeState_2022(UConverter
* _this
,
750 const char* sourceLimit
,
753 UCNV_TableStates_2022 value
;
754 UConverterDataISO2022
* myData2022
= ((UConverterDataISO2022
*)_this
->extraInfo
);
755 uint32_t key
= myData2022
->key
;
757 int8_t initialToULength
= _this
->toULength
;
760 value
= VALID_NON_TERMINAL_2022
;
761 while (*source
< sourceLimit
) {
763 _this
->toUBytes
[_this
->toULength
++]=(uint8_t)c
;
764 value
= getKey_2022(c
,(int32_t *) &key
, &offset
);
768 case VALID_NON_TERMINAL_2022
:
769 /* continue with the loop */
772 case VALID_TERMINAL_2022
:
779 case VALID_MAYBE_TERMINAL_2022
:
780 #ifdef U_ENABLE_GENERIC_ISO_2022
781 /* ESC ( B is ambiguous only for ISO_2022 itself */
782 if(var
== ISO_2022
) {
783 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
784 _this
->toULength
= 0;
786 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
788 /* continue with the loop */
789 value
= VALID_NON_TERMINAL_2022
;
794 /* not ISO_2022 itself, finish here */
795 value
= VALID_TERMINAL_2022
;
803 myData2022
->key
= key
;
805 if (value
== VALID_NON_TERMINAL_2022
) {
806 /* indicate that the escape sequence is incomplete: key!=0 */
808 } else if (value
== INVALID_2022
) {
809 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
810 } else /* value == VALID_TERMINAL_2022 */ {
812 #ifdef U_ENABLE_GENERIC_ISO_2022
815 const char *chosenConverterName
= escSeqStateTable_Result_2022
[offset
];
816 if(chosenConverterName
== NULL
) {
818 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
819 _this
->toUCallbackReason
= UCNV_UNASSIGNED
;
823 _this
->mode
= UCNV_SI
;
824 ucnv_close(myData2022
->currentConverter
);
825 myData2022
->currentConverter
= myUConverter
= ucnv_open(chosenConverterName
, err
);
826 if(U_SUCCESS(*err
)) {
827 myUConverter
->fromCharErrorBehaviour
= UCNV_TO_U_CALLBACK_STOP
;
828 _this
->mode
= UCNV_SO
;
835 StateEnum tempState
=(StateEnum
)nextStateToUnicodeJP
[offset
];
838 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
841 if(myData2022
->toU2022State
.cs
[2]!=0) {
842 if(myData2022
->toU2022State
.g
<2) {
843 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
845 myData2022
->toU2022State
.g
=2;
847 /* illegal to have SS2 before a matching designator */
848 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
851 /* case SS3_STATE: not used in ISO-2022-JP-x */
854 if((jpCharsetMasks
[myData2022
->version
] & CSM(tempState
)) == 0) {
855 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
857 /* G2 charset for SS2 */
858 myData2022
->toU2022State
.cs
[2]=(int8_t)tempState
;
862 if((jpCharsetMasks
[myData2022
->version
] & CSM(tempState
)) == 0) {
863 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
866 myData2022
->toU2022State
.cs
[0]=(int8_t)tempState
;
874 StateEnum tempState
=(StateEnum
)nextStateToUnicodeCN
[offset
];
877 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
880 if(myData2022
->toU2022State
.cs
[2]!=0) {
881 if(myData2022
->toU2022State
.g
<2) {
882 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
884 myData2022
->toU2022State
.g
=2;
886 /* illegal to have SS2 before a matching designator */
887 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
891 if(myData2022
->toU2022State
.cs
[3]!=0) {
892 if(myData2022
->toU2022State
.g
<2) {
893 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
895 myData2022
->toU2022State
.g
=3;
897 /* illegal to have SS3 before a matching designator */
898 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
902 if(myData2022
->version
==0) {
903 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
910 myData2022
->toU2022State
.cs
[1]=(int8_t)tempState
;
913 myData2022
->toU2022State
.cs
[2]=(int8_t)tempState
;
916 /* other CNS 11643 planes */
917 if(myData2022
->version
==0) {
918 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
920 myData2022
->toU2022State
.cs
[3]=(int8_t)tempState
;
928 /* nothing to be done, just accept this one escape sequence */
930 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
935 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
939 if(U_SUCCESS(*err
)) {
940 _this
->toULength
= 0;
941 } else if(*err
==U_ILLEGAL_ESCAPE_SEQUENCE
) {
942 if(_this
->toULength
>1) {
944 * Ticket 5691: consistent illegal sequences:
945 * - We include at least the first byte (ESC) in the illegal sequence.
946 * - If any of the non-initial bytes could be the start of a character,
947 * we stop the illegal sequence before the first one of those.
948 * In escape sequences, all following bytes are "printable", that is,
949 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
950 * they are valid single/lead bytes.
951 * For simplicity, we always only report the initial ESC byte as the
952 * illegal sequence and back out all other bytes we looked at.
954 /* Back out some bytes. */
955 int8_t backOutDistance
=_this
->toULength
-1;
956 int8_t bytesFromThisBuffer
=_this
->toULength
-initialToULength
;
957 if(backOutDistance
<=bytesFromThisBuffer
) {
958 /* same as initialToULength<=1 */
959 *source
-=backOutDistance
;
961 /* Back out bytes from the previous buffer: Need to replay them. */
962 _this
->preToULength
=(int8_t)(bytesFromThisBuffer
-backOutDistance
);
963 /* same as -(initialToULength-1) */
964 /* preToULength is negative! */
965 uprv_memcpy(_this
->preToU
, _this
->toUBytes
+1, -_this
->preToULength
);
966 *source
-=bytesFromThisBuffer
;
970 } else if(*err
==U_UNSUPPORTED_ESCAPE_SEQUENCE
) {
971 _this
->toUCallbackReason
= UCNV_UNASSIGNED
;
975 /*Checks the characters of the buffer against valid 2022 escape sequences
976 *if the match we return a pointer to the initial start of the sequence otherwise
977 *we return sourceLimit
979 /*for 2022 looks ahead in the stream
980 *to determine the longest possible convertible
983 static U_INLINE
const char*
984 getEndOfBuffer_2022(const char** source
,
985 const char* sourceLimit
,
988 const char* mySource
= *source
;
990 #ifdef U_ENABLE_GENERIC_ISO_2022
991 if (*source
>= sourceLimit
)
996 if (*mySource
== ESC_2022
){
1000 UCNV_TableStates_2022 value
= VALID_NON_TERMINAL_2022
;
1002 /* Kludge: I could not
1003 * figure out the reason for validating an escape sequence
1004 * twice - once here and once in changeState_2022().
1005 * is it possible to have an ESC character in a ISO2022
1006 * byte stream which is valid in a code page? Is it legal?
1009 (mySource
+i
< sourceLimit
)&&(value
== VALID_NON_TERMINAL_2022
);
1011 value
= getKey_2022(*(mySource
+i
), &key
, &offset
);
1013 if (value
> 0 || *mySource
==ESC_2022
)
1016 if ((value
== VALID_NON_TERMINAL_2022
)&&(!flush
) )
1019 }while (++mySource
< sourceLimit
);
1023 while(mySource
< sourceLimit
&& *mySource
!= ESC_2022
) {
1031 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1032 * any future change in _MBCSFromUChar32() function should be reflected here.
1033 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1035 static U_INLINE
int32_t
1036 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData
* sharedData
,
1043 const uint16_t *table
;
1044 uint32_t stage2Entry
;
1049 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1050 * Use internal version of ucnv_open() that verifies that the new structures are available,
1051 * else U_INTERNAL_PROGRAM_ERROR.
1053 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1054 if(c
<0x10000 || (sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
1055 table
=sharedData
->mbcs
.fromUnicodeTable
;
1056 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
1057 /* get the bytes and the length for the output */
1058 if(outputType
==MBCS_OUTPUT_2
){
1059 myValue
=MBCS_VALUE_2_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
1065 } else /* outputType==MBCS_OUTPUT_3 */ {
1066 p
=MBCS_POINTER_3_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
1067 myValue
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
1070 } else if(myValue
<=0xffff) {
1076 /* is this code point assigned, or do we use fallbacks? */
1077 if((stage2Entry
&(1<<(16+(c
&0xf))))!=0) {
1081 } else if(FROM_U_USE_FALLBACK(useFallback
, c
) && myValue
!=0) {
1083 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1084 * There is no way with this data structure for fallback output
1085 * to be a zero byte.
1092 cx
=sharedData
->mbcs
.extIndexes
;
1094 return ucnv_extSimpleMatchFromU(cx
, c
, value
, useFallback
);
1101 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1102 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1103 * @param retval pointer to output byte
1104 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1106 static U_INLINE
int32_t
1107 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData
* sharedData
,
1112 const uint16_t *table
;
1114 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1115 if(c
>=0x10000 && !(sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
1118 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1119 table
=sharedData
->mbcs
.fromUnicodeTable
;
1120 /* get the byte for the output */
1121 value
=MBCS_SINGLE_RESULT_FROM_U(table
, (uint16_t *)sharedData
->mbcs
.fromUnicodeBytes
, c
);
1122 /* is this code point assigned, or do we use fallbacks? */
1123 *retval
=(uint32_t)(value
&0xff);
1125 return 1; /* roundtrip */
1126 } else if(useFallback
? value
>=0x800 : value
>=0xc00) {
1127 return -1; /* fallback taken */
1129 return 0; /* no mapping */
1134 * Check that the result is a 2-byte value with each byte in the range A1..FE
1135 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1136 * to move it to the ISO 2022 range 21..7E.
1137 * Return 0 if out of range.
1139 static U_INLINE
uint32_t
1140 _2022FromGR94DBCS(uint32_t value
) {
1141 if( (uint16_t)(value
- 0xa1a1) <= (0xfefe - 0xa1a1) &&
1142 (uint8_t)(value
- 0xa1) <= (0xfe - 0xa1)
1144 return value
- 0x8080; /* shift down to 21..7e byte range */
1146 return 0; /* not valid for ISO 2022 */
1150 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1152 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1153 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1156 static U_INLINE
uint32_t
1157 _2022ToGR94DBCS(uint32_t value
) {
1158 uint32_t returnValue
= value
+ 0x8080;
1159 if( (uint16_t)(returnValue
- 0xa1a1) <= (0xfefe - 0xa1a1) &&
1160 (uint8_t)(returnValue
- 0xa1) <= (0xfe - 0xa1)) {
1168 #ifdef U_ENABLE_GENERIC_ISO_2022
1170 /**********************************************************************************
1171 * ISO-2022 Converter
1177 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs
* args
,
1179 const char* mySourceLimit
, *realSourceLimit
;
1180 const char* sourceStart
;
1181 const UChar
* myTargetStart
;
1182 UConverter
* saveThis
;
1183 UConverterDataISO2022
* myData
;
1186 saveThis
= args
->converter
;
1187 myData
=((UConverterDataISO2022
*)(saveThis
->extraInfo
));
1189 realSourceLimit
= args
->sourceLimit
;
1190 while (args
->source
< realSourceLimit
) {
1191 if(myData
->key
== 0) { /* are we in the middle of an escape sequence? */
1192 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1193 mySourceLimit
= getEndOfBuffer_2022(&(args
->source
), realSourceLimit
, args
->flush
);
1195 if(args
->source
< mySourceLimit
) {
1196 if(myData
->currentConverter
==NULL
) {
1197 myData
->currentConverter
= ucnv_open("ASCII",err
);
1198 if(U_FAILURE(*err
)){
1202 myData
->currentConverter
->fromCharErrorBehaviour
= UCNV_TO_U_CALLBACK_STOP
;
1203 saveThis
->mode
= UCNV_SO
;
1206 /* convert to before the ESC or until the end of the buffer */
1207 myData
->isFirstBuffer
=FALSE
;
1208 sourceStart
= args
->source
;
1209 myTargetStart
= args
->target
;
1210 args
->converter
= myData
->currentConverter
;
1211 ucnv_toUnicode(args
->converter
,
1217 (UBool
)(args
->flush
&& mySourceLimit
== realSourceLimit
),
1219 args
->converter
= saveThis
;
1221 if (*err
== U_BUFFER_OVERFLOW_ERROR
) {
1222 /* move the overflow buffer */
1223 length
= saveThis
->UCharErrorBufferLength
= myData
->currentConverter
->UCharErrorBufferLength
;
1224 myData
->currentConverter
->UCharErrorBufferLength
= 0;
1226 uprv_memcpy(saveThis
->UCharErrorBuffer
,
1227 myData
->currentConverter
->UCharErrorBuffer
,
1228 length
*U_SIZEOF_UCHAR
);
1235 * -Error while converting
1236 * -Done with entire buffer
1237 * -Need to write offsets or update the current offset
1238 * (leave that up to the code in ucnv.c)
1240 * or else we just stopped at an ESC byte and continue with changeState_2022()
1242 if (U_FAILURE(*err
) ||
1243 (args
->source
== realSourceLimit
) ||
1244 (args
->offsets
!= NULL
&& (args
->target
!= myTargetStart
|| args
->source
!= sourceStart
) ||
1245 (mySourceLimit
< realSourceLimit
&& myData
->currentConverter
->toULength
> 0))
1247 /* copy partial or error input for truncated detection and error handling */
1248 if(U_FAILURE(*err
)) {
1249 length
= saveThis
->invalidCharLength
= myData
->currentConverter
->invalidCharLength
;
1251 uprv_memcpy(saveThis
->invalidCharBuffer
, myData
->currentConverter
->invalidCharBuffer
, length
);
1254 length
= saveThis
->toULength
= myData
->currentConverter
->toULength
;
1256 uprv_memcpy(saveThis
->toUBytes
, myData
->currentConverter
->toUBytes
, length
);
1257 if(args
->source
< mySourceLimit
) {
1258 *err
= U_TRUNCATED_CHAR_FOUND
; /* truncated input before ESC */
1267 sourceStart
= args
->source
;
1268 changeState_2022(args
->converter
,
1273 if (U_FAILURE(*err
) || (args
->source
!= sourceStart
&& args
->offsets
!= NULL
)) {
1274 /* let the ucnv.c code update its current offset */
1283 * To Unicode Callback helper function
1286 toUnicodeCallback(UConverter
*cnv
,
1287 const uint32_t sourceChar
, const uint32_t targetUniChar
,
1289 if(sourceChar
>0xff){
1290 cnv
->toUBytes
[0] = (uint8_t)(sourceChar
>>8);
1291 cnv
->toUBytes
[1] = (uint8_t)sourceChar
;
1295 cnv
->toUBytes
[0] =(char) sourceChar
;
1299 if(targetUniChar
== (missingCharMarker
-1/*0xfffe*/)){
1300 *err
= U_INVALID_CHAR_FOUND
;
1303 *err
= U_ILLEGAL_CHAR_FOUND
;
1307 /**************************************ISO-2022-JP*************************************************/
1309 /************************************** IMPORTANT **************************************************
1310 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1311 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1312 * The converter iterates over each Unicode codepoint
1313 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1314 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1315 * would do as far as possible.
1317 * If the implementation of these macros or structure of sharedData struct change in the future, make
1318 * sure that ISO-2022 is also changed.
1319 ***************************************************************************************************
1322 /***************************************************************************************************
1323 * Rules for ISO-2022-jp encoding
1324 * (i) Escape sequences must be fully contained within a line they should not
1325 * span new lines or CRs
1326 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1327 * JIS-Roman character escape sequence should follow before the line terminates
1328 * (iii) If the first character on the line is represented by two bytes then a two
1329 * byte character escape sequence should precede it
1330 * (iv) If no escape sequence is encountered then the characters are ASCII
1331 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1332 * and invoked with SS2 (ESC N).
1333 * (vi) If there is any G0 designation in text, there must be a switch to
1334 * ASCII or to JIS X 0201-Roman before a space character (but not
1335 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1336 * characters such as tab or CRLF.
1337 * (vi) Supported encodings:
1338 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1342 * JISX201, JISX208,JISX212 : new .cnv data files created
1343 * KSC5601 : alias to ibm-949 mapping table
1344 * GB2312 : alias to ibm-1386 mapping table
1345 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1346 * ISO-8859-7 : alisas to ibm-9409 mapping table
1349 /* preference order of JP charsets */
1350 static const StateEnum jpCharsetPref
[]={
1363 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1364 * not in order of jpCharsetPref[]!
1366 static const char escSeqChars
[][6] ={
1367 "\x1B\x28\x42", /* <ESC>(B ASCII */
1368 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1369 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1370 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1371 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1372 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1373 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1374 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1375 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1378 static const int8_t escSeqCharsLen
[] ={
1379 3, /* length of <ESC>(B ASCII */
1380 3, /* length of <ESC>.A ISO-8859-1 */
1381 3, /* length of <ESC>.F ISO-8859-7 */
1382 3, /* length of <ESC>(J JISX-201 */
1383 3, /* length of <ESC>$B JISX-208 */
1384 4, /* length of <ESC>$(D JISX-212 */
1385 3, /* length of <ESC>$A GB2312 */
1386 4, /* length of <ESC>$(C KSC5601 */
1387 3 /* length of <ESC>(I HWKANA_7BIT */
1391 * The iteration over various code pages works this way:
1392 * i) Get the currentState from myConverterData->currentState
1393 * ii) Check if the character is mapped to a valid character in the currentState
1394 * Yes -> a) set the initIterState to currentState
1395 * b) remain in this state until an invalid character is found
1396 * No -> a) go to the next code page and find the character
1397 * iii) Before changing the state increment the current state check if the current state
1398 * is equal to the intitIteration state
1399 * Yes -> A character that cannot be represented in any of the supported encodings
1400 * break and return a U_INVALID_CHARACTER error
1401 * No -> Continue and find the character in next code page
1404 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1407 /* Map 00..7F to Unicode according to JIS X 0201. */
1408 static U_INLINE
uint32_t
1409 jisx201ToU(uint32_t value
) {
1412 } else if(value
== 0x5c) {
1414 } else if(value
== 0x7e) {
1416 } else /* value <= 0x7f */ {
1421 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1422 static U_INLINE
uint32_t
1423 jisx201FromU(uint32_t value
) {
1425 if(value
!=0x5c && value
!=0x7e) {
1428 } else if(value
==0xa5) {
1430 } else if(value
==0x203e) {
1437 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1438 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1439 * Return 0 if the byte pair is out of range.
1441 static U_INLINE
uint32_t
1442 _2022FromSJIS(uint32_t value
) {
1445 if(value
> 0xEFFC) {
1446 return 0; /* beyond JIS X 0208 */
1449 trail
= (uint8_t)value
;
1451 value
&= 0xff00; /* lead byte */
1452 if(value
<= 0x9f00) {
1454 } else /* 0xe000 <= value <= 0xef00 */ {
1462 value
|= trail
- 0x1f;
1464 value
|= trail
- 0x20;
1466 } else /* trail <= 0xfc */ {
1467 value
|= trail
- 0x7e;
1473 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1474 * If either byte is outside 21..7E make sure that the result is not valid
1475 * for Shift-JIS so that the converter catches it.
1476 * Some invalid byte values already turn into equally invalid Shift-JIS
1477 * byte values and need not be tested explicitly.
1479 static U_INLINE
void
1480 _2022ToSJIS(uint8_t c1
, uint8_t c2
, char bytes
[2]) {
1485 } else if(c2
<= 0x7e) {
1488 c2
= 0; /* invalid */
1491 if((uint8_t)(c2
-0x21) <= ((0x7e)-0x21)) {
1494 c2
= 0; /* invalid */
1500 } else if(c1
<= 0x3f) {
1503 c1
= 0; /* invalid */
1505 bytes
[0] = (char)c1
;
1506 bytes
[1] = (char)c2
;
1510 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1512 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1513 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1514 * These were the only fallbacks in ICU's jisx-208.ucm file.
1516 static const uint16_t hwkana_fb
[HWKANA_END
- HWKANA_START
+ 1] = {
1517 0x2123, /* U+FF61 */
1532 0x213C, /* U+FF70 */
1548 0x253F, /* U+FF80 */
1564 0x255F, /* U+FF90 */
1583 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
) {
1584 UConverter
*cnv
= args
->converter
;
1585 UConverterDataISO2022
*converterData
;
1586 ISO2022State
*pFromU2022State
;
1587 uint8_t *target
= (uint8_t *) args
->target
;
1588 const uint8_t *targetLimit
= (const uint8_t *) args
->targetLimit
;
1589 const UChar
* source
= args
->source
;
1590 const UChar
* sourceLimit
= args
->sourceLimit
;
1591 int32_t* offsets
= args
->offsets
;
1594 int32_t len
, outLen
;
1596 int32_t choiceCount
;
1597 uint32_t targetValue
= 0;
1603 /* set up the state */
1604 converterData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
1605 pFromU2022State
= &converterData
->fromU2022State
;
1609 /* check if the last codepoint of previous buffer was a lead surrogate*/
1610 if((sourceChar
= cnv
->fromUChar32
)!=0 && target
< targetLimit
) {
1614 while(source
< sourceLimit
) {
1615 if(target
< targetLimit
) {
1617 sourceChar
= *(source
++);
1618 /*check if the char is a First surrogate*/
1619 if(UTF_IS_SURROGATE(sourceChar
)) {
1620 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
1622 /*look ahead to find the trail surrogate*/
1623 if(source
< sourceLimit
) {
1624 /* test the following code unit */
1625 UChar trail
=(UChar
) *source
;
1626 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1628 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
1629 cnv
->fromUChar32
=0x00;
1630 /* convert this supplementary code point */
1631 /* exit this condition tree */
1633 /* this is an unmatched lead code unit (1st surrogate) */
1634 /* callback(illegal) */
1635 *err
=U_ILLEGAL_CHAR_FOUND
;
1636 cnv
->fromUChar32
=sourceChar
;
1641 cnv
->fromUChar32
=sourceChar
;
1645 /* this is an unmatched trail code unit (2nd surrogate) */
1646 /* callback(illegal) */
1647 *err
=U_ILLEGAL_CHAR_FOUND
;
1648 cnv
->fromUChar32
=sourceChar
;
1653 /* do not convert SO/SI/ESC */
1654 if(IS_2022_CONTROL(sourceChar
)) {
1655 /* callback(illegal) */
1656 *err
=U_ILLEGAL_CHAR_FOUND
;
1657 cnv
->fromUChar32
=sourceChar
;
1661 /* do the conversion */
1663 if(choiceCount
== 0) {
1667 * The csm variable keeps track of which charsets are allowed
1668 * and not used yet while building the choices[].
1670 csm
= jpCharsetMasks
[converterData
->version
];
1673 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1674 if(converterData
->version
== 3 || converterData
->version
== 4) {
1675 choices
[choiceCount
++] = (int8_t)HWKANA_7BIT
;
1677 /* Do not try single-byte half-width Katakana for other versions. */
1678 csm
&= ~CSM(HWKANA_7BIT
);
1680 /* try the current G0 charset */
1681 choices
[choiceCount
++] = cs
= pFromU2022State
->cs
[0];
1684 /* try the current G2 charset */
1685 if((cs
= pFromU2022State
->cs
[2]) != 0) {
1686 choices
[choiceCount
++] = cs
;
1690 /* try all the other possible charsets */
1691 for(i
= 0; i
< LENGTHOF(jpCharsetPref
); ++i
) {
1692 cs
= (int8_t)jpCharsetPref
[i
];
1694 choices
[choiceCount
++] = cs
;
1702 * len==0: no mapping found yet
1703 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1704 * len>0: found a roundtrip result, done
1708 * We will turn off useFallback after finding a fallback,
1709 * but we still get fallbacks from PUA code points as usual.
1710 * Therefore, we will also need to check that we don't overwrite
1711 * an early fallback with a later one.
1713 useFallback
= cnv
->useFallback
;
1715 for(i
= 0; i
< choiceCount
&& len
<= 0; ++i
) {
1718 int8_t cs0
= choices
[i
];
1721 if(sourceChar
<= 0x7f) {
1722 targetValue
= (uint32_t)sourceChar
;
1729 if(GR96_START
<= sourceChar
&& sourceChar
<= GR96_END
) {
1730 targetValue
= (uint32_t)sourceChar
- 0x80;
1737 if((uint32_t)(sourceChar
- HWKANA_START
) <= (HWKANA_END
- HWKANA_START
)) {
1738 if(converterData
->version
==3) {
1739 /* JIS7: use G1 (SO) */
1740 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1741 targetValue
= (uint32_t)(sourceChar
- (HWKANA_START
- 0x21));
1743 pFromU2022State
->cs
[1] = cs
= cs0
; /* do not output an escape sequence */
1745 } else if(converterData
->version
==4) {
1746 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1747 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1748 targetValue
= (uint32_t)(sourceChar
- (HWKANA_START
- 0xa1));
1751 cs
= pFromU2022State
->cs
[0];
1752 if(IS_JP_DBCS(cs
)) {
1753 /* switch from a DBCS charset to JISX201 */
1754 cs
= (int8_t)JISX201
;
1756 /* else stay in the current G0 charset */
1759 /* else do not use HWKANA_7BIT with other versions */
1764 value
= jisx201FromU(sourceChar
);
1766 targetValue
= value
;
1770 useFallback
= FALSE
;
1774 /* G0 DBCS from Shift-JIS table */
1775 len2
= MBCS_FROM_UCHAR32_ISO2022(
1776 converterData
->myConverterArray
[cs0
],
1778 useFallback
, MBCS_OUTPUT_2
);
1779 if(len2
== 2 || (len2
== -2 && len
== 0)) { /* only accept DBCS: abs(len)==2 */
1780 value
= _2022FromSJIS(value
);
1782 targetValue
= value
;
1786 useFallback
= FALSE
;
1788 } else if(len
== 0 && useFallback
&&
1789 (uint32_t)(sourceChar
- HWKANA_START
) <= (HWKANA_END
- HWKANA_START
)) {
1790 targetValue
= hwkana_fb
[sourceChar
- HWKANA_START
];
1794 useFallback
= FALSE
;
1798 /* G0 SBCS forced to 7-bit output */
1799 len2
= MBCS_SINGLE_FROM_UCHAR32(
1800 converterData
->myConverterArray
[cs0
],
1803 if(len2
!= 0 && !(len2
< 0 && len
!= 0) && GR96_START
<= value
&& value
<= GR96_END
) {
1804 targetValue
= value
- 0x80;
1808 useFallback
= FALSE
;
1813 len2
= MBCS_FROM_UCHAR32_ISO2022(
1814 converterData
->myConverterArray
[cs0
],
1816 useFallback
, MBCS_OUTPUT_2
);
1817 if(len2
== 2 || (len2
== -2 && len
== 0)) { /* only accept DBCS: abs(len)==2 */
1818 if(cs0
== KSC5601
) {
1820 * Check for valid bytes for the encoding scheme.
1821 * This is necessary because the sub-converter (windows-949)
1822 * has a broader encoding scheme than is valid for 2022.
1824 value
= _2022FromGR94DBCS(value
);
1829 targetValue
= value
;
1833 useFallback
= FALSE
;
1841 len
= -len
; /* fallback */
1843 outLen
= 0; /* count output bytes */
1845 /* write SI if necessary (only for JIS7) */
1846 if(pFromU2022State
->g
== 1 && g
== 0) {
1847 buffer
[outLen
++] = UCNV_SI
;
1848 pFromU2022State
->g
= 0;
1851 /* write the designation sequence if necessary */
1852 if(cs
!= pFromU2022State
->cs
[g
]) {
1853 int32_t escLen
= escSeqCharsLen
[cs
];
1854 uprv_memcpy(buffer
+ outLen
, escSeqChars
[cs
], escLen
);
1856 pFromU2022State
->cs
[g
] = cs
;
1858 /* invalidate the choices[] */
1862 /* write the shift sequence if necessary */
1863 if(g
!= pFromU2022State
->g
) {
1865 /* case 0 handled before writing escapes */
1867 buffer
[outLen
++] = UCNV_SO
;
1868 pFromU2022State
->g
= 1;
1870 default: /* case 2 */
1871 buffer
[outLen
++] = 0x1b;
1872 buffer
[outLen
++] = 0x4e;
1874 /* no case 3: no SS3 in ISO-2022-JP-x */
1878 /* write the output bytes */
1880 buffer
[outLen
++] = (char)targetValue
;
1881 } else /* len == 2 */ {
1882 buffer
[outLen
++] = (char)(targetValue
>> 8);
1883 buffer
[outLen
++] = (char)targetValue
;
1887 * if we cannot find the character after checking all codepages
1888 * then this is an error
1890 *err
= U_INVALID_CHAR_FOUND
;
1891 cnv
->fromUChar32
=sourceChar
;
1895 if(sourceChar
== CR
|| sourceChar
== LF
) {
1896 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1897 pFromU2022State
->cs
[2] = 0;
1901 /* output outLen>0 bytes in buffer[] */
1903 *target
++ = buffer
[0];
1905 *offsets
++ = (int32_t)(source
- args
->source
- 1); /* -1: known to be ASCII */
1907 } else if(outLen
== 2 && (target
+ 2) <= targetLimit
) {
1908 *target
++ = buffer
[0];
1909 *target
++ = buffer
[1];
1911 int32_t sourceIndex
= (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
));
1912 *offsets
++ = sourceIndex
;
1913 *offsets
++ = sourceIndex
;
1919 &target
, (const char *)targetLimit
,
1920 &offsets
, (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
)),
1922 if(U_FAILURE(*err
)) {
1926 } /* end if(myTargetIndex<myTargetLength) */
1928 *err
=U_BUFFER_OVERFLOW_ERROR
;
1932 }/* end while(mySourceIndex<mySourceLength) */
1935 * the end of the input stream and detection of truncated input
1936 * are handled by the framework, but for ISO-2022-JP conversion
1937 * we need to be in ASCII mode at the very end
1941 * in SO mode or not in ASCII mode
1942 * end of input and no truncated input
1944 if( U_SUCCESS(*err
) &&
1945 (pFromU2022State
->g
!=0 || pFromU2022State
->cs
[0]!=ASCII
) &&
1946 args
->flush
&& source
>=sourceLimit
&& cnv
->fromUChar32
==0
1948 int32_t sourceIndex
;
1952 if(pFromU2022State
->g
!= 0) {
1953 buffer
[outLen
++] = UCNV_SI
;
1954 pFromU2022State
->g
= 0;
1957 if(pFromU2022State
->cs
[0] != ASCII
) {
1958 int32_t escLen
= escSeqCharsLen
[ASCII
];
1959 uprv_memcpy(buffer
+ outLen
, escSeqChars
[ASCII
], escLen
);
1961 pFromU2022State
->cs
[0] = (int8_t)ASCII
;
1964 /* get the source index of the last input character */
1966 * TODO this would be simpler and more reliable if we used a pair
1967 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1968 * so that we could simply use the prevSourceIndex here;
1969 * this code gives an incorrect result for the rare case of an unmatched
1970 * trail surrogate that is alone in the last buffer of the text stream
1972 sourceIndex
=(int32_t)(source
-args
->source
);
1975 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
1976 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
1987 &target
, (const char *)targetLimit
,
1988 &offsets
, sourceIndex
,
1992 /*save the state and return */
1993 args
->source
= source
;
1994 args
->target
= (char*)target
;
1997 /*************** to unicode *******************/
2000 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
2003 const char *mySource
= (char *) args
->source
;
2004 UChar
*myTarget
= args
->target
;
2005 const char *mySourceLimit
= args
->sourceLimit
;
2006 uint32_t targetUniChar
= 0x0000;
2007 uint32_t mySourceChar
= 0x0000;
2008 uint32_t tmpSourceChar
= 0x0000;
2009 UConverterDataISO2022
* myData
;
2010 ISO2022State
*pToU2022State
;
2013 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2014 pToU2022State
= &myData
->toU2022State
;
2016 if(myData
->key
!= 0) {
2017 /* continue with a partial escape sequence */
2019 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
2020 /* continue with a partial double-byte character */
2021 mySourceChar
= args
->converter
->toUBytes
[0];
2022 args
->converter
->toULength
= 0;
2023 cs
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
2024 targetUniChar
= missingCharMarker
;
2028 while(mySource
< mySourceLimit
){
2030 targetUniChar
=missingCharMarker
;
2032 if(myTarget
< args
->targetLimit
){
2034 mySourceChar
= (unsigned char) *mySource
++;
2036 switch(mySourceChar
) {
2038 if(myData
->version
==3) {
2042 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2043 myData
->isEmptySegment
= FALSE
; /* reset this, we have a different error */
2048 if(myData
->version
==3) {
2049 /* JIS7: switch to G1 half-width Katakana */
2050 pToU2022State
->cs
[1] = (int8_t)HWKANA_7BIT
;
2054 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2055 myData
->isEmptySegment
= FALSE
; /* reset this, we have a different error */
2063 const char * mySourceBefore
= mySource
;
2064 int8_t toULengthBefore
= args
->converter
->toULength
;
2066 changeState_2022(args
->converter
,&(mySource
),
2067 mySourceLimit
, ISO_2022_JP
,err
);
2069 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2070 if(myData
->version
==0 && myData
->key
==0 && U_SUCCESS(*err
) && myData
->isEmptySegment
) {
2071 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
2072 args
->converter
->toUCallbackReason
= UCNV_IRREGULAR
;
2073 args
->converter
->toULength
= toULengthBefore
+ (mySource
- mySourceBefore
);
2077 /* invalid or illegal escape sequence */
2078 if(U_FAILURE(*err
)){
2079 args
->target
= myTarget
;
2080 args
->source
= mySource
;
2081 myData
->isEmptySegment
= FALSE
; /* Reset to avoid future spurious errors */
2084 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2085 if(myData
->key
==0) {
2086 myData
->isEmptySegment
= TRUE
;
2090 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2095 /* automatically reset to single-byte mode */
2096 if((StateEnum
)pToU2022State
->cs
[0] != ASCII
&& (StateEnum
)pToU2022State
->cs
[0] != JISX201
) {
2097 pToU2022State
->cs
[0] = (int8_t)ASCII
;
2099 pToU2022State
->cs
[2] = 0;
2100 pToU2022State
->g
= 0;
2103 /* convert one or two bytes */
2104 myData
->isEmptySegment
= FALSE
;
2105 cs
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
2106 if( (uint8_t)(mySourceChar
- 0xa1) <= (0xdf - 0xa1) && myData
->version
==4 &&
2109 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2110 targetUniChar
= mySourceChar
+ (HWKANA_START
- 0xa1);
2112 /* return from a single-shift state to the previous one */
2113 if(pToU2022State
->g
>= 2) {
2114 pToU2022State
->g
=pToU2022State
->prevG
;
2118 if(mySourceChar
<= 0x7f) {
2119 targetUniChar
= mySourceChar
;
2123 if(mySourceChar
<= 0x7f) {
2124 targetUniChar
= mySourceChar
+ 0x80;
2126 /* return from a single-shift state to the previous one */
2127 pToU2022State
->g
=pToU2022State
->prevG
;
2130 if(mySourceChar
<= 0x7f) {
2131 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2133 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2134 myData
->myConverterArray
[cs
],
2135 mySourceChar
+ 0x80);
2137 /* return from a single-shift state to the previous one */
2138 pToU2022State
->g
=pToU2022State
->prevG
;
2141 if(mySourceChar
<= 0x7f) {
2142 targetUniChar
= jisx201ToU(mySourceChar
);
2146 if((uint8_t)(mySourceChar
- 0x21) <= (0x5f - 0x21)) {
2147 /* 7-bit halfwidth Katakana */
2148 targetUniChar
= mySourceChar
+ (HWKANA_START
- 0x21);
2153 if(mySource
< mySourceLimit
) {
2154 int leadIsOk
, trailIsOk
;
2157 trailByte
= (uint8_t)*mySource
;
2159 * Ticket 5691: consistent illegal sequences:
2160 * - We include at least the first byte in the illegal sequence.
2161 * - If any of the non-initial bytes could be the start of a character,
2162 * we stop the illegal sequence before the first one of those.
2164 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2165 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2166 * Otherwise we convert or report the pair of bytes.
2168 leadIsOk
= (uint8_t)(mySourceChar
- 0x21) <= (0x7e - 0x21);
2169 trailIsOk
= (uint8_t)(trailByte
- 0x21) <= (0x7e - 0x21);
2170 if (leadIsOk
&& trailIsOk
) {
2172 tmpSourceChar
= (mySourceChar
<< 8) | trailByte
;
2174 _2022ToSJIS((uint8_t)mySourceChar
, trailByte
, tempBuf
);
2175 mySourceChar
= tmpSourceChar
;
2177 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2178 mySourceChar
= tmpSourceChar
;
2179 if (cs
== KSC5601
) {
2180 tmpSourceChar
+= 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2182 tempBuf
[0] = (char)(tmpSourceChar
>> 8);
2183 tempBuf
[1] = (char)(tmpSourceChar
);
2185 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(myData
->myConverterArray
[cs
], tempBuf
, 2, FALSE
);
2186 } else if (!(trailIsOk
|| IS_2022_CONTROL(trailByte
))) {
2187 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2189 /* add another bit so that the code below writes 2 bytes in case of error */
2190 mySourceChar
= 0x10000 | (mySourceChar
<< 8) | trailByte
;
2193 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
2194 args
->converter
->toULength
= 1;
2197 } /* End of inner switch */
2199 } /* End of outer switch */
2200 if(targetUniChar
< (missingCharMarker
-1/*0xfffe*/)){
2202 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2204 *(myTarget
++)=(UChar
)targetUniChar
;
2206 else if(targetUniChar
> missingCharMarker
){
2207 /* disassemble the surrogate pair and write to output*/
2208 targetUniChar
-=0x0010000;
2209 *myTarget
= (UChar
)(0xd800+(UChar
)(targetUniChar
>>10));
2211 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2214 if(myTarget
< args
->targetLimit
){
2215 *myTarget
= (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
2217 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2221 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]=
2222 (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
2227 /* Call the callback function*/
2228 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
2232 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2233 *err
=U_BUFFER_OVERFLOW_ERROR
;
2238 args
->target
= myTarget
;
2239 args
->source
= mySource
;
2243 /***************************************************************
2244 * Rules for ISO-2022-KR encoding
2245 * i) The KSC5601 designator sequence should appear only once in a file,
2246 * at the begining of a line before any KSC5601 characters. This usually
2247 * means that it appears by itself on the first line of the file
2248 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2249 * and SI to shift into single byte mode
2252 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
2254 UConverter
* saveConv
= args
->converter
;
2255 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*)saveConv
->extraInfo
;
2256 args
->converter
=myConverterData
->currentConverter
;
2258 myConverterData
->currentConverter
->fromUChar32
= saveConv
->fromUChar32
;
2259 ucnv_MBCSFromUnicodeWithOffsets(args
,err
);
2260 saveConv
->fromUChar32
= myConverterData
->currentConverter
->fromUChar32
;
2262 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
2263 if(myConverterData
->currentConverter
->charErrorBufferLength
> 0) {
2265 saveConv
->charErrorBuffer
,
2266 myConverterData
->currentConverter
->charErrorBuffer
,
2267 myConverterData
->currentConverter
->charErrorBufferLength
);
2269 saveConv
->charErrorBufferLength
= myConverterData
->currentConverter
->charErrorBufferLength
;
2270 myConverterData
->currentConverter
->charErrorBufferLength
= 0;
2272 args
->converter
=saveConv
;
2276 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
2278 const UChar
*source
= args
->source
;
2279 const UChar
*sourceLimit
= args
->sourceLimit
;
2280 unsigned char *target
= (unsigned char *) args
->target
;
2281 unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
2282 int32_t* offsets
= args
->offsets
;
2283 uint32_t targetByteUnit
= 0x0000;
2284 UChar32 sourceChar
= 0x0000;
2285 UBool isTargetByteDBCS
;
2286 UBool oldIsTargetByteDBCS
;
2287 UConverterDataISO2022
*converterData
;
2288 UConverterSharedData
* sharedData
;
2292 converterData
=(UConverterDataISO2022
*)args
->converter
->extraInfo
;
2293 /* if the version is 1 then the user is requesting
2294 * conversion with ibm-25546 pass the arguments to
2295 * MBCS converter and return
2297 if(converterData
->version
==1){
2298 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args
,err
);
2302 /* initialize data */
2303 sharedData
= converterData
->currentConverter
->sharedData
;
2304 useFallback
= args
->converter
->useFallback
;
2305 isTargetByteDBCS
=(UBool
)args
->converter
->fromUnicodeStatus
;
2306 oldIsTargetByteDBCS
= isTargetByteDBCS
;
2308 isTargetByteDBCS
= (UBool
) args
->converter
->fromUnicodeStatus
;
2309 if((sourceChar
= args
->converter
->fromUChar32
)!=0 && target
<targetLimit
) {
2312 while(source
< sourceLimit
){
2314 targetByteUnit
= missingCharMarker
;
2316 if(target
< (unsigned char*) args
->targetLimit
){
2317 sourceChar
= *source
++;
2319 /* do not convert SO/SI/ESC */
2320 if(IS_2022_CONTROL(sourceChar
)) {
2321 /* callback(illegal) */
2322 *err
=U_ILLEGAL_CHAR_FOUND
;
2323 args
->converter
->fromUChar32
=sourceChar
;
2327 length
= MBCS_FROM_UCHAR32_ISO2022(sharedData
,sourceChar
,&targetByteUnit
,useFallback
,MBCS_OUTPUT_2
);
2329 length
= -length
; /* fallback */
2331 /* only DBCS or SBCS characters are expected*/
2332 /* DB characters with high bit set to 1 are expected */
2333 if( length
> 2 || length
==0 ||
2334 (length
== 1 && targetByteUnit
> 0x7f) ||
2336 ((uint16_t)(targetByteUnit
- 0xa1a1) > (0xfefe - 0xa1a1) ||
2337 (uint8_t)(targetByteUnit
- 0xa1) > (0xfe - 0xa1)))
2339 targetByteUnit
=missingCharMarker
;
2341 if (targetByteUnit
!= missingCharMarker
){
2343 oldIsTargetByteDBCS
= isTargetByteDBCS
;
2344 isTargetByteDBCS
= (UBool
)(targetByteUnit
>0x00FF);
2345 /* append the shift sequence */
2346 if (oldIsTargetByteDBCS
!= isTargetByteDBCS
){
2348 if (isTargetByteDBCS
)
2349 *target
++ = UCNV_SO
;
2351 *target
++ = UCNV_SI
;
2353 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2355 /* write the targetUniChar to target */
2356 if(targetByteUnit
<= 0x00FF){
2357 if( target
< targetLimit
){
2358 *(target
++) = (unsigned char) targetByteUnit
;
2360 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2364 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
);
2365 *err
= U_BUFFER_OVERFLOW_ERROR
;
2368 if(target
< targetLimit
){
2369 *(target
++) =(unsigned char) ((targetByteUnit
>>8) -0x80);
2371 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2373 if(target
< targetLimit
){
2374 *(target
++) =(unsigned char) (targetByteUnit
-0x80);
2376 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2379 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
-0x80);
2380 *err
= U_BUFFER_OVERFLOW_ERROR
;
2383 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) ((targetByteUnit
>>8) -0x80);
2384 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
-0x80);
2385 *err
= U_BUFFER_OVERFLOW_ERROR
;
2391 /* oops.. the code point is unassingned
2392 * set the error and reason
2395 /*check if the char is a First surrogate*/
2396 if(UTF_IS_SURROGATE(sourceChar
)) {
2397 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
2399 /*look ahead to find the trail surrogate*/
2400 if(source
< sourceLimit
) {
2401 /* test the following code unit */
2402 UChar trail
=(UChar
) *source
;
2403 if(UTF_IS_SECOND_SURROGATE(trail
)) {
2405 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
2406 *err
= U_INVALID_CHAR_FOUND
;
2407 /* convert this surrogate code point */
2408 /* exit this condition tree */
2410 /* this is an unmatched lead code unit (1st surrogate) */
2411 /* callback(illegal) */
2412 *err
=U_ILLEGAL_CHAR_FOUND
;
2416 *err
= U_ZERO_ERROR
;
2419 /* this is an unmatched trail code unit (2nd surrogate) */
2420 /* callback(illegal) */
2421 *err
=U_ILLEGAL_CHAR_FOUND
;
2424 /* callback(unassigned) for a BMP code point */
2425 *err
= U_INVALID_CHAR_FOUND
;
2428 args
->converter
->fromUChar32
=sourceChar
;
2431 } /* end if(myTargetIndex<myTargetLength) */
2433 *err
=U_BUFFER_OVERFLOW_ERROR
;
2437 }/* end while(mySourceIndex<mySourceLength) */
2440 * the end of the input stream and detection of truncated input
2441 * are handled by the framework, but for ISO-2022-KR conversion
2442 * we need to be in ASCII mode at the very end
2447 * end of input and no truncated input
2449 if( U_SUCCESS(*err
) &&
2451 args
->flush
&& source
>=sourceLimit
&& args
->converter
->fromUChar32
==0
2453 int32_t sourceIndex
;
2455 /* we are switching to ASCII */
2456 isTargetByteDBCS
=FALSE
;
2458 /* get the source index of the last input character */
2460 * TODO this would be simpler and more reliable if we used a pair
2461 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2462 * so that we could simply use the prevSourceIndex here;
2463 * this code gives an incorrect result for the rare case of an unmatched
2464 * trail surrogate that is alone in the last buffer of the text stream
2466 sourceIndex
=(int32_t)(source
-args
->source
);
2469 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
2470 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
2481 &target
, (const char *)targetLimit
,
2482 &offsets
, sourceIndex
,
2486 /*save the state and return */
2487 args
->source
= source
;
2488 args
->target
= (char*)target
;
2489 args
->converter
->fromUnicodeStatus
= (uint32_t)isTargetByteDBCS
;
2492 /************************ To Unicode ***************************************/
2495 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs
*args
,
2497 char const* sourceStart
;
2498 UConverterDataISO2022
* myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2500 UConverterToUnicodeArgs subArgs
;
2501 int32_t minArgsSize
;
2503 /* set up the subconverter arguments */
2504 if(args
->size
<sizeof(UConverterToUnicodeArgs
)) {
2505 minArgsSize
= args
->size
;
2507 minArgsSize
= (int32_t)sizeof(UConverterToUnicodeArgs
);
2510 uprv_memcpy(&subArgs
, args
, minArgsSize
);
2511 subArgs
.size
= (uint16_t)minArgsSize
;
2512 subArgs
.converter
= myData
->currentConverter
;
2514 /* remember the original start of the input for offsets */
2515 sourceStart
= args
->source
;
2517 if(myData
->key
!= 0) {
2518 /* continue with a partial escape sequence */
2522 while(U_SUCCESS(*err
) && args
->source
< args
->sourceLimit
) {
2523 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2524 subArgs
.source
= args
->source
;
2525 subArgs
.sourceLimit
= getEndOfBuffer_2022(&(args
->source
), args
->sourceLimit
, args
->flush
);
2526 if(subArgs
.source
!= subArgs
.sourceLimit
) {
2528 * get the current partial byte sequence
2530 * it needs to be moved between the public and the subconverter
2531 * so that the conversion framework, which only sees the public
2532 * converter, can handle truncated and illegal input etc.
2534 if(args
->converter
->toULength
> 0) {
2535 uprv_memcpy(subArgs
.converter
->toUBytes
, args
->converter
->toUBytes
, args
->converter
->toULength
);
2537 subArgs
.converter
->toULength
= args
->converter
->toULength
;
2540 * Convert up to the end of the input, or to before the next escape character.
2541 * Does not handle conversion extensions because the preToU[] state etc.
2544 ucnv_MBCSToUnicodeWithOffsets(&subArgs
, err
);
2546 if(args
->offsets
!= NULL
&& sourceStart
!= args
->source
) {
2547 /* update offsets to base them on the actual start of the input */
2548 int32_t *offsets
= args
->offsets
;
2549 UChar
*target
= args
->target
;
2550 int32_t delta
= (int32_t)(args
->source
- sourceStart
);
2551 while(target
< subArgs
.target
) {
2559 args
->source
= subArgs
.source
;
2560 args
->target
= subArgs
.target
;
2561 args
->offsets
= subArgs
.offsets
;
2563 /* copy input/error/overflow buffers */
2564 if(subArgs
.converter
->toULength
> 0) {
2565 uprv_memcpy(args
->converter
->toUBytes
, subArgs
.converter
->toUBytes
, subArgs
.converter
->toULength
);
2567 args
->converter
->toULength
= subArgs
.converter
->toULength
;
2569 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
2570 if(subArgs
.converter
->UCharErrorBufferLength
> 0) {
2571 uprv_memcpy(args
->converter
->UCharErrorBuffer
, subArgs
.converter
->UCharErrorBuffer
,
2572 subArgs
.converter
->UCharErrorBufferLength
);
2574 args
->converter
->UCharErrorBufferLength
=subArgs
.converter
->UCharErrorBufferLength
;
2575 subArgs
.converter
->UCharErrorBufferLength
= 0;
2579 if (U_FAILURE(*err
) || (args
->source
== args
->sourceLimit
)) {
2584 changeState_2022(args
->converter
,
2593 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
2596 const char *mySource
= ( char *) args
->source
;
2597 UChar
*myTarget
= args
->target
;
2598 const char *mySourceLimit
= args
->sourceLimit
;
2599 UChar32 targetUniChar
= 0x0000;
2600 UChar mySourceChar
= 0x0000;
2601 UConverterDataISO2022
* myData
;
2602 UConverterSharedData
* sharedData
;
2605 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2606 if(myData
->version
==1){
2607 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args
,err
);
2611 /* initialize state */
2612 sharedData
= myData
->currentConverter
->sharedData
;
2613 useFallback
= args
->converter
->useFallback
;
2615 if(myData
->key
!= 0) {
2616 /* continue with a partial escape sequence */
2618 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
2619 /* continue with a partial double-byte character */
2620 mySourceChar
= args
->converter
->toUBytes
[0];
2621 args
->converter
->toULength
= 0;
2625 while(mySource
< mySourceLimit
){
2627 if(myTarget
< args
->targetLimit
){
2629 mySourceChar
= (unsigned char) *mySource
++;
2631 if(mySourceChar
==UCNV_SI
){
2632 myData
->toU2022State
.g
= 0;
2633 if (myData
->isEmptySegment
) {
2634 myData
->isEmptySegment
= FALSE
; /* we are handling it, reset to avoid future spurious errors */
2635 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
2636 args
->converter
->toUCallbackReason
= UCNV_IRREGULAR
;
2637 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
2638 args
->converter
->toULength
= 1;
2639 args
->target
= myTarget
;
2640 args
->source
= mySource
;
2643 /*consume the source */
2645 }else if(mySourceChar
==UCNV_SO
){
2646 myData
->toU2022State
.g
= 1;
2647 myData
->isEmptySegment
= TRUE
; /* Begin a new segment, empty so far */
2648 /*consume the source */
2650 }else if(mySourceChar
==ESC_2022
){
2653 myData
->isEmptySegment
= FALSE
; /* Any invalid ESC sequences will be detected separately, so just reset this */
2654 changeState_2022(args
->converter
,&(mySource
),
2655 mySourceLimit
, ISO_2022_KR
, err
);
2656 if(U_FAILURE(*err
)){
2657 args
->target
= myTarget
;
2658 args
->source
= mySource
;
2664 myData
->isEmptySegment
= FALSE
; /* Any invalid char errors will be detected separately, so just reset this */
2665 if(myData
->toU2022State
.g
== 1) {
2666 if(mySource
< mySourceLimit
) {
2667 int leadIsOk
, trailIsOk
;
2670 targetUniChar
= missingCharMarker
;
2671 trailByte
= (uint8_t)*mySource
;
2673 * Ticket 5691: consistent illegal sequences:
2674 * - We include at least the first byte in the illegal sequence.
2675 * - If any of the non-initial bytes could be the start of a character,
2676 * we stop the illegal sequence before the first one of those.
2678 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2679 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2680 * Otherwise we convert or report the pair of bytes.
2682 leadIsOk
= (uint8_t)(mySourceChar
- 0x21) <= (0x7e - 0x21);
2683 trailIsOk
= (uint8_t)(trailByte
- 0x21) <= (0x7e - 0x21);
2684 if (leadIsOk
&& trailIsOk
) {
2686 tempBuf
[0] = (char)(mySourceChar
+ 0x80);
2687 tempBuf
[1] = (char)(trailByte
+ 0x80);
2688 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(sharedData
, tempBuf
, 2, useFallback
);
2689 mySourceChar
= (mySourceChar
<< 8) | trailByte
;
2690 } else if (!(trailIsOk
|| IS_2022_CONTROL(trailByte
))) {
2691 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2693 /* add another bit so that the code below writes 2 bytes in case of error */
2694 mySourceChar
= 0x10000 | (mySourceChar
<< 8) | trailByte
;
2697 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
2698 args
->converter
->toULength
= 1;
2702 else if(mySourceChar
<= 0x7f) {
2703 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(sharedData
, mySource
- 1, 1, useFallback
);
2705 targetUniChar
= 0xffff;
2707 if(targetUniChar
< 0xfffe){
2709 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2711 *(myTarget
++)=(UChar
)targetUniChar
;
2714 /* Call the callback function*/
2715 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
2720 *err
=U_BUFFER_OVERFLOW_ERROR
;
2724 args
->target
= myTarget
;
2725 args
->source
= mySource
;
2728 /*************************** END ISO2022-KR *********************************/
2730 /*************************** ISO-2022-CN *********************************
2732 * Rules for ISO-2022-CN Encoding:
2733 * i) The designator sequence must appear once on a line before any instance
2734 * of character set it designates.
2735 * ii) If two lines contain characters from the same character set, both lines
2736 * must include the designator sequence.
2737 * iii) Once the designator sequence is known, a shifting sequence has to be found
2738 * to invoke the shifting
2739 * iv) All lines start in ASCII and end in ASCII.
2740 * v) Four shifting sequences are employed for this purpose:
2742 * Sequcence ASCII Eq Charsets
2743 * ---------- ------- ---------
2745 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2746 * SS2 <ESC>N CNS-11643-1992 Plane 2
2747 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2750 * SOdesignator : ESC "$" ")" finalchar_for_SO
2751 * SS2designator : ESC "$" "*" finalchar_for_SS2
2752 * SS3designator : ESC "$" "+" finalchar_for_SS3
2754 * ESC $ ) A Indicates the bytes following SO are Chinese
2755 * characters as defined in GB 2312-80, until
2756 * another SOdesignation appears
2759 * ESC $ ) E Indicates the bytes following SO are as defined
2760 * in ISO-IR-165 (for details, see section 2.1),
2761 * until another SOdesignation appears
2763 * ESC $ ) G Indicates the bytes following SO are as defined
2764 * in CNS 11643-plane-1, until another
2765 * SOdesignation appears
2767 * ESC $ * H Indicates the two bytes immediately following
2768 * SS2 is a Chinese character as defined in CNS
2769 * 11643-plane-2, until another SS2designation
2771 * (Meaning <ESC>N must preceed every 2 byte
2774 * ESC $ + I Indicates the immediate two bytes following SS3
2775 * is a Chinese character as defined in CNS
2776 * 11643-plane-3, until another SS3designation
2778 * (Meaning <ESC>O must preceed every 2 byte
2781 * ESC $ + J Indicates the immediate two bytes following SS3
2782 * is a Chinese character as defined in CNS
2783 * 11643-plane-4, until another SS3designation
2785 * (In English: <ESC>O must preceed every 2 byte
2788 * ESC $ + K Indicates the immediate two bytes following SS3
2789 * is a Chinese character as defined in CNS
2790 * 11643-plane-5, until another SS3designation
2793 * ESC $ + L Indicates the immediate two bytes following SS3
2794 * is a Chinese character as defined in CNS
2795 * 11643-plane-6, until another SS3designation
2798 * ESC $ + M Indicates the immediate two bytes following SS3
2799 * is a Chinese character as defined in CNS
2800 * 11643-plane-7, until another SS3designation
2803 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2804 * has its own designation information before any Chinese characters
2809 /* The following are defined this way to make the strings truely readonly */
2810 static const char GB_2312_80_STR
[] = "\x1B\x24\x29\x41";
2811 static const char ISO_IR_165_STR
[] = "\x1B\x24\x29\x45";
2812 static const char CNS_11643_1992_Plane_1_STR
[] = "\x1B\x24\x29\x47";
2813 static const char CNS_11643_1992_Plane_2_STR
[] = "\x1B\x24\x2A\x48";
2814 static const char CNS_11643_1992_Plane_3_STR
[] = "\x1B\x24\x2B\x49";
2815 static const char CNS_11643_1992_Plane_4_STR
[] = "\x1B\x24\x2B\x4A";
2816 static const char CNS_11643_1992_Plane_5_STR
[] = "\x1B\x24\x2B\x4B";
2817 static const char CNS_11643_1992_Plane_6_STR
[] = "\x1B\x24\x2B\x4C";
2818 static const char CNS_11643_1992_Plane_7_STR
[] = "\x1B\x24\x2B\x4D";
2820 /********************** ISO2022-CN Data **************************/
2821 static const char* const escSeqCharsCN
[10] ={
2822 SHIFT_IN_STR
, /* ASCII */
2825 CNS_11643_1992_Plane_1_STR
,
2826 CNS_11643_1992_Plane_2_STR
,
2827 CNS_11643_1992_Plane_3_STR
,
2828 CNS_11643_1992_Plane_4_STR
,
2829 CNS_11643_1992_Plane_5_STR
,
2830 CNS_11643_1992_Plane_6_STR
,
2831 CNS_11643_1992_Plane_7_STR
2835 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
2836 UConverter
*cnv
= args
->converter
;
2837 UConverterDataISO2022
*converterData
;
2838 ISO2022State
*pFromU2022State
;
2839 uint8_t *target
= (uint8_t *) args
->target
;
2840 const uint8_t *targetLimit
= (const uint8_t *) args
->targetLimit
;
2841 const UChar
* source
= args
->source
;
2842 const UChar
* sourceLimit
= args
->sourceLimit
;
2843 int32_t* offsets
= args
->offsets
;
2848 int32_t choiceCount
;
2849 uint32_t targetValue
= 0;
2852 /* set up the state */
2853 converterData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
2854 pFromU2022State
= &converterData
->fromU2022State
;
2858 /* check if the last codepoint of previous buffer was a lead surrogate*/
2859 if((sourceChar
= cnv
->fromUChar32
)!=0 && target
< targetLimit
) {
2863 while( source
< sourceLimit
){
2864 if(target
< targetLimit
){
2866 sourceChar
= *(source
++);
2867 /*check if the char is a First surrogate*/
2868 if(UTF_IS_SURROGATE(sourceChar
)) {
2869 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
2871 /*look ahead to find the trail surrogate*/
2872 if(source
< sourceLimit
) {
2873 /* test the following code unit */
2874 UChar trail
=(UChar
) *source
;
2875 if(UTF_IS_SECOND_SURROGATE(trail
)) {
2877 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
2878 cnv
->fromUChar32
=0x00;
2879 /* convert this supplementary code point */
2880 /* exit this condition tree */
2882 /* this is an unmatched lead code unit (1st surrogate) */
2883 /* callback(illegal) */
2884 *err
=U_ILLEGAL_CHAR_FOUND
;
2885 cnv
->fromUChar32
=sourceChar
;
2890 cnv
->fromUChar32
=sourceChar
;
2894 /* this is an unmatched trail code unit (2nd surrogate) */
2895 /* callback(illegal) */
2896 *err
=U_ILLEGAL_CHAR_FOUND
;
2897 cnv
->fromUChar32
=sourceChar
;
2902 /* do the conversion */
2903 if(sourceChar
<= 0x007f ){
2904 /* do not convert SO/SI/ESC */
2905 if(IS_2022_CONTROL(sourceChar
)) {
2906 /* callback(illegal) */
2907 *err
=U_ILLEGAL_CHAR_FOUND
;
2908 cnv
->fromUChar32
=sourceChar
;
2913 if(pFromU2022State
->g
== 0) {
2914 buffer
[0] = (char)sourceChar
;
2917 buffer
[0] = UCNV_SI
;
2918 buffer
[1] = (char)sourceChar
;
2920 pFromU2022State
->g
= 0;
2923 if(sourceChar
== CR
|| sourceChar
== LF
) {
2924 /* reset the state at the end of a line */
2925 uprv_memset(pFromU2022State
, 0, sizeof(ISO2022State
));
2930 /* convert U+0080..U+10ffff */
2934 if(choiceCount
== 0) {
2935 /* try the current SO/G1 converter first */
2936 choices
[0] = pFromU2022State
->cs
[1];
2938 /* default to GB2312_1 if none is designated yet */
2939 if(choices
[0] == 0) {
2940 choices
[0] = GB2312_1
;
2943 if(converterData
->version
== 0) {
2946 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2947 if(choices
[0] == GB2312_1
) {
2948 choices
[1] = (int8_t)CNS_11643_1
;
2950 choices
[1] = (int8_t)GB2312_1
;
2955 /* ISO-2022-CN-EXT */
2957 /* try one of the other converters */
2958 switch(choices
[0]) {
2960 choices
[1] = (int8_t)CNS_11643_1
;
2961 choices
[2] = (int8_t)ISO_IR_165
;
2964 choices
[1] = (int8_t)GB2312_1
;
2965 choices
[2] = (int8_t)CNS_11643_1
;
2967 default: /* CNS_11643_x */
2968 choices
[1] = (int8_t)GB2312_1
;
2969 choices
[2] = (int8_t)ISO_IR_165
;
2979 * len==0: no mapping found yet
2980 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
2981 * len>0: found a roundtrip result, done
2985 * We will turn off useFallback after finding a fallback,
2986 * but we still get fallbacks from PUA code points as usual.
2987 * Therefore, we will also need to check that we don't overwrite
2988 * an early fallback with a later one.
2990 useFallback
= cnv
->useFallback
;
2992 for(i
= 0; i
< choiceCount
&& len
<= 0; ++i
) {
2993 int8_t cs0
= choices
[i
];
2997 if(cs0
>= CNS_11643_0
) {
2998 len2
= MBCS_FROM_UCHAR32_ISO2022(
2999 converterData
->myConverterArray
[CNS_11643
],
3004 if(len2
== 3 || (len2
== -3 && len
== 0)) {
3005 targetValue
= value
;
3006 cs
= (int8_t)(CNS_11643_0
+ (value
>> 16) - 0x80);
3011 useFallback
= FALSE
;
3013 if(cs
== CNS_11643_1
) {
3015 } else if(cs
== CNS_11643_2
) {
3017 } else /* plane 3..7 */ if(converterData
->version
== 1) {
3020 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3025 /* GB2312_1 or ISO-IR-165 */
3026 len2
= MBCS_FROM_UCHAR32_ISO2022(
3027 converterData
->myConverterArray
[cs0
],
3032 if(len2
== 2 || (len2
== -2 && len
== 0)) {
3033 targetValue
= value
;
3037 useFallback
= FALSE
;
3044 len
= 0; /* count output bytes; it must have been abs(len) == 2 */
3046 /* write the designation sequence if necessary */
3047 if(cs
!= pFromU2022State
->cs
[g
]) {
3048 if(cs
< CNS_11643
) {
3049 uprv_memcpy(buffer
, escSeqCharsCN
[cs
], 4);
3051 uprv_memcpy(buffer
, escSeqCharsCN
[CNS_11643
+ (cs
- CNS_11643_1
)], 4);
3054 pFromU2022State
->cs
[g
] = cs
;
3056 /* changing the SO/G1 charset invalidates the choices[] */
3061 /* write the shift sequence if necessary */
3062 if(g
!= pFromU2022State
->g
) {
3065 buffer
[len
++] = UCNV_SO
;
3067 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3068 pFromU2022State
->g
= 1;
3071 buffer
[len
++] = 0x1b;
3072 buffer
[len
++] = 0x4e;
3074 default: /* case 3 */
3075 buffer
[len
++] = 0x1b;
3076 buffer
[len
++] = 0x4f;
3081 /* write the two output bytes */
3082 buffer
[len
++] = (char)(targetValue
>> 8);
3083 buffer
[len
++] = (char)targetValue
;
3085 /* if we cannot find the character after checking all codepages
3086 * then this is an error
3088 *err
= U_INVALID_CHAR_FOUND
;
3089 cnv
->fromUChar32
=sourceChar
;
3094 /* output len>0 bytes in buffer[] */
3096 *target
++ = buffer
[0];
3098 *offsets
++ = (int32_t)(source
- args
->source
- 1); /* -1: known to be ASCII */
3100 } else if(len
== 2 && (target
+ 2) <= targetLimit
) {
3101 *target
++ = buffer
[0];
3102 *target
++ = buffer
[1];
3104 int32_t sourceIndex
= (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
));
3105 *offsets
++ = sourceIndex
;
3106 *offsets
++ = sourceIndex
;
3112 &target
, (const char *)targetLimit
,
3113 &offsets
, (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
)),
3115 if(U_FAILURE(*err
)) {
3119 } /* end if(myTargetIndex<myTargetLength) */
3121 *err
=U_BUFFER_OVERFLOW_ERROR
;
3125 }/* end while(mySourceIndex<mySourceLength) */
3128 * the end of the input stream and detection of truncated input
3129 * are handled by the framework, but for ISO-2022-CN conversion
3130 * we need to be in ASCII mode at the very end
3135 * end of input and no truncated input
3137 if( U_SUCCESS(*err
) &&
3138 pFromU2022State
->g
!=0 &&
3139 args
->flush
&& source
>=sourceLimit
&& cnv
->fromUChar32
==0
3141 int32_t sourceIndex
;
3143 /* we are switching to ASCII */
3144 pFromU2022State
->g
=0;
3146 /* get the source index of the last input character */
3148 * TODO this would be simpler and more reliable if we used a pair
3149 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3150 * so that we could simply use the prevSourceIndex here;
3151 * this code gives an incorrect result for the rare case of an unmatched
3152 * trail surrogate that is alone in the last buffer of the text stream
3154 sourceIndex
=(int32_t)(source
-args
->source
);
3157 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
3158 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
3169 &target
, (const char *)targetLimit
,
3170 &offsets
, sourceIndex
,
3174 /*save the state and return */
3175 args
->source
= source
;
3176 args
->target
= (char*)target
;
3181 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
3184 const char *mySource
= (char *) args
->source
;
3185 UChar
*myTarget
= args
->target
;
3186 const char *mySourceLimit
= args
->sourceLimit
;
3187 uint32_t targetUniChar
= 0x0000;
3188 uint32_t mySourceChar
= 0x0000;
3189 UConverterDataISO2022
* myData
;
3190 ISO2022State
*pToU2022State
;
3192 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
3193 pToU2022State
= &myData
->toU2022State
;
3195 if(myData
->key
!= 0) {
3196 /* continue with a partial escape sequence */
3198 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
3199 /* continue with a partial double-byte character */
3200 mySourceChar
= args
->converter
->toUBytes
[0];
3201 args
->converter
->toULength
= 0;
3202 targetUniChar
= missingCharMarker
;
3206 while(mySource
< mySourceLimit
){
3208 targetUniChar
=missingCharMarker
;
3210 if(myTarget
< args
->targetLimit
){
3212 mySourceChar
= (unsigned char) *mySource
++;
3214 switch(mySourceChar
){
3217 if (myData
->isEmptySegment
) {
3218 myData
->isEmptySegment
= FALSE
; /* we are handling it, reset to avoid future spurious errors */
3219 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
3220 args
->converter
->toUCallbackReason
= UCNV_IRREGULAR
;
3221 args
->converter
->toUBytes
[0] = mySourceChar
;
3222 args
->converter
->toULength
= 1;
3223 args
->target
= myTarget
;
3224 args
->source
= mySource
;
3230 if(pToU2022State
->cs
[1] != 0) {
3232 myData
->isEmptySegment
= TRUE
; /* Begin a new segment, empty so far */
3235 /* illegal to have SO before a matching designator */
3236 myData
->isEmptySegment
= FALSE
; /* Handling a different error, reset this to avoid future spurious errs */
3244 const char * mySourceBefore
= mySource
;
3245 int8_t toULengthBefore
= args
->converter
->toULength
;
3247 changeState_2022(args
->converter
,&(mySource
),
3248 mySourceLimit
, ISO_2022_CN
,err
);
3250 /* After SO there must be at least one character before a designator (designator error handled separately) */
3251 if(myData
->key
==0 && U_SUCCESS(*err
) && myData
->isEmptySegment
) {
3252 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
3253 args
->converter
->toUCallbackReason
= UCNV_IRREGULAR
;
3254 args
->converter
->toULength
= toULengthBefore
+ (mySource
- mySourceBefore
);
3258 /* invalid or illegal escape sequence */
3259 if(U_FAILURE(*err
)){
3260 args
->target
= myTarget
;
3261 args
->source
= mySource
;
3262 myData
->isEmptySegment
= FALSE
; /* Reset to avoid future spurious errors */
3267 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3272 uprv_memset(pToU2022State
, 0, sizeof(ISO2022State
));
3275 /* convert one or two bytes */
3276 myData
->isEmptySegment
= FALSE
;
3277 if(pToU2022State
->g
!= 0) {
3278 if(mySource
< mySourceLimit
) {
3279 UConverterSharedData
*cnv
;
3280 StateEnum tempState
;
3282 int leadIsOk
, trailIsOk
;
3285 trailByte
= (uint8_t)*mySource
;
3287 * Ticket 5691: consistent illegal sequences:
3288 * - We include at least the first byte in the illegal sequence.
3289 * - If any of the non-initial bytes could be the start of a character,
3290 * we stop the illegal sequence before the first one of those.
3292 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3293 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3294 * Otherwise we convert or report the pair of bytes.
3296 leadIsOk
= (uint8_t)(mySourceChar
- 0x21) <= (0x7e - 0x21);
3297 trailIsOk
= (uint8_t)(trailByte
- 0x21) <= (0x7e - 0x21);
3298 if (leadIsOk
&& trailIsOk
) {
3300 tempState
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
3301 if(tempState
>= CNS_11643_0
) {
3302 cnv
= myData
->myConverterArray
[CNS_11643
];
3303 tempBuf
[0] = (char) (0x80+(tempState
-CNS_11643_0
));
3304 tempBuf
[1] = (char) (mySourceChar
);
3305 tempBuf
[2] = (char) trailByte
;
3309 cnv
= myData
->myConverterArray
[tempState
];
3310 tempBuf
[0] = (char) (mySourceChar
);
3311 tempBuf
[1] = (char) trailByte
;
3314 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(cnv
, tempBuf
, tempBufLen
, FALSE
);
3315 mySourceChar
= (mySourceChar
<< 8) | trailByte
;
3316 } else if (!(trailIsOk
|| IS_2022_CONTROL(trailByte
))) {
3317 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3319 /* add another bit so that the code below writes 2 bytes in case of error */
3320 mySourceChar
= 0x10000 | (mySourceChar
<< 8) | trailByte
;
3322 if(pToU2022State
->g
>=2) {
3323 /* return from a single-shift state to the previous one */
3324 pToU2022State
->g
=pToU2022State
->prevG
;
3327 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
3328 args
->converter
->toULength
= 1;
3333 if(mySourceChar
<= 0x7f) {
3334 targetUniChar
= (UChar
) mySourceChar
;
3339 if(targetUniChar
< (missingCharMarker
-1/*0xfffe*/)){
3341 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
3343 *(myTarget
++)=(UChar
)targetUniChar
;
3345 else if(targetUniChar
> missingCharMarker
){
3346 /* disassemble the surrogate pair and write to output*/
3347 targetUniChar
-=0x0010000;
3348 *myTarget
= (UChar
)(0xd800+(UChar
)(targetUniChar
>>10));
3350 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
3353 if(myTarget
< args
->targetLimit
){
3354 *myTarget
= (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
3356 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
3360 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]=
3361 (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
3366 /* Call the callback function*/
3367 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
3372 *err
=U_BUFFER_OVERFLOW_ERROR
;
3377 args
->target
= myTarget
;
3378 args
->source
= mySource
;
3382 _ISO_2022_WriteSub(UConverterFromUnicodeArgs
*args
, int32_t offsetIndex
, UErrorCode
*err
) {
3383 UConverter
*cnv
= args
->converter
;
3384 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) cnv
->extraInfo
;
3385 ISO2022State
*pFromU2022State
=&myConverterData
->fromU2022State
;
3390 subchar
=(char *)cnv
->subChars
;
3391 length
=cnv
->subCharLen
; /* assume length==1 for most variants */
3394 switch(myConverterData
->locale
[0]){
3399 if(pFromU2022State
->g
== 1) {
3400 /* JIS7: switch from G1 to G0 */
3401 pFromU2022State
->g
= 0;
3405 cs
= pFromU2022State
->cs
[0];
3406 if(cs
!= ASCII
&& cs
!= JISX201
) {
3407 /* not in ASCII or JIS X 0201: switch to ASCII */
3408 pFromU2022State
->cs
[0] = (int8_t)ASCII
;
3418 if(pFromU2022State
->g
!= 0) {
3419 /* not in ASCII mode: switch to ASCII */
3420 pFromU2022State
->g
= 0;
3426 if(myConverterData
->version
== 0) {
3428 if((UBool
)args
->converter
->fromUnicodeStatus
) {
3429 /* in DBCS mode: switch to SBCS */
3430 args
->converter
->fromUnicodeStatus
= 0;
3434 } else /* length == 2*/ {
3435 if(!(UBool
)args
->converter
->fromUnicodeStatus
) {
3436 /* in SBCS mode: switch to DBCS */
3437 args
->converter
->fromUnicodeStatus
= 1;
3445 /* save the subconverter's substitution string */
3446 uint8_t *currentSubChars
= myConverterData
->currentConverter
->subChars
;
3447 int8_t currentSubCharLen
= myConverterData
->currentConverter
->subCharLen
;
3449 /* set our substitution string into the subconverter */
3450 myConverterData
->currentConverter
->subChars
= (uint8_t *)subchar
;
3451 myConverterData
->currentConverter
->subCharLen
= (int8_t)length
;
3453 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3454 args
->converter
= myConverterData
->currentConverter
;
3455 myConverterData
->currentConverter
->fromUChar32
= cnv
->fromUChar32
;
3456 ucnv_cbFromUWriteSub(args
, 0, err
);
3457 cnv
->fromUChar32
= myConverterData
->currentConverter
->fromUChar32
;
3458 args
->converter
= cnv
;
3460 /* restore the subconverter's substitution string */
3461 myConverterData
->currentConverter
->subChars
= currentSubChars
;
3462 myConverterData
->currentConverter
->subCharLen
= currentSubCharLen
;
3464 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
3465 if(myConverterData
->currentConverter
->charErrorBufferLength
> 0) {
3467 cnv
->charErrorBuffer
,
3468 myConverterData
->currentConverter
->charErrorBuffer
,
3469 myConverterData
->currentConverter
->charErrorBufferLength
);
3471 cnv
->charErrorBufferLength
= myConverterData
->currentConverter
->charErrorBufferLength
;
3472 myConverterData
->currentConverter
->charErrorBufferLength
= 0;
3480 ucnv_cbFromUWriteBytes(args
,
3481 buffer
, (int32_t)(p
- buffer
),
3486 * Structure for cloning an ISO 2022 converter into a single memory block.
3487 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3488 * and then ucnv_safeClone() of the sub-converter may additionally align
3489 * currentConverter inside the cloneStruct, for which we need the deadSpace
3490 * after currentConverter.
3491 * This is because UAlignedMemory may be larger than the actually
3492 * necessary alignment size for the platform.
3493 * The other cloneStruct fields will not be moved around,
3494 * and are aligned properly with cloneStruct's alignment.
3499 UConverter currentConverter
;
3500 UAlignedMemory deadSpace
;
3501 UConverterDataISO2022 mydata
;
3506 _ISO_2022_SafeClone(
3507 const UConverter
*cnv
,
3509 int32_t *pBufferSize
,
3512 struct cloneStruct
* localClone
;
3513 UConverterDataISO2022
*cnvData
;
3516 if (*pBufferSize
== 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3517 *pBufferSize
= (int32_t)sizeof(struct cloneStruct
);
3521 cnvData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
3522 localClone
= (struct cloneStruct
*)stackBuffer
;
3524 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3526 uprv_memcpy(&localClone
->mydata
, cnvData
, sizeof(UConverterDataISO2022
));
3527 localClone
->cnv
.extraInfo
= &localClone
->mydata
; /* set pointer to extra data */
3528 localClone
->cnv
.isExtraLocal
= TRUE
;
3530 /* share the subconverters */
3532 if(cnvData
->currentConverter
!= NULL
) {
3533 size
= (int32_t)(sizeof(UConverter
) + sizeof(UAlignedMemory
)); /* include size of padding */
3534 localClone
->mydata
.currentConverter
=
3535 ucnv_safeClone(cnvData
->currentConverter
,
3536 &localClone
->currentConverter
,
3538 if(U_FAILURE(*status
)) {
3543 for(i
=0; i
<UCNV_2022_MAX_CONVERTERS
; ++i
) {
3544 if(cnvData
->myConverterArray
[i
] != NULL
) {
3545 ucnv_incrementRefCount(cnvData
->myConverterArray
[i
]);
3549 return &localClone
->cnv
;
3553 _ISO_2022_GetUnicodeSet(const UConverter
*cnv
,
3554 const USetAdder
*sa
,
3555 UConverterUnicodeSet which
,
3556 UErrorCode
*pErrorCode
)
3559 UConverterDataISO2022
* cnvData
;
3561 if (U_FAILURE(*pErrorCode
)) {
3564 #ifdef U_ENABLE_GENERIC_ISO_2022
3565 if (cnv
->sharedData
== &_ISO2022Data
) {
3566 /* We use UTF-8 in this case */
3567 sa
->addRange(sa
->set
, 0, 0xd7FF);
3568 sa
->addRange(sa
->set
, 0xE000, 0x10FFFF);
3573 cnvData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
3575 /* open a set and initialize it with code points that are algorithmically round-tripped */
3576 switch(cnvData
->locale
[0]){
3578 /* include JIS X 0201 which is hardcoded */
3579 sa
->add(sa
->set
, 0xa5);
3580 sa
->add(sa
->set
, 0x203e);
3581 if(jpCharsetMasks
[cnvData
->version
]&CSM(ISO8859_1
)) {
3582 /* include Latin-1 for some variants of JP */
3583 sa
->addRange(sa
->set
, 0, 0xff);
3585 /* include ASCII for JP */
3586 sa
->addRange(sa
->set
, 0, 0x7f);
3588 if(cnvData
->version
==3 || cnvData
->version
==4 || which
==UCNV_ROUNDTRIP_AND_FALLBACK_SET
) {
3590 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3591 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3592 * use half-width Katakana.
3593 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3594 * half-width Katakana via the ESC ( I sequence.
3595 * However, we only emit (fromUnicode) half-width Katakana according to the
3596 * definition of each variant.
3598 * When including fallbacks,
3599 * we need to include half-width Katakana Unicode code points for all JP variants because
3600 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3602 /* include half-width Katakana for JP */
3603 sa
->addRange(sa
->set
, HWKANA_START
, HWKANA_END
);
3608 /* include ASCII for CN */
3609 sa
->addRange(sa
->set
, 0, 0x7f);
3612 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3613 cnvData
->currentConverter
->sharedData
->impl
->getUnicodeSet(
3614 cnvData
->currentConverter
, sa
, which
, pErrorCode
);
3615 /* the loop over myConverterArray[] will simply not find another converter */
3621 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3622 if( (cnvData
->locale
[0]=='c' || cnvData
->locale
[0]=='z') &&
3623 cnvData
->version
==0 && i
==CNS_11643
3625 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3626 ucnv_MBCSGetUnicodeSetForBytes(
3627 cnvData
->myConverterArray
[i
],
3628 sa
, UCNV_ROUNDTRIP_SET
,
3634 for (i
=0; i
<UCNV_2022_MAX_CONVERTERS
; i
++) {
3635 UConverterSetFilter filter
;
3636 if(cnvData
->myConverterArray
[i
]!=NULL
) {
3637 if( (cnvData
->locale
[0]=='c' || cnvData
->locale
[0]=='z') &&
3638 cnvData
->version
==0 && i
==CNS_11643
3641 * Version-specific for CN:
3642 * CN version 0 does not map CNS planes 3..7 although
3643 * they are all available in the CNS conversion table;
3644 * CN version 1 (-EXT) does map them all.
3645 * The two versions create different Unicode sets.
3647 filter
=UCNV_SET_FILTER_2022_CN
;
3648 } else if(cnvData
->locale
[0]=='j' && i
==JISX208
) {
3650 * Only add code points that map to Shift-JIS codes
3651 * corresponding to JIS X 0208.
3653 filter
=UCNV_SET_FILTER_SJIS
;
3654 } else if(i
==KSC5601
) {
3656 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3657 * are broader than GR94.
3659 filter
=UCNV_SET_FILTER_GR94DBCS
;
3661 filter
=UCNV_SET_FILTER_NONE
;
3663 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData
->myConverterArray
[i
], sa
, which
, filter
, pErrorCode
);
3668 * ISO 2022 converters must not convert SO/SI/ESC despite what
3669 * sub-converters do by themselves.
3670 * Remove these characters from the set.
3672 sa
->remove(sa
->set
, 0x0e);
3673 sa
->remove(sa
->set
, 0x0f);
3674 sa
->remove(sa
->set
, 0x1b);
3676 /* ISO 2022 converters do not convert C1 controls either */
3677 sa
->removeRange(sa
->set
, 0x80, 0x9f);
3680 static const UConverterImpl _ISO2022Impl
={
3690 #ifdef U_ENABLE_GENERIC_ISO_2022
3691 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC
,
3692 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC
,
3693 ucnv_fromUnicode_UTF8
,
3694 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
3706 _ISO_2022_SafeClone
,
3707 _ISO_2022_GetUnicodeSet
3709 static const UConverterStaticData _ISO2022StaticData
={
3710 sizeof(UConverterStaticData
),
3716 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3723 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3725 const UConverterSharedData _ISO2022Data
={
3726 sizeof(UConverterSharedData
),
3730 &_ISO2022StaticData
,
3736 /*************JP****************/
3737 static const UConverterImpl _ISO2022JPImpl
={
3747 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3748 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3749 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3750 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3756 _ISO_2022_SafeClone
,
3757 _ISO_2022_GetUnicodeSet
3759 static const UConverterStaticData _ISO2022JPStaticData
={
3760 sizeof(UConverterStaticData
),
3766 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3773 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3775 static const UConverterSharedData _ISO2022JPData
={
3776 sizeof(UConverterSharedData
),
3780 &_ISO2022JPStaticData
,
3786 /************* KR ***************/
3787 static const UConverterImpl _ISO2022KRImpl
={
3797 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3798 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3799 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3800 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3806 _ISO_2022_SafeClone
,
3807 _ISO_2022_GetUnicodeSet
3809 static const UConverterStaticData _ISO2022KRStaticData
={
3810 sizeof(UConverterStaticData
),
3816 3, /* max 3 bytes per UChar: SO+DBCS */
3823 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3825 static const UConverterSharedData _ISO2022KRData
={
3826 sizeof(UConverterSharedData
),
3830 &_ISO2022KRStaticData
,
3836 /*************** CN ***************/
3837 static const UConverterImpl _ISO2022CNImpl
={
3848 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3849 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3850 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3851 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3857 _ISO_2022_SafeClone
,
3858 _ISO_2022_GetUnicodeSet
3860 static const UConverterStaticData _ISO2022CNStaticData
={
3861 sizeof(UConverterStaticData
),
3867 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3874 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3876 static const UConverterSharedData _ISO2022CNData
={
3877 sizeof(UConverterSharedData
),
3881 &_ISO2022CNStaticData
,
3889 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */