2 **********************************************************************
3 * Copyright (C) 2000-2006,2008 International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.c
8 * tab size: 8 (not used)
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
29 #include "unicode/utypes.h"
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
46 #ifdef U_ENABLE_GENERIC_ISO_2022
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
74 * Markus Scherer 2003-dec-03
78 static const char SHIFT_IN_STR
[] = "\x0F";
79 static const char SHIFT_OUT_STR
[] = "\x0E";
88 * ISO 2022 control codes must not be converted from Unicode
89 * because they would mess up the byte stream.
90 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
91 * corresponding to SO, SI, and ESC.
93 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
95 /* for ISO-2022-JP and -CN implementations */
112 HWKANA_7BIT
=8, /* Halfwidth Katakana 7 bit */
115 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
121 * these are used in StateEnum and ISO2022State variables,
122 * but CNS_11643 must be used to index into myConverterArray[]
134 /* is the StateEnum charset value for a DBCS charset? */
135 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
137 #define CSM(cs) ((uint16_t)1<<(cs))
140 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
141 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
143 * Note: The converter uses some leniency:
144 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
145 * all versions, not just JIS7 and JIS8.
146 * - ICU does not distinguish between different versions of JIS X 0208.
148 static const uint16_t jpCharsetMasks
[5]={
149 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
),
150 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
),
151 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
),
152 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
),
153 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
)
165 typedef struct ISO2022State
{
166 int8_t cs
[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
167 int8_t g
; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
168 int8_t prevG
; /* g before single shift (SS2 or SS3) */
171 #define UCNV_OPTIONS_VERSION_MASK 0xf
172 #define UCNV_2022_MAX_CONVERTERS 10
175 UConverterSharedData
*myConverterArray
[UCNV_2022_MAX_CONVERTERS
];
176 UConverter
*currentConverter
;
177 Cnv2022Type currentType
;
178 ISO2022State toU2022State
, fromU2022State
;
181 #ifdef U_ENABLE_GENERIC_ISO_2022
184 UBool isEmptySegment
;
187 }UConverterDataISO2022
;
190 /* ISO-2022 ----------------------------------------------------------------- */
192 /*Forward declaration */
194 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs
* args
,
197 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
,
200 #define ESC_2022 0x1B /*ESC*/
204 INVALID_2022
= -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
205 VALID_NON_TERMINAL_2022
= 0, /*so far corresponds to a valid iso 2022 escape sequence*/
206 VALID_TERMINAL_2022
= 1, /*corresponds to a valid iso 2022 escape sequence*/
207 VALID_MAYBE_TERMINAL_2022
= 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
208 } UCNV_TableStates_2022
;
211 * The way these state transition arrays work is:
212 * ex : ESC$B is the sequence for JISX208
213 * a) First Iteration: char is ESC
214 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
215 * int x = normalize_esq_chars_2022[27] which is equal to 1
216 * ii) Search for this value in escSeqStateTable_Key_2022[]
217 * value of x is stored at escSeqStateTable_Key_2022[0]
218 * iii) Save this index as offset
219 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
220 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
221 * b) Switch on this state and continue to next char
222 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
223 * which is normalize_esq_chars_2022[36] == 4
224 * ii) x is currently 1(from above)
225 * x<<=5 -- x is now 32
226 * x+=normalize_esq_chars_2022[36]
228 * iii) Search for this value in escSeqStateTable_Key_2022[]
229 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
230 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
231 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
232 * c) Switch on this state and continue to next char
233 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
234 * ii) x is currently 36 (from above)
235 * x<<=5 -- x is now 1152
236 * x+=normalize_esq_chars_2022[66]
238 * iii) Search for this value in escSeqStateTable_Key_2022[]
239 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
240 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
241 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
242 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
246 /*Below are the 3 arrays depicting a state transition table*/
247 static const int8_t normalize_esq_chars_2022
[256] = {
248 /* 0 1 2 3 4 5 6 7 8 9 */
250 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
251 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
252 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
253 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
254 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
255 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
256 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
257 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
258 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
259 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
260 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
261 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
262 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
263 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
264 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
265 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
266 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
267 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
268 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
269 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
270 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
271 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
273 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
274 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
278 #ifdef U_ENABLE_GENERIC_ISO_2022
280 * When the generic ISO-2022 converter is completely removed, not just disabled
281 * per #ifdef, then the following state table and the associated tables that are
282 * dimensioned with MAX_STATES_2022 should be trimmed.
284 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
285 * the associated escape sequences starting with ESC ( B should be removed.
286 * This includes the ones with key values 1097 and all of the ones above 1000000.
288 * For the latter, the tables can simply be truncated.
289 * For the former, since the tables must be kept parallel, it is probably best
290 * to simply duplicate an adjacent table cell, parallel in all tables.
292 * It may make sense to restructure the tables, especially by using small search
293 * tables for the variants instead of indexing them parallel to the table here.
297 #define MAX_STATES_2022 74
298 static const int32_t escSeqStateTable_Key_2022
[MAX_STATES_2022
] = {
299 /* 0 1 2 3 4 5 6 7 8 9 */
301 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
302 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
303 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
304 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
305 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
306 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
307 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
308 ,35947631 ,35947635 ,35947636 ,35947638
311 #ifdef U_ENABLE_GENERIC_ISO_2022
313 static const char* const escSeqStateTable_Result_2022
[MAX_STATES_2022
] = {
314 /* 0 1 2 3 4 5 6 7 8 9 */
316 NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,"latin1" ,"latin1"
317 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
318 ,"latin1" ,NULL
,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL
,NULL
,NULL
,NULL
,"UTF8"
319 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL
,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
320 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
321 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
322 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL
,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
323 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
328 static const UCNV_TableStates_2022 escSeqStateTable_Value_2022
[MAX_STATES_2022
] = {
329 /* 0 1 2 3 4 5 6 7 8 9 */
330 VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
331 ,VALID_MAYBE_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
332 ,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
333 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
334 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
335 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
336 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
337 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
341 /* Type def for refactoring changeState_2022 code*/
343 #ifdef U_ENABLE_GENERIC_ISO_2022
351 /*********** ISO 2022 Converter Protos ***********/
353 _ISO2022Open(UConverter
*cnv
, const char *name
, const char *locale
,uint32_t options
, UErrorCode
*errorCode
);
356 _ISO2022Close(UConverter
*converter
);
359 _ISO2022Reset(UConverter
*converter
, UConverterResetChoice choice
);
362 _ISO2022getName(const UConverter
* cnv
);
365 _ISO_2022_WriteSub(UConverterFromUnicodeArgs
*args
, int32_t offsetIndex
, UErrorCode
*err
);
368 _ISO_2022_SafeClone(const UConverter
*cnv
, void *stackBuffer
, int32_t *pBufferSize
, UErrorCode
*status
);
370 #ifdef U_ENABLE_GENERIC_ISO_2022
372 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs
* args
, UErrorCode
* err
);
375 /*const UConverterSharedData _ISO2022Data;*/
376 static const UConverterSharedData _ISO2022JPData
;
377 static const UConverterSharedData _ISO2022KRData
;
378 static const UConverterSharedData _ISO2022CNData
;
380 /*************** Converter implementations ******************/
382 /* The purpose of this function is to get around gcc compiler warnings. */
384 fromUWriteUInt8(UConverter
*cnv
,
385 const char *bytes
, int32_t length
,
386 uint8_t **target
, const char *targetLimit
,
389 UErrorCode
*pErrorCode
)
391 char *targetChars
= (char *)*target
;
392 ucnv_fromUWriteBytes(cnv
, bytes
, length
, &targetChars
, targetLimit
,
393 offsets
, sourceIndex
, pErrorCode
);
394 *target
= (uint8_t*)targetChars
;
399 setInitialStateToUnicodeKR(UConverter
* converter
, UConverterDataISO2022
*myConverterData
){
400 if(myConverterData
->version
== 1) {
401 UConverter
*cnv
= myConverterData
->currentConverter
;
403 cnv
->toUnicodeStatus
=0; /* offset */
404 cnv
->mode
=0; /* state */
405 cnv
->toULength
=0; /* byteIndex */
410 setInitialStateFromUnicodeKR(UConverter
* converter
,UConverterDataISO2022
*myConverterData
){
411 /* in ISO-2022-KR the designator sequence appears only once
412 * in a file so we append it only once
414 if( converter
->charErrorBufferLength
==0){
416 converter
->charErrorBufferLength
= 4;
417 converter
->charErrorBuffer
[0] = 0x1b;
418 converter
->charErrorBuffer
[1] = 0x24;
419 converter
->charErrorBuffer
[2] = 0x29;
420 converter
->charErrorBuffer
[3] = 0x43;
422 if(myConverterData
->version
== 1) {
423 UConverter
*cnv
= myConverterData
->currentConverter
;
426 cnv
->fromUnicodeStatus
=1; /* prevLength */
431 _ISO2022Open(UConverter
*cnv
, const char *name
, const char *locale
,uint32_t options
, UErrorCode
*errorCode
){
433 char myLocale
[6]={' ',' ',' ',' ',' ',' '};
435 cnv
->extraInfo
= uprv_malloc (sizeof (UConverterDataISO2022
));
436 if(cnv
->extraInfo
!= NULL
) {
437 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) cnv
->extraInfo
;
440 uprv_memset(myConverterData
, 0, sizeof(UConverterDataISO2022
));
441 myConverterData
->currentType
= ASCII1
;
442 cnv
->fromUnicodeStatus
=FALSE
;
444 uprv_strncpy(myLocale
, locale
, sizeof(myLocale
));
446 version
= options
& UCNV_OPTIONS_VERSION_MASK
;
447 myConverterData
->version
= version
;
448 if(myLocale
[0]=='j' && (myLocale
[1]=='a'|| myLocale
[1]=='p') &&
449 (myLocale
[2]=='_' || myLocale
[2]=='\0'))
452 /* open the required converters and cache them */
453 if(jpCharsetMasks
[version
]&CSM(ISO8859_7
)) {
454 myConverterData
->myConverterArray
[ISO8859_7
]= ucnv_loadSharedData("ISO8859_7", NULL
, errorCode
);
456 myConverterData
->myConverterArray
[JISX201
] = ucnv_loadSharedData("JISX0201", NULL
, errorCode
);
457 myConverterData
->myConverterArray
[JISX208
] = ucnv_loadSharedData("jisx-208", NULL
, errorCode
);
458 if(jpCharsetMasks
[version
]&CSM(JISX212
)) {
459 myConverterData
->myConverterArray
[JISX212
] = ucnv_loadSharedData("jisx-212", NULL
, errorCode
);
461 if(jpCharsetMasks
[version
]&CSM(GB2312
)) {
462 myConverterData
->myConverterArray
[GB2312
] = ucnv_loadSharedData("ibm-5478", NULL
, errorCode
); /* gb_2312_80-1 */
464 if(jpCharsetMasks
[version
]&CSM(KSC5601
)) {
465 myConverterData
->myConverterArray
[KSC5601
] = ucnv_loadSharedData("ksc_5601", NULL
, errorCode
);
468 /* set the function pointers to appropriate funtions */
469 cnv
->sharedData
=(UConverterSharedData
*)(&_ISO2022JPData
);
470 uprv_strcpy(myConverterData
->locale
,"ja");
472 uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ja,version=");
473 len
= uprv_strlen(myConverterData
->name
);
474 myConverterData
->name
[len
]=(char)(myConverterData
->version
+(int)'0');
475 myConverterData
->name
[len
+1]='\0';
477 else if(myLocale
[0]=='k' && (myLocale
[1]=='o'|| myLocale
[1]=='r') &&
478 (myLocale
[2]=='_' || myLocale
[2]=='\0'))
481 myConverterData
->currentConverter
=
482 ucnv_open("icu-internal-25546",errorCode
);
484 if (U_FAILURE(*errorCode
)) {
489 uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ko,version=1");
490 uprv_memcpy(cnv
->subChars
, myConverterData
->currentConverter
->subChars
, 4);
491 cnv
->subCharLen
= myConverterData
->currentConverter
->subCharLen
;
493 myConverterData
->currentConverter
=ucnv_open("ibm-949",errorCode
);
495 if (U_FAILURE(*errorCode
)) {
500 myConverterData
->version
= 0;
501 uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ko,version=0");
504 /* initialize the state variables */
505 setInitialStateToUnicodeKR(cnv
, myConverterData
);
506 setInitialStateFromUnicodeKR(cnv
, myConverterData
);
508 /* set the function pointers to appropriate funtions */
509 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022KRData
;
510 uprv_strcpy(myConverterData
->locale
,"ko");
512 else if(((myLocale
[0]=='z' && myLocale
[1]=='h') || (myLocale
[0]=='c'&& myLocale
[1]=='n'))&&
513 (myLocale
[2]=='_' || myLocale
[2]=='\0'))
516 /* open the required converters and cache them */
517 myConverterData
->myConverterArray
[GB2312_1
] = ucnv_loadSharedData("ibm-5478", NULL
, errorCode
);
519 myConverterData
->myConverterArray
[ISO_IR_165
] = ucnv_loadSharedData("iso-ir-165", NULL
, errorCode
);
521 myConverterData
->myConverterArray
[CNS_11643
] = ucnv_loadSharedData("cns-11643-1992", NULL
, errorCode
);
524 /* set the function pointers to appropriate funtions */
525 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022CNData
;
526 uprv_strcpy(myConverterData
->locale
,"cn");
529 uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=zh,version=1");
531 myConverterData
->version
= 0;
532 uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=zh,version=0");
536 #ifdef U_ENABLE_GENERIC_ISO_2022
537 myConverterData
->isFirstBuffer
= TRUE
;
539 /* append the UTF-8 escape sequence */
540 cnv
->charErrorBufferLength
= 3;
541 cnv
->charErrorBuffer
[0] = 0x1b;
542 cnv
->charErrorBuffer
[1] = 0x25;
543 cnv
->charErrorBuffer
[2] = 0x42;
545 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022Data
;
546 /* initialize the state variables */
547 uprv_strcpy(myConverterData
->name
,"ISO_2022");
549 *errorCode
= U_UNSUPPORTED_ERROR
;
554 cnv
->maxBytesPerUChar
=cnv
->sharedData
->staticData
->maxBytesPerChar
;
556 if(U_FAILURE(*errorCode
)) {
560 *errorCode
= U_MEMORY_ALLOCATION_ERROR
;
566 _ISO2022Close(UConverter
*converter
) {
567 UConverterDataISO2022
* myData
=(UConverterDataISO2022
*) (converter
->extraInfo
);
568 UConverterSharedData
**array
= myData
->myConverterArray
;
571 if (converter
->extraInfo
!= NULL
) {
572 /*close the array of converter pointers and free the memory*/
573 for (i
=0; i
<UCNV_2022_MAX_CONVERTERS
; i
++) {
575 ucnv_unloadSharedDataIfReady(array
[i
]);
579 ucnv_close(myData
->currentConverter
);
581 if(!converter
->isExtraLocal
){
582 uprv_free (converter
->extraInfo
);
583 converter
->extraInfo
= NULL
;
589 _ISO2022Reset(UConverter
*converter
, UConverterResetChoice choice
) {
590 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) (converter
->extraInfo
);
591 if(choice
<=UCNV_RESET_TO_UNICODE
) {
592 uprv_memset(&myConverterData
->toU2022State
, 0, sizeof(ISO2022State
));
593 myConverterData
->key
= 0;
594 myConverterData
->isEmptySegment
= FALSE
;
596 if(choice
!=UCNV_RESET_TO_UNICODE
) {
597 uprv_memset(&myConverterData
->fromU2022State
, 0, sizeof(ISO2022State
));
599 #ifdef U_ENABLE_GENERIC_ISO_2022
600 if(myConverterData
->locale
[0] == 0){
601 if(choice
<=UCNV_RESET_TO_UNICODE
) {
602 myConverterData
->isFirstBuffer
= TRUE
;
603 myConverterData
->key
= 0;
604 if (converter
->mode
== UCNV_SO
){
605 ucnv_close (myConverterData
->currentConverter
);
606 myConverterData
->currentConverter
=NULL
;
608 converter
->mode
= UCNV_SI
;
610 if(choice
!=UCNV_RESET_TO_UNICODE
) {
611 /* re-append UTF-8 escape sequence */
612 converter
->charErrorBufferLength
= 3;
613 converter
->charErrorBuffer
[0] = 0x1b;
614 converter
->charErrorBuffer
[1] = 0x28;
615 converter
->charErrorBuffer
[2] = 0x42;
621 /* reset the state variables */
622 if(myConverterData
->locale
[0] == 'k'){
623 if(choice
<=UCNV_RESET_TO_UNICODE
) {
624 setInitialStateToUnicodeKR(converter
, myConverterData
);
626 if(choice
!=UCNV_RESET_TO_UNICODE
) {
627 setInitialStateFromUnicodeKR(converter
, myConverterData
);
634 _ISO2022getName(const UConverter
* cnv
){
636 UConverterDataISO2022
* myData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
643 /*************** to unicode *******************/
644 /****************************************************************************
645 * Recognized escape sequences are
657 static const StateEnum nextStateToUnicodeJP
[MAX_STATES_2022
]= {
658 /* 0 1 2 3 4 5 6 7 8 9 */
659 INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,SS2_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
660 ,ASCII
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,JISX201
,HWKANA_7BIT
,JISX201
,INVALID_STATE
661 ,INVALID_STATE
,INVALID_STATE
,JISX208
,GB2312
,JISX208
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
662 ,ISO8859_1
,ISO8859_7
,JISX208
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,KSC5601
,JISX212
,INVALID_STATE
663 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
664 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
665 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
666 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
669 /*************** to unicode *******************/
670 static const StateEnum nextStateToUnicodeCN
[MAX_STATES_2022
]= {
671 /* 0 1 2 3 4 5 6 7 8 9 */
672 INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,SS2_STATE
,SS3_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
673 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
674 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
675 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
676 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,GB2312_1
,INVALID_STATE
,ISO_IR_165
677 ,CNS_11643_1
,CNS_11643_2
,CNS_11643_3
,CNS_11643_4
,CNS_11643_5
,CNS_11643_6
,CNS_11643_7
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
678 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
679 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
683 static UCNV_TableStates_2022
684 getKey_2022(char c
,int32_t* key
,int32_t* offset
){
687 int32_t hi
= MAX_STATES_2022
;
690 togo
= normalize_esq_chars_2022
[(uint8_t)c
];
692 /* not a valid character anywhere in an escape sequence */
697 togo
= (*key
<< 5) + togo
;
699 while (hi
!= low
) /*binary search*/{
701 register int32_t mid
= (hi
+low
) >> 1; /*Finds median*/
706 if (escSeqStateTable_Key_2022
[mid
] > togo
){
709 else if (escSeqStateTable_Key_2022
[mid
] < togo
){
712 else /*we found it*/{
715 return escSeqStateTable_Value_2022
[mid
];
726 /*runs through a state machine to determine the escape sequence - codepage correspondance
729 changeState_2022(UConverter
* _this
,
731 const char* sourceLimit
,
734 UCNV_TableStates_2022 value
;
735 UConverterDataISO2022
* myData2022
= ((UConverterDataISO2022
*)_this
->extraInfo
);
736 uint32_t key
= myData2022
->key
;
740 value
= VALID_NON_TERMINAL_2022
;
741 while (*source
< sourceLimit
) {
743 _this
->toUBytes
[_this
->toULength
++]=(uint8_t)c
;
744 value
= getKey_2022(c
,(int32_t *) &key
, &offset
);
748 case VALID_NON_TERMINAL_2022
:
749 /* continue with the loop */
752 case VALID_TERMINAL_2022
:
759 case VALID_MAYBE_TERMINAL_2022
:
760 #ifdef U_ENABLE_GENERIC_ISO_2022
761 /* ESC ( B is ambiguous only for ISO_2022 itself */
762 if(var
== ISO_2022
) {
763 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
764 _this
->toULength
= 0;
766 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
768 /* continue with the loop */
769 value
= VALID_NON_TERMINAL_2022
;
774 /* not ISO_2022 itself, finish here */
775 value
= VALID_TERMINAL_2022
;
783 myData2022
->key
= key
;
785 if (value
== VALID_NON_TERMINAL_2022
) {
786 /* indicate that the escape sequence is incomplete: key!=0 */
788 } else if (value
== INVALID_2022
) {
789 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
791 } else /* value == VALID_TERMINAL_2022 */ {
793 #ifdef U_ENABLE_GENERIC_ISO_2022
796 const char *chosenConverterName
= escSeqStateTable_Result_2022
[offset
];
797 if(chosenConverterName
== NULL
) {
799 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
803 _this
->mode
= UCNV_SI
;
804 ucnv_close(myData2022
->currentConverter
);
805 myData2022
->currentConverter
= myUConverter
= ucnv_open(chosenConverterName
, err
);
806 if(U_SUCCESS(*err
)) {
807 myUConverter
->fromCharErrorBehaviour
= UCNV_TO_U_CALLBACK_STOP
;
808 _this
->mode
= UCNV_SO
;
815 StateEnum tempState
=nextStateToUnicodeJP
[offset
];
818 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
821 if(myData2022
->toU2022State
.cs
[2]!=0) {
822 if(myData2022
->toU2022State
.g
<2) {
823 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
825 myData2022
->toU2022State
.g
=2;
827 /* illegal to have SS2 before a matching designator */
828 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
831 /* case SS3_STATE: not used in ISO-2022-JP-x */
834 if((jpCharsetMasks
[myData2022
->version
] & CSM(tempState
)) == 0) {
835 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
837 /* G2 charset for SS2 */
838 myData2022
->toU2022State
.cs
[2]=(int8_t)tempState
;
842 if((jpCharsetMasks
[myData2022
->version
] & CSM(tempState
)) == 0) {
843 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
846 myData2022
->toU2022State
.cs
[0]=(int8_t)tempState
;
854 StateEnum tempState
=nextStateToUnicodeCN
[offset
];
857 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
860 if(myData2022
->toU2022State
.cs
[2]!=0) {
861 if(myData2022
->toU2022State
.g
<2) {
862 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
864 myData2022
->toU2022State
.g
=2;
866 /* illegal to have SS2 before a matching designator */
867 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
871 if(myData2022
->toU2022State
.cs
[3]!=0) {
872 if(myData2022
->toU2022State
.g
<2) {
873 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
875 myData2022
->toU2022State
.g
=3;
877 /* illegal to have SS3 before a matching designator */
878 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
882 if(myData2022
->version
==0) {
883 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
890 myData2022
->toU2022State
.cs
[1]=(int8_t)tempState
;
893 myData2022
->toU2022State
.cs
[2]=(int8_t)tempState
;
896 /* other CNS 11643 planes */
897 if(myData2022
->version
==0) {
898 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
900 myData2022
->toU2022State
.cs
[3]=(int8_t)tempState
;
908 /* nothing to be done, just accept this one escape sequence */
910 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
915 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
919 if(U_SUCCESS(*err
)) {
920 _this
->toULength
= 0;
924 /*Checks the characters of the buffer against valid 2022 escape sequences
925 *if the match we return a pointer to the initial start of the sequence otherwise
926 *we return sourceLimit
928 /*for 2022 looks ahead in the stream
929 *to determine the longest possible convertible
932 static U_INLINE
const char*
933 getEndOfBuffer_2022(const char** source
,
934 const char* sourceLimit
,
937 const char* mySource
= *source
;
939 #ifdef U_ENABLE_GENERIC_ISO_2022
940 if (*source
>= sourceLimit
)
945 if (*mySource
== ESC_2022
){
949 UCNV_TableStates_2022 value
= VALID_NON_TERMINAL_2022
;
951 /* Kludge: I could not
952 * figure out the reason for validating an escape sequence
953 * twice - once here and once in changeState_2022().
954 * is it possible to have an ESC character in a ISO2022
955 * byte stream which is valid in a code page? Is it legal?
958 (mySource
+i
< sourceLimit
)&&(value
== VALID_NON_TERMINAL_2022
);
960 value
= getKey_2022(*(mySource
+i
), &key
, &offset
);
962 if (value
> 0 || *mySource
==ESC_2022
)
965 if ((value
== VALID_NON_TERMINAL_2022
)&&(!flush
) )
968 }while (++mySource
< sourceLimit
);
972 while(mySource
< sourceLimit
&& *mySource
!= ESC_2022
) {
980 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
981 * any future change in _MBCSFromUChar32() function should be reflected in
985 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData
* sharedData
,
993 const uint16_t *table
;
994 uint32_t stage2Entry
;
997 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
998 if(c
<0x10000 || (sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
999 table
=sharedData
->mbcs
.fromUnicodeTable
;
1000 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
1001 /* get the bytes and the length for the output */
1002 if(outputType
==MBCS_OUTPUT_2
){
1003 myValue
=MBCS_VALUE_2_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
1009 } else /* outputType==MBCS_OUTPUT_3 */ {
1010 p
=MBCS_POINTER_3_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
1011 myValue
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
1014 } else if(myValue
<=0xffff) {
1020 /* is this code point assigned, or do we use fallbacks? */
1021 if( (stage2Entry
&(1<<(16+(c
&0xf))))!=0 ||
1022 (FROM_U_USE_FALLBACK(useFallback
, c
) && myValue
!=0)
1025 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1026 * There is no way with this data structure for fallback output
1027 * to be a zero byte.
1035 cx
=sharedData
->mbcs
.extIndexes
;
1037 *length
=ucnv_extSimpleMatchFromU(cx
, c
, value
, useFallback
);
1045 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1046 * any future change in _MBCSSingleFromUChar32() function should be reflected in
1049 static U_INLINE
void
1050 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData
* sharedData
,
1055 const uint16_t *table
;
1057 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1058 if(c
>=0x10000 && !(sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
1059 *retval
=(uint16_t)-1;
1062 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1063 table
=sharedData
->mbcs
.fromUnicodeTable
;
1064 /* get the byte for the output */
1065 value
=MBCS_SINGLE_RESULT_FROM_U(table
, (uint16_t *)sharedData
->mbcs
.fromUnicodeBytes
, c
);
1066 /* is this code point assigned, or do we use fallbacks? */
1067 if(useFallback
? value
>=0x800 : value
>=0xc00) {
1072 *retval
=(uint16_t) value
;
1075 #ifdef U_ENABLE_GENERIC_ISO_2022
1077 /**********************************************************************************
1078 * ISO-2022 Converter
1084 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs
* args
,
1086 const char* mySourceLimit
, *realSourceLimit
;
1087 const char* sourceStart
;
1088 const UChar
* myTargetStart
;
1089 UConverter
* saveThis
;
1090 UConverterDataISO2022
* myData
;
1093 saveThis
= args
->converter
;
1094 myData
=((UConverterDataISO2022
*)(saveThis
->extraInfo
));
1096 realSourceLimit
= args
->sourceLimit
;
1097 while (args
->source
< realSourceLimit
) {
1098 if(myData
->key
== 0) { /* are we in the middle of an escape sequence? */
1099 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1100 mySourceLimit
= getEndOfBuffer_2022(&(args
->source
), realSourceLimit
, args
->flush
);
1102 if(args
->source
< mySourceLimit
) {
1103 if(myData
->currentConverter
==NULL
) {
1104 myData
->currentConverter
= ucnv_open("ASCII",err
);
1105 if(U_FAILURE(*err
)){
1109 myData
->currentConverter
->fromCharErrorBehaviour
= UCNV_TO_U_CALLBACK_STOP
;
1110 saveThis
->mode
= UCNV_SO
;
1113 /* convert to before the ESC or until the end of the buffer */
1114 myData
->isFirstBuffer
=FALSE
;
1115 sourceStart
= args
->source
;
1116 myTargetStart
= args
->target
;
1117 args
->converter
= myData
->currentConverter
;
1118 ucnv_toUnicode(args
->converter
,
1124 (UBool
)(args
->flush
&& mySourceLimit
== realSourceLimit
),
1126 args
->converter
= saveThis
;
1128 if (*err
== U_BUFFER_OVERFLOW_ERROR
) {
1129 /* move the overflow buffer */
1130 length
= saveThis
->UCharErrorBufferLength
= myData
->currentConverter
->UCharErrorBufferLength
;
1131 myData
->currentConverter
->UCharErrorBufferLength
= 0;
1133 uprv_memcpy(saveThis
->UCharErrorBuffer
,
1134 myData
->currentConverter
->UCharErrorBuffer
,
1135 length
*U_SIZEOF_UCHAR
);
1142 * -Error while converting
1143 * -Done with entire buffer
1144 * -Need to write offsets or update the current offset
1145 * (leave that up to the code in ucnv.c)
1147 * or else we just stopped at an ESC byte and continue with changeState_2022()
1149 if (U_FAILURE(*err
) ||
1150 (args
->source
== realSourceLimit
) ||
1151 (args
->offsets
!= NULL
&& (args
->target
!= myTargetStart
|| args
->source
!= sourceStart
) ||
1152 (mySourceLimit
< realSourceLimit
&& myData
->currentConverter
->toULength
> 0))
1154 /* copy partial or error input for truncated detection and error handling */
1155 if(U_FAILURE(*err
)) {
1156 length
= saveThis
->invalidCharLength
= myData
->currentConverter
->invalidCharLength
;
1158 uprv_memcpy(saveThis
->invalidCharBuffer
, myData
->currentConverter
->invalidCharBuffer
, length
);
1161 length
= saveThis
->toULength
= myData
->currentConverter
->toULength
;
1163 uprv_memcpy(saveThis
->toUBytes
, myData
->currentConverter
->toUBytes
, length
);
1164 if(args
->source
< mySourceLimit
) {
1165 *err
= U_TRUNCATED_CHAR_FOUND
; /* truncated input before ESC */
1174 sourceStart
= args
->source
;
1175 changeState_2022(args
->converter
,
1180 if (U_FAILURE(*err
) || (args
->source
!= sourceStart
&& args
->offsets
!= NULL
)) {
1181 /* let the ucnv.c code update its current offset */
1190 * To Unicode Callback helper function
1193 toUnicodeCallback(UConverter
*cnv
,
1194 const uint32_t sourceChar
, const uint32_t targetUniChar
,
1196 if(sourceChar
>0xff){
1197 cnv
->toUBytes
[0] = (uint8_t)(sourceChar
>>8);
1198 cnv
->toUBytes
[1] = (uint8_t)sourceChar
;
1202 cnv
->toUBytes
[0] =(char) sourceChar
;
1206 if(targetUniChar
== (missingCharMarker
-1/*0xfffe*/)){
1207 *err
= U_INVALID_CHAR_FOUND
;
1210 *err
= U_ILLEGAL_CHAR_FOUND
;
1214 /**************************************ISO-2022-JP*************************************************/
1216 /************************************** IMPORTANT **************************************************
1217 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1218 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1219 * The converter iterates over each Unicode codepoint
1220 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1221 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1222 * would do as far as possible.
1224 * If the implementation of these macros or structure of sharedData struct change in the future, make
1225 * sure that ISO-2022 is also changed.
1226 ***************************************************************************************************
1229 /***************************************************************************************************
1230 * Rules for ISO-2022-jp encoding
1231 * (i) Escape sequences must be fully contained within a line they should not
1232 * span new lines or CRs
1233 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1234 * JIS-Roman character escape sequence should follow before the line terminates
1235 * (iii) If the first character on the line is represented by two bytes then a two
1236 * byte character escape sequence should precede it
1237 * (iv) If no escape sequence is encountered then the characters are ASCII
1238 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1239 * and invoked with SS2 (ESC N).
1240 * (vi) If there is any G0 designation in text, there must be a switch to
1241 * ASCII or to JIS X 0201-Roman before a space character (but not
1242 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1243 * characters such as tab or CRLF.
1244 * (vi) Supported encodings:
1245 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1249 * JISX201, JISX208,JISX212 : new .cnv data files created
1250 * KSC5601 : alias to ibm-949 mapping table
1251 * GB2312 : alias to ibm-1386 mapping table
1252 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1253 * ISO-8859-7 : alisas to ibm-9409 mapping table
1256 /* preference order of JP charsets */
1257 static const StateEnum jpCharsetPref
[]={
1270 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1271 * not in order of jpCharsetPref[]!
1273 static const char escSeqChars
[][6] ={
1274 "\x1B\x28\x42", /* <ESC>(B ASCII */
1275 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1276 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1277 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1278 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1279 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1280 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1281 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1282 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1285 static const int32_t escSeqCharsLen
[] ={
1286 3, /* length of <ESC>(B ASCII */
1287 3, /* length of <ESC>.A ISO-8859-1 */
1288 3, /* length of <ESC>.F ISO-8859-7 */
1289 3, /* length of <ESC>(J JISX-201 */
1290 3, /* length of <ESC>$B JISX-208 */
1291 4, /* length of <ESC>$(D JISX-212 */
1292 3, /* length of <ESC>$A GB2312 */
1293 4, /* length of <ESC>$(C KSC5601 */
1294 3 /* length of <ESC>(I HWKANA_7BIT */
1298 * The iteration over various code pages works this way:
1299 * i) Get the currentState from myConverterData->currentState
1300 * ii) Check if the character is mapped to a valid character in the currentState
1301 * Yes -> a) set the initIterState to currentState
1302 * b) remain in this state until an invalid character is found
1303 * No -> a) go to the next code page and find the character
1304 * iii) Before changing the state increment the current state check if the current state
1305 * is equal to the intitIteration state
1306 * Yes -> A character that cannot be represented in any of the supported encodings
1307 * break and return a U_INVALID_CHARACTER error
1308 * No -> Continue and find the character in next code page
1311 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1315 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
) {
1316 UConverterDataISO2022
*converterData
;
1317 ISO2022State
*pFromU2022State
;
1318 uint8_t *target
= (uint8_t *) args
->target
;
1319 const uint8_t *targetLimit
= (const uint8_t *) args
->targetLimit
;
1320 const UChar
* source
= args
->source
;
1321 const UChar
* sourceLimit
= args
->sourceLimit
;
1322 int32_t* offsets
= args
->offsets
;
1325 int32_t len
, outLen
;
1327 int32_t choiceCount
;
1328 uint32_t targetValue
= 0;
1334 /* set up the state */
1335 converterData
= (UConverterDataISO2022
*)args
->converter
->extraInfo
;
1336 pFromU2022State
= &converterData
->fromU2022State
;
1337 useFallback
= args
->converter
->useFallback
;
1341 /* check if the last codepoint of previous buffer was a lead surrogate*/
1342 if((sourceChar
= args
->converter
->fromUChar32
)!=0 && target
< targetLimit
) {
1346 while(source
< sourceLimit
) {
1347 if(target
< targetLimit
) {
1349 sourceChar
= *(source
++);
1350 /*check if the char is a First surrogate*/
1351 if(UTF_IS_SURROGATE(sourceChar
)) {
1352 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
1354 /*look ahead to find the trail surrogate*/
1355 if(source
< sourceLimit
) {
1356 /* test the following code unit */
1357 UChar trail
=(UChar
) *source
;
1358 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1360 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
1361 args
->converter
->fromUChar32
=0x00;
1362 /* convert this supplementary code point */
1363 /* exit this condition tree */
1365 /* this is an unmatched lead code unit (1st surrogate) */
1366 /* callback(illegal) */
1367 *err
=U_ILLEGAL_CHAR_FOUND
;
1368 args
->converter
->fromUChar32
=sourceChar
;
1373 args
->converter
->fromUChar32
=sourceChar
;
1377 /* this is an unmatched trail code unit (2nd surrogate) */
1378 /* callback(illegal) */
1379 *err
=U_ILLEGAL_CHAR_FOUND
;
1380 args
->converter
->fromUChar32
=sourceChar
;
1385 /* do not convert SO/SI/ESC */
1386 if(IS_2022_CONTROL(sourceChar
)) {
1387 /* callback(illegal) */
1388 *err
=U_ILLEGAL_CHAR_FOUND
;
1389 args
->converter
->fromUChar32
=sourceChar
;
1393 /* do the conversion */
1395 if(choiceCount
== 0) {
1399 * The csm variable keeps track of which charsets are allowed
1400 * and not used yet while building the choices[].
1402 csm
= jpCharsetMasks
[converterData
->version
];
1405 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1406 if(converterData
->version
== 3 || converterData
->version
== 4) {
1407 choices
[choiceCount
++] = cs
= (int8_t)HWKANA_7BIT
;
1411 /* try the current G0 charset */
1412 choices
[choiceCount
++] = cs
= pFromU2022State
->cs
[0];
1415 /* try the current G2 charset */
1416 if((cs
= pFromU2022State
->cs
[2]) != 0) {
1417 choices
[choiceCount
++] = cs
;
1421 /* try all the other possible charsets */
1422 for(i
= 0; i
< LENGTHOF(jpCharsetPref
); ++i
) {
1423 cs
= (int8_t)jpCharsetPref
[i
];
1425 choices
[choiceCount
++] = cs
;
1434 for(i
= 0; i
< choiceCount
&& len
== 0; ++i
) {
1438 if(sourceChar
<= 0x7f) {
1439 targetValue
= (uint32_t)sourceChar
;
1444 if(0x80 <= sourceChar
&& sourceChar
<= 0xff) {
1445 targetValue
= (uint32_t)sourceChar
- 0x80;
1451 if((uint32_t)(0xff9f-sourceChar
)<=(0xff9f-0xff61)) {
1452 targetValue
= (uint32_t)(sourceChar
- (0xff61 - 0x21));
1455 if(converterData
->version
==3) {
1456 /* JIS7: use G1 (SO) */
1457 pFromU2022State
->cs
[1] = cs
; /* do not output an escape sequence */
1459 } else if(converterData
->version
==4) {
1460 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1463 targetValue
+= 0x80;
1465 cs0
= pFromU2022State
->cs
[0];
1466 if(IS_JP_DBCS(cs0
)) {
1467 /* switch from a DBCS charset to JISX201 */
1468 cs
= (int8_t)JISX201
;
1470 /* stay in the current G0 charset */
1478 MBCS_SINGLE_FROM_UCHAR32(
1479 converterData
->myConverterArray
[cs
],
1480 sourceChar
, &targetValue
,
1482 if(targetValue
<= 0x7f) {
1487 /* G0 SBCS forced to 7-bit output */
1488 MBCS_SINGLE_FROM_UCHAR32(
1489 converterData
->myConverterArray
[cs
],
1490 sourceChar
, &targetValue
,
1492 if(0x80 <= targetValue
&& targetValue
<= 0xff) {
1493 targetValue
-= 0x80;
1500 MBCS_FROM_UCHAR32_ISO2022(
1501 converterData
->myConverterArray
[cs
],
1502 sourceChar
, &targetValue
,
1503 useFallback
, &len
, MBCS_OUTPUT_2
);
1512 outLen
= 0; /* count output bytes */
1514 /* write SI if necessary (only for JIS7) */
1515 if(pFromU2022State
->g
== 1 && g
== 0) {
1516 buffer
[outLen
++] = UCNV_SI
;
1517 pFromU2022State
->g
= 0;
1520 /* write the designation sequence if necessary */
1521 if(cs
!= pFromU2022State
->cs
[g
]) {
1522 int32_t escLen
= escSeqCharsLen
[cs
];
1523 uprv_memcpy(buffer
+ outLen
, escSeqChars
[cs
], escLen
);
1525 pFromU2022State
->cs
[g
] = cs
;
1527 /* invalidate the choices[] */
1531 /* write the shift sequence if necessary */
1532 if(g
!= pFromU2022State
->g
) {
1534 /* case 0 handled before writing escapes */
1536 buffer
[outLen
++] = UCNV_SO
;
1537 pFromU2022State
->g
= 1;
1539 default: /* case 2 */
1540 buffer
[outLen
++] = 0x1b;
1541 buffer
[outLen
++] = 0x4e;
1543 /* no case 3: no SS3 in ISO-2022-JP-x */
1547 /* write the output bytes */
1549 buffer
[outLen
++] = (char)targetValue
;
1550 } else /* len == 2 */ {
1551 buffer
[outLen
++] = (char)(targetValue
>> 8);
1552 buffer
[outLen
++] = (char)targetValue
;
1556 * if we cannot find the character after checking all codepages
1557 * then this is an error
1559 *err
= U_INVALID_CHAR_FOUND
;
1560 args
->converter
->fromUChar32
=sourceChar
;
1564 if(sourceChar
== CR
|| sourceChar
== LF
) {
1565 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1566 pFromU2022State
->cs
[2] = 0;
1570 /* output outLen>0 bytes in buffer[] */
1572 *target
++ = buffer
[0];
1574 *offsets
++ = (int32_t)(source
- args
->source
- 1); /* -1: known to be ASCII */
1576 } else if(outLen
== 2 && (target
+ 2) <= targetLimit
) {
1577 *target
++ = buffer
[0];
1578 *target
++ = buffer
[1];
1580 int32_t sourceIndex
= (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
));
1581 *offsets
++ = sourceIndex
;
1582 *offsets
++ = sourceIndex
;
1588 &target
, (const char *)targetLimit
,
1589 &offsets
, (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
)),
1591 if(U_FAILURE(*err
)) {
1595 } /* end if(myTargetIndex<myTargetLength) */
1597 *err
=U_BUFFER_OVERFLOW_ERROR
;
1601 }/* end while(mySourceIndex<mySourceLength) */
1604 * the end of the input stream and detection of truncated input
1605 * are handled by the framework, but for ISO-2022-JP conversion
1606 * we need to be in ASCII mode at the very end
1610 * in SO mode or not in ASCII mode
1611 * end of input and no truncated input
1613 if( U_SUCCESS(*err
) &&
1614 (pFromU2022State
->g
!=0 || pFromU2022State
->cs
[0]!=ASCII
) &&
1615 args
->flush
&& source
>=sourceLimit
&& args
->converter
->fromUChar32
==0
1617 int32_t sourceIndex
;
1621 if(pFromU2022State
->g
!= 0) {
1622 buffer
[outLen
++] = UCNV_SI
;
1623 pFromU2022State
->g
= 0;
1626 if(pFromU2022State
->cs
[0] != ASCII
) {
1627 int32_t escLen
= escSeqCharsLen
[ASCII
];
1628 uprv_memcpy(buffer
+ outLen
, escSeqChars
[ASCII
], escLen
);
1630 pFromU2022State
->cs
[0] = (int8_t)ASCII
;
1633 /* get the source index of the last input character */
1635 * TODO this would be simpler and more reliable if we used a pair
1636 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1637 * so that we could simply use the prevSourceIndex here;
1638 * this code gives an incorrect result for the rare case of an unmatched
1639 * trail surrogate that is alone in the last buffer of the text stream
1641 sourceIndex
=(int32_t)(source
-args
->source
);
1644 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
1645 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
1656 &target
, (const char *)targetLimit
,
1657 &offsets
, sourceIndex
,
1661 /*save the state and return */
1662 args
->source
= source
;
1663 args
->target
= (char*)target
;
1666 /*************** to unicode *******************/
1669 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
1672 const char *mySource
= (char *) args
->source
;
1673 UChar
*myTarget
= args
->target
;
1674 const char *mySourceLimit
= args
->sourceLimit
;
1675 uint32_t targetUniChar
= 0x0000;
1676 uint32_t mySourceChar
= 0x0000;
1677 UConverterDataISO2022
* myData
;
1678 ISO2022State
*pToU2022State
;
1681 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
1682 pToU2022State
= &myData
->toU2022State
;
1684 if(myData
->key
!= 0) {
1685 /* continue with a partial escape sequence */
1687 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
1688 /* continue with a partial double-byte character */
1689 mySourceChar
= args
->converter
->toUBytes
[0];
1690 args
->converter
->toULength
= 0;
1691 cs
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
1695 while(mySource
< mySourceLimit
){
1697 targetUniChar
=missingCharMarker
;
1699 if(myTarget
< args
->targetLimit
){
1701 mySourceChar
= (unsigned char) *mySource
++;
1703 switch(mySourceChar
) {
1705 if(myData
->version
==3) {
1709 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1710 myData
->isEmptySegment
= FALSE
; /* reset this, we have a different error */
1715 if(myData
->version
==3) {
1716 /* JIS7: switch to G1 half-width Katakana */
1717 pToU2022State
->cs
[1] = (int8_t)HWKANA_7BIT
;
1721 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1722 myData
->isEmptySegment
= FALSE
; /* reset this, we have a different error */
1730 const char * mySourceBefore
= mySource
;
1731 int8_t toULengthBefore
= args
->converter
->toULength
;
1733 changeState_2022(args
->converter
,&(mySource
),
1734 mySourceLimit
, ISO_2022_JP
,err
);
1736 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
1737 if ( myData
->version
== 0 && myData
->key
== 0 && U_SUCCESS(*err
) && myData
->isEmptySegment
) {
1738 *err
= U_PARSE_ERROR
; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
1739 args
->converter
->toULength
= toULengthBefore
+ (mySource
- mySourceBefore
);
1743 /* invalid or illegal escape sequence */
1744 if(U_FAILURE(*err
)){
1745 args
->target
= myTarget
;
1746 args
->source
= mySource
;
1747 myData
->isEmptySegment
= FALSE
; /* Reset to avoid future spurious errors */
1750 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
1751 if (myData
->key
== 0) {
1752 myData
->isEmptySegment
= TRUE
;
1756 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
1761 /* automatically reset to single-byte mode */
1762 if((StateEnum
)pToU2022State
->cs
[0] != ASCII
&& (StateEnum
)pToU2022State
->cs
[0] != JISX201
) {
1763 pToU2022State
->cs
[0] = (int8_t)ASCII
;
1765 pToU2022State
->cs
[2] = 0;
1766 pToU2022State
->g
= 0;
1769 /* convert one or two bytes */
1770 myData
->isEmptySegment
= FALSE
;
1771 cs
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
1772 if( (uint8_t)(mySourceChar
- 0xa1) <= (0xdf - 0xa1) && myData
->version
==4 &&
1775 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
1776 targetUniChar
= mySourceChar
+ (0xff61 - 0xa1);
1778 /* return from a single-shift state to the previous one */
1779 if(pToU2022State
->g
>= 2) {
1780 pToU2022State
->g
=pToU2022State
->prevG
;
1784 if(mySourceChar
<= 0x7f) {
1785 targetUniChar
= mySourceChar
;
1789 if(mySourceChar
<= 0x7f) {
1790 targetUniChar
= mySourceChar
+ 0x80;
1792 /* return from a single-shift state to the previous one */
1793 pToU2022State
->g
=pToU2022State
->prevG
;
1796 if(mySourceChar
<= 0x7f) {
1797 /* convert mySourceChar+0x80 to use a normal 8-bit table */
1799 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1800 myData
->myConverterArray
[cs
],
1801 mySourceChar
+ 0x80);
1803 /* return from a single-shift state to the previous one */
1804 pToU2022State
->g
=pToU2022State
->prevG
;
1807 if(mySourceChar
<= 0x7f) {
1809 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1810 myData
->myConverterArray
[cs
],
1815 if((uint8_t)(mySourceChar
- 0x21) <= (0x5f - 0x21)) {
1816 /* 7-bit halfwidth Katakana */
1817 targetUniChar
= mySourceChar
+ (0xff61 - 0x21);
1822 if(mySource
< mySourceLimit
) {
1825 tempBuf
[0] = (char) (mySourceChar
);
1826 tempBuf
[1] = trailByte
= *mySource
++;
1827 mySourceChar
= (mySourceChar
<< 8) | (uint8_t)(trailByte
);
1828 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(myData
->myConverterArray
[cs
], tempBuf
, 2, FALSE
);
1830 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
1831 args
->converter
->toULength
= 1;
1837 if(targetUniChar
< (missingCharMarker
-1/*0xfffe*/)){
1839 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
1841 *(myTarget
++)=(UChar
)targetUniChar
;
1843 else if(targetUniChar
> missingCharMarker
){
1844 /* disassemble the surrogate pair and write to output*/
1845 targetUniChar
-=0x0010000;
1846 *myTarget
= (UChar
)(0xd800+(UChar
)(targetUniChar
>>10));
1848 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
1851 if(myTarget
< args
->targetLimit
){
1852 *myTarget
= (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
1854 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
1858 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]=
1859 (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
1864 /* Call the callback function*/
1865 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
1870 *err
=U_BUFFER_OVERFLOW_ERROR
;
1875 args
->target
= myTarget
;
1876 args
->source
= mySource
;
1880 /***************************************************************
1881 * Rules for ISO-2022-KR encoding
1882 * i) The KSC5601 designator sequence should appear only once in a file,
1883 * at the begining of a line before any KSC5601 characters. This usually
1884 * means that it appears by itself on the first line of the file
1885 * ii) There are only 2 shifting sequences SO to shift into double byte mode
1886 * and SI to shift into single byte mode
1889 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
1891 UConverter
* saveConv
= args
->converter
;
1892 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*)saveConv
->extraInfo
;
1893 args
->converter
=myConverterData
->currentConverter
;
1895 myConverterData
->currentConverter
->fromUChar32
= saveConv
->fromUChar32
;
1896 ucnv_MBCSFromUnicodeWithOffsets(args
,err
);
1897 saveConv
->fromUChar32
= myConverterData
->currentConverter
->fromUChar32
;
1899 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
1900 if(myConverterData
->currentConverter
->charErrorBufferLength
> 0) {
1902 saveConv
->charErrorBuffer
,
1903 myConverterData
->currentConverter
->charErrorBuffer
,
1904 myConverterData
->currentConverter
->charErrorBufferLength
);
1906 saveConv
->charErrorBufferLength
= myConverterData
->currentConverter
->charErrorBufferLength
;
1907 myConverterData
->currentConverter
->charErrorBufferLength
= 0;
1909 args
->converter
=saveConv
;
1913 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
1915 const UChar
*source
= args
->source
;
1916 const UChar
*sourceLimit
= args
->sourceLimit
;
1917 unsigned char *target
= (unsigned char *) args
->target
;
1918 unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
1919 int32_t* offsets
= args
->offsets
;
1920 uint32_t targetByteUnit
= 0x0000;
1921 UChar32 sourceChar
= 0x0000;
1922 UBool isTargetByteDBCS
;
1923 UBool oldIsTargetByteDBCS
;
1924 UConverterDataISO2022
*converterData
;
1925 UConverterSharedData
* sharedData
;
1929 converterData
=(UConverterDataISO2022
*)args
->converter
->extraInfo
;
1930 /* if the version is 1 then the user is requesting
1931 * conversion with ibm-25546 pass the arguments to
1932 * MBCS converter and return
1934 if(converterData
->version
==1){
1935 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args
,err
);
1939 /* initialize data */
1940 sharedData
= converterData
->currentConverter
->sharedData
;
1941 useFallback
= args
->converter
->useFallback
;
1942 isTargetByteDBCS
=(UBool
)args
->converter
->fromUnicodeStatus
;
1943 oldIsTargetByteDBCS
= isTargetByteDBCS
;
1945 isTargetByteDBCS
= (UBool
) args
->converter
->fromUnicodeStatus
;
1946 if((sourceChar
= args
->converter
->fromUChar32
)!=0 && target
<targetLimit
) {
1949 while(source
< sourceLimit
){
1951 targetByteUnit
= missingCharMarker
;
1953 if(target
< (unsigned char*) args
->targetLimit
){
1954 sourceChar
= *source
++;
1956 /* do not convert SO/SI/ESC */
1957 if(IS_2022_CONTROL(sourceChar
)) {
1958 /* callback(illegal) */
1959 *err
=U_ILLEGAL_CHAR_FOUND
;
1960 args
->converter
->fromUChar32
=sourceChar
;
1964 /* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData,
1965 sourceChar,&targetByteUnit,args->converter->useFallback);*/
1966 MBCS_FROM_UCHAR32_ISO2022(sharedData
,sourceChar
,&targetByteUnit
,useFallback
,&length
,MBCS_OUTPUT_2
);
1967 /* only DBCS or SBCS characters are expected*/
1968 /* DB characters with high bit set to 1 are expected */
1969 if(length
> 2 || length
==0 ||(((targetByteUnit
& 0x8080) != 0x8080)&& length
==2)){
1970 targetByteUnit
=missingCharMarker
;
1972 if (targetByteUnit
!= missingCharMarker
){
1974 oldIsTargetByteDBCS
= isTargetByteDBCS
;
1975 isTargetByteDBCS
= (UBool
)(targetByteUnit
>0x00FF);
1976 /* append the shift sequence */
1977 if (oldIsTargetByteDBCS
!= isTargetByteDBCS
){
1979 if (isTargetByteDBCS
)
1980 *target
++ = UCNV_SO
;
1982 *target
++ = UCNV_SI
;
1984 *(offsets
++) = (int32_t)(source
- args
->source
-1);
1986 /* write the targetUniChar to target */
1987 if(targetByteUnit
<= 0x00FF){
1988 if( target
< targetLimit
){
1989 *(target
++) = (unsigned char) targetByteUnit
;
1991 *(offsets
++) = (int32_t)(source
- args
->source
-1);
1995 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
);
1996 *err
= U_BUFFER_OVERFLOW_ERROR
;
1999 if(target
< targetLimit
){
2000 *(target
++) =(unsigned char) ((targetByteUnit
>>8) -0x80);
2002 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2004 if(target
< targetLimit
){
2005 *(target
++) =(unsigned char) (targetByteUnit
-0x80);
2007 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2010 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
-0x80);
2011 *err
= U_BUFFER_OVERFLOW_ERROR
;
2014 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) ((targetByteUnit
>>8) -0x80);
2015 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
-0x80);
2016 *err
= U_BUFFER_OVERFLOW_ERROR
;
2022 /* oops.. the code point is unassingned
2023 * set the error and reason
2026 /*check if the char is a First surrogate*/
2027 if(UTF_IS_SURROGATE(sourceChar
)) {
2028 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
2030 /*look ahead to find the trail surrogate*/
2031 if(source
< sourceLimit
) {
2032 /* test the following code unit */
2033 UChar trail
=(UChar
) *source
;
2034 if(UTF_IS_SECOND_SURROGATE(trail
)) {
2036 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
2037 *err
= U_INVALID_CHAR_FOUND
;
2038 /* convert this surrogate code point */
2039 /* exit this condition tree */
2041 /* this is an unmatched lead code unit (1st surrogate) */
2042 /* callback(illegal) */
2043 *err
=U_ILLEGAL_CHAR_FOUND
;
2047 *err
= U_ZERO_ERROR
;
2050 /* this is an unmatched trail code unit (2nd surrogate) */
2051 /* callback(illegal) */
2052 *err
=U_ILLEGAL_CHAR_FOUND
;
2055 /* callback(unassigned) for a BMP code point */
2056 *err
= U_INVALID_CHAR_FOUND
;
2059 args
->converter
->fromUChar32
=sourceChar
;
2062 } /* end if(myTargetIndex<myTargetLength) */
2064 *err
=U_BUFFER_OVERFLOW_ERROR
;
2068 }/* end while(mySourceIndex<mySourceLength) */
2071 * the end of the input stream and detection of truncated input
2072 * are handled by the framework, but for ISO-2022-KR conversion
2073 * we need to be in ASCII mode at the very end
2078 * end of input and no truncated input
2080 if( U_SUCCESS(*err
) &&
2082 args
->flush
&& source
>=sourceLimit
&& args
->converter
->fromUChar32
==0
2084 int32_t sourceIndex
;
2086 /* we are switching to ASCII */
2087 isTargetByteDBCS
=FALSE
;
2089 /* get the source index of the last input character */
2091 * TODO this would be simpler and more reliable if we used a pair
2092 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2093 * so that we could simply use the prevSourceIndex here;
2094 * this code gives an incorrect result for the rare case of an unmatched
2095 * trail surrogate that is alone in the last buffer of the text stream
2097 sourceIndex
=(int32_t)(source
-args
->source
);
2100 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
2101 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
2112 &target
, (const char *)targetLimit
,
2113 &offsets
, sourceIndex
,
2117 /*save the state and return */
2118 args
->source
= source
;
2119 args
->target
= (char*)target
;
2120 args
->converter
->fromUnicodeStatus
= (uint32_t)isTargetByteDBCS
;
2123 /************************ To Unicode ***************************************/
2126 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs
*args
,
2128 char const* sourceStart
;
2129 UConverterDataISO2022
* myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2131 UConverterToUnicodeArgs subArgs
;
2132 int32_t minArgsSize
;
2134 /* set up the subconverter arguments */
2135 if(args
->size
<sizeof(UConverterToUnicodeArgs
)) {
2136 minArgsSize
= args
->size
;
2138 minArgsSize
= (int32_t)sizeof(UConverterToUnicodeArgs
);
2141 uprv_memcpy(&subArgs
, args
, minArgsSize
);
2142 subArgs
.size
= (uint16_t)minArgsSize
;
2143 subArgs
.converter
= myData
->currentConverter
;
2145 /* remember the original start of the input for offsets */
2146 sourceStart
= args
->source
;
2148 if(myData
->key
!= 0) {
2149 /* continue with a partial escape sequence */
2153 while(U_SUCCESS(*err
) && args
->source
< args
->sourceLimit
) {
2154 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2155 subArgs
.source
= args
->source
;
2156 subArgs
.sourceLimit
= getEndOfBuffer_2022(&(args
->source
), args
->sourceLimit
, args
->flush
);
2157 if(subArgs
.source
!= subArgs
.sourceLimit
) {
2159 * get the current partial byte sequence
2161 * it needs to be moved between the public and the subconverter
2162 * so that the conversion framework, which only sees the public
2163 * converter, can handle truncated and illegal input etc.
2165 if(args
->converter
->toULength
> 0) {
2166 uprv_memcpy(subArgs
.converter
->toUBytes
, args
->converter
->toUBytes
, args
->converter
->toULength
);
2168 subArgs
.converter
->toULength
= args
->converter
->toULength
;
2171 * Convert up to the end of the input, or to before the next escape character.
2172 * Does not handle conversion extensions because the preToU[] state etc.
2175 ucnv_MBCSToUnicodeWithOffsets(&subArgs
, err
);
2177 if(args
->offsets
!= NULL
&& sourceStart
!= args
->source
) {
2178 /* update offsets to base them on the actual start of the input */
2179 int32_t *offsets
= args
->offsets
;
2180 UChar
*target
= args
->target
;
2181 int32_t delta
= (int32_t)(args
->source
- sourceStart
);
2182 while(target
< subArgs
.target
) {
2190 args
->source
= subArgs
.source
;
2191 args
->target
= subArgs
.target
;
2192 args
->offsets
= subArgs
.offsets
;
2194 /* copy input/error/overflow buffers */
2195 if(subArgs
.converter
->toULength
> 0) {
2196 uprv_memcpy(args
->converter
->toUBytes
, subArgs
.converter
->toUBytes
, subArgs
.converter
->toULength
);
2198 args
->converter
->toULength
= subArgs
.converter
->toULength
;
2200 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
2201 if(subArgs
.converter
->UCharErrorBufferLength
> 0) {
2202 uprv_memcpy(args
->converter
->UCharErrorBuffer
, subArgs
.converter
->UCharErrorBuffer
,
2203 subArgs
.converter
->UCharErrorBufferLength
);
2205 args
->converter
->UCharErrorBufferLength
=subArgs
.converter
->UCharErrorBufferLength
;
2206 subArgs
.converter
->UCharErrorBufferLength
= 0;
2210 if (U_FAILURE(*err
) || (args
->source
== args
->sourceLimit
)) {
2215 changeState_2022(args
->converter
,
2224 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
2227 const char *mySource
= ( char *) args
->source
;
2228 UChar
*myTarget
= args
->target
;
2229 const char *mySourceLimit
= args
->sourceLimit
;
2230 UChar32 targetUniChar
= 0x0000;
2231 UChar mySourceChar
= 0x0000;
2232 UConverterDataISO2022
* myData
;
2233 UConverterSharedData
* sharedData
;
2236 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2237 if(myData
->version
==1){
2238 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args
,err
);
2242 /* initialize state */
2243 sharedData
= myData
->currentConverter
->sharedData
;
2244 useFallback
= args
->converter
->useFallback
;
2246 if(myData
->key
!= 0) {
2247 /* continue with a partial escape sequence */
2249 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
2250 /* continue with a partial double-byte character */
2251 mySourceChar
= args
->converter
->toUBytes
[0];
2252 args
->converter
->toULength
= 0;
2256 while(mySource
< mySourceLimit
){
2258 if(myTarget
< args
->targetLimit
){
2260 mySourceChar
= (unsigned char) *mySource
++;
2262 if(mySourceChar
==UCNV_SI
){
2263 myData
->toU2022State
.g
= 0;
2264 if (myData
->isEmptySegment
) {
2265 myData
->isEmptySegment
= FALSE
; /* we are handling it, reset to avoid future spurious errors */
2266 *err
= U_PARSE_ERROR
; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
2267 args
->converter
->toUBytes
[0] = mySourceChar
;
2268 args
->converter
->toULength
= 1;
2269 args
->target
= myTarget
;
2270 args
->source
= mySource
;
2273 /*consume the source */
2275 }else if(mySourceChar
==UCNV_SO
){
2276 myData
->toU2022State
.g
= 1;
2277 myData
->isEmptySegment
= TRUE
; /* Begin a new segment, empty so far */
2278 /*consume the source */
2280 }else if(mySourceChar
==ESC_2022
){
2283 myData
->isEmptySegment
= FALSE
; /* Any invalid ESC sequences will be detected separately, so just reset this */
2284 changeState_2022(args
->converter
,&(mySource
),
2285 mySourceLimit
, ISO_2022_KR
, err
);
2286 if(U_FAILURE(*err
)){
2287 args
->target
= myTarget
;
2288 args
->source
= mySource
;
2294 myData
->isEmptySegment
= FALSE
; /* Any invalid char errors will be detected separately, so just reset this */
2295 if(myData
->toU2022State
.g
== 1) {
2296 if(mySource
< mySourceLimit
) {
2299 trailByte
= *mySource
++;
2300 tempBuf
[0] = (char)(mySourceChar
+ 0x80);
2301 tempBuf
[1] = (char)(trailByte
+ 0x80);
2302 mySourceChar
= (mySourceChar
<< 8) | (uint8_t)(trailByte
);
2303 if((mySourceChar
& 0x8080) == 0) {
2304 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(sharedData
, tempBuf
, 2, useFallback
);
2306 /* illegal bytes > 0x7f */
2307 targetUniChar
= missingCharMarker
;
2310 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
2311 args
->converter
->toULength
= 1;
2316 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(sharedData
, mySource
- 1, 1, useFallback
);
2318 if(targetUniChar
< 0xfffe){
2320 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2322 *(myTarget
++)=(UChar
)targetUniChar
;
2325 /* Call the callback function*/
2326 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
2331 *err
=U_BUFFER_OVERFLOW_ERROR
;
2335 args
->target
= myTarget
;
2336 args
->source
= mySource
;
2339 /*************************** END ISO2022-KR *********************************/
2341 /*************************** ISO-2022-CN *********************************
2343 * Rules for ISO-2022-CN Encoding:
2344 * i) The designator sequence must appear once on a line before any instance
2345 * of character set it designates.
2346 * ii) If two lines contain characters from the same character set, both lines
2347 * must include the designator sequence.
2348 * iii) Once the designator sequence is known, a shifting sequence has to be found
2349 * to invoke the shifting
2350 * iv) All lines start in ASCII and end in ASCII.
2351 * v) Four shifting sequences are employed for this purpose:
2353 * Sequcence ASCII Eq Charsets
2354 * ---------- ------- ---------
2356 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2357 * SS2 <ESC>N CNS-11643-1992 Plane 2
2358 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2361 * SOdesignator : ESC "$" ")" finalchar_for_SO
2362 * SS2designator : ESC "$" "*" finalchar_for_SS2
2363 * SS3designator : ESC "$" "+" finalchar_for_SS3
2365 * ESC $ ) A Indicates the bytes following SO are Chinese
2366 * characters as defined in GB 2312-80, until
2367 * another SOdesignation appears
2370 * ESC $ ) E Indicates the bytes following SO are as defined
2371 * in ISO-IR-165 (for details, see section 2.1),
2372 * until another SOdesignation appears
2374 * ESC $ ) G Indicates the bytes following SO are as defined
2375 * in CNS 11643-plane-1, until another
2376 * SOdesignation appears
2378 * ESC $ * H Indicates the two bytes immediately following
2379 * SS2 is a Chinese character as defined in CNS
2380 * 11643-plane-2, until another SS2designation
2382 * (Meaning <ESC>N must preceed every 2 byte
2385 * ESC $ + I Indicates the immediate two bytes following SS3
2386 * is a Chinese character as defined in CNS
2387 * 11643-plane-3, until another SS3designation
2389 * (Meaning <ESC>O must preceed every 2 byte
2392 * ESC $ + J Indicates the immediate two bytes following SS3
2393 * is a Chinese character as defined in CNS
2394 * 11643-plane-4, until another SS3designation
2396 * (In English: <ESC>O must preceed every 2 byte
2399 * ESC $ + K Indicates the immediate two bytes following SS3
2400 * is a Chinese character as defined in CNS
2401 * 11643-plane-5, until another SS3designation
2404 * ESC $ + L Indicates the immediate two bytes following SS3
2405 * is a Chinese character as defined in CNS
2406 * 11643-plane-6, until another SS3designation
2409 * ESC $ + M Indicates the immediate two bytes following SS3
2410 * is a Chinese character as defined in CNS
2411 * 11643-plane-7, until another SS3designation
2414 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2415 * has its own designation information before any Chinese characters
2420 /* The following are defined this way to make the strings truely readonly */
2421 static const char GB_2312_80_STR
[] = "\x1B\x24\x29\x41";
2422 static const char ISO_IR_165_STR
[] = "\x1B\x24\x29\x45";
2423 static const char CNS_11643_1992_Plane_1_STR
[] = "\x1B\x24\x29\x47";
2424 static const char CNS_11643_1992_Plane_2_STR
[] = "\x1B\x24\x2A\x48";
2425 static const char CNS_11643_1992_Plane_3_STR
[] = "\x1B\x24\x2B\x49";
2426 static const char CNS_11643_1992_Plane_4_STR
[] = "\x1B\x24\x2B\x4A";
2427 static const char CNS_11643_1992_Plane_5_STR
[] = "\x1B\x24\x2B\x4B";
2428 static const char CNS_11643_1992_Plane_6_STR
[] = "\x1B\x24\x2B\x4C";
2429 static const char CNS_11643_1992_Plane_7_STR
[] = "\x1B\x24\x2B\x4D";
2431 /********************** ISO2022-CN Data **************************/
2432 static const char* const escSeqCharsCN
[10] ={
2433 SHIFT_IN_STR
, /* ASCII */
2436 CNS_11643_1992_Plane_1_STR
,
2437 CNS_11643_1992_Plane_2_STR
,
2438 CNS_11643_1992_Plane_3_STR
,
2439 CNS_11643_1992_Plane_4_STR
,
2440 CNS_11643_1992_Plane_5_STR
,
2441 CNS_11643_1992_Plane_6_STR
,
2442 CNS_11643_1992_Plane_7_STR
2446 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
2448 UConverterDataISO2022
*converterData
;
2449 ISO2022State
*pFromU2022State
;
2450 uint8_t *target
= (uint8_t *) args
->target
;
2451 const uint8_t *targetLimit
= (const uint8_t *) args
->targetLimit
;
2452 const UChar
* source
= args
->source
;
2453 const UChar
* sourceLimit
= args
->sourceLimit
;
2454 int32_t* offsets
= args
->offsets
;
2459 int32_t choiceCount
;
2460 uint32_t targetValue
= 0;
2463 /* set up the state */
2464 converterData
= (UConverterDataISO2022
*)args
->converter
->extraInfo
;
2465 pFromU2022State
= &converterData
->fromU2022State
;
2466 useFallback
= args
->converter
->useFallback
;
2470 /* check if the last codepoint of previous buffer was a lead surrogate*/
2471 if((sourceChar
= args
->converter
->fromUChar32
)!=0 && target
< targetLimit
) {
2475 while( source
< sourceLimit
){
2476 if(target
< targetLimit
){
2478 sourceChar
= *(source
++);
2479 /*check if the char is a First surrogate*/
2480 if(UTF_IS_SURROGATE(sourceChar
)) {
2481 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
2483 /*look ahead to find the trail surrogate*/
2484 if(source
< sourceLimit
) {
2485 /* test the following code unit */
2486 UChar trail
=(UChar
) *source
;
2487 if(UTF_IS_SECOND_SURROGATE(trail
)) {
2489 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
2490 args
->converter
->fromUChar32
=0x00;
2491 /* convert this supplementary code point */
2492 /* exit this condition tree */
2494 /* this is an unmatched lead code unit (1st surrogate) */
2495 /* callback(illegal) */
2496 *err
=U_ILLEGAL_CHAR_FOUND
;
2497 args
->converter
->fromUChar32
=sourceChar
;
2502 args
->converter
->fromUChar32
=sourceChar
;
2506 /* this is an unmatched trail code unit (2nd surrogate) */
2507 /* callback(illegal) */
2508 *err
=U_ILLEGAL_CHAR_FOUND
;
2509 args
->converter
->fromUChar32
=sourceChar
;
2514 /* do the conversion */
2515 if(sourceChar
<= 0x007f ){
2516 /* do not convert SO/SI/ESC */
2517 if(IS_2022_CONTROL(sourceChar
)) {
2518 /* callback(illegal) */
2519 *err
=U_ILLEGAL_CHAR_FOUND
;
2520 args
->converter
->fromUChar32
=sourceChar
;
2525 if(pFromU2022State
->g
== 0) {
2526 buffer
[0] = (char)sourceChar
;
2529 buffer
[0] = UCNV_SI
;
2530 buffer
[1] = (char)sourceChar
;
2532 pFromU2022State
->g
= 0;
2535 if(sourceChar
== CR
|| sourceChar
== LF
) {
2536 /* reset the state at the end of a line */
2537 uprv_memset(pFromU2022State
, 0, sizeof(ISO2022State
));
2542 /* convert U+0080..U+10ffff */
2543 UConverterSharedData
*cnv
;
2547 if(choiceCount
== 0) {
2548 /* try the current SO/G1 converter first */
2549 choices
[0] = pFromU2022State
->cs
[1];
2551 /* default to GB2312_1 if none is designated yet */
2552 if(choices
[0] == 0) {
2553 choices
[0] = GB2312_1
;
2556 if(converterData
->version
== 0) {
2559 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2560 if(choices
[0] == GB2312_1
) {
2561 choices
[1] = (int8_t)CNS_11643_1
;
2563 choices
[1] = (int8_t)GB2312_1
;
2568 /* ISO-2022-CN-EXT */
2570 /* try one of the other converters */
2571 switch(choices
[0]) {
2573 choices
[1] = (int8_t)CNS_11643_1
;
2574 choices
[2] = (int8_t)ISO_IR_165
;
2577 choices
[1] = (int8_t)GB2312_1
;
2578 choices
[2] = (int8_t)CNS_11643_1
;
2580 default: /* CNS_11643_x */
2581 choices
[1] = (int8_t)GB2312_1
;
2582 choices
[2] = (int8_t)ISO_IR_165
;
2593 for(i
= 0; i
< choiceCount
&& len
== 0; ++i
) {
2596 if(cs
> CNS_11643_0
) {
2597 cnv
= converterData
->myConverterArray
[CNS_11643
];
2598 MBCS_FROM_UCHAR32_ISO2022(cnv
,sourceChar
,&targetValue
,useFallback
,&len
,MBCS_OUTPUT_3
);
2600 cs
= (int8_t)(CNS_11643_0
+ (targetValue
>> 16) - 0x80);
2602 if(cs
== CNS_11643_1
) {
2604 } else if(cs
== CNS_11643_2
) {
2606 } else /* plane 3..7 */ if(converterData
->version
== 1) {
2609 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
2614 /* GB2312_1 or ISO-IR-165 */
2615 cnv
= converterData
->myConverterArray
[cs
];
2616 MBCS_FROM_UCHAR32_ISO2022(cnv
,sourceChar
,&targetValue
,useFallback
,&len
,MBCS_OUTPUT_2
);
2617 g
= 1; /* used if len == 2 */
2623 len
= 0; /* count output bytes; it must have been len == 2 */
2625 /* write the designation sequence if necessary */
2626 if(cs
!= pFromU2022State
->cs
[g
]) {
2627 if(cs
< CNS_11643
) {
2628 uprv_memcpy(buffer
, escSeqCharsCN
[cs
], 4);
2630 uprv_memcpy(buffer
, escSeqCharsCN
[CNS_11643
+ (cs
- CNS_11643_1
)], 4);
2633 pFromU2022State
->cs
[g
] = cs
;
2635 /* changing the SO/G1 charset invalidates the choices[] */
2640 /* write the shift sequence if necessary */
2641 if(g
!= pFromU2022State
->g
) {
2644 buffer
[len
++] = UCNV_SO
;
2646 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
2647 pFromU2022State
->g
= 1;
2650 buffer
[len
++] = 0x1b;
2651 buffer
[len
++] = 0x4e;
2653 default: /* case 3 */
2654 buffer
[len
++] = 0x1b;
2655 buffer
[len
++] = 0x4f;
2660 /* write the two output bytes */
2661 buffer
[len
++] = (char)(targetValue
>> 8);
2662 buffer
[len
++] = (char)targetValue
;
2664 /* if we cannot find the character after checking all codepages
2665 * then this is an error
2667 *err
= U_INVALID_CHAR_FOUND
;
2668 args
->converter
->fromUChar32
=sourceChar
;
2673 /* output len>0 bytes in buffer[] */
2675 *target
++ = buffer
[0];
2677 *offsets
++ = (int32_t)(source
- args
->source
- 1); /* -1: known to be ASCII */
2679 } else if(len
== 2 && (target
+ 2) <= targetLimit
) {
2680 *target
++ = buffer
[0];
2681 *target
++ = buffer
[1];
2683 int32_t sourceIndex
= (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
));
2684 *offsets
++ = sourceIndex
;
2685 *offsets
++ = sourceIndex
;
2691 &target
, (const char *)targetLimit
,
2692 &offsets
, (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
)),
2694 if(U_FAILURE(*err
)) {
2698 } /* end if(myTargetIndex<myTargetLength) */
2700 *err
=U_BUFFER_OVERFLOW_ERROR
;
2704 }/* end while(mySourceIndex<mySourceLength) */
2707 * the end of the input stream and detection of truncated input
2708 * are handled by the framework, but for ISO-2022-CN conversion
2709 * we need to be in ASCII mode at the very end
2714 * end of input and no truncated input
2716 if( U_SUCCESS(*err
) &&
2717 pFromU2022State
->g
!=0 &&
2718 args
->flush
&& source
>=sourceLimit
&& args
->converter
->fromUChar32
==0
2720 int32_t sourceIndex
;
2722 /* we are switching to ASCII */
2723 pFromU2022State
->g
=0;
2725 /* get the source index of the last input character */
2727 * TODO this would be simpler and more reliable if we used a pair
2728 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2729 * so that we could simply use the prevSourceIndex here;
2730 * this code gives an incorrect result for the rare case of an unmatched
2731 * trail surrogate that is alone in the last buffer of the text stream
2733 sourceIndex
=(int32_t)(source
-args
->source
);
2736 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
2737 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
2748 &target
, (const char *)targetLimit
,
2749 &offsets
, sourceIndex
,
2753 /*save the state and return */
2754 args
->source
= source
;
2755 args
->target
= (char*)target
;
2760 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
2763 const char *mySource
= (char *) args
->source
;
2764 UChar
*myTarget
= args
->target
;
2765 const char *mySourceLimit
= args
->sourceLimit
;
2766 uint32_t targetUniChar
= 0x0000;
2767 uint32_t mySourceChar
= 0x0000;
2768 UConverterDataISO2022
* myData
;
2769 ISO2022State
*pToU2022State
;
2771 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2772 pToU2022State
= &myData
->toU2022State
;
2774 if(myData
->key
!= 0) {
2775 /* continue with a partial escape sequence */
2777 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
2778 /* continue with a partial double-byte character */
2779 mySourceChar
= args
->converter
->toUBytes
[0];
2780 args
->converter
->toULength
= 0;
2784 while(mySource
< mySourceLimit
){
2786 targetUniChar
=missingCharMarker
;
2788 if(myTarget
< args
->targetLimit
){
2790 mySourceChar
= (unsigned char) *mySource
++;
2792 switch(mySourceChar
){
2795 if (myData
->isEmptySegment
) {
2796 myData
->isEmptySegment
= FALSE
; /* we are handling it, reset to avoid future spurious errors */
2797 *err
= U_PARSE_ERROR
; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
2798 args
->converter
->toUBytes
[0] = mySourceChar
;
2799 args
->converter
->toULength
= 1;
2800 args
->target
= myTarget
;
2801 args
->source
= mySource
;
2807 if(pToU2022State
->cs
[1] != 0) {
2809 myData
->isEmptySegment
= TRUE
; /* Begin a new segment, empty so far */
2812 /* illegal to have SO before a matching designator */
2813 myData
->isEmptySegment
= FALSE
; /* Handling a different error, reset this to avoid future spurious errs */
2821 const char * mySourceBefore
= mySource
;
2822 int8_t toULengthBefore
= args
->converter
->toULength
;
2824 changeState_2022(args
->converter
,&(mySource
),
2825 mySourceLimit
, ISO_2022_CN
,err
);
2827 /* After SO there must be at least one character before a designator (designator error handled separately) */
2828 if ( myData
->key
== 0 && U_SUCCESS(*err
) && myData
->isEmptySegment
) {
2829 *err
= U_PARSE_ERROR
; /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
2830 args
->converter
->toULength
= toULengthBefore
+ (mySource
- mySourceBefore
);
2834 /* invalid or illegal escape sequence */
2835 if(U_FAILURE(*err
)){
2836 args
->target
= myTarget
;
2837 args
->source
= mySource
;
2838 myData
->isEmptySegment
= FALSE
; /* Reset to avoid future spurious errors */
2843 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
2848 uprv_memset(pToU2022State
, 0, sizeof(ISO2022State
));
2851 /* convert one or two bytes */
2852 myData
->isEmptySegment
= FALSE
;
2853 if(pToU2022State
->g
!= 0) {
2854 if(mySource
< mySourceLimit
) {
2855 UConverterSharedData
*cnv
;
2856 StateEnum tempState
;
2860 trailByte
= *mySource
++;
2861 tempState
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
2862 if(tempState
> CNS_11643_0
) {
2863 cnv
= myData
->myConverterArray
[CNS_11643
];
2864 tempBuf
[0] = (char) (0x80+(tempState
-CNS_11643_0
));
2865 tempBuf
[1] = (char) (mySourceChar
);
2866 tempBuf
[2] = trailByte
;
2870 cnv
= myData
->myConverterArray
[tempState
];
2871 tempBuf
[0] = (char) (mySourceChar
);
2872 tempBuf
[1] = trailByte
;
2875 mySourceChar
= (mySourceChar
<< 8) | (uint8_t)(trailByte
);
2876 if(pToU2022State
->g
>=2) {
2877 /* return from a single-shift state to the previous one */
2878 pToU2022State
->g
=pToU2022State
->prevG
;
2880 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(cnv
, tempBuf
, tempBufLen
, FALSE
);
2882 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
2883 args
->converter
->toULength
= 1;
2888 if(mySourceChar
<= 0x7f) {
2889 targetUniChar
= (UChar
) mySourceChar
;
2894 if(targetUniChar
< (missingCharMarker
-1/*0xfffe*/)){
2896 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2898 *(myTarget
++)=(UChar
)targetUniChar
;
2900 else if(targetUniChar
> missingCharMarker
){
2901 /* disassemble the surrogate pair and write to output*/
2902 targetUniChar
-=0x0010000;
2903 *myTarget
= (UChar
)(0xd800+(UChar
)(targetUniChar
>>10));
2905 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2908 if(myTarget
< args
->targetLimit
){
2909 *myTarget
= (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
2911 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2915 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]=
2916 (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
2921 /* Call the callback function*/
2922 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
2927 *err
=U_BUFFER_OVERFLOW_ERROR
;
2932 args
->target
= myTarget
;
2933 args
->source
= mySource
;
2937 _ISO_2022_WriteSub(UConverterFromUnicodeArgs
*args
, int32_t offsetIndex
, UErrorCode
*err
) {
2938 UConverter
*cnv
= args
->converter
;
2939 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) cnv
->extraInfo
;
2940 ISO2022State
*pFromU2022State
=&myConverterData
->fromU2022State
;
2945 subchar
=(char *)cnv
->subChars
;
2946 length
=cnv
->subCharLen
; /* assume length==1 for most variants */
2949 switch(myConverterData
->locale
[0]){
2954 if(pFromU2022State
->g
== 1) {
2955 /* JIS7: switch from G1 to G0 */
2956 pFromU2022State
->g
= 0;
2960 cs
= pFromU2022State
->cs
[0];
2961 if(cs
!= ASCII
&& cs
!= JISX201
) {
2962 /* not in ASCII or JIS X 0201: switch to ASCII */
2963 pFromU2022State
->cs
[0] = (int8_t)ASCII
;
2973 if(pFromU2022State
->g
!= 0) {
2974 /* not in ASCII mode: switch to ASCII */
2975 pFromU2022State
->g
= 0;
2981 if(myConverterData
->version
== 0) {
2983 if((UBool
)args
->converter
->fromUnicodeStatus
) {
2984 /* in DBCS mode: switch to SBCS */
2985 args
->converter
->fromUnicodeStatus
= 0;
2989 } else /* length == 2*/ {
2990 if(!(UBool
)args
->converter
->fromUnicodeStatus
) {
2991 /* in SBCS mode: switch to DBCS */
2992 args
->converter
->fromUnicodeStatus
= 1;
3000 /* save the subconverter's substitution string */
3001 uint8_t *currentSubChars
= myConverterData
->currentConverter
->subChars
;
3002 int8_t currentSubCharLen
= myConverterData
->currentConverter
->subCharLen
;
3004 /* set our substitution string into the subconverter */
3005 myConverterData
->currentConverter
->subChars
= (uint8_t *)subchar
;
3006 myConverterData
->currentConverter
->subCharLen
= (int8_t)length
;
3008 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3009 args
->converter
= myConverterData
->currentConverter
;
3010 myConverterData
->currentConverter
->fromUChar32
= cnv
->fromUChar32
;
3011 ucnv_cbFromUWriteSub(args
, 0, err
);
3012 cnv
->fromUChar32
= myConverterData
->currentConverter
->fromUChar32
;
3013 args
->converter
= cnv
;
3015 /* restore the subconverter's substitution string */
3016 myConverterData
->currentConverter
->subChars
= currentSubChars
;
3017 myConverterData
->currentConverter
->subCharLen
= currentSubCharLen
;
3019 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
3020 if(myConverterData
->currentConverter
->charErrorBufferLength
> 0) {
3022 cnv
->charErrorBuffer
,
3023 myConverterData
->currentConverter
->charErrorBuffer
,
3024 myConverterData
->currentConverter
->charErrorBufferLength
);
3026 cnv
->charErrorBufferLength
= myConverterData
->currentConverter
->charErrorBufferLength
;
3027 myConverterData
->currentConverter
->charErrorBufferLength
= 0;
3035 ucnv_cbFromUWriteBytes(args
,
3036 buffer
, (int32_t)(p
- buffer
),
3041 * Structure for cloning an ISO 2022 converter into a single memory block.
3042 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3043 * and then ucnv_safeClone() of the sub-converter may additionally align
3044 * currentConverter inside the cloneStruct, for which we need the deadSpace
3045 * after currentConverter.
3046 * This is because UAlignedMemory may be larger than the actually
3047 * necessary alignment size for the platform.
3048 * The other cloneStruct fields will not be moved around,
3049 * and are aligned properly with cloneStruct's alignment.
3054 UConverter currentConverter
;
3055 UAlignedMemory deadSpace
;
3056 UConverterDataISO2022 mydata
;
3061 _ISO_2022_SafeClone(
3062 const UConverter
*cnv
,
3064 int32_t *pBufferSize
,
3067 struct cloneStruct
* localClone
;
3068 UConverterDataISO2022
*cnvData
;
3071 if (*pBufferSize
== 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3072 *pBufferSize
= (int32_t)sizeof(struct cloneStruct
);
3076 cnvData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
3077 localClone
= (struct cloneStruct
*)stackBuffer
;
3079 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3081 uprv_memcpy(&localClone
->mydata
, cnvData
, sizeof(UConverterDataISO2022
));
3082 localClone
->cnv
.extraInfo
= &localClone
->mydata
; /* set pointer to extra data */
3083 localClone
->cnv
.isExtraLocal
= TRUE
;
3085 /* share the subconverters */
3087 if(cnvData
->currentConverter
!= NULL
) {
3088 size
= (int32_t)(sizeof(UConverter
) + sizeof(UAlignedMemory
)); /* include size of padding */
3089 localClone
->mydata
.currentConverter
=
3090 ucnv_safeClone(cnvData
->currentConverter
,
3091 &localClone
->currentConverter
,
3093 if(U_FAILURE(*status
)) {
3098 for(i
=0; i
<UCNV_2022_MAX_CONVERTERS
; ++i
) {
3099 if(cnvData
->myConverterArray
[i
] != NULL
) {
3100 ucnv_incrementRefCount(cnvData
->myConverterArray
[i
]);
3104 return &localClone
->cnv
;
3108 _ISO_2022_GetUnicodeSet(const UConverter
*cnv
,
3109 const USetAdder
*sa
,
3110 UConverterUnicodeSet which
,
3111 UErrorCode
*pErrorCode
)
3114 UConverterDataISO2022
* cnvData
;
3116 if (U_FAILURE(*pErrorCode
)) {
3119 #ifdef U_ENABLE_GENERIC_ISO_2022
3120 if (cnv
->sharedData
== &_ISO2022Data
) {
3121 /* We use UTF-8 in this case */
3122 sa
->addRange(sa
->set
, 0, 0xd7FF);
3123 sa
->addRange(sa
->set
, 0xE000, 0x10FFFF);
3128 cnvData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
3130 /* open a set and initialize it with code points that are algorithmically round-tripped */
3131 switch(cnvData
->locale
[0]){
3133 if(jpCharsetMasks
[cnvData
->version
]&CSM(ISO8859_1
)) {
3134 /* include Latin-1 for some variants of JP */
3135 sa
->addRange(sa
->set
, 0, 0xff);
3137 /* include ASCII for JP */
3138 sa
->addRange(sa
->set
, 0, 0x7f);
3140 if(jpCharsetMasks
[cnvData
->version
]&CSM(HWKANA_7BIT
)) {
3141 /* include half-width Katakana for JP */
3142 sa
->addRange(sa
->set
, 0xff61, 0xff9f);
3147 /* include ASCII for CN */
3148 sa
->addRange(sa
->set
, 0, 0x7f);
3151 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3152 cnvData
->currentConverter
->sharedData
->impl
->getUnicodeSet(
3153 cnvData
->currentConverter
, sa
, which
, pErrorCode
);
3154 /* the loop over myConverterArray[] will simply not find another converter */
3161 * Version-specific for CN:
3162 * CN version 0 does not map CNS planes 3..7 although
3163 * they are all available in the CNS conversion table;
3164 * CN version 1 does map them all.
3165 * The two versions create different Unicode sets.
3167 for (i
=0; i
<UCNV_2022_MAX_CONVERTERS
; i
++) {
3168 if(cnvData
->myConverterArray
[i
]!=NULL
) {
3169 if( (cnvData
->locale
[0]=='c' || cnvData
->locale
[0]=='z') &&
3170 cnvData
->version
==0 && i
==CNS_11643
3172 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3173 ucnv_MBCSGetUnicodeSetForBytes(
3174 cnvData
->myConverterArray
[i
],
3175 sa
, UCNV_ROUNDTRIP_SET
,
3179 ucnv_MBCSGetUnicodeSetForUnicode(cnvData
->myConverterArray
[i
], sa
, which
, pErrorCode
);
3185 * ISO 2022 converters must not convert SO/SI/ESC despite what
3186 * sub-converters do by themselves.
3187 * Remove these characters from the set.
3189 sa
->remove(sa
->set
, 0x0e);
3190 sa
->remove(sa
->set
, 0x0f);
3191 sa
->remove(sa
->set
, 0x1b);
3194 static const UConverterImpl _ISO2022Impl
={
3204 #ifdef U_ENABLE_GENERIC_ISO_2022
3205 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC
,
3206 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC
,
3207 ucnv_fromUnicode_UTF8
,
3208 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
3220 _ISO_2022_SafeClone
,
3221 _ISO_2022_GetUnicodeSet
3223 static const UConverterStaticData _ISO2022StaticData
={
3224 sizeof(UConverterStaticData
),
3230 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3237 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3239 const UConverterSharedData _ISO2022Data
={
3240 sizeof(UConverterSharedData
),
3244 &_ISO2022StaticData
,
3250 /*************JP****************/
3251 static const UConverterImpl _ISO2022JPImpl
={
3261 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3262 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3263 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3264 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3270 _ISO_2022_SafeClone
,
3271 _ISO_2022_GetUnicodeSet
3273 static const UConverterStaticData _ISO2022JPStaticData
={
3274 sizeof(UConverterStaticData
),
3280 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3287 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3289 static const UConverterSharedData _ISO2022JPData
={
3290 sizeof(UConverterSharedData
),
3294 &_ISO2022JPStaticData
,
3300 /************* KR ***************/
3301 static const UConverterImpl _ISO2022KRImpl
={
3311 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3312 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3313 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3314 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3320 _ISO_2022_SafeClone
,
3321 _ISO_2022_GetUnicodeSet
3323 static const UConverterStaticData _ISO2022KRStaticData
={
3324 sizeof(UConverterStaticData
),
3330 3, /* max 3 bytes per UChar: SO+DBCS */
3337 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3339 static const UConverterSharedData _ISO2022KRData
={
3340 sizeof(UConverterSharedData
),
3344 &_ISO2022KRStaticData
,
3350 /*************** CN ***************/
3351 static const UConverterImpl _ISO2022CNImpl
={
3362 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3363 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3364 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3365 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3371 _ISO_2022_SafeClone
,
3372 _ISO_2022_GetUnicodeSet
3374 static const UConverterStaticData _ISO2022CNStaticData
={
3375 sizeof(UConverterStaticData
),
3381 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3388 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3390 static const UConverterSharedData _ISO2022CNData
={
3391 sizeof(UConverterSharedData
),
3395 &_ISO2022CNStaticData
,
3403 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */