2 **********************************************************************
3 * Copyright (C) 2000-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.c
8 * tab size: 8 (not used)
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
29 #include "unicode/utypes.h"
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
46 #ifdef U_ENABLE_GENERIC_ISO_2022
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
74 * Markus Scherer 2003-dec-03
78 static const char SHIFT_IN_STR
[] = "\x0F";
79 static const char SHIFT_OUT_STR
[] = "\x0E";
93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
94 * as bytes 21..7E. (Subtract 0x80.)
95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
96 * as bytes 20..7F. (Subtract 0x80.)
97 * Do not encode C1 control codes with native bytes 80..9F
98 * as bytes 00..1F (C0 control codes).
108 * ISO 2022 control codes must not be converted from Unicode
109 * because they would mess up the byte stream.
110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
111 * corresponding to SO, SI, and ESC.
113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
115 /* for ISO-2022-JP and -CN implementations */
132 HWKANA_7BIT
=8, /* Halfwidth Katakana 7 bit */
135 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
141 * these are used in StateEnum and ISO2022State variables,
142 * but CNS_11643 must be used to index into myConverterArray[]
154 /* is the StateEnum charset value for a DBCS charset? */
155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
157 #define CSM(cs) ((uint16_t)1<<(cs))
160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
163 * Note: The converter uses some leniency:
164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
165 * all versions, not just JIS7 and JIS8.
166 * - ICU does not distinguish between different versions of JIS X 0208.
168 enum { MAX_JA_VERSION
=4 };
169 static const uint16_t jpCharsetMasks
[MAX_JA_VERSION
+1]={
170 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
),
171 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
),
172 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
),
173 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
),
174 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
)
186 typedef struct ISO2022State
{
187 int8_t cs
[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
188 int8_t g
; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
189 int8_t prevG
; /* g before single shift (SS2 or SS3) */
192 #define UCNV_OPTIONS_VERSION_MASK 0xf
193 #define UCNV_2022_MAX_CONVERTERS 10
196 UConverterSharedData
*myConverterArray
[UCNV_2022_MAX_CONVERTERS
];
197 UConverter
*currentConverter
;
198 Cnv2022Type currentType
;
199 ISO2022State toU2022State
, fromU2022State
;
202 #ifdef U_ENABLE_GENERIC_ISO_2022
205 UBool isEmptySegment
;
208 }UConverterDataISO2022
;
211 /* ISO-2022 ----------------------------------------------------------------- */
213 /*Forward declaration */
215 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs
* args
,
218 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
,
221 #define ESC_2022 0x1B /*ESC*/
225 INVALID_2022
= -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
226 VALID_NON_TERMINAL_2022
= 0, /*so far corresponds to a valid iso 2022 escape sequence*/
227 VALID_TERMINAL_2022
= 1, /*corresponds to a valid iso 2022 escape sequence*/
228 VALID_MAYBE_TERMINAL_2022
= 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
229 } UCNV_TableStates_2022
;
232 * The way these state transition arrays work is:
233 * ex : ESC$B is the sequence for JISX208
234 * a) First Iteration: char is ESC
235 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
236 * int x = normalize_esq_chars_2022[27] which is equal to 1
237 * ii) Search for this value in escSeqStateTable_Key_2022[]
238 * value of x is stored at escSeqStateTable_Key_2022[0]
239 * iii) Save this index as offset
240 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
241 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
242 * b) Switch on this state and continue to next char
243 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
244 * which is normalize_esq_chars_2022[36] == 4
245 * ii) x is currently 1(from above)
246 * x<<=5 -- x is now 32
247 * x+=normalize_esq_chars_2022[36]
249 * iii) Search for this value in escSeqStateTable_Key_2022[]
250 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
251 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
252 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
253 * c) Switch on this state and continue to next char
254 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
255 * ii) x is currently 36 (from above)
256 * x<<=5 -- x is now 1152
257 * x+=normalize_esq_chars_2022[66]
259 * iii) Search for this value in escSeqStateTable_Key_2022[]
260 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
261 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
262 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
263 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
267 /*Below are the 3 arrays depicting a state transition table*/
268 static const int8_t normalize_esq_chars_2022
[256] = {
269 /* 0 1 2 3 4 5 6 7 8 9 */
271 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
273 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
274 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
275 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
276 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
277 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
278 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
279 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
280 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
282 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
299 #ifdef U_ENABLE_GENERIC_ISO_2022
301 * When the generic ISO-2022 converter is completely removed, not just disabled
302 * per #ifdef, then the following state table and the associated tables that are
303 * dimensioned with MAX_STATES_2022 should be trimmed.
305 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
306 * the associated escape sequences starting with ESC ( B should be removed.
307 * This includes the ones with key values 1097 and all of the ones above 1000000.
309 * For the latter, the tables can simply be truncated.
310 * For the former, since the tables must be kept parallel, it is probably best
311 * to simply duplicate an adjacent table cell, parallel in all tables.
313 * It may make sense to restructure the tables, especially by using small search
314 * tables for the variants instead of indexing them parallel to the table here.
318 #define MAX_STATES_2022 74
319 static const int32_t escSeqStateTable_Key_2022
[MAX_STATES_2022
] = {
320 /* 0 1 2 3 4 5 6 7 8 9 */
322 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
323 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
324 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
325 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
326 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
327 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
328 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
329 ,35947631 ,35947635 ,35947636 ,35947638
332 #ifdef U_ENABLE_GENERIC_ISO_2022
334 static const char* const escSeqStateTable_Result_2022
[MAX_STATES_2022
] = {
335 /* 0 1 2 3 4 5 6 7 8 9 */
337 NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,"latin1" ,"latin1"
338 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
339 ,"latin1" ,NULL
,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL
,NULL
,NULL
,NULL
,"UTF8"
340 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL
,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
341 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
342 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
343 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL
,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
344 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
349 static const int8_t escSeqStateTable_Value_2022
[MAX_STATES_2022
] = {
350 /* 0 1 2 3 4 5 6 7 8 9 */
351 VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
352 ,VALID_MAYBE_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
353 ,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
354 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
355 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
356 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
357 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
358 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
362 /* Type def for refactoring changeState_2022 code*/
364 #ifdef U_ENABLE_GENERIC_ISO_2022
372 /*********** ISO 2022 Converter Protos ***********/
374 _ISO2022Open(UConverter
*cnv
, UConverterLoadArgs
*pArgs
, UErrorCode
*errorCode
);
377 _ISO2022Close(UConverter
*converter
);
380 _ISO2022Reset(UConverter
*converter
, UConverterResetChoice choice
);
383 _ISO2022getName(const UConverter
* cnv
);
386 _ISO_2022_WriteSub(UConverterFromUnicodeArgs
*args
, int32_t offsetIndex
, UErrorCode
*err
);
389 _ISO_2022_SafeClone(const UConverter
*cnv
, void *stackBuffer
, int32_t *pBufferSize
, UErrorCode
*status
);
391 #ifdef U_ENABLE_GENERIC_ISO_2022
393 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs
* args
, UErrorCode
* err
);
396 /*const UConverterSharedData _ISO2022Data;*/
397 static const UConverterSharedData _ISO2022JPData
;
398 static const UConverterSharedData _ISO2022KRData
;
399 static const UConverterSharedData _ISO2022CNData
;
401 /*************** Converter implementations ******************/
403 /* The purpose of this function is to get around gcc compiler warnings. */
405 fromUWriteUInt8(UConverter
*cnv
,
406 const char *bytes
, int32_t length
,
407 uint8_t **target
, const char *targetLimit
,
410 UErrorCode
*pErrorCode
)
412 char *targetChars
= (char *)*target
;
413 ucnv_fromUWriteBytes(cnv
, bytes
, length
, &targetChars
, targetLimit
,
414 offsets
, sourceIndex
, pErrorCode
);
415 *target
= (uint8_t*)targetChars
;
420 setInitialStateToUnicodeKR(UConverter
* converter
, UConverterDataISO2022
*myConverterData
){
421 if(myConverterData
->version
== 1) {
422 UConverter
*cnv
= myConverterData
->currentConverter
;
424 cnv
->toUnicodeStatus
=0; /* offset */
425 cnv
->mode
=0; /* state */
426 cnv
->toULength
=0; /* byteIndex */
431 setInitialStateFromUnicodeKR(UConverter
* converter
,UConverterDataISO2022
*myConverterData
){
432 /* in ISO-2022-KR the designator sequence appears only once
433 * in a file so we append it only once
435 if( converter
->charErrorBufferLength
==0){
437 converter
->charErrorBufferLength
= 4;
438 converter
->charErrorBuffer
[0] = 0x1b;
439 converter
->charErrorBuffer
[1] = 0x24;
440 converter
->charErrorBuffer
[2] = 0x29;
441 converter
->charErrorBuffer
[3] = 0x43;
443 if(myConverterData
->version
== 1) {
444 UConverter
*cnv
= myConverterData
->currentConverter
;
447 cnv
->fromUnicodeStatus
=1; /* prevLength */
452 _ISO2022Open(UConverter
*cnv
, UConverterLoadArgs
*pArgs
, UErrorCode
*errorCode
){
454 char myLocale
[6]={' ',' ',' ',' ',' ',' '};
456 cnv
->extraInfo
= uprv_malloc (sizeof (UConverterDataISO2022
));
457 if(cnv
->extraInfo
!= NULL
) {
458 UConverterNamePieces stackPieces
;
459 UConverterLoadArgs stackArgs
={ (int32_t)sizeof(UConverterLoadArgs
) };
460 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) cnv
->extraInfo
;
463 stackArgs
.onlyTestIsLoadable
= pArgs
->onlyTestIsLoadable
;
465 uprv_memset(myConverterData
, 0, sizeof(UConverterDataISO2022
));
466 myConverterData
->currentType
= ASCII1
;
467 cnv
->fromUnicodeStatus
=FALSE
;
469 uprv_strncpy(myLocale
, pArgs
->locale
, sizeof(myLocale
));
471 version
= pArgs
->options
& UCNV_OPTIONS_VERSION_MASK
;
472 myConverterData
->version
= version
;
473 if(myLocale
[0]=='j' && (myLocale
[1]=='a'|| myLocale
[1]=='p') &&
474 (myLocale
[2]=='_' || myLocale
[2]=='\0'))
477 /* open the required converters and cache them */
478 if(version
>MAX_JA_VERSION
) {
479 /* prevent indexing beyond jpCharsetMasks[] */
480 myConverterData
->version
= version
= 0;
482 if(jpCharsetMasks
[version
]&CSM(ISO8859_7
)) {
483 myConverterData
->myConverterArray
[ISO8859_7
] =
484 ucnv_loadSharedData("ISO8859_7", &stackPieces
, &stackArgs
, errorCode
);
486 myConverterData
->myConverterArray
[JISX208
] =
487 ucnv_loadSharedData("Shift-JIS", &stackPieces
, &stackArgs
, errorCode
);
488 if(jpCharsetMasks
[version
]&CSM(JISX212
)) {
489 myConverterData
->myConverterArray
[JISX212
] =
490 ucnv_loadSharedData("jisx-212", &stackPieces
, &stackArgs
, errorCode
);
492 if(jpCharsetMasks
[version
]&CSM(GB2312
)) {
493 myConverterData
->myConverterArray
[GB2312
] =
494 ucnv_loadSharedData("ibm-5478", &stackPieces
, &stackArgs
, errorCode
); /* gb_2312_80-1 */
496 if(jpCharsetMasks
[version
]&CSM(KSC5601
)) {
497 myConverterData
->myConverterArray
[KSC5601
] =
498 ucnv_loadSharedData("ksc_5601", &stackPieces
, &stackArgs
, errorCode
);
501 /* set the function pointers to appropriate funtions */
502 cnv
->sharedData
=(UConverterSharedData
*)(&_ISO2022JPData
);
503 uprv_strcpy(myConverterData
->locale
,"ja");
505 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ja,version=");
506 len
= uprv_strlen(myConverterData
->name
);
507 myConverterData
->name
[len
]=(char)(myConverterData
->version
+(int)'0');
508 myConverterData
->name
[len
+1]='\0';
510 else if(myLocale
[0]=='k' && (myLocale
[1]=='o'|| myLocale
[1]=='r') &&
511 (myLocale
[2]=='_' || myLocale
[2]=='\0'))
515 cnvName
="icu-internal-25546";
518 myConverterData
->version
=version
=0;
520 if(pArgs
->onlyTestIsLoadable
) {
521 ucnv_canCreateConverter(cnvName
, errorCode
); /* errorCode carries result */
522 uprv_free(cnv
->extraInfo
);
526 myConverterData
->currentConverter
=ucnv_open(cnvName
, errorCode
);
527 if (U_FAILURE(*errorCode
)) {
533 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ko,version=1");
534 uprv_memcpy(cnv
->subChars
, myConverterData
->currentConverter
->subChars
, 4);
535 cnv
->subCharLen
= myConverterData
->currentConverter
->subCharLen
;
537 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ko,version=0");
540 /* initialize the state variables */
541 setInitialStateToUnicodeKR(cnv
, myConverterData
);
542 setInitialStateFromUnicodeKR(cnv
, myConverterData
);
544 /* set the function pointers to appropriate funtions */
545 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022KRData
;
546 uprv_strcpy(myConverterData
->locale
,"ko");
549 else if(((myLocale
[0]=='z' && myLocale
[1]=='h') || (myLocale
[0]=='c'&& myLocale
[1]=='n'))&&
550 (myLocale
[2]=='_' || myLocale
[2]=='\0'))
553 /* open the required converters and cache them */
554 myConverterData
->myConverterArray
[GB2312_1
] =
555 ucnv_loadSharedData("ibm-5478", &stackPieces
, &stackArgs
, errorCode
);
557 myConverterData
->myConverterArray
[ISO_IR_165
] =
558 ucnv_loadSharedData("iso-ir-165", &stackPieces
, &stackArgs
, errorCode
);
560 myConverterData
->myConverterArray
[CNS_11643
] =
561 ucnv_loadSharedData("cns-11643-1992", &stackPieces
, &stackArgs
, errorCode
);
564 /* set the function pointers to appropriate funtions */
565 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022CNData
;
566 uprv_strcpy(myConverterData
->locale
,"cn");
569 myConverterData
->version
= 0;
570 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=zh,version=0");
571 }else if (version
==1){
572 myConverterData
->version
= 1;
573 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=zh,version=1");
575 myConverterData
->version
= 2;
576 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=zh,version=2");
580 #ifdef U_ENABLE_GENERIC_ISO_2022
581 myConverterData
->isFirstBuffer
= TRUE
;
583 /* append the UTF-8 escape sequence */
584 cnv
->charErrorBufferLength
= 3;
585 cnv
->charErrorBuffer
[0] = 0x1b;
586 cnv
->charErrorBuffer
[1] = 0x25;
587 cnv
->charErrorBuffer
[2] = 0x42;
589 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022Data
;
590 /* initialize the state variables */
591 uprv_strcpy(myConverterData
->name
,"ISO_2022");
593 *errorCode
= U_UNSUPPORTED_ERROR
;
598 cnv
->maxBytesPerUChar
=cnv
->sharedData
->staticData
->maxBytesPerChar
;
600 if(U_FAILURE(*errorCode
) || pArgs
->onlyTestIsLoadable
) {
604 *errorCode
= U_MEMORY_ALLOCATION_ERROR
;
610 _ISO2022Close(UConverter
*converter
) {
611 UConverterDataISO2022
* myData
=(UConverterDataISO2022
*) (converter
->extraInfo
);
612 UConverterSharedData
**array
= myData
->myConverterArray
;
615 if (converter
->extraInfo
!= NULL
) {
616 /*close the array of converter pointers and free the memory*/
617 for (i
=0; i
<UCNV_2022_MAX_CONVERTERS
; i
++) {
619 ucnv_unloadSharedDataIfReady(array
[i
]);
623 ucnv_close(myData
->currentConverter
);
625 if(!converter
->isExtraLocal
){
626 uprv_free (converter
->extraInfo
);
627 converter
->extraInfo
= NULL
;
633 _ISO2022Reset(UConverter
*converter
, UConverterResetChoice choice
) {
634 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) (converter
->extraInfo
);
635 if(choice
<=UCNV_RESET_TO_UNICODE
) {
636 uprv_memset(&myConverterData
->toU2022State
, 0, sizeof(ISO2022State
));
637 myConverterData
->key
= 0;
638 myConverterData
->isEmptySegment
= FALSE
;
640 if(choice
!=UCNV_RESET_TO_UNICODE
) {
641 uprv_memset(&myConverterData
->fromU2022State
, 0, sizeof(ISO2022State
));
643 #ifdef U_ENABLE_GENERIC_ISO_2022
644 if(myConverterData
->locale
[0] == 0){
645 if(choice
<=UCNV_RESET_TO_UNICODE
) {
646 myConverterData
->isFirstBuffer
= TRUE
;
647 myConverterData
->key
= 0;
648 if (converter
->mode
== UCNV_SO
){
649 ucnv_close (myConverterData
->currentConverter
);
650 myConverterData
->currentConverter
=NULL
;
652 converter
->mode
= UCNV_SI
;
654 if(choice
!=UCNV_RESET_TO_UNICODE
) {
655 /* re-append UTF-8 escape sequence */
656 converter
->charErrorBufferLength
= 3;
657 converter
->charErrorBuffer
[0] = 0x1b;
658 converter
->charErrorBuffer
[1] = 0x28;
659 converter
->charErrorBuffer
[2] = 0x42;
665 /* reset the state variables */
666 if(myConverterData
->locale
[0] == 'k'){
667 if(choice
<=UCNV_RESET_TO_UNICODE
) {
668 setInitialStateToUnicodeKR(converter
, myConverterData
);
670 if(choice
!=UCNV_RESET_TO_UNICODE
) {
671 setInitialStateFromUnicodeKR(converter
, myConverterData
);
678 _ISO2022getName(const UConverter
* cnv
){
680 UConverterDataISO2022
* myData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
687 /*************** to unicode *******************/
688 /****************************************************************************
689 * Recognized escape sequences are
701 static const int8_t nextStateToUnicodeJP
[MAX_STATES_2022
]= {
702 /* 0 1 2 3 4 5 6 7 8 9 */
703 INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,SS2_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
704 ,ASCII
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,JISX201
,HWKANA_7BIT
,JISX201
,INVALID_STATE
705 ,INVALID_STATE
,INVALID_STATE
,JISX208
,GB2312
,JISX208
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
706 ,ISO8859_1
,ISO8859_7
,JISX208
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,KSC5601
,JISX212
,INVALID_STATE
707 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
708 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
709 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
710 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
713 /*************** to unicode *******************/
714 static const int8_t nextStateToUnicodeCN
[MAX_STATES_2022
]= {
715 /* 0 1 2 3 4 5 6 7 8 9 */
716 INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,SS2_STATE
,SS3_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
717 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
718 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
719 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
720 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,GB2312_1
,INVALID_STATE
,ISO_IR_165
721 ,CNS_11643_1
,CNS_11643_2
,CNS_11643_3
,CNS_11643_4
,CNS_11643_5
,CNS_11643_6
,CNS_11643_7
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
722 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
723 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
727 static UCNV_TableStates_2022
728 getKey_2022(char c
,int32_t* key
,int32_t* offset
){
731 int32_t hi
= MAX_STATES_2022
;
734 togo
= normalize_esq_chars_2022
[(uint8_t)c
];
736 /* not a valid character anywhere in an escape sequence */
741 togo
= (*key
<< 5) + togo
;
743 while (hi
!= low
) /*binary search*/{
745 register int32_t mid
= (hi
+low
) >> 1; /*Finds median*/
750 if (escSeqStateTable_Key_2022
[mid
] > togo
){
753 else if (escSeqStateTable_Key_2022
[mid
] < togo
){
756 else /*we found it*/{
759 return (UCNV_TableStates_2022
)escSeqStateTable_Value_2022
[mid
];
770 /*runs through a state machine to determine the escape sequence - codepage correspondance
773 changeState_2022(UConverter
* _this
,
775 const char* sourceLimit
,
778 UCNV_TableStates_2022 value
;
779 UConverterDataISO2022
* myData2022
= ((UConverterDataISO2022
*)_this
->extraInfo
);
780 uint32_t key
= myData2022
->key
;
782 int8_t initialToULength
= _this
->toULength
;
785 value
= VALID_NON_TERMINAL_2022
;
786 while (*source
< sourceLimit
) {
788 _this
->toUBytes
[_this
->toULength
++]=(uint8_t)c
;
789 value
= getKey_2022(c
,(int32_t *) &key
, &offset
);
793 case VALID_NON_TERMINAL_2022
:
794 /* continue with the loop */
797 case VALID_TERMINAL_2022
:
804 case VALID_MAYBE_TERMINAL_2022
:
805 #ifdef U_ENABLE_GENERIC_ISO_2022
806 /* ESC ( B is ambiguous only for ISO_2022 itself */
807 if(var
== ISO_2022
) {
808 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
809 _this
->toULength
= 0;
811 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
813 /* continue with the loop */
814 value
= VALID_NON_TERMINAL_2022
;
819 /* not ISO_2022 itself, finish here */
820 value
= VALID_TERMINAL_2022
;
828 myData2022
->key
= key
;
830 if (value
== VALID_NON_TERMINAL_2022
) {
831 /* indicate that the escape sequence is incomplete: key!=0 */
833 } else if (value
== INVALID_2022
) {
834 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
835 } else /* value == VALID_TERMINAL_2022 */ {
837 #ifdef U_ENABLE_GENERIC_ISO_2022
840 const char *chosenConverterName
= escSeqStateTable_Result_2022
[offset
];
841 if(chosenConverterName
== NULL
) {
843 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
844 _this
->toUCallbackReason
= UCNV_UNASSIGNED
;
848 _this
->mode
= UCNV_SI
;
849 ucnv_close(myData2022
->currentConverter
);
850 myData2022
->currentConverter
= myUConverter
= ucnv_open(chosenConverterName
, err
);
851 if(U_SUCCESS(*err
)) {
852 myUConverter
->fromCharErrorBehaviour
= UCNV_TO_U_CALLBACK_STOP
;
853 _this
->mode
= UCNV_SO
;
860 StateEnum tempState
=(StateEnum
)nextStateToUnicodeJP
[offset
];
863 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
866 if(myData2022
->toU2022State
.cs
[2]!=0) {
867 if(myData2022
->toU2022State
.g
<2) {
868 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
870 myData2022
->toU2022State
.g
=2;
872 /* illegal to have SS2 before a matching designator */
873 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
876 /* case SS3_STATE: not used in ISO-2022-JP-x */
879 if((jpCharsetMasks
[myData2022
->version
] & CSM(tempState
)) == 0) {
880 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
882 /* G2 charset for SS2 */
883 myData2022
->toU2022State
.cs
[2]=(int8_t)tempState
;
887 if((jpCharsetMasks
[myData2022
->version
] & CSM(tempState
)) == 0) {
888 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
891 myData2022
->toU2022State
.cs
[0]=(int8_t)tempState
;
899 StateEnum tempState
=(StateEnum
)nextStateToUnicodeCN
[offset
];
902 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
905 if(myData2022
->toU2022State
.cs
[2]!=0) {
906 if(myData2022
->toU2022State
.g
<2) {
907 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
909 myData2022
->toU2022State
.g
=2;
911 /* illegal to have SS2 before a matching designator */
912 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
916 if(myData2022
->toU2022State
.cs
[3]!=0) {
917 if(myData2022
->toU2022State
.g
<2) {
918 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
920 myData2022
->toU2022State
.g
=3;
922 /* illegal to have SS3 before a matching designator */
923 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
927 if(myData2022
->version
==0) {
928 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
935 myData2022
->toU2022State
.cs
[1]=(int8_t)tempState
;
938 myData2022
->toU2022State
.cs
[2]=(int8_t)tempState
;
941 /* other CNS 11643 planes */
942 if(myData2022
->version
==0) {
943 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
945 myData2022
->toU2022State
.cs
[3]=(int8_t)tempState
;
953 /* nothing to be done, just accept this one escape sequence */
955 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
960 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
964 if(U_SUCCESS(*err
)) {
965 _this
->toULength
= 0;
966 } else if(*err
==U_ILLEGAL_ESCAPE_SEQUENCE
) {
967 if(_this
->toULength
>1) {
969 * Ticket 5691: consistent illegal sequences:
970 * - We include at least the first byte (ESC) in the illegal sequence.
971 * - If any of the non-initial bytes could be the start of a character,
972 * we stop the illegal sequence before the first one of those.
973 * In escape sequences, all following bytes are "printable", that is,
974 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
975 * they are valid single/lead bytes.
976 * For simplicity, we always only report the initial ESC byte as the
977 * illegal sequence and back out all other bytes we looked at.
979 /* Back out some bytes. */
980 int8_t backOutDistance
=_this
->toULength
-1;
981 int8_t bytesFromThisBuffer
=_this
->toULength
-initialToULength
;
982 if(backOutDistance
<=bytesFromThisBuffer
) {
983 /* same as initialToULength<=1 */
984 *source
-=backOutDistance
;
986 /* Back out bytes from the previous buffer: Need to replay them. */
987 _this
->preToULength
=(int8_t)(bytesFromThisBuffer
-backOutDistance
);
988 /* same as -(initialToULength-1) */
989 /* preToULength is negative! */
990 uprv_memcpy(_this
->preToU
, _this
->toUBytes
+1, -_this
->preToULength
);
991 *source
-=bytesFromThisBuffer
;
995 } else if(*err
==U_UNSUPPORTED_ESCAPE_SEQUENCE
) {
996 _this
->toUCallbackReason
= UCNV_UNASSIGNED
;
1000 /*Checks the characters of the buffer against valid 2022 escape sequences
1001 *if the match we return a pointer to the initial start of the sequence otherwise
1002 *we return sourceLimit
1004 /*for 2022 looks ahead in the stream
1005 *to determine the longest possible convertible
1008 static U_INLINE
const char*
1009 getEndOfBuffer_2022(const char** source
,
1010 const char* sourceLimit
,
1013 const char* mySource
= *source
;
1015 #ifdef U_ENABLE_GENERIC_ISO_2022
1016 if (*source
>= sourceLimit
)
1021 if (*mySource
== ESC_2022
){
1025 UCNV_TableStates_2022 value
= VALID_NON_TERMINAL_2022
;
1027 /* Kludge: I could not
1028 * figure out the reason for validating an escape sequence
1029 * twice - once here and once in changeState_2022().
1030 * is it possible to have an ESC character in a ISO2022
1031 * byte stream which is valid in a code page? Is it legal?
1034 (mySource
+i
< sourceLimit
)&&(value
== VALID_NON_TERMINAL_2022
);
1036 value
= getKey_2022(*(mySource
+i
), &key
, &offset
);
1038 if (value
> 0 || *mySource
==ESC_2022
)
1041 if ((value
== VALID_NON_TERMINAL_2022
)&&(!flush
) )
1044 }while (++mySource
< sourceLimit
);
1048 while(mySource
< sourceLimit
&& *mySource
!= ESC_2022
) {
1056 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1057 * any future change in _MBCSFromUChar32() function should be reflected here.
1058 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1060 static U_INLINE
int32_t
1061 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData
* sharedData
,
1068 const uint16_t *table
;
1069 uint32_t stage2Entry
;
1074 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1075 * Use internal version of ucnv_open() that verifies that the new structures are available,
1076 * else U_INTERNAL_PROGRAM_ERROR.
1078 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1079 if(c
<0x10000 || (sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
1080 table
=sharedData
->mbcs
.fromUnicodeTable
;
1081 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
1082 /* get the bytes and the length for the output */
1083 if(outputType
==MBCS_OUTPUT_2
){
1084 myValue
=MBCS_VALUE_2_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
1090 } else /* outputType==MBCS_OUTPUT_3 */ {
1091 p
=MBCS_POINTER_3_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
1092 myValue
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
1095 } else if(myValue
<=0xffff) {
1101 /* is this code point assigned, or do we use fallbacks? */
1102 if((stage2Entry
&(1<<(16+(c
&0xf))))!=0) {
1106 } else if(FROM_U_USE_FALLBACK(useFallback
, c
) && myValue
!=0) {
1108 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1109 * There is no way with this data structure for fallback output
1110 * to be a zero byte.
1117 cx
=sharedData
->mbcs
.extIndexes
;
1119 return ucnv_extSimpleMatchFromU(cx
, c
, value
, useFallback
);
1126 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1127 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1128 * @param retval pointer to output byte
1129 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1131 static U_INLINE
int32_t
1132 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData
* sharedData
,
1137 const uint16_t *table
;
1139 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1140 if(c
>=0x10000 && !(sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
1143 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1144 table
=sharedData
->mbcs
.fromUnicodeTable
;
1145 /* get the byte for the output */
1146 value
=MBCS_SINGLE_RESULT_FROM_U(table
, (uint16_t *)sharedData
->mbcs
.fromUnicodeBytes
, c
);
1147 /* is this code point assigned, or do we use fallbacks? */
1148 *retval
=(uint32_t)(value
&0xff);
1150 return 1; /* roundtrip */
1151 } else if(useFallback
? value
>=0x800 : value
>=0xc00) {
1152 return -1; /* fallback taken */
1154 return 0; /* no mapping */
1159 * Check that the result is a 2-byte value with each byte in the range A1..FE
1160 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1161 * to move it to the ISO 2022 range 21..7E.
1162 * Return 0 if out of range.
1164 static U_INLINE
uint32_t
1165 _2022FromGR94DBCS(uint32_t value
) {
1166 if( (uint16_t)(value
- 0xa1a1) <= (0xfefe - 0xa1a1) &&
1167 (uint8_t)(value
- 0xa1) <= (0xfe - 0xa1)
1169 return value
- 0x8080; /* shift down to 21..7e byte range */
1171 return 0; /* not valid for ISO 2022 */
1175 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1177 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1178 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1181 static U_INLINE
uint32_t
1182 _2022ToGR94DBCS(uint32_t value
) {
1183 uint32_t returnValue
= value
+ 0x8080;
1184 if( (uint16_t)(returnValue
- 0xa1a1) <= (0xfefe - 0xa1a1) &&
1185 (uint8_t)(returnValue
- 0xa1) <= (0xfe - 0xa1)) {
1193 #ifdef U_ENABLE_GENERIC_ISO_2022
1195 /**********************************************************************************
1196 * ISO-2022 Converter
1202 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs
* args
,
1204 const char* mySourceLimit
, *realSourceLimit
;
1205 const char* sourceStart
;
1206 const UChar
* myTargetStart
;
1207 UConverter
* saveThis
;
1208 UConverterDataISO2022
* myData
;
1211 saveThis
= args
->converter
;
1212 myData
=((UConverterDataISO2022
*)(saveThis
->extraInfo
));
1214 realSourceLimit
= args
->sourceLimit
;
1215 while (args
->source
< realSourceLimit
) {
1216 if(myData
->key
== 0) { /* are we in the middle of an escape sequence? */
1217 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1218 mySourceLimit
= getEndOfBuffer_2022(&(args
->source
), realSourceLimit
, args
->flush
);
1220 if(args
->source
< mySourceLimit
) {
1221 if(myData
->currentConverter
==NULL
) {
1222 myData
->currentConverter
= ucnv_open("ASCII",err
);
1223 if(U_FAILURE(*err
)){
1227 myData
->currentConverter
->fromCharErrorBehaviour
= UCNV_TO_U_CALLBACK_STOP
;
1228 saveThis
->mode
= UCNV_SO
;
1231 /* convert to before the ESC or until the end of the buffer */
1232 myData
->isFirstBuffer
=FALSE
;
1233 sourceStart
= args
->source
;
1234 myTargetStart
= args
->target
;
1235 args
->converter
= myData
->currentConverter
;
1236 ucnv_toUnicode(args
->converter
,
1242 (UBool
)(args
->flush
&& mySourceLimit
== realSourceLimit
),
1244 args
->converter
= saveThis
;
1246 if (*err
== U_BUFFER_OVERFLOW_ERROR
) {
1247 /* move the overflow buffer */
1248 length
= saveThis
->UCharErrorBufferLength
= myData
->currentConverter
->UCharErrorBufferLength
;
1249 myData
->currentConverter
->UCharErrorBufferLength
= 0;
1251 uprv_memcpy(saveThis
->UCharErrorBuffer
,
1252 myData
->currentConverter
->UCharErrorBuffer
,
1253 length
*U_SIZEOF_UCHAR
);
1260 * -Error while converting
1261 * -Done with entire buffer
1262 * -Need to write offsets or update the current offset
1263 * (leave that up to the code in ucnv.c)
1265 * or else we just stopped at an ESC byte and continue with changeState_2022()
1267 if (U_FAILURE(*err
) ||
1268 (args
->source
== realSourceLimit
) ||
1269 (args
->offsets
!= NULL
&& (args
->target
!= myTargetStart
|| args
->source
!= sourceStart
) ||
1270 (mySourceLimit
< realSourceLimit
&& myData
->currentConverter
->toULength
> 0))
1272 /* copy partial or error input for truncated detection and error handling */
1273 if(U_FAILURE(*err
)) {
1274 length
= saveThis
->invalidCharLength
= myData
->currentConverter
->invalidCharLength
;
1276 uprv_memcpy(saveThis
->invalidCharBuffer
, myData
->currentConverter
->invalidCharBuffer
, length
);
1279 length
= saveThis
->toULength
= myData
->currentConverter
->toULength
;
1281 uprv_memcpy(saveThis
->toUBytes
, myData
->currentConverter
->toUBytes
, length
);
1282 if(args
->source
< mySourceLimit
) {
1283 *err
= U_TRUNCATED_CHAR_FOUND
; /* truncated input before ESC */
1292 sourceStart
= args
->source
;
1293 changeState_2022(args
->converter
,
1298 if (U_FAILURE(*err
) || (args
->source
!= sourceStart
&& args
->offsets
!= NULL
)) {
1299 /* let the ucnv.c code update its current offset */
1308 * To Unicode Callback helper function
1311 toUnicodeCallback(UConverter
*cnv
,
1312 const uint32_t sourceChar
, const uint32_t targetUniChar
,
1314 if(sourceChar
>0xff){
1315 cnv
->toUBytes
[0] = (uint8_t)(sourceChar
>>8);
1316 cnv
->toUBytes
[1] = (uint8_t)sourceChar
;
1320 cnv
->toUBytes
[0] =(char) sourceChar
;
1324 if(targetUniChar
== (missingCharMarker
-1/*0xfffe*/)){
1325 *err
= U_INVALID_CHAR_FOUND
;
1328 *err
= U_ILLEGAL_CHAR_FOUND
;
1332 /**************************************ISO-2022-JP*************************************************/
1334 /************************************** IMPORTANT **************************************************
1335 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1336 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1337 * The converter iterates over each Unicode codepoint
1338 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1339 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1340 * would do as far as possible.
1342 * If the implementation of these macros or structure of sharedData struct change in the future, make
1343 * sure that ISO-2022 is also changed.
1344 ***************************************************************************************************
1347 /***************************************************************************************************
1348 * Rules for ISO-2022-jp encoding
1349 * (i) Escape sequences must be fully contained within a line they should not
1350 * span new lines or CRs
1351 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1352 * JIS-Roman character escape sequence should follow before the line terminates
1353 * (iii) If the first character on the line is represented by two bytes then a two
1354 * byte character escape sequence should precede it
1355 * (iv) If no escape sequence is encountered then the characters are ASCII
1356 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1357 * and invoked with SS2 (ESC N).
1358 * (vi) If there is any G0 designation in text, there must be a switch to
1359 * ASCII or to JIS X 0201-Roman before a space character (but not
1360 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1361 * characters such as tab or CRLF.
1362 * (vi) Supported encodings:
1363 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1367 * JISX201, JISX208,JISX212 : new .cnv data files created
1368 * KSC5601 : alias to ibm-949 mapping table
1369 * GB2312 : alias to ibm-1386 mapping table
1370 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1371 * ISO-8859-7 : alisas to ibm-9409 mapping table
1374 /* preference order of JP charsets */
1375 static const StateEnum jpCharsetPref
[]={
1388 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1389 * not in order of jpCharsetPref[]!
1391 static const char escSeqChars
[][6] ={
1392 "\x1B\x28\x42", /* <ESC>(B ASCII */
1393 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1394 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1395 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1396 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1397 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1398 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1399 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1400 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1403 static const int8_t escSeqCharsLen
[] ={
1404 3, /* length of <ESC>(B ASCII */
1405 3, /* length of <ESC>.A ISO-8859-1 */
1406 3, /* length of <ESC>.F ISO-8859-7 */
1407 3, /* length of <ESC>(J JISX-201 */
1408 3, /* length of <ESC>$B JISX-208 */
1409 4, /* length of <ESC>$(D JISX-212 */
1410 3, /* length of <ESC>$A GB2312 */
1411 4, /* length of <ESC>$(C KSC5601 */
1412 3 /* length of <ESC>(I HWKANA_7BIT */
1416 * The iteration over various code pages works this way:
1417 * i) Get the currentState from myConverterData->currentState
1418 * ii) Check if the character is mapped to a valid character in the currentState
1419 * Yes -> a) set the initIterState to currentState
1420 * b) remain in this state until an invalid character is found
1421 * No -> a) go to the next code page and find the character
1422 * iii) Before changing the state increment the current state check if the current state
1423 * is equal to the intitIteration state
1424 * Yes -> A character that cannot be represented in any of the supported encodings
1425 * break and return a U_INVALID_CHARACTER error
1426 * No -> Continue and find the character in next code page
1429 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1432 /* Map 00..7F to Unicode according to JIS X 0201. */
1433 static U_INLINE
uint32_t
1434 jisx201ToU(uint32_t value
) {
1437 } else if(value
== 0x5c) {
1439 } else if(value
== 0x7e) {
1441 } else /* value <= 0x7f */ {
1446 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1447 static U_INLINE
uint32_t
1448 jisx201FromU(uint32_t value
) {
1450 if(value
!=0x5c && value
!=0x7e) {
1453 } else if(value
==0xa5) {
1455 } else if(value
==0x203e) {
1462 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1463 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1464 * Return 0 if the byte pair is out of range.
1466 static U_INLINE
uint32_t
1467 _2022FromSJIS(uint32_t value
) {
1470 if(value
> 0xEFFC) {
1471 return 0; /* beyond JIS X 0208 */
1474 trail
= (uint8_t)value
;
1476 value
&= 0xff00; /* lead byte */
1477 if(value
<= 0x9f00) {
1479 } else /* 0xe000 <= value <= 0xef00 */ {
1487 value
|= trail
- 0x1f;
1489 value
|= trail
- 0x20;
1491 } else /* trail <= 0xfc */ {
1492 value
|= trail
- 0x7e;
1498 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1499 * If either byte is outside 21..7E make sure that the result is not valid
1500 * for Shift-JIS so that the converter catches it.
1501 * Some invalid byte values already turn into equally invalid Shift-JIS
1502 * byte values and need not be tested explicitly.
1504 static U_INLINE
void
1505 _2022ToSJIS(uint8_t c1
, uint8_t c2
, char bytes
[2]) {
1510 } else if(c2
<= 0x7e) {
1513 c2
= 0; /* invalid */
1516 if((uint8_t)(c2
-0x21) <= ((0x7e)-0x21)) {
1519 c2
= 0; /* invalid */
1525 } else if(c1
<= 0x3f) {
1528 c1
= 0; /* invalid */
1530 bytes
[0] = (char)c1
;
1531 bytes
[1] = (char)c2
;
1535 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1537 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1538 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1539 * These were the only fallbacks in ICU's jisx-208.ucm file.
1541 static const uint16_t hwkana_fb
[HWKANA_END
- HWKANA_START
+ 1] = {
1542 0x2123, /* U+FF61 */
1557 0x213C, /* U+FF70 */
1573 0x253F, /* U+FF80 */
1589 0x255F, /* U+FF90 */
1608 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
) {
1609 UConverter
*cnv
= args
->converter
;
1610 UConverterDataISO2022
*converterData
;
1611 ISO2022State
*pFromU2022State
;
1612 uint8_t *target
= (uint8_t *) args
->target
;
1613 const uint8_t *targetLimit
= (const uint8_t *) args
->targetLimit
;
1614 const UChar
* source
= args
->source
;
1615 const UChar
* sourceLimit
= args
->sourceLimit
;
1616 int32_t* offsets
= args
->offsets
;
1619 int32_t len
, outLen
;
1621 int32_t choiceCount
;
1622 uint32_t targetValue
= 0;
1628 /* set up the state */
1629 converterData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
1630 pFromU2022State
= &converterData
->fromU2022State
;
1634 /* check if the last codepoint of previous buffer was a lead surrogate*/
1635 if((sourceChar
= cnv
->fromUChar32
)!=0 && target
< targetLimit
) {
1639 while(source
< sourceLimit
) {
1640 if(target
< targetLimit
) {
1642 sourceChar
= *(source
++);
1643 /*check if the char is a First surrogate*/
1644 if(UTF_IS_SURROGATE(sourceChar
)) {
1645 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
1647 /*look ahead to find the trail surrogate*/
1648 if(source
< sourceLimit
) {
1649 /* test the following code unit */
1650 UChar trail
=(UChar
) *source
;
1651 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1653 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
1654 cnv
->fromUChar32
=0x00;
1655 /* convert this supplementary code point */
1656 /* exit this condition tree */
1658 /* this is an unmatched lead code unit (1st surrogate) */
1659 /* callback(illegal) */
1660 *err
=U_ILLEGAL_CHAR_FOUND
;
1661 cnv
->fromUChar32
=sourceChar
;
1666 cnv
->fromUChar32
=sourceChar
;
1670 /* this is an unmatched trail code unit (2nd surrogate) */
1671 /* callback(illegal) */
1672 *err
=U_ILLEGAL_CHAR_FOUND
;
1673 cnv
->fromUChar32
=sourceChar
;
1678 /* do not convert SO/SI/ESC */
1679 if(IS_2022_CONTROL(sourceChar
)) {
1680 /* callback(illegal) */
1681 *err
=U_ILLEGAL_CHAR_FOUND
;
1682 cnv
->fromUChar32
=sourceChar
;
1686 /* do the conversion */
1688 if(choiceCount
== 0) {
1692 * The csm variable keeps track of which charsets are allowed
1693 * and not used yet while building the choices[].
1695 csm
= jpCharsetMasks
[converterData
->version
];
1698 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1699 if(converterData
->version
== 3 || converterData
->version
== 4) {
1700 choices
[choiceCount
++] = (int8_t)HWKANA_7BIT
;
1702 /* Do not try single-byte half-width Katakana for other versions. */
1703 csm
&= ~CSM(HWKANA_7BIT
);
1705 /* try the current G0 charset */
1706 choices
[choiceCount
++] = cs
= pFromU2022State
->cs
[0];
1709 /* try the current G2 charset */
1710 if((cs
= pFromU2022State
->cs
[2]) != 0) {
1711 choices
[choiceCount
++] = cs
;
1715 /* try all the other possible charsets */
1716 for(i
= 0; i
< LENGTHOF(jpCharsetPref
); ++i
) {
1717 cs
= (int8_t)jpCharsetPref
[i
];
1719 choices
[choiceCount
++] = cs
;
1727 * len==0: no mapping found yet
1728 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1729 * len>0: found a roundtrip result, done
1733 * We will turn off useFallback after finding a fallback,
1734 * but we still get fallbacks from PUA code points as usual.
1735 * Therefore, we will also need to check that we don't overwrite
1736 * an early fallback with a later one.
1738 useFallback
= cnv
->useFallback
;
1740 for(i
= 0; i
< choiceCount
&& len
<= 0; ++i
) {
1743 int8_t cs0
= choices
[i
];
1746 if(sourceChar
<= 0x7f) {
1747 targetValue
= (uint32_t)sourceChar
;
1754 if(GR96_START
<= sourceChar
&& sourceChar
<= GR96_END
) {
1755 targetValue
= (uint32_t)sourceChar
- 0x80;
1762 if((uint32_t)(sourceChar
- HWKANA_START
) <= (HWKANA_END
- HWKANA_START
)) {
1763 if(converterData
->version
==3) {
1764 /* JIS7: use G1 (SO) */
1765 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1766 targetValue
= (uint32_t)(sourceChar
- (HWKANA_START
- 0x21));
1768 pFromU2022State
->cs
[1] = cs
= cs0
; /* do not output an escape sequence */
1770 } else if(converterData
->version
==4) {
1771 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1772 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1773 targetValue
= (uint32_t)(sourceChar
- (HWKANA_START
- 0xa1));
1776 cs
= pFromU2022State
->cs
[0];
1777 if(IS_JP_DBCS(cs
)) {
1778 /* switch from a DBCS charset to JISX201 */
1779 cs
= (int8_t)JISX201
;
1781 /* else stay in the current G0 charset */
1784 /* else do not use HWKANA_7BIT with other versions */
1789 value
= jisx201FromU(sourceChar
);
1791 targetValue
= value
;
1795 useFallback
= FALSE
;
1799 /* G0 DBCS from Shift-JIS table */
1800 len2
= MBCS_FROM_UCHAR32_ISO2022(
1801 converterData
->myConverterArray
[cs0
],
1803 useFallback
, MBCS_OUTPUT_2
);
1804 if(len2
== 2 || (len2
== -2 && len
== 0)) { /* only accept DBCS: abs(len)==2 */
1805 value
= _2022FromSJIS(value
);
1807 targetValue
= value
;
1811 useFallback
= FALSE
;
1813 } else if(len
== 0 && useFallback
&&
1814 (uint32_t)(sourceChar
- HWKANA_START
) <= (HWKANA_END
- HWKANA_START
)) {
1815 targetValue
= hwkana_fb
[sourceChar
- HWKANA_START
];
1819 useFallback
= FALSE
;
1823 /* G0 SBCS forced to 7-bit output */
1824 len2
= MBCS_SINGLE_FROM_UCHAR32(
1825 converterData
->myConverterArray
[cs0
],
1828 if(len2
!= 0 && !(len2
< 0 && len
!= 0) && GR96_START
<= value
&& value
<= GR96_END
) {
1829 targetValue
= value
- 0x80;
1833 useFallback
= FALSE
;
1838 len2
= MBCS_FROM_UCHAR32_ISO2022(
1839 converterData
->myConverterArray
[cs0
],
1841 useFallback
, MBCS_OUTPUT_2
);
1842 if(len2
== 2 || (len2
== -2 && len
== 0)) { /* only accept DBCS: abs(len)==2 */
1843 if(cs0
== KSC5601
) {
1845 * Check for valid bytes for the encoding scheme.
1846 * This is necessary because the sub-converter (windows-949)
1847 * has a broader encoding scheme than is valid for 2022.
1849 value
= _2022FromGR94DBCS(value
);
1854 targetValue
= value
;
1858 useFallback
= FALSE
;
1866 len
= -len
; /* fallback */
1868 outLen
= 0; /* count output bytes */
1870 /* write SI if necessary (only for JIS7) */
1871 if(pFromU2022State
->g
== 1 && g
== 0) {
1872 buffer
[outLen
++] = UCNV_SI
;
1873 pFromU2022State
->g
= 0;
1876 /* write the designation sequence if necessary */
1877 if(cs
!= pFromU2022State
->cs
[g
]) {
1878 int32_t escLen
= escSeqCharsLen
[cs
];
1879 uprv_memcpy(buffer
+ outLen
, escSeqChars
[cs
], escLen
);
1881 pFromU2022State
->cs
[g
] = cs
;
1883 /* invalidate the choices[] */
1887 /* write the shift sequence if necessary */
1888 if(g
!= pFromU2022State
->g
) {
1890 /* case 0 handled before writing escapes */
1892 buffer
[outLen
++] = UCNV_SO
;
1893 pFromU2022State
->g
= 1;
1895 default: /* case 2 */
1896 buffer
[outLen
++] = 0x1b;
1897 buffer
[outLen
++] = 0x4e;
1899 /* no case 3: no SS3 in ISO-2022-JP-x */
1903 /* write the output bytes */
1905 buffer
[outLen
++] = (char)targetValue
;
1906 } else /* len == 2 */ {
1907 buffer
[outLen
++] = (char)(targetValue
>> 8);
1908 buffer
[outLen
++] = (char)targetValue
;
1912 * if we cannot find the character after checking all codepages
1913 * then this is an error
1915 *err
= U_INVALID_CHAR_FOUND
;
1916 cnv
->fromUChar32
=sourceChar
;
1920 if(sourceChar
== CR
|| sourceChar
== LF
) {
1921 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1922 pFromU2022State
->cs
[2] = 0;
1926 /* output outLen>0 bytes in buffer[] */
1928 *target
++ = buffer
[0];
1930 *offsets
++ = (int32_t)(source
- args
->source
- 1); /* -1: known to be ASCII */
1932 } else if(outLen
== 2 && (target
+ 2) <= targetLimit
) {
1933 *target
++ = buffer
[0];
1934 *target
++ = buffer
[1];
1936 int32_t sourceIndex
= (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
));
1937 *offsets
++ = sourceIndex
;
1938 *offsets
++ = sourceIndex
;
1944 &target
, (const char *)targetLimit
,
1945 &offsets
, (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
)),
1947 if(U_FAILURE(*err
)) {
1951 } /* end if(myTargetIndex<myTargetLength) */
1953 *err
=U_BUFFER_OVERFLOW_ERROR
;
1957 }/* end while(mySourceIndex<mySourceLength) */
1960 * the end of the input stream and detection of truncated input
1961 * are handled by the framework, but for ISO-2022-JP conversion
1962 * we need to be in ASCII mode at the very end
1966 * in SO mode or not in ASCII mode
1967 * end of input and no truncated input
1969 if( U_SUCCESS(*err
) &&
1970 (pFromU2022State
->g
!=0 || pFromU2022State
->cs
[0]!=ASCII
) &&
1971 args
->flush
&& source
>=sourceLimit
&& cnv
->fromUChar32
==0
1973 int32_t sourceIndex
;
1977 if(pFromU2022State
->g
!= 0) {
1978 buffer
[outLen
++] = UCNV_SI
;
1979 pFromU2022State
->g
= 0;
1982 if(pFromU2022State
->cs
[0] != ASCII
) {
1983 int32_t escLen
= escSeqCharsLen
[ASCII
];
1984 uprv_memcpy(buffer
+ outLen
, escSeqChars
[ASCII
], escLen
);
1986 pFromU2022State
->cs
[0] = (int8_t)ASCII
;
1989 /* get the source index of the last input character */
1991 * TODO this would be simpler and more reliable if we used a pair
1992 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1993 * so that we could simply use the prevSourceIndex here;
1994 * this code gives an incorrect result for the rare case of an unmatched
1995 * trail surrogate that is alone in the last buffer of the text stream
1997 sourceIndex
=(int32_t)(source
-args
->source
);
2000 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
2001 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
2012 &target
, (const char *)targetLimit
,
2013 &offsets
, sourceIndex
,
2017 /*save the state and return */
2018 args
->source
= source
;
2019 args
->target
= (char*)target
;
2022 /*************** to unicode *******************/
2025 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
2028 const char *mySource
= (char *) args
->source
;
2029 UChar
*myTarget
= args
->target
;
2030 const char *mySourceLimit
= args
->sourceLimit
;
2031 uint32_t targetUniChar
= 0x0000;
2032 uint32_t mySourceChar
= 0x0000;
2033 uint32_t tmpSourceChar
= 0x0000;
2034 UConverterDataISO2022
* myData
;
2035 ISO2022State
*pToU2022State
;
2038 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2039 pToU2022State
= &myData
->toU2022State
;
2041 if(myData
->key
!= 0) {
2042 /* continue with a partial escape sequence */
2044 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
2045 /* continue with a partial double-byte character */
2046 mySourceChar
= args
->converter
->toUBytes
[0];
2047 args
->converter
->toULength
= 0;
2048 cs
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
2049 targetUniChar
= missingCharMarker
;
2053 while(mySource
< mySourceLimit
){
2055 targetUniChar
=missingCharMarker
;
2057 if(myTarget
< args
->targetLimit
){
2059 mySourceChar
= (unsigned char) *mySource
++;
2061 switch(mySourceChar
) {
2063 if(myData
->version
==3) {
2067 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2068 myData
->isEmptySegment
= FALSE
; /* reset this, we have a different error */
2073 if(myData
->version
==3) {
2074 /* JIS7: switch to G1 half-width Katakana */
2075 pToU2022State
->cs
[1] = (int8_t)HWKANA_7BIT
;
2079 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2080 myData
->isEmptySegment
= FALSE
; /* reset this, we have a different error */
2088 const char * mySourceBefore
= mySource
;
2089 int8_t toULengthBefore
= args
->converter
->toULength
;
2091 changeState_2022(args
->converter
,&(mySource
),
2092 mySourceLimit
, ISO_2022_JP
,err
);
2094 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2095 if(myData
->version
==0 && myData
->key
==0 && U_SUCCESS(*err
) && myData
->isEmptySegment
) {
2096 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
2097 args
->converter
->toUCallbackReason
= UCNV_IRREGULAR
;
2098 args
->converter
->toULength
= (int8_t)(toULengthBefore
+ (mySource
- mySourceBefore
));
2102 /* invalid or illegal escape sequence */
2103 if(U_FAILURE(*err
)){
2104 args
->target
= myTarget
;
2105 args
->source
= mySource
;
2106 myData
->isEmptySegment
= FALSE
; /* Reset to avoid future spurious errors */
2109 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2110 if(myData
->key
==0) {
2111 myData
->isEmptySegment
= TRUE
;
2115 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2120 /* automatically reset to single-byte mode */
2121 if((StateEnum
)pToU2022State
->cs
[0] != ASCII
&& (StateEnum
)pToU2022State
->cs
[0] != JISX201
) {
2122 pToU2022State
->cs
[0] = (int8_t)ASCII
;
2124 pToU2022State
->cs
[2] = 0;
2125 pToU2022State
->g
= 0;
2128 /* convert one or two bytes */
2129 myData
->isEmptySegment
= FALSE
;
2130 cs
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
2131 if( (uint8_t)(mySourceChar
- 0xa1) <= (0xdf - 0xa1) && myData
->version
==4 &&
2134 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2135 targetUniChar
= mySourceChar
+ (HWKANA_START
- 0xa1);
2137 /* return from a single-shift state to the previous one */
2138 if(pToU2022State
->g
>= 2) {
2139 pToU2022State
->g
=pToU2022State
->prevG
;
2143 if(mySourceChar
<= 0x7f) {
2144 targetUniChar
= mySourceChar
;
2148 if(mySourceChar
<= 0x7f) {
2149 targetUniChar
= mySourceChar
+ 0x80;
2151 /* return from a single-shift state to the previous one */
2152 pToU2022State
->g
=pToU2022State
->prevG
;
2155 if(mySourceChar
<= 0x7f) {
2156 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2158 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2159 myData
->myConverterArray
[cs
],
2160 mySourceChar
+ 0x80);
2162 /* return from a single-shift state to the previous one */
2163 pToU2022State
->g
=pToU2022State
->prevG
;
2166 if(mySourceChar
<= 0x7f) {
2167 targetUniChar
= jisx201ToU(mySourceChar
);
2171 if((uint8_t)(mySourceChar
- 0x21) <= (0x5f - 0x21)) {
2172 /* 7-bit halfwidth Katakana */
2173 targetUniChar
= mySourceChar
+ (HWKANA_START
- 0x21);
2178 if(mySource
< mySourceLimit
) {
2179 int leadIsOk
, trailIsOk
;
2182 trailByte
= (uint8_t)*mySource
;
2184 * Ticket 5691: consistent illegal sequences:
2185 * - We include at least the first byte in the illegal sequence.
2186 * - If any of the non-initial bytes could be the start of a character,
2187 * we stop the illegal sequence before the first one of those.
2189 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2190 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2191 * Otherwise we convert or report the pair of bytes.
2193 leadIsOk
= (uint8_t)(mySourceChar
- 0x21) <= (0x7e - 0x21);
2194 trailIsOk
= (uint8_t)(trailByte
- 0x21) <= (0x7e - 0x21);
2195 if (leadIsOk
&& trailIsOk
) {
2197 tmpSourceChar
= (mySourceChar
<< 8) | trailByte
;
2199 _2022ToSJIS((uint8_t)mySourceChar
, trailByte
, tempBuf
);
2200 mySourceChar
= tmpSourceChar
;
2202 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2203 mySourceChar
= tmpSourceChar
;
2204 if (cs
== KSC5601
) {
2205 tmpSourceChar
+= 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2207 tempBuf
[0] = (char)(tmpSourceChar
>> 8);
2208 tempBuf
[1] = (char)(tmpSourceChar
);
2210 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(myData
->myConverterArray
[cs
], tempBuf
, 2, FALSE
);
2211 } else if (!(trailIsOk
|| IS_2022_CONTROL(trailByte
))) {
2212 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2214 /* add another bit so that the code below writes 2 bytes in case of error */
2215 mySourceChar
= 0x10000 | (mySourceChar
<< 8) | trailByte
;
2218 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
2219 args
->converter
->toULength
= 1;
2222 } /* End of inner switch */
2224 } /* End of outer switch */
2225 if(targetUniChar
< (missingCharMarker
-1/*0xfffe*/)){
2227 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2229 *(myTarget
++)=(UChar
)targetUniChar
;
2231 else if(targetUniChar
> missingCharMarker
){
2232 /* disassemble the surrogate pair and write to output*/
2233 targetUniChar
-=0x0010000;
2234 *myTarget
= (UChar
)(0xd800+(UChar
)(targetUniChar
>>10));
2236 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2239 if(myTarget
< args
->targetLimit
){
2240 *myTarget
= (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
2242 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2246 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]=
2247 (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
2252 /* Call the callback function*/
2253 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
2257 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2258 *err
=U_BUFFER_OVERFLOW_ERROR
;
2263 args
->target
= myTarget
;
2264 args
->source
= mySource
;
2268 /***************************************************************
2269 * Rules for ISO-2022-KR encoding
2270 * i) The KSC5601 designator sequence should appear only once in a file,
2271 * at the begining of a line before any KSC5601 characters. This usually
2272 * means that it appears by itself on the first line of the file
2273 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2274 * and SI to shift into single byte mode
2277 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
2279 UConverter
* saveConv
= args
->converter
;
2280 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*)saveConv
->extraInfo
;
2281 args
->converter
=myConverterData
->currentConverter
;
2283 myConverterData
->currentConverter
->fromUChar32
= saveConv
->fromUChar32
;
2284 ucnv_MBCSFromUnicodeWithOffsets(args
,err
);
2285 saveConv
->fromUChar32
= myConverterData
->currentConverter
->fromUChar32
;
2287 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
2288 if(myConverterData
->currentConverter
->charErrorBufferLength
> 0) {
2290 saveConv
->charErrorBuffer
,
2291 myConverterData
->currentConverter
->charErrorBuffer
,
2292 myConverterData
->currentConverter
->charErrorBufferLength
);
2294 saveConv
->charErrorBufferLength
= myConverterData
->currentConverter
->charErrorBufferLength
;
2295 myConverterData
->currentConverter
->charErrorBufferLength
= 0;
2297 args
->converter
=saveConv
;
2301 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
2303 const UChar
*source
= args
->source
;
2304 const UChar
*sourceLimit
= args
->sourceLimit
;
2305 unsigned char *target
= (unsigned char *) args
->target
;
2306 unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
2307 int32_t* offsets
= args
->offsets
;
2308 uint32_t targetByteUnit
= 0x0000;
2309 UChar32 sourceChar
= 0x0000;
2310 UBool isTargetByteDBCS
;
2311 UBool oldIsTargetByteDBCS
;
2312 UConverterDataISO2022
*converterData
;
2313 UConverterSharedData
* sharedData
;
2317 converterData
=(UConverterDataISO2022
*)args
->converter
->extraInfo
;
2318 /* if the version is 1 then the user is requesting
2319 * conversion with ibm-25546 pass the arguments to
2320 * MBCS converter and return
2322 if(converterData
->version
==1){
2323 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args
,err
);
2327 /* initialize data */
2328 sharedData
= converterData
->currentConverter
->sharedData
;
2329 useFallback
= args
->converter
->useFallback
;
2330 isTargetByteDBCS
=(UBool
)args
->converter
->fromUnicodeStatus
;
2331 oldIsTargetByteDBCS
= isTargetByteDBCS
;
2333 isTargetByteDBCS
= (UBool
) args
->converter
->fromUnicodeStatus
;
2334 if((sourceChar
= args
->converter
->fromUChar32
)!=0 && target
<targetLimit
) {
2337 while(source
< sourceLimit
){
2339 targetByteUnit
= missingCharMarker
;
2341 if(target
< (unsigned char*) args
->targetLimit
){
2342 sourceChar
= *source
++;
2344 /* do not convert SO/SI/ESC */
2345 if(IS_2022_CONTROL(sourceChar
)) {
2346 /* callback(illegal) */
2347 *err
=U_ILLEGAL_CHAR_FOUND
;
2348 args
->converter
->fromUChar32
=sourceChar
;
2352 length
= MBCS_FROM_UCHAR32_ISO2022(sharedData
,sourceChar
,&targetByteUnit
,useFallback
,MBCS_OUTPUT_2
);
2354 length
= -length
; /* fallback */
2356 /* only DBCS or SBCS characters are expected*/
2357 /* DB characters with high bit set to 1 are expected */
2358 if( length
> 2 || length
==0 ||
2359 (length
== 1 && targetByteUnit
> 0x7f) ||
2361 ((uint16_t)(targetByteUnit
- 0xa1a1) > (0xfefe - 0xa1a1) ||
2362 (uint8_t)(targetByteUnit
- 0xa1) > (0xfe - 0xa1)))
2364 targetByteUnit
=missingCharMarker
;
2366 if (targetByteUnit
!= missingCharMarker
){
2368 oldIsTargetByteDBCS
= isTargetByteDBCS
;
2369 isTargetByteDBCS
= (UBool
)(targetByteUnit
>0x00FF);
2370 /* append the shift sequence */
2371 if (oldIsTargetByteDBCS
!= isTargetByteDBCS
){
2373 if (isTargetByteDBCS
)
2374 *target
++ = UCNV_SO
;
2376 *target
++ = UCNV_SI
;
2378 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2380 /* write the targetUniChar to target */
2381 if(targetByteUnit
<= 0x00FF){
2382 if( target
< targetLimit
){
2383 *(target
++) = (unsigned char) targetByteUnit
;
2385 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2389 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
);
2390 *err
= U_BUFFER_OVERFLOW_ERROR
;
2393 if(target
< targetLimit
){
2394 *(target
++) =(unsigned char) ((targetByteUnit
>>8) -0x80);
2396 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2398 if(target
< targetLimit
){
2399 *(target
++) =(unsigned char) (targetByteUnit
-0x80);
2401 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2404 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
-0x80);
2405 *err
= U_BUFFER_OVERFLOW_ERROR
;
2408 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) ((targetByteUnit
>>8) -0x80);
2409 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
-0x80);
2410 *err
= U_BUFFER_OVERFLOW_ERROR
;
2416 /* oops.. the code point is unassingned
2417 * set the error and reason
2420 /*check if the char is a First surrogate*/
2421 if(UTF_IS_SURROGATE(sourceChar
)) {
2422 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
2424 /*look ahead to find the trail surrogate*/
2425 if(source
< sourceLimit
) {
2426 /* test the following code unit */
2427 UChar trail
=(UChar
) *source
;
2428 if(UTF_IS_SECOND_SURROGATE(trail
)) {
2430 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
2431 *err
= U_INVALID_CHAR_FOUND
;
2432 /* convert this surrogate code point */
2433 /* exit this condition tree */
2435 /* this is an unmatched lead code unit (1st surrogate) */
2436 /* callback(illegal) */
2437 *err
=U_ILLEGAL_CHAR_FOUND
;
2441 *err
= U_ZERO_ERROR
;
2444 /* this is an unmatched trail code unit (2nd surrogate) */
2445 /* callback(illegal) */
2446 *err
=U_ILLEGAL_CHAR_FOUND
;
2449 /* callback(unassigned) for a BMP code point */
2450 *err
= U_INVALID_CHAR_FOUND
;
2453 args
->converter
->fromUChar32
=sourceChar
;
2456 } /* end if(myTargetIndex<myTargetLength) */
2458 *err
=U_BUFFER_OVERFLOW_ERROR
;
2462 }/* end while(mySourceIndex<mySourceLength) */
2465 * the end of the input stream and detection of truncated input
2466 * are handled by the framework, but for ISO-2022-KR conversion
2467 * we need to be in ASCII mode at the very end
2472 * end of input and no truncated input
2474 if( U_SUCCESS(*err
) &&
2476 args
->flush
&& source
>=sourceLimit
&& args
->converter
->fromUChar32
==0
2478 int32_t sourceIndex
;
2480 /* we are switching to ASCII */
2481 isTargetByteDBCS
=FALSE
;
2483 /* get the source index of the last input character */
2485 * TODO this would be simpler and more reliable if we used a pair
2486 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2487 * so that we could simply use the prevSourceIndex here;
2488 * this code gives an incorrect result for the rare case of an unmatched
2489 * trail surrogate that is alone in the last buffer of the text stream
2491 sourceIndex
=(int32_t)(source
-args
->source
);
2494 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
2495 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
2506 &target
, (const char *)targetLimit
,
2507 &offsets
, sourceIndex
,
2511 /*save the state and return */
2512 args
->source
= source
;
2513 args
->target
= (char*)target
;
2514 args
->converter
->fromUnicodeStatus
= (uint32_t)isTargetByteDBCS
;
2517 /************************ To Unicode ***************************************/
2520 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs
*args
,
2522 char const* sourceStart
;
2523 UConverterDataISO2022
* myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2525 UConverterToUnicodeArgs subArgs
;
2526 int32_t minArgsSize
;
2528 /* set up the subconverter arguments */
2529 if(args
->size
<sizeof(UConverterToUnicodeArgs
)) {
2530 minArgsSize
= args
->size
;
2532 minArgsSize
= (int32_t)sizeof(UConverterToUnicodeArgs
);
2535 uprv_memcpy(&subArgs
, args
, minArgsSize
);
2536 subArgs
.size
= (uint16_t)minArgsSize
;
2537 subArgs
.converter
= myData
->currentConverter
;
2539 /* remember the original start of the input for offsets */
2540 sourceStart
= args
->source
;
2542 if(myData
->key
!= 0) {
2543 /* continue with a partial escape sequence */
2547 while(U_SUCCESS(*err
) && args
->source
< args
->sourceLimit
) {
2548 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2549 subArgs
.source
= args
->source
;
2550 subArgs
.sourceLimit
= getEndOfBuffer_2022(&(args
->source
), args
->sourceLimit
, args
->flush
);
2551 if(subArgs
.source
!= subArgs
.sourceLimit
) {
2553 * get the current partial byte sequence
2555 * it needs to be moved between the public and the subconverter
2556 * so that the conversion framework, which only sees the public
2557 * converter, can handle truncated and illegal input etc.
2559 if(args
->converter
->toULength
> 0) {
2560 uprv_memcpy(subArgs
.converter
->toUBytes
, args
->converter
->toUBytes
, args
->converter
->toULength
);
2562 subArgs
.converter
->toULength
= args
->converter
->toULength
;
2565 * Convert up to the end of the input, or to before the next escape character.
2566 * Does not handle conversion extensions because the preToU[] state etc.
2569 ucnv_MBCSToUnicodeWithOffsets(&subArgs
, err
);
2571 if(args
->offsets
!= NULL
&& sourceStart
!= args
->source
) {
2572 /* update offsets to base them on the actual start of the input */
2573 int32_t *offsets
= args
->offsets
;
2574 UChar
*target
= args
->target
;
2575 int32_t delta
= (int32_t)(args
->source
- sourceStart
);
2576 while(target
< subArgs
.target
) {
2584 args
->source
= subArgs
.source
;
2585 args
->target
= subArgs
.target
;
2586 args
->offsets
= subArgs
.offsets
;
2588 /* copy input/error/overflow buffers */
2589 if(subArgs
.converter
->toULength
> 0) {
2590 uprv_memcpy(args
->converter
->toUBytes
, subArgs
.converter
->toUBytes
, subArgs
.converter
->toULength
);
2592 args
->converter
->toULength
= subArgs
.converter
->toULength
;
2594 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
2595 if(subArgs
.converter
->UCharErrorBufferLength
> 0) {
2596 uprv_memcpy(args
->converter
->UCharErrorBuffer
, subArgs
.converter
->UCharErrorBuffer
,
2597 subArgs
.converter
->UCharErrorBufferLength
);
2599 args
->converter
->UCharErrorBufferLength
=subArgs
.converter
->UCharErrorBufferLength
;
2600 subArgs
.converter
->UCharErrorBufferLength
= 0;
2604 if (U_FAILURE(*err
) || (args
->source
== args
->sourceLimit
)) {
2609 changeState_2022(args
->converter
,
2618 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
2621 const char *mySource
= ( char *) args
->source
;
2622 UChar
*myTarget
= args
->target
;
2623 const char *mySourceLimit
= args
->sourceLimit
;
2624 UChar32 targetUniChar
= 0x0000;
2625 UChar mySourceChar
= 0x0000;
2626 UConverterDataISO2022
* myData
;
2627 UConverterSharedData
* sharedData
;
2630 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2631 if(myData
->version
==1){
2632 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args
,err
);
2636 /* initialize state */
2637 sharedData
= myData
->currentConverter
->sharedData
;
2638 useFallback
= args
->converter
->useFallback
;
2640 if(myData
->key
!= 0) {
2641 /* continue with a partial escape sequence */
2643 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
2644 /* continue with a partial double-byte character */
2645 mySourceChar
= args
->converter
->toUBytes
[0];
2646 args
->converter
->toULength
= 0;
2650 while(mySource
< mySourceLimit
){
2652 if(myTarget
< args
->targetLimit
){
2654 mySourceChar
= (unsigned char) *mySource
++;
2656 if(mySourceChar
==UCNV_SI
){
2657 myData
->toU2022State
.g
= 0;
2658 if (myData
->isEmptySegment
) {
2659 myData
->isEmptySegment
= FALSE
; /* we are handling it, reset to avoid future spurious errors */
2660 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
2661 args
->converter
->toUCallbackReason
= UCNV_IRREGULAR
;
2662 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
2663 args
->converter
->toULength
= 1;
2664 args
->target
= myTarget
;
2665 args
->source
= mySource
;
2668 /*consume the source */
2670 }else if(mySourceChar
==UCNV_SO
){
2671 myData
->toU2022State
.g
= 1;
2672 myData
->isEmptySegment
= TRUE
; /* Begin a new segment, empty so far */
2673 /*consume the source */
2675 }else if(mySourceChar
==ESC_2022
){
2678 myData
->isEmptySegment
= FALSE
; /* Any invalid ESC sequences will be detected separately, so just reset this */
2679 changeState_2022(args
->converter
,&(mySource
),
2680 mySourceLimit
, ISO_2022_KR
, err
);
2681 if(U_FAILURE(*err
)){
2682 args
->target
= myTarget
;
2683 args
->source
= mySource
;
2689 myData
->isEmptySegment
= FALSE
; /* Any invalid char errors will be detected separately, so just reset this */
2690 if(myData
->toU2022State
.g
== 1) {
2691 if(mySource
< mySourceLimit
) {
2692 int leadIsOk
, trailIsOk
;
2695 targetUniChar
= missingCharMarker
;
2696 trailByte
= (uint8_t)*mySource
;
2698 * Ticket 5691: consistent illegal sequences:
2699 * - We include at least the first byte in the illegal sequence.
2700 * - If any of the non-initial bytes could be the start of a character,
2701 * we stop the illegal sequence before the first one of those.
2703 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2704 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2705 * Otherwise we convert or report the pair of bytes.
2707 leadIsOk
= (uint8_t)(mySourceChar
- 0x21) <= (0x7e - 0x21);
2708 trailIsOk
= (uint8_t)(trailByte
- 0x21) <= (0x7e - 0x21);
2709 if (leadIsOk
&& trailIsOk
) {
2711 tempBuf
[0] = (char)(mySourceChar
+ 0x80);
2712 tempBuf
[1] = (char)(trailByte
+ 0x80);
2713 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(sharedData
, tempBuf
, 2, useFallback
);
2714 mySourceChar
= (mySourceChar
<< 8) | trailByte
;
2715 } else if (!(trailIsOk
|| IS_2022_CONTROL(trailByte
))) {
2716 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2718 /* add another bit so that the code below writes 2 bytes in case of error */
2719 mySourceChar
= 0x10000 | (mySourceChar
<< 8) | trailByte
;
2722 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
2723 args
->converter
->toULength
= 1;
2727 else if(mySourceChar
<= 0x7f) {
2728 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(sharedData
, mySource
- 1, 1, useFallback
);
2730 targetUniChar
= 0xffff;
2732 if(targetUniChar
< 0xfffe){
2734 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2736 *(myTarget
++)=(UChar
)targetUniChar
;
2739 /* Call the callback function*/
2740 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
2745 *err
=U_BUFFER_OVERFLOW_ERROR
;
2749 args
->target
= myTarget
;
2750 args
->source
= mySource
;
2753 /*************************** END ISO2022-KR *********************************/
2755 /*************************** ISO-2022-CN *********************************
2757 * Rules for ISO-2022-CN Encoding:
2758 * i) The designator sequence must appear once on a line before any instance
2759 * of character set it designates.
2760 * ii) If two lines contain characters from the same character set, both lines
2761 * must include the designator sequence.
2762 * iii) Once the designator sequence is known, a shifting sequence has to be found
2763 * to invoke the shifting
2764 * iv) All lines start in ASCII and end in ASCII.
2765 * v) Four shifting sequences are employed for this purpose:
2767 * Sequcence ASCII Eq Charsets
2768 * ---------- ------- ---------
2770 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2771 * SS2 <ESC>N CNS-11643-1992 Plane 2
2772 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2775 * SOdesignator : ESC "$" ")" finalchar_for_SO
2776 * SS2designator : ESC "$" "*" finalchar_for_SS2
2777 * SS3designator : ESC "$" "+" finalchar_for_SS3
2779 * ESC $ ) A Indicates the bytes following SO are Chinese
2780 * characters as defined in GB 2312-80, until
2781 * another SOdesignation appears
2784 * ESC $ ) E Indicates the bytes following SO are as defined
2785 * in ISO-IR-165 (for details, see section 2.1),
2786 * until another SOdesignation appears
2788 * ESC $ ) G Indicates the bytes following SO are as defined
2789 * in CNS 11643-plane-1, until another
2790 * SOdesignation appears
2792 * ESC $ * H Indicates the two bytes immediately following
2793 * SS2 is a Chinese character as defined in CNS
2794 * 11643-plane-2, until another SS2designation
2796 * (Meaning <ESC>N must preceed every 2 byte
2799 * ESC $ + I Indicates the immediate two bytes following SS3
2800 * is a Chinese character as defined in CNS
2801 * 11643-plane-3, until another SS3designation
2803 * (Meaning <ESC>O must preceed every 2 byte
2806 * ESC $ + J Indicates the immediate two bytes following SS3
2807 * is a Chinese character as defined in CNS
2808 * 11643-plane-4, until another SS3designation
2810 * (In English: <ESC>O must preceed every 2 byte
2813 * ESC $ + K Indicates the immediate two bytes following SS3
2814 * is a Chinese character as defined in CNS
2815 * 11643-plane-5, until another SS3designation
2818 * ESC $ + L Indicates the immediate two bytes following SS3
2819 * is a Chinese character as defined in CNS
2820 * 11643-plane-6, until another SS3designation
2823 * ESC $ + M Indicates the immediate two bytes following SS3
2824 * is a Chinese character as defined in CNS
2825 * 11643-plane-7, until another SS3designation
2828 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2829 * has its own designation information before any Chinese characters
2834 /* The following are defined this way to make the strings truely readonly */
2835 static const char GB_2312_80_STR
[] = "\x1B\x24\x29\x41";
2836 static const char ISO_IR_165_STR
[] = "\x1B\x24\x29\x45";
2837 static const char CNS_11643_1992_Plane_1_STR
[] = "\x1B\x24\x29\x47";
2838 static const char CNS_11643_1992_Plane_2_STR
[] = "\x1B\x24\x2A\x48";
2839 static const char CNS_11643_1992_Plane_3_STR
[] = "\x1B\x24\x2B\x49";
2840 static const char CNS_11643_1992_Plane_4_STR
[] = "\x1B\x24\x2B\x4A";
2841 static const char CNS_11643_1992_Plane_5_STR
[] = "\x1B\x24\x2B\x4B";
2842 static const char CNS_11643_1992_Plane_6_STR
[] = "\x1B\x24\x2B\x4C";
2843 static const char CNS_11643_1992_Plane_7_STR
[] = "\x1B\x24\x2B\x4D";
2845 /********************** ISO2022-CN Data **************************/
2846 static const char* const escSeqCharsCN
[10] ={
2847 SHIFT_IN_STR
, /* ASCII */
2850 CNS_11643_1992_Plane_1_STR
,
2851 CNS_11643_1992_Plane_2_STR
,
2852 CNS_11643_1992_Plane_3_STR
,
2853 CNS_11643_1992_Plane_4_STR
,
2854 CNS_11643_1992_Plane_5_STR
,
2855 CNS_11643_1992_Plane_6_STR
,
2856 CNS_11643_1992_Plane_7_STR
2860 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
2861 UConverter
*cnv
= args
->converter
;
2862 UConverterDataISO2022
*converterData
;
2863 ISO2022State
*pFromU2022State
;
2864 uint8_t *target
= (uint8_t *) args
->target
;
2865 const uint8_t *targetLimit
= (const uint8_t *) args
->targetLimit
;
2866 const UChar
* source
= args
->source
;
2867 const UChar
* sourceLimit
= args
->sourceLimit
;
2868 int32_t* offsets
= args
->offsets
;
2873 int32_t choiceCount
;
2874 uint32_t targetValue
= 0;
2877 /* set up the state */
2878 converterData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
2879 pFromU2022State
= &converterData
->fromU2022State
;
2883 /* check if the last codepoint of previous buffer was a lead surrogate*/
2884 if((sourceChar
= cnv
->fromUChar32
)!=0 && target
< targetLimit
) {
2888 while( source
< sourceLimit
){
2889 if(target
< targetLimit
){
2891 sourceChar
= *(source
++);
2892 /*check if the char is a First surrogate*/
2893 if(UTF_IS_SURROGATE(sourceChar
)) {
2894 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
2896 /*look ahead to find the trail surrogate*/
2897 if(source
< sourceLimit
) {
2898 /* test the following code unit */
2899 UChar trail
=(UChar
) *source
;
2900 if(UTF_IS_SECOND_SURROGATE(trail
)) {
2902 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
2903 cnv
->fromUChar32
=0x00;
2904 /* convert this supplementary code point */
2905 /* exit this condition tree */
2907 /* this is an unmatched lead code unit (1st surrogate) */
2908 /* callback(illegal) */
2909 *err
=U_ILLEGAL_CHAR_FOUND
;
2910 cnv
->fromUChar32
=sourceChar
;
2915 cnv
->fromUChar32
=sourceChar
;
2919 /* this is an unmatched trail code unit (2nd surrogate) */
2920 /* callback(illegal) */
2921 *err
=U_ILLEGAL_CHAR_FOUND
;
2922 cnv
->fromUChar32
=sourceChar
;
2927 /* do the conversion */
2928 if(sourceChar
<= 0x007f ){
2929 /* do not convert SO/SI/ESC */
2930 if(IS_2022_CONTROL(sourceChar
)) {
2931 /* callback(illegal) */
2932 *err
=U_ILLEGAL_CHAR_FOUND
;
2933 cnv
->fromUChar32
=sourceChar
;
2938 if(pFromU2022State
->g
== 0) {
2939 buffer
[0] = (char)sourceChar
;
2942 buffer
[0] = UCNV_SI
;
2943 buffer
[1] = (char)sourceChar
;
2945 pFromU2022State
->g
= 0;
2948 if(sourceChar
== CR
|| sourceChar
== LF
) {
2949 /* reset the state at the end of a line */
2950 uprv_memset(pFromU2022State
, 0, sizeof(ISO2022State
));
2955 /* convert U+0080..U+10ffff */
2959 if(choiceCount
== 0) {
2960 /* try the current SO/G1 converter first */
2961 choices
[0] = pFromU2022State
->cs
[1];
2963 /* default to GB2312_1 if none is designated yet */
2964 if(choices
[0] == 0) {
2965 choices
[0] = GB2312_1
;
2968 if(converterData
->version
== 0) {
2971 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2972 if(choices
[0] == GB2312_1
) {
2973 choices
[1] = (int8_t)CNS_11643_1
;
2975 choices
[1] = (int8_t)GB2312_1
;
2979 } else if (converterData
->version
== 1) {
2980 /* ISO-2022-CN-EXT */
2982 /* try one of the other converters */
2983 switch(choices
[0]) {
2985 choices
[1] = (int8_t)CNS_11643_1
;
2986 choices
[2] = (int8_t)ISO_IR_165
;
2989 choices
[1] = (int8_t)GB2312_1
;
2990 choices
[2] = (int8_t)CNS_11643_1
;
2992 default: /* CNS_11643_x */
2993 choices
[1] = (int8_t)GB2312_1
;
2994 choices
[2] = (int8_t)ISO_IR_165
;
3000 choices
[0] = (int8_t)CNS_11643_1
;
3001 choices
[1] = (int8_t)GB2312_1
;
3007 * len==0: no mapping found yet
3008 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3009 * len>0: found a roundtrip result, done
3013 * We will turn off useFallback after finding a fallback,
3014 * but we still get fallbacks from PUA code points as usual.
3015 * Therefore, we will also need to check that we don't overwrite
3016 * an early fallback with a later one.
3018 useFallback
= cnv
->useFallback
;
3020 for(i
= 0; i
< choiceCount
&& len
<= 0; ++i
) {
3021 int8_t cs0
= choices
[i
];
3025 if(cs0
>= CNS_11643_0
) {
3026 len2
= MBCS_FROM_UCHAR32_ISO2022(
3027 converterData
->myConverterArray
[CNS_11643
],
3032 if(len2
== 3 || (len2
== -3 && len
== 0)) {
3033 targetValue
= value
;
3034 cs
= (int8_t)(CNS_11643_0
+ (value
>> 16) - 0x80);
3039 useFallback
= FALSE
;
3041 if(cs
== CNS_11643_1
) {
3043 } else if(cs
== CNS_11643_2
) {
3045 } else /* plane 3..7 */ if(converterData
->version
== 1) {
3048 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3053 /* GB2312_1 or ISO-IR-165 */
3054 len2
= MBCS_FROM_UCHAR32_ISO2022(
3055 converterData
->myConverterArray
[cs0
],
3060 if(len2
== 2 || (len2
== -2 && len
== 0)) {
3061 targetValue
= value
;
3065 useFallback
= FALSE
;
3072 len
= 0; /* count output bytes; it must have been abs(len) == 2 */
3074 /* write the designation sequence if necessary */
3075 if(cs
!= pFromU2022State
->cs
[g
]) {
3076 if(cs
< CNS_11643
) {
3077 uprv_memcpy(buffer
, escSeqCharsCN
[cs
], 4);
3079 uprv_memcpy(buffer
, escSeqCharsCN
[CNS_11643
+ (cs
- CNS_11643_1
)], 4);
3082 pFromU2022State
->cs
[g
] = cs
;
3084 /* changing the SO/G1 charset invalidates the choices[] */
3089 /* write the shift sequence if necessary */
3090 if(g
!= pFromU2022State
->g
) {
3093 buffer
[len
++] = UCNV_SO
;
3095 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3096 pFromU2022State
->g
= 1;
3099 buffer
[len
++] = 0x1b;
3100 buffer
[len
++] = 0x4e;
3102 default: /* case 3 */
3103 buffer
[len
++] = 0x1b;
3104 buffer
[len
++] = 0x4f;
3109 /* write the two output bytes */
3110 buffer
[len
++] = (char)(targetValue
>> 8);
3111 buffer
[len
++] = (char)targetValue
;
3113 /* if we cannot find the character after checking all codepages
3114 * then this is an error
3116 *err
= U_INVALID_CHAR_FOUND
;
3117 cnv
->fromUChar32
=sourceChar
;
3122 /* output len>0 bytes in buffer[] */
3124 *target
++ = buffer
[0];
3126 *offsets
++ = (int32_t)(source
- args
->source
- 1); /* -1: known to be ASCII */
3128 } else if(len
== 2 && (target
+ 2) <= targetLimit
) {
3129 *target
++ = buffer
[0];
3130 *target
++ = buffer
[1];
3132 int32_t sourceIndex
= (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
));
3133 *offsets
++ = sourceIndex
;
3134 *offsets
++ = sourceIndex
;
3140 &target
, (const char *)targetLimit
,
3141 &offsets
, (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
)),
3143 if(U_FAILURE(*err
)) {
3147 } /* end if(myTargetIndex<myTargetLength) */
3149 *err
=U_BUFFER_OVERFLOW_ERROR
;
3153 }/* end while(mySourceIndex<mySourceLength) */
3156 * the end of the input stream and detection of truncated input
3157 * are handled by the framework, but for ISO-2022-CN conversion
3158 * we need to be in ASCII mode at the very end
3163 * end of input and no truncated input
3165 if( U_SUCCESS(*err
) &&
3166 pFromU2022State
->g
!=0 &&
3167 args
->flush
&& source
>=sourceLimit
&& cnv
->fromUChar32
==0
3169 int32_t sourceIndex
;
3171 /* we are switching to ASCII */
3172 pFromU2022State
->g
=0;
3174 /* get the source index of the last input character */
3176 * TODO this would be simpler and more reliable if we used a pair
3177 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3178 * so that we could simply use the prevSourceIndex here;
3179 * this code gives an incorrect result for the rare case of an unmatched
3180 * trail surrogate that is alone in the last buffer of the text stream
3182 sourceIndex
=(int32_t)(source
-args
->source
);
3185 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
3186 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
3197 &target
, (const char *)targetLimit
,
3198 &offsets
, sourceIndex
,
3202 /*save the state and return */
3203 args
->source
= source
;
3204 args
->target
= (char*)target
;
3209 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
3212 const char *mySource
= (char *) args
->source
;
3213 UChar
*myTarget
= args
->target
;
3214 const char *mySourceLimit
= args
->sourceLimit
;
3215 uint32_t targetUniChar
= 0x0000;
3216 uint32_t mySourceChar
= 0x0000;
3217 UConverterDataISO2022
* myData
;
3218 ISO2022State
*pToU2022State
;
3220 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
3221 pToU2022State
= &myData
->toU2022State
;
3223 if(myData
->key
!= 0) {
3224 /* continue with a partial escape sequence */
3226 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
3227 /* continue with a partial double-byte character */
3228 mySourceChar
= args
->converter
->toUBytes
[0];
3229 args
->converter
->toULength
= 0;
3230 targetUniChar
= missingCharMarker
;
3234 while(mySource
< mySourceLimit
){
3236 targetUniChar
=missingCharMarker
;
3238 if(myTarget
< args
->targetLimit
){
3240 mySourceChar
= (unsigned char) *mySource
++;
3242 switch(mySourceChar
){
3245 if (myData
->isEmptySegment
) {
3246 myData
->isEmptySegment
= FALSE
; /* we are handling it, reset to avoid future spurious errors */
3247 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
3248 args
->converter
->toUCallbackReason
= UCNV_IRREGULAR
;
3249 args
->converter
->toUBytes
[0] = mySourceChar
;
3250 args
->converter
->toULength
= 1;
3251 args
->target
= myTarget
;
3252 args
->source
= mySource
;
3258 if(pToU2022State
->cs
[1] != 0) {
3260 myData
->isEmptySegment
= TRUE
; /* Begin a new segment, empty so far */
3263 /* illegal to have SO before a matching designator */
3264 myData
->isEmptySegment
= FALSE
; /* Handling a different error, reset this to avoid future spurious errs */
3272 const char * mySourceBefore
= mySource
;
3273 int8_t toULengthBefore
= args
->converter
->toULength
;
3275 changeState_2022(args
->converter
,&(mySource
),
3276 mySourceLimit
, ISO_2022_CN
,err
);
3278 /* After SO there must be at least one character before a designator (designator error handled separately) */
3279 if(myData
->key
==0 && U_SUCCESS(*err
) && myData
->isEmptySegment
) {
3280 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
3281 args
->converter
->toUCallbackReason
= UCNV_IRREGULAR
;
3282 args
->converter
->toULength
= (int8_t)(toULengthBefore
+ (mySource
- mySourceBefore
));
3286 /* invalid or illegal escape sequence */
3287 if(U_FAILURE(*err
)){
3288 args
->target
= myTarget
;
3289 args
->source
= mySource
;
3290 myData
->isEmptySegment
= FALSE
; /* Reset to avoid future spurious errors */
3295 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3300 uprv_memset(pToU2022State
, 0, sizeof(ISO2022State
));
3303 /* convert one or two bytes */
3304 myData
->isEmptySegment
= FALSE
;
3305 if(pToU2022State
->g
!= 0) {
3306 if(mySource
< mySourceLimit
) {
3307 UConverterSharedData
*cnv
;
3308 StateEnum tempState
;
3310 int leadIsOk
, trailIsOk
;
3313 trailByte
= (uint8_t)*mySource
;
3315 * Ticket 5691: consistent illegal sequences:
3316 * - We include at least the first byte in the illegal sequence.
3317 * - If any of the non-initial bytes could be the start of a character,
3318 * we stop the illegal sequence before the first one of those.
3320 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3321 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3322 * Otherwise we convert or report the pair of bytes.
3324 leadIsOk
= (uint8_t)(mySourceChar
- 0x21) <= (0x7e - 0x21);
3325 trailIsOk
= (uint8_t)(trailByte
- 0x21) <= (0x7e - 0x21);
3326 if (leadIsOk
&& trailIsOk
) {
3328 tempState
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
3329 if(tempState
>= CNS_11643_0
) {
3330 cnv
= myData
->myConverterArray
[CNS_11643
];
3331 tempBuf
[0] = (char) (0x80+(tempState
-CNS_11643_0
));
3332 tempBuf
[1] = (char) (mySourceChar
);
3333 tempBuf
[2] = (char) trailByte
;
3337 cnv
= myData
->myConverterArray
[tempState
];
3338 tempBuf
[0] = (char) (mySourceChar
);
3339 tempBuf
[1] = (char) trailByte
;
3342 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(cnv
, tempBuf
, tempBufLen
, FALSE
);
3343 mySourceChar
= (mySourceChar
<< 8) | trailByte
;
3344 } else if (!(trailIsOk
|| IS_2022_CONTROL(trailByte
))) {
3345 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3347 /* add another bit so that the code below writes 2 bytes in case of error */
3348 mySourceChar
= 0x10000 | (mySourceChar
<< 8) | trailByte
;
3350 if(pToU2022State
->g
>=2) {
3351 /* return from a single-shift state to the previous one */
3352 pToU2022State
->g
=pToU2022State
->prevG
;
3355 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
3356 args
->converter
->toULength
= 1;
3361 if(mySourceChar
<= 0x7f) {
3362 targetUniChar
= (UChar
) mySourceChar
;
3367 if(targetUniChar
< (missingCharMarker
-1/*0xfffe*/)){
3369 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
3371 *(myTarget
++)=(UChar
)targetUniChar
;
3373 else if(targetUniChar
> missingCharMarker
){
3374 /* disassemble the surrogate pair and write to output*/
3375 targetUniChar
-=0x0010000;
3376 *myTarget
= (UChar
)(0xd800+(UChar
)(targetUniChar
>>10));
3378 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
3381 if(myTarget
< args
->targetLimit
){
3382 *myTarget
= (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
3384 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
3388 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]=
3389 (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
3394 /* Call the callback function*/
3395 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
3400 *err
=U_BUFFER_OVERFLOW_ERROR
;
3405 args
->target
= myTarget
;
3406 args
->source
= mySource
;
3410 _ISO_2022_WriteSub(UConverterFromUnicodeArgs
*args
, int32_t offsetIndex
, UErrorCode
*err
) {
3411 UConverter
*cnv
= args
->converter
;
3412 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) cnv
->extraInfo
;
3413 ISO2022State
*pFromU2022State
=&myConverterData
->fromU2022State
;
3418 subchar
=(char *)cnv
->subChars
;
3419 length
=cnv
->subCharLen
; /* assume length==1 for most variants */
3422 switch(myConverterData
->locale
[0]){
3427 if(pFromU2022State
->g
== 1) {
3428 /* JIS7: switch from G1 to G0 */
3429 pFromU2022State
->g
= 0;
3433 cs
= pFromU2022State
->cs
[0];
3434 if(cs
!= ASCII
&& cs
!= JISX201
) {
3435 /* not in ASCII or JIS X 0201: switch to ASCII */
3436 pFromU2022State
->cs
[0] = (int8_t)ASCII
;
3446 if(pFromU2022State
->g
!= 0) {
3447 /* not in ASCII mode: switch to ASCII */
3448 pFromU2022State
->g
= 0;
3454 if(myConverterData
->version
== 0) {
3456 if((UBool
)args
->converter
->fromUnicodeStatus
) {
3457 /* in DBCS mode: switch to SBCS */
3458 args
->converter
->fromUnicodeStatus
= 0;
3462 } else /* length == 2*/ {
3463 if(!(UBool
)args
->converter
->fromUnicodeStatus
) {
3464 /* in SBCS mode: switch to DBCS */
3465 args
->converter
->fromUnicodeStatus
= 1;
3473 /* save the subconverter's substitution string */
3474 uint8_t *currentSubChars
= myConverterData
->currentConverter
->subChars
;
3475 int8_t currentSubCharLen
= myConverterData
->currentConverter
->subCharLen
;
3477 /* set our substitution string into the subconverter */
3478 myConverterData
->currentConverter
->subChars
= (uint8_t *)subchar
;
3479 myConverterData
->currentConverter
->subCharLen
= (int8_t)length
;
3481 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3482 args
->converter
= myConverterData
->currentConverter
;
3483 myConverterData
->currentConverter
->fromUChar32
= cnv
->fromUChar32
;
3484 ucnv_cbFromUWriteSub(args
, 0, err
);
3485 cnv
->fromUChar32
= myConverterData
->currentConverter
->fromUChar32
;
3486 args
->converter
= cnv
;
3488 /* restore the subconverter's substitution string */
3489 myConverterData
->currentConverter
->subChars
= currentSubChars
;
3490 myConverterData
->currentConverter
->subCharLen
= currentSubCharLen
;
3492 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
3493 if(myConverterData
->currentConverter
->charErrorBufferLength
> 0) {
3495 cnv
->charErrorBuffer
,
3496 myConverterData
->currentConverter
->charErrorBuffer
,
3497 myConverterData
->currentConverter
->charErrorBufferLength
);
3499 cnv
->charErrorBufferLength
= myConverterData
->currentConverter
->charErrorBufferLength
;
3500 myConverterData
->currentConverter
->charErrorBufferLength
= 0;
3508 ucnv_cbFromUWriteBytes(args
,
3509 buffer
, (int32_t)(p
- buffer
),
3514 * Structure for cloning an ISO 2022 converter into a single memory block.
3515 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3516 * and then ucnv_safeClone() of the sub-converter may additionally align
3517 * currentConverter inside the cloneStruct, for which we need the deadSpace
3518 * after currentConverter.
3519 * This is because UAlignedMemory may be larger than the actually
3520 * necessary alignment size for the platform.
3521 * The other cloneStruct fields will not be moved around,
3522 * and are aligned properly with cloneStruct's alignment.
3527 UConverter currentConverter
;
3528 UAlignedMemory deadSpace
;
3529 UConverterDataISO2022 mydata
;
3534 _ISO_2022_SafeClone(
3535 const UConverter
*cnv
,
3537 int32_t *pBufferSize
,
3540 struct cloneStruct
* localClone
;
3541 UConverterDataISO2022
*cnvData
;
3544 if (*pBufferSize
== 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3545 *pBufferSize
= (int32_t)sizeof(struct cloneStruct
);
3549 cnvData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
3550 localClone
= (struct cloneStruct
*)stackBuffer
;
3552 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3554 uprv_memcpy(&localClone
->mydata
, cnvData
, sizeof(UConverterDataISO2022
));
3555 localClone
->cnv
.extraInfo
= &localClone
->mydata
; /* set pointer to extra data */
3556 localClone
->cnv
.isExtraLocal
= TRUE
;
3558 /* share the subconverters */
3560 if(cnvData
->currentConverter
!= NULL
) {
3561 size
= (int32_t)(sizeof(UConverter
) + sizeof(UAlignedMemory
)); /* include size of padding */
3562 localClone
->mydata
.currentConverter
=
3563 ucnv_safeClone(cnvData
->currentConverter
,
3564 &localClone
->currentConverter
,
3566 if(U_FAILURE(*status
)) {
3571 for(i
=0; i
<UCNV_2022_MAX_CONVERTERS
; ++i
) {
3572 if(cnvData
->myConverterArray
[i
] != NULL
) {
3573 ucnv_incrementRefCount(cnvData
->myConverterArray
[i
]);
3577 return &localClone
->cnv
;
3581 _ISO_2022_GetUnicodeSet(const UConverter
*cnv
,
3582 const USetAdder
*sa
,
3583 UConverterUnicodeSet which
,
3584 UErrorCode
*pErrorCode
)
3587 UConverterDataISO2022
* cnvData
;
3589 if (U_FAILURE(*pErrorCode
)) {
3592 #ifdef U_ENABLE_GENERIC_ISO_2022
3593 if (cnv
->sharedData
== &_ISO2022Data
) {
3594 /* We use UTF-8 in this case */
3595 sa
->addRange(sa
->set
, 0, 0xd7FF);
3596 sa
->addRange(sa
->set
, 0xE000, 0x10FFFF);
3601 cnvData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
3603 /* open a set and initialize it with code points that are algorithmically round-tripped */
3604 switch(cnvData
->locale
[0]){
3606 /* include JIS X 0201 which is hardcoded */
3607 sa
->add(sa
->set
, 0xa5);
3608 sa
->add(sa
->set
, 0x203e);
3609 if(jpCharsetMasks
[cnvData
->version
]&CSM(ISO8859_1
)) {
3610 /* include Latin-1 for some variants of JP */
3611 sa
->addRange(sa
->set
, 0, 0xff);
3613 /* include ASCII for JP */
3614 sa
->addRange(sa
->set
, 0, 0x7f);
3616 if(cnvData
->version
==3 || cnvData
->version
==4 || which
==UCNV_ROUNDTRIP_AND_FALLBACK_SET
) {
3618 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3619 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3620 * use half-width Katakana.
3621 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3622 * half-width Katakana via the ESC ( I sequence.
3623 * However, we only emit (fromUnicode) half-width Katakana according to the
3624 * definition of each variant.
3626 * When including fallbacks,
3627 * we need to include half-width Katakana Unicode code points for all JP variants because
3628 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3630 /* include half-width Katakana for JP */
3631 sa
->addRange(sa
->set
, HWKANA_START
, HWKANA_END
);
3636 /* include ASCII for CN */
3637 sa
->addRange(sa
->set
, 0, 0x7f);
3640 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3641 cnvData
->currentConverter
->sharedData
->impl
->getUnicodeSet(
3642 cnvData
->currentConverter
, sa
, which
, pErrorCode
);
3643 /* the loop over myConverterArray[] will simply not find another converter */
3649 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3650 if( (cnvData
->locale
[0]=='c' || cnvData
->locale
[0]=='z') &&
3651 cnvData
->version
==0 && i
==CNS_11643
3653 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3654 ucnv_MBCSGetUnicodeSetForBytes(
3655 cnvData
->myConverterArray
[i
],
3656 sa
, UCNV_ROUNDTRIP_SET
,
3662 for (i
=0; i
<UCNV_2022_MAX_CONVERTERS
; i
++) {
3663 UConverterSetFilter filter
;
3664 if(cnvData
->myConverterArray
[i
]!=NULL
) {
3665 if( (cnvData
->locale
[0]=='c' || cnvData
->locale
[0]=='z') &&
3666 cnvData
->version
==0 && i
==CNS_11643
3669 * Version-specific for CN:
3670 * CN version 0 does not map CNS planes 3..7 although
3671 * they are all available in the CNS conversion table;
3672 * CN version 1 (-EXT) does map them all.
3673 * The two versions create different Unicode sets.
3675 filter
=UCNV_SET_FILTER_2022_CN
;
3676 } else if(cnvData
->locale
[0]=='j' && i
==JISX208
) {
3678 * Only add code points that map to Shift-JIS codes
3679 * corresponding to JIS X 0208.
3681 filter
=UCNV_SET_FILTER_SJIS
;
3682 } else if(i
==KSC5601
) {
3684 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3685 * are broader than GR94.
3687 filter
=UCNV_SET_FILTER_GR94DBCS
;
3689 filter
=UCNV_SET_FILTER_NONE
;
3691 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData
->myConverterArray
[i
], sa
, which
, filter
, pErrorCode
);
3696 * ISO 2022 converters must not convert SO/SI/ESC despite what
3697 * sub-converters do by themselves.
3698 * Remove these characters from the set.
3700 sa
->remove(sa
->set
, 0x0e);
3701 sa
->remove(sa
->set
, 0x0f);
3702 sa
->remove(sa
->set
, 0x1b);
3704 /* ISO 2022 converters do not convert C1 controls either */
3705 sa
->removeRange(sa
->set
, 0x80, 0x9f);
3708 static const UConverterImpl _ISO2022Impl
={
3718 #ifdef U_ENABLE_GENERIC_ISO_2022
3719 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC
,
3720 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC
,
3721 ucnv_fromUnicode_UTF8
,
3722 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
3734 _ISO_2022_SafeClone
,
3735 _ISO_2022_GetUnicodeSet
3737 static const UConverterStaticData _ISO2022StaticData
={
3738 sizeof(UConverterStaticData
),
3744 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3751 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3753 const UConverterSharedData _ISO2022Data
={
3754 sizeof(UConverterSharedData
),
3758 &_ISO2022StaticData
,
3764 /*************JP****************/
3765 static const UConverterImpl _ISO2022JPImpl
={
3775 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3776 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3777 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3778 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3784 _ISO_2022_SafeClone
,
3785 _ISO_2022_GetUnicodeSet
3787 static const UConverterStaticData _ISO2022JPStaticData
={
3788 sizeof(UConverterStaticData
),
3794 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3801 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3803 static const UConverterSharedData _ISO2022JPData
={
3804 sizeof(UConverterSharedData
),
3808 &_ISO2022JPStaticData
,
3814 /************* KR ***************/
3815 static const UConverterImpl _ISO2022KRImpl
={
3825 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3826 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3827 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3828 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3834 _ISO_2022_SafeClone
,
3835 _ISO_2022_GetUnicodeSet
3837 static const UConverterStaticData _ISO2022KRStaticData
={
3838 sizeof(UConverterStaticData
),
3844 3, /* max 3 bytes per UChar: SO+DBCS */
3851 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3853 static const UConverterSharedData _ISO2022KRData
={
3854 sizeof(UConverterSharedData
),
3858 &_ISO2022KRStaticData
,
3864 /*************** CN ***************/
3865 static const UConverterImpl _ISO2022CNImpl
={
3876 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3877 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3878 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3879 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3885 _ISO_2022_SafeClone
,
3886 _ISO_2022_GetUnicodeSet
3888 static const UConverterStaticData _ISO2022CNStaticData
={
3889 sizeof(UConverterStaticData
),
3895 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3902 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3904 static const UConverterSharedData _ISO2022CNData
={
3905 sizeof(UConverterSharedData
),
3909 &_ISO2022CNStaticData
,
3917 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */