1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2000-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnv2022.cpp
10 * tab size: 8 (not used)
13 * created on: 2000feb03
14 * created by: Markus W. Scherer
18 * 06/29/2000 helena Major rewrite of the callback APIs.
19 * 08/08/2000 Ram Included support for ISO-2022-JP-2
20 * Changed implementation of toUnicode
22 * 08/21/2000 Ram Added support for ISO-2022-KR
23 * 08/29/2000 Ram Seperated implementation of EBCDIC to
25 * 09/20/2000 Ram Added support for ISO-2022-CN
26 * Added implementations for getNextUChar()
27 * for specific 2022 country variants.
28 * 10/31/2000 Ram Implemented offsets logic functions
31 #include "unicode/utypes.h"
33 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
35 #include "unicode/ucnv.h"
36 #include "unicode/uset.h"
37 #include "unicode/ucnv_err.h"
38 #include "unicode/ucnv_cb.h"
39 #include "unicode/utf16.h"
48 #ifdef U_ENABLE_GENERIC_ISO_2022
50 * I am disabling the generic ISO-2022 converter after proposing to do so on
51 * the icu mailing list two days ago.
54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55 * its designation sequences, single shifts with return to the previous state,
56 * switch-with-no-return to UTF-16BE or similar, etc.
57 * This is unlike the language-specific variants like ISO-2022-JP which
58 * require a much smaller repertoire of ISO-2022 features.
59 * These variants continue to be supported.
60 * 2. I believe that no one is really using the generic ISO-2022 converter
61 * but rather always one of the language-specific variants.
62 * Note that ICU's generic ISO-2022 converter has always output one escape
63 * sequence followed by UTF-8 for the whole stream.
64 * 3. Switching between subcharsets is extremely slow, because each time
65 * the previous converter is closed and a new one opened,
66 * without any kind of caching, least-recently-used list, etc.
67 * 4. The code is currently buggy, and given the above it does not seem
68 * reasonable to spend the time on maintenance.
69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70 * This means, for example, that when ISO-8859-7 is designated, the following
71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72 * The ICU ISO-2022 converter does not handle this - and has no information
73 * about which subconverter would have to be shifted vs. which is designed
76 * Markus Scherer 2003-dec-03
80 #if !UCONFIG_ONLY_HTML_CONVERSION
81 static const char SHIFT_IN_STR
[] = "\x0F";
82 // static const char SHIFT_OUT_STR[] = "\x0E";
97 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
98 * as bytes 21..7E. (Subtract 0x80.)
99 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
100 * as bytes 20..7F. (Subtract 0x80.)
101 * Do not encode C1 control codes with native bytes 80..9F
102 * as bytes 00..1F (C0 control codes).
112 * ISO 2022 control codes must not be converted from Unicode
113 * because they would mess up the byte stream.
114 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
115 * corresponding to SO, SI, and ESC.
117 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
119 /* for ISO-2022-JP and -CN implementations */
136 HWKANA_7BIT
=8, /* Halfwidth Katakana 7 bit */
139 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
145 * these are used in StateEnum and ISO2022State variables,
146 * but CNS_11643 must be used to index into myConverterArray[]
158 /* is the StateEnum charset value for a DBCS charset? */
159 #if UCONFIG_ONLY_HTML_CONVERSION
160 #define IS_JP_DBCS(cs) (JISX208==(cs))
162 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
165 #define CSM(cs) ((uint16_t)1<<(cs))
168 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
169 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
171 * Note: The converter uses some leniency:
172 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
173 * all versions, not just JIS7 and JIS8.
174 * - ICU does not distinguish between different versions of JIS X 0208.
176 #if UCONFIG_ONLY_HTML_CONVERSION
177 enum { MAX_JA_VERSION
=0 };
179 enum { MAX_JA_VERSION
=4 };
181 static const uint16_t jpCharsetMasks
[MAX_JA_VERSION
+1]={
182 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
),
183 #if !UCONFIG_ONLY_HTML_CONVERSION
184 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
),
185 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
),
186 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
),
187 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
)
200 typedef struct ISO2022State
{
201 int8_t cs
[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
202 int8_t g
; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
203 int8_t prevG
; /* g before single shift (SS2 or SS3) */
206 #define UCNV_OPTIONS_VERSION_MASK 0xf
207 #define UCNV_2022_MAX_CONVERTERS 10
210 UConverterSharedData
*myConverterArray
[UCNV_2022_MAX_CONVERTERS
];
211 UConverter
*currentConverter
;
212 Cnv2022Type currentType
;
213 ISO2022State toU2022State
, fromU2022State
;
216 #ifdef U_ENABLE_GENERIC_ISO_2022
219 UBool isEmptySegment
;
222 }UConverterDataISO2022
;
225 /* ISO-2022 ----------------------------------------------------------------- */
227 /*Forward declaration */
228 U_CFUNC
void U_CALLCONV
229 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs
* args
,
231 U_CFUNC
void U_CALLCONV
232 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
,
235 #define ESC_2022 0x1B /*ESC*/
239 INVALID_2022
= -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
240 VALID_NON_TERMINAL_2022
= 0, /*so far corresponds to a valid iso 2022 escape sequence*/
241 VALID_TERMINAL_2022
= 1, /*corresponds to a valid iso 2022 escape sequence*/
242 VALID_MAYBE_TERMINAL_2022
= 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
243 } UCNV_TableStates_2022
;
246 * The way these state transition arrays work is:
247 * ex : ESC$B is the sequence for JISX208
248 * a) First Iteration: char is ESC
249 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
250 * int x = normalize_esq_chars_2022[27] which is equal to 1
251 * ii) Search for this value in escSeqStateTable_Key_2022[]
252 * value of x is stored at escSeqStateTable_Key_2022[0]
253 * iii) Save this index as offset
254 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
255 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
256 * b) Switch on this state and continue to next char
257 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
258 * which is normalize_esq_chars_2022[36] == 4
259 * ii) x is currently 1(from above)
260 * x<<=5 -- x is now 32
261 * x+=normalize_esq_chars_2022[36]
263 * iii) Search for this value in escSeqStateTable_Key_2022[]
264 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
265 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
266 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
267 * c) Switch on this state and continue to next char
268 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
269 * ii) x is currently 36 (from above)
270 * x<<=5 -- x is now 1152
271 * x+=normalize_esq_chars_2022[66]
273 * iii) Search for this value in escSeqStateTable_Key_2022[]
274 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
275 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
276 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
277 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
281 /*Below are the 3 arrays depicting a state transition table*/
282 static const int8_t normalize_esq_chars_2022
[256] = {
283 /* 0 1 2 3 4 5 6 7 8 9 */
285 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
289 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
292 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
293 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
298 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
299 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
300 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
301 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
302 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
303 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
304 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
305 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
306 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
307 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
308 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
309 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
313 #ifdef U_ENABLE_GENERIC_ISO_2022
315 * When the generic ISO-2022 converter is completely removed, not just disabled
316 * per #ifdef, then the following state table and the associated tables that are
317 * dimensioned with MAX_STATES_2022 should be trimmed.
319 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
320 * the associated escape sequences starting with ESC ( B should be removed.
321 * This includes the ones with key values 1097 and all of the ones above 1000000.
323 * For the latter, the tables can simply be truncated.
324 * For the former, since the tables must be kept parallel, it is probably best
325 * to simply duplicate an adjacent table cell, parallel in all tables.
327 * It may make sense to restructure the tables, especially by using small search
328 * tables for the variants instead of indexing them parallel to the table here.
332 #define MAX_STATES_2022 74
333 static const int32_t escSeqStateTable_Key_2022
[MAX_STATES_2022
] = {
334 /* 0 1 2 3 4 5 6 7 8 9 */
336 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
337 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
338 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
339 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
340 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
341 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
342 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
343 ,35947631 ,35947635 ,35947636 ,35947638
346 #ifdef U_ENABLE_GENERIC_ISO_2022
348 static const char* const escSeqStateTable_Result_2022
[MAX_STATES_2022
] = {
349 /* 0 1 2 3 4 5 6 7 8 9 */
351 NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,"latin1" ,"latin1"
352 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
353 ,"latin1" ,NULL
,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL
,NULL
,NULL
,NULL
,"UTF8"
354 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL
,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
355 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
356 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
357 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL
,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
358 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
363 static const int8_t escSeqStateTable_Value_2022
[MAX_STATES_2022
] = {
364 /* 0 1 2 3 4 5 6 7 8 9 */
365 VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
366 ,VALID_MAYBE_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
367 ,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
368 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
369 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
370 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
371 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
372 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
375 /* Type def for refactoring changeState_2022 code*/
377 #ifdef U_ENABLE_GENERIC_ISO_2022
381 #if !UCONFIG_ONLY_HTML_CONVERSION
387 /*********** ISO 2022 Converter Protos ***********/
388 static void U_CALLCONV
389 _ISO2022Open(UConverter
*cnv
, UConverterLoadArgs
*pArgs
, UErrorCode
*errorCode
);
391 static void U_CALLCONV
392 _ISO2022Close(UConverter
*converter
);
394 static void U_CALLCONV
395 _ISO2022Reset(UConverter
*converter
, UConverterResetChoice choice
);
398 static const char * U_CALLCONV
399 _ISO2022getName(const UConverter
* cnv
);
402 static void U_CALLCONV
403 _ISO_2022_WriteSub(UConverterFromUnicodeArgs
*args
, int32_t offsetIndex
, UErrorCode
*err
);
406 static UConverter
* U_CALLCONV
407 _ISO_2022_SafeClone(const UConverter
*cnv
, void *stackBuffer
, int32_t *pBufferSize
, UErrorCode
*status
);
411 #ifdef U_ENABLE_GENERIC_ISO_2022
412 static void U_CALLCONV
413 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs
* args
, UErrorCode
* err
);
418 /*const UConverterSharedData _ISO2022Data;*/
419 extern const UConverterSharedData _ISO2022JPData
;
421 #if !UCONFIG_ONLY_HTML_CONVERSION
422 extern const UConverterSharedData _ISO2022KRData
;
423 extern const UConverterSharedData _ISO2022CNData
;
428 /*************** Converter implementations ******************/
430 /* The purpose of this function is to get around gcc compiler warnings. */
432 fromUWriteUInt8(UConverter
*cnv
,
433 const char *bytes
, int32_t length
,
434 uint8_t **target
, const char *targetLimit
,
437 UErrorCode
*pErrorCode
)
439 char *targetChars
= (char *)*target
;
440 ucnv_fromUWriteBytes(cnv
, bytes
, length
, &targetChars
, targetLimit
,
441 offsets
, sourceIndex
, pErrorCode
);
442 *target
= (uint8_t*)targetChars
;
447 setInitialStateToUnicodeKR(UConverter
* /*converter*/, UConverterDataISO2022
*myConverterData
){
448 if(myConverterData
->version
== 1) {
449 UConverter
*cnv
= myConverterData
->currentConverter
;
451 cnv
->toUnicodeStatus
=0; /* offset */
452 cnv
->mode
=0; /* state */
453 cnv
->toULength
=0; /* byteIndex */
458 setInitialStateFromUnicodeKR(UConverter
* converter
,UConverterDataISO2022
*myConverterData
){
459 /* in ISO-2022-KR the designator sequence appears only once
460 * in a file so we append it only once
462 if( converter
->charErrorBufferLength
==0){
464 converter
->charErrorBufferLength
= 4;
465 converter
->charErrorBuffer
[0] = 0x1b;
466 converter
->charErrorBuffer
[1] = 0x24;
467 converter
->charErrorBuffer
[2] = 0x29;
468 converter
->charErrorBuffer
[3] = 0x43;
470 if(myConverterData
->version
== 1) {
471 UConverter
*cnv
= myConverterData
->currentConverter
;
474 cnv
->fromUnicodeStatus
=1; /* prevLength */
478 static void U_CALLCONV
479 _ISO2022Open(UConverter
*cnv
, UConverterLoadArgs
*pArgs
, UErrorCode
*errorCode
){
481 char myLocale
[6]={' ',' ',' ',' ',' ',' '};
483 cnv
->extraInfo
= uprv_malloc (sizeof (UConverterDataISO2022
));
484 if(cnv
->extraInfo
!= NULL
) {
485 UConverterNamePieces stackPieces
;
486 UConverterLoadArgs stackArgs
=UCNV_LOAD_ARGS_INITIALIZER
;
487 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) cnv
->extraInfo
;
490 stackArgs
.onlyTestIsLoadable
= pArgs
->onlyTestIsLoadable
;
492 uprv_memset(myConverterData
, 0, sizeof(UConverterDataISO2022
));
493 myConverterData
->currentType
= ASCII1
;
494 cnv
->fromUnicodeStatus
=FALSE
;
496 uprv_strncpy(myLocale
, pArgs
->locale
, sizeof(myLocale
));
498 version
= pArgs
->options
& UCNV_OPTIONS_VERSION_MASK
;
499 myConverterData
->version
= version
;
500 if(myLocale
[0]=='j' && (myLocale
[1]=='a'|| myLocale
[1]=='p') &&
501 (myLocale
[2]=='_' || myLocale
[2]=='\0'))
503 /* open the required converters and cache them */
504 if(version
>MAX_JA_VERSION
) {
505 // ICU 55 fails to open a converter for an unsupported version.
506 // Previously, it fell back to version 0, but that would yield
507 // unexpected behavior.
508 *errorCode
= U_MISSING_RESOURCE_ERROR
;
511 if(jpCharsetMasks
[version
]&CSM(ISO8859_7
)) {
512 myConverterData
->myConverterArray
[ISO8859_7
] =
513 ucnv_loadSharedData("ISO8859_7", &stackPieces
, &stackArgs
, errorCode
);
515 myConverterData
->myConverterArray
[JISX208
] =
516 ucnv_loadSharedData("Shift-JIS", &stackPieces
, &stackArgs
, errorCode
);
517 if(jpCharsetMasks
[version
]&CSM(JISX212
)) {
518 myConverterData
->myConverterArray
[JISX212
] =
519 ucnv_loadSharedData("jisx-212", &stackPieces
, &stackArgs
, errorCode
);
521 if(jpCharsetMasks
[version
]&CSM(GB2312
)) {
522 myConverterData
->myConverterArray
[GB2312
] =
523 ucnv_loadSharedData("ibm-5478", &stackPieces
, &stackArgs
, errorCode
); /* gb_2312_80-1 */
525 if(jpCharsetMasks
[version
]&CSM(KSC5601
)) {
526 myConverterData
->myConverterArray
[KSC5601
] =
527 ucnv_loadSharedData("ksc_5601", &stackPieces
, &stackArgs
, errorCode
);
530 /* set the function pointers to appropriate funtions */
531 cnv
->sharedData
=(UConverterSharedData
*)(&_ISO2022JPData
);
532 uprv_strcpy(myConverterData
->locale
,"ja");
534 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ja,version=");
535 size_t len
= uprv_strlen(myConverterData
->name
);
536 myConverterData
->name
[len
]=(char)(myConverterData
->version
+(int)'0');
537 myConverterData
->name
[len
+1]='\0';
539 #if !UCONFIG_ONLY_HTML_CONVERSION
540 else if(myLocale
[0]=='k' && (myLocale
[1]=='o'|| myLocale
[1]=='r') &&
541 (myLocale
[2]=='_' || myLocale
[2]=='\0'))
544 // ICU 55 fails to open a converter for an unsupported version.
545 // Previously, it fell back to version 0, but that would yield
546 // unexpected behavior.
547 *errorCode
= U_MISSING_RESOURCE_ERROR
;
552 cnvName
="icu-internal-25546";
555 myConverterData
->version
=version
=0;
557 if(pArgs
->onlyTestIsLoadable
) {
558 ucnv_canCreateConverter(cnvName
, errorCode
); /* errorCode carries result */
559 uprv_free(cnv
->extraInfo
);
563 myConverterData
->currentConverter
=ucnv_open(cnvName
, errorCode
);
564 if (U_FAILURE(*errorCode
)) {
570 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ko,version=1");
571 uprv_memcpy(cnv
->subChars
, myConverterData
->currentConverter
->subChars
, 4);
572 cnv
->subCharLen
= myConverterData
->currentConverter
->subCharLen
;
574 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ko,version=0");
577 /* initialize the state variables */
578 setInitialStateToUnicodeKR(cnv
, myConverterData
);
579 setInitialStateFromUnicodeKR(cnv
, myConverterData
);
581 /* set the function pointers to appropriate funtions */
582 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022KRData
;
583 uprv_strcpy(myConverterData
->locale
,"ko");
586 else if(((myLocale
[0]=='z' && myLocale
[1]=='h') || (myLocale
[0]=='c'&& myLocale
[1]=='n'))&&
587 (myLocale
[2]=='_' || myLocale
[2]=='\0'))
590 // ICU 55 fails to open a converter for an unsupported version.
591 // Previously, it fell back to version 0, but that would yield
592 // unexpected behavior.
593 *errorCode
= U_MISSING_RESOURCE_ERROR
;
597 /* open the required converters and cache them */
598 myConverterData
->myConverterArray
[GB2312_1
] =
599 ucnv_loadSharedData("ibm-5478", &stackPieces
, &stackArgs
, errorCode
);
601 myConverterData
->myConverterArray
[ISO_IR_165
] =
602 ucnv_loadSharedData("iso-ir-165", &stackPieces
, &stackArgs
, errorCode
);
604 myConverterData
->myConverterArray
[CNS_11643
] =
605 ucnv_loadSharedData("cns-11643-1992", &stackPieces
, &stackArgs
, errorCode
);
608 /* set the function pointers to appropriate funtions */
609 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022CNData
;
610 uprv_strcpy(myConverterData
->locale
,"cn");
613 myConverterData
->version
= 0;
614 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=zh,version=0");
615 }else if (version
==1){
616 myConverterData
->version
= 1;
617 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=zh,version=1");
619 myConverterData
->version
= 2;
620 (void)uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=zh,version=2");
623 #endif // !UCONFIG_ONLY_HTML_CONVERSION
625 #ifdef U_ENABLE_GENERIC_ISO_2022
626 myConverterData
->isFirstBuffer
= TRUE
;
628 /* append the UTF-8 escape sequence */
629 cnv
->charErrorBufferLength
= 3;
630 cnv
->charErrorBuffer
[0] = 0x1b;
631 cnv
->charErrorBuffer
[1] = 0x25;
632 cnv
->charErrorBuffer
[2] = 0x42;
634 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022Data
;
635 /* initialize the state variables */
636 uprv_strcpy(myConverterData
->name
,"ISO_2022");
638 *errorCode
= U_MISSING_RESOURCE_ERROR
;
639 // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
640 // data loading error code.
645 cnv
->maxBytesPerUChar
=cnv
->sharedData
->staticData
->maxBytesPerChar
;
647 if(U_FAILURE(*errorCode
) || pArgs
->onlyTestIsLoadable
) {
651 *errorCode
= U_MEMORY_ALLOCATION_ERROR
;
656 static void U_CALLCONV
657 _ISO2022Close(UConverter
*converter
) {
658 UConverterDataISO2022
* myData
=(UConverterDataISO2022
*) (converter
->extraInfo
);
659 UConverterSharedData
**array
= myData
->myConverterArray
;
662 if (converter
->extraInfo
!= NULL
) {
663 /*close the array of converter pointers and free the memory*/
664 for (i
=0; i
<UCNV_2022_MAX_CONVERTERS
; i
++) {
666 ucnv_unloadSharedDataIfReady(array
[i
]);
670 ucnv_close(myData
->currentConverter
);
672 if(!converter
->isExtraLocal
){
673 uprv_free (converter
->extraInfo
);
674 converter
->extraInfo
= NULL
;
679 static void U_CALLCONV
680 _ISO2022Reset(UConverter
*converter
, UConverterResetChoice choice
) {
681 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) (converter
->extraInfo
);
682 if(choice
<=UCNV_RESET_TO_UNICODE
) {
683 uprv_memset(&myConverterData
->toU2022State
, 0, sizeof(ISO2022State
));
684 myConverterData
->key
= 0;
685 myConverterData
->isEmptySegment
= FALSE
;
687 if(choice
!=UCNV_RESET_TO_UNICODE
) {
688 uprv_memset(&myConverterData
->fromU2022State
, 0, sizeof(ISO2022State
));
690 #ifdef U_ENABLE_GENERIC_ISO_2022
691 if(myConverterData
->locale
[0] == 0){
692 if(choice
<=UCNV_RESET_TO_UNICODE
) {
693 myConverterData
->isFirstBuffer
= TRUE
;
694 myConverterData
->key
= 0;
695 if (converter
->mode
== UCNV_SO
){
696 ucnv_close (myConverterData
->currentConverter
);
697 myConverterData
->currentConverter
=NULL
;
699 converter
->mode
= UCNV_SI
;
701 if(choice
!=UCNV_RESET_TO_UNICODE
) {
702 /* re-append UTF-8 escape sequence */
703 converter
->charErrorBufferLength
= 3;
704 converter
->charErrorBuffer
[0] = 0x1b;
705 converter
->charErrorBuffer
[1] = 0x28;
706 converter
->charErrorBuffer
[2] = 0x42;
712 /* reset the state variables */
713 if(myConverterData
->locale
[0] == 'k'){
714 if(choice
<=UCNV_RESET_TO_UNICODE
) {
715 setInitialStateToUnicodeKR(converter
, myConverterData
);
717 if(choice
!=UCNV_RESET_TO_UNICODE
) {
718 setInitialStateFromUnicodeKR(converter
, myConverterData
);
726 static const char * U_CALLCONV
727 _ISO2022getName(const UConverter
* cnv
){
729 UConverterDataISO2022
* myData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
738 /*************** to unicode *******************/
739 /****************************************************************************
740 * Recognized escape sequences are
752 static const int8_t nextStateToUnicodeJP
[MAX_STATES_2022
]= {
753 /* 0 1 2 3 4 5 6 7 8 9 */
754 INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,SS2_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
755 ,ASCII
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,JISX201
,HWKANA_7BIT
,JISX201
,INVALID_STATE
756 ,INVALID_STATE
,INVALID_STATE
,JISX208
,GB2312
,JISX208
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
757 ,ISO8859_1
,ISO8859_7
,JISX208
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,KSC5601
,JISX212
,INVALID_STATE
758 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
759 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
760 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
761 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
764 #if !UCONFIG_ONLY_HTML_CONVERSION
765 /*************** to unicode *******************/
766 static const int8_t nextStateToUnicodeCN
[MAX_STATES_2022
]= {
767 /* 0 1 2 3 4 5 6 7 8 9 */
768 INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,SS2_STATE
,SS3_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
769 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
770 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
771 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
772 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,GB2312_1
,INVALID_STATE
,ISO_IR_165
773 ,CNS_11643_1
,CNS_11643_2
,CNS_11643_3
,CNS_11643_4
,CNS_11643_5
,CNS_11643_6
,CNS_11643_7
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
774 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
775 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
780 static UCNV_TableStates_2022
781 getKey_2022(char c
,int32_t* key
,int32_t* offset
){
784 int32_t hi
= MAX_STATES_2022
;
787 togo
= normalize_esq_chars_2022
[(uint8_t)c
];
789 /* not a valid character anywhere in an escape sequence */
794 togo
= (*key
<< 5) + togo
;
796 while (hi
!= low
) /*binary search*/{
798 int32_t mid
= (hi
+low
) >> 1; /*Finds median*/
803 if (escSeqStateTable_Key_2022
[mid
] > togo
){
806 else if (escSeqStateTable_Key_2022
[mid
] < togo
){
809 else /*we found it*/{
812 return (UCNV_TableStates_2022
)escSeqStateTable_Value_2022
[mid
];
823 /*runs through a state machine to determine the escape sequence - codepage correspondance
826 changeState_2022(UConverter
* _this
,
828 const char* sourceLimit
,
831 UCNV_TableStates_2022 value
;
832 UConverterDataISO2022
* myData2022
= ((UConverterDataISO2022
*)_this
->extraInfo
);
833 uint32_t key
= myData2022
->key
;
835 int8_t initialToULength
= _this
->toULength
;
838 value
= VALID_NON_TERMINAL_2022
;
839 while (*source
< sourceLimit
) {
841 _this
->toUBytes
[_this
->toULength
++]=(uint8_t)c
;
842 value
= getKey_2022(c
,(int32_t *) &key
, &offset
);
846 case VALID_NON_TERMINAL_2022
:
847 /* continue with the loop */
850 case VALID_TERMINAL_2022
:
857 case VALID_MAYBE_TERMINAL_2022
:
858 #ifdef U_ENABLE_GENERIC_ISO_2022
859 /* ESC ( B is ambiguous only for ISO_2022 itself */
860 if(var
== ISO_2022
) {
861 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
862 _this
->toULength
= 0;
864 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
866 /* continue with the loop */
867 value
= VALID_NON_TERMINAL_2022
;
872 /* not ISO_2022 itself, finish here */
873 value
= VALID_TERMINAL_2022
;
881 myData2022
->key
= key
;
883 if (value
== VALID_NON_TERMINAL_2022
) {
884 /* indicate that the escape sequence is incomplete: key!=0 */
886 } else if (value
== INVALID_2022
) {
887 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
888 } else /* value == VALID_TERMINAL_2022 */ {
890 #ifdef U_ENABLE_GENERIC_ISO_2022
893 const char *chosenConverterName
= escSeqStateTable_Result_2022
[offset
];
894 if(chosenConverterName
== NULL
) {
896 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
897 _this
->toUCallbackReason
= UCNV_UNASSIGNED
;
901 _this
->mode
= UCNV_SI
;
902 ucnv_close(myData2022
->currentConverter
);
903 myData2022
->currentConverter
= myUConverter
= ucnv_open(chosenConverterName
, err
);
904 if(U_SUCCESS(*err
)) {
905 myUConverter
->fromCharErrorBehaviour
= UCNV_TO_U_CALLBACK_STOP
;
906 _this
->mode
= UCNV_SO
;
913 StateEnum tempState
=(StateEnum
)nextStateToUnicodeJP
[offset
];
916 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
919 if(myData2022
->toU2022State
.cs
[2]!=0) {
920 if(myData2022
->toU2022State
.g
<2) {
921 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
923 myData2022
->toU2022State
.g
=2;
925 /* illegal to have SS2 before a matching designator */
926 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
929 /* case SS3_STATE: not used in ISO-2022-JP-x */
932 if((jpCharsetMasks
[myData2022
->version
] & CSM(tempState
)) == 0) {
933 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
935 /* G2 charset for SS2 */
936 myData2022
->toU2022State
.cs
[2]=(int8_t)tempState
;
940 if((jpCharsetMasks
[myData2022
->version
] & CSM(tempState
)) == 0) {
941 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
944 myData2022
->toU2022State
.cs
[0]=(int8_t)tempState
;
950 #if !UCONFIG_ONLY_HTML_CONVERSION
953 StateEnum tempState
=(StateEnum
)nextStateToUnicodeCN
[offset
];
956 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
959 if(myData2022
->toU2022State
.cs
[2]!=0) {
960 if(myData2022
->toU2022State
.g
<2) {
961 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
963 myData2022
->toU2022State
.g
=2;
965 /* illegal to have SS2 before a matching designator */
966 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
970 if(myData2022
->toU2022State
.cs
[3]!=0) {
971 if(myData2022
->toU2022State
.g
<2) {
972 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
974 myData2022
->toU2022State
.g
=3;
976 /* illegal to have SS3 before a matching designator */
977 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
981 if(myData2022
->version
==0) {
982 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
989 myData2022
->toU2022State
.cs
[1]=(int8_t)tempState
;
992 myData2022
->toU2022State
.cs
[2]=(int8_t)tempState
;
995 /* other CNS 11643 planes */
996 if(myData2022
->version
==0) {
997 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
999 myData2022
->toU2022State
.cs
[3]=(int8_t)tempState
;
1007 /* nothing to be done, just accept this one escape sequence */
1009 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
1012 #endif // !UCONFIG_ONLY_HTML_CONVERSION
1015 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
1019 if(U_SUCCESS(*err
)) {
1020 _this
->toULength
= 0;
1021 } else if(*err
==U_ILLEGAL_ESCAPE_SEQUENCE
) {
1022 if(_this
->toULength
>1) {
1024 * Ticket 5691: consistent illegal sequences:
1025 * - We include at least the first byte (ESC) in the illegal sequence.
1026 * - If any of the non-initial bytes could be the start of a character,
1027 * we stop the illegal sequence before the first one of those.
1028 * In escape sequences, all following bytes are "printable", that is,
1029 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1030 * they are valid single/lead bytes.
1031 * For simplicity, we always only report the initial ESC byte as the
1032 * illegal sequence and back out all other bytes we looked at.
1034 /* Back out some bytes. */
1035 int8_t backOutDistance
=_this
->toULength
-1;
1036 int8_t bytesFromThisBuffer
=_this
->toULength
-initialToULength
;
1037 if(backOutDistance
<=bytesFromThisBuffer
) {
1038 /* same as initialToULength<=1 */
1039 *source
-=backOutDistance
;
1041 /* Back out bytes from the previous buffer: Need to replay them. */
1042 _this
->preToULength
=(int8_t)(bytesFromThisBuffer
-backOutDistance
);
1043 /* same as -(initialToULength-1) */
1044 /* preToULength is negative! */
1045 uprv_memcpy(_this
->preToU
, _this
->toUBytes
+1, -_this
->preToULength
);
1046 *source
-=bytesFromThisBuffer
;
1050 } else if(*err
==U_UNSUPPORTED_ESCAPE_SEQUENCE
) {
1051 _this
->toUCallbackReason
= UCNV_UNASSIGNED
;
1055 #if !UCONFIG_ONLY_HTML_CONVERSION
1056 /*Checks the characters of the buffer against valid 2022 escape sequences
1057 *if the match we return a pointer to the initial start of the sequence otherwise
1058 *we return sourceLimit
1060 /*for 2022 looks ahead in the stream
1061 *to determine the longest possible convertible
1064 static inline const char*
1065 getEndOfBuffer_2022(const char** source
,
1066 const char* sourceLimit
,
1069 const char* mySource
= *source
;
1071 #ifdef U_ENABLE_GENERIC_ISO_2022
1072 if (*source
>= sourceLimit
)
1077 if (*mySource
== ESC_2022
){
1081 UCNV_TableStates_2022 value
= VALID_NON_TERMINAL_2022
;
1083 /* Kludge: I could not
1084 * figure out the reason for validating an escape sequence
1085 * twice - once here and once in changeState_2022().
1086 * is it possible to have an ESC character in a ISO2022
1087 * byte stream which is valid in a code page? Is it legal?
1090 (mySource
+i
< sourceLimit
)&&(value
== VALID_NON_TERMINAL_2022
);
1092 value
= getKey_2022(*(mySource
+i
), &key
, &offset
);
1094 if (value
> 0 || *mySource
==ESC_2022
)
1097 if ((value
== VALID_NON_TERMINAL_2022
)&&(!flush
) )
1100 }while (++mySource
< sourceLimit
);
1104 while(mySource
< sourceLimit
&& *mySource
!= ESC_2022
) {
1112 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1113 * any future change in _MBCSFromUChar32() function should be reflected here.
1114 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1116 static inline int32_t
1117 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData
* sharedData
,
1124 const uint16_t *table
;
1125 uint32_t stage2Entry
;
1130 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1131 * Use internal version of ucnv_open() that verifies that the new structures are available,
1132 * else U_INTERNAL_PROGRAM_ERROR.
1134 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1135 if(c
<0x10000 || (sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
1136 table
=sharedData
->mbcs
.fromUnicodeTable
;
1137 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
1138 /* get the bytes and the length for the output */
1139 if(outputType
==MBCS_OUTPUT_2
){
1140 myValue
=MBCS_VALUE_2_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
1146 } else /* outputType==MBCS_OUTPUT_3 */ {
1147 p
=MBCS_POINTER_3_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
1148 myValue
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
1151 } else if(myValue
<=0xffff) {
1157 /* is this code point assigned, or do we use fallbacks? */
1158 if((stage2Entry
&(1<<(16+(c
&0xf))))!=0) {
1162 } else if(FROM_U_USE_FALLBACK(useFallback
, c
) && myValue
!=0) {
1164 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1165 * There is no way with this data structure for fallback output
1166 * to be a zero byte.
1173 cx
=sharedData
->mbcs
.extIndexes
;
1175 return ucnv_extSimpleMatchFromU(cx
, c
, value
, useFallback
);
1182 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1183 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1184 * @param retval pointer to output byte
1185 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1187 static inline int32_t
1188 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData
* sharedData
,
1193 const uint16_t *table
;
1195 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1196 if(c
>=0x10000 && !(sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
1199 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1200 table
=sharedData
->mbcs
.fromUnicodeTable
;
1201 /* get the byte for the output */
1202 value
=MBCS_SINGLE_RESULT_FROM_U(table
, (uint16_t *)sharedData
->mbcs
.fromUnicodeBytes
, c
);
1203 /* is this code point assigned, or do we use fallbacks? */
1204 *retval
=(uint32_t)(value
&0xff);
1206 return 1; /* roundtrip */
1207 } else if(useFallback
? value
>=0x800 : value
>=0xc00) {
1208 return -1; /* fallback taken */
1210 return 0; /* no mapping */
1215 * Check that the result is a 2-byte value with each byte in the range A1..FE
1216 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1217 * to move it to the ISO 2022 range 21..7E.
1218 * Return 0 if out of range.
1220 static inline uint32_t
1221 _2022FromGR94DBCS(uint32_t value
) {
1222 if( (uint16_t)(value
- 0xa1a1) <= (0xfefe - 0xa1a1) &&
1223 (uint8_t)(value
- 0xa1) <= (0xfe - 0xa1)
1225 return value
- 0x8080; /* shift down to 21..7e byte range */
1227 return 0; /* not valid for ISO 2022 */
1231 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1233 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1234 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1237 static inline uint32_t
1238 _2022ToGR94DBCS(uint32_t value
) {
1239 uint32_t returnValue
= value
+ 0x8080;
1240 if( (uint16_t)(returnValue
- 0xa1a1) <= (0xfefe - 0xa1a1) &&
1241 (uint8_t)(returnValue
- 0xa1) <= (0xfe - 0xa1)) {
1249 #ifdef U_ENABLE_GENERIC_ISO_2022
1251 /**********************************************************************************
1252 * ISO-2022 Converter
1257 static void U_CALLCONV
1258 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs
* args
,
1260 const char* mySourceLimit
, *realSourceLimit
;
1261 const char* sourceStart
;
1262 const UChar
* myTargetStart
;
1263 UConverter
* saveThis
;
1264 UConverterDataISO2022
* myData
;
1267 saveThis
= args
->converter
;
1268 myData
=((UConverterDataISO2022
*)(saveThis
->extraInfo
));
1270 realSourceLimit
= args
->sourceLimit
;
1271 while (args
->source
< realSourceLimit
) {
1272 if(myData
->key
== 0) { /* are we in the middle of an escape sequence? */
1273 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1274 mySourceLimit
= getEndOfBuffer_2022(&(args
->source
), realSourceLimit
, args
->flush
);
1276 if(args
->source
< mySourceLimit
) {
1277 if(myData
->currentConverter
==NULL
) {
1278 myData
->currentConverter
= ucnv_open("ASCII",err
);
1279 if(U_FAILURE(*err
)){
1283 myData
->currentConverter
->fromCharErrorBehaviour
= UCNV_TO_U_CALLBACK_STOP
;
1284 saveThis
->mode
= UCNV_SO
;
1287 /* convert to before the ESC or until the end of the buffer */
1288 myData
->isFirstBuffer
=FALSE
;
1289 sourceStart
= args
->source
;
1290 myTargetStart
= args
->target
;
1291 args
->converter
= myData
->currentConverter
;
1292 ucnv_toUnicode(args
->converter
,
1298 (UBool
)(args
->flush
&& mySourceLimit
== realSourceLimit
),
1300 args
->converter
= saveThis
;
1302 if (*err
== U_BUFFER_OVERFLOW_ERROR
) {
1303 /* move the overflow buffer */
1304 length
= saveThis
->UCharErrorBufferLength
= myData
->currentConverter
->UCharErrorBufferLength
;
1305 myData
->currentConverter
->UCharErrorBufferLength
= 0;
1307 uprv_memcpy(saveThis
->UCharErrorBuffer
,
1308 myData
->currentConverter
->UCharErrorBuffer
,
1309 length
*U_SIZEOF_UCHAR
);
1316 * -Error while converting
1317 * -Done with entire buffer
1318 * -Need to write offsets or update the current offset
1319 * (leave that up to the code in ucnv.c)
1321 * or else we just stopped at an ESC byte and continue with changeState_2022()
1323 if (U_FAILURE(*err
) ||
1324 (args
->source
== realSourceLimit
) ||
1325 (args
->offsets
!= NULL
&& (args
->target
!= myTargetStart
|| args
->source
!= sourceStart
) ||
1326 (mySourceLimit
< realSourceLimit
&& myData
->currentConverter
->toULength
> 0))
1328 /* copy partial or error input for truncated detection and error handling */
1329 if(U_FAILURE(*err
)) {
1330 length
= saveThis
->invalidCharLength
= myData
->currentConverter
->invalidCharLength
;
1332 uprv_memcpy(saveThis
->invalidCharBuffer
, myData
->currentConverter
->invalidCharBuffer
, length
);
1335 length
= saveThis
->toULength
= myData
->currentConverter
->toULength
;
1337 uprv_memcpy(saveThis
->toUBytes
, myData
->currentConverter
->toUBytes
, length
);
1338 if(args
->source
< mySourceLimit
) {
1339 *err
= U_TRUNCATED_CHAR_FOUND
; /* truncated input before ESC */
1348 sourceStart
= args
->source
;
1349 changeState_2022(args
->converter
,
1354 if (U_FAILURE(*err
) || (args
->source
!= sourceStart
&& args
->offsets
!= NULL
)) {
1355 /* let the ucnv.c code update its current offset */
1364 * To Unicode Callback helper function
1367 toUnicodeCallback(UConverter
*cnv
,
1368 const uint32_t sourceChar
, const uint32_t targetUniChar
,
1370 if(sourceChar
>0xff){
1371 cnv
->toUBytes
[0] = (uint8_t)(sourceChar
>>8);
1372 cnv
->toUBytes
[1] = (uint8_t)sourceChar
;
1376 cnv
->toUBytes
[0] =(char) sourceChar
;
1380 if(targetUniChar
== (missingCharMarker
-1/*0xfffe*/)){
1381 *err
= U_INVALID_CHAR_FOUND
;
1384 *err
= U_ILLEGAL_CHAR_FOUND
;
1388 /**************************************ISO-2022-JP*************************************************/
1390 /************************************** IMPORTANT **************************************************
1391 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1392 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1393 * The converter iterates over each Unicode codepoint
1394 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1395 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1396 * would do as far as possible.
1398 * If the implementation of these macros or structure of sharedData struct change in the future, make
1399 * sure that ISO-2022 is also changed.
1400 ***************************************************************************************************
1403 /***************************************************************************************************
1404 * Rules for ISO-2022-jp encoding
1405 * (i) Escape sequences must be fully contained within a line they should not
1406 * span new lines or CRs
1407 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1408 * JIS-Roman character escape sequence should follow before the line terminates
1409 * (iii) If the first character on the line is represented by two bytes then a two
1410 * byte character escape sequence should precede it
1411 * (iv) If no escape sequence is encountered then the characters are ASCII
1412 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1413 * and invoked with SS2 (ESC N).
1414 * (vi) If there is any G0 designation in text, there must be a switch to
1415 * ASCII or to JIS X 0201-Roman before a space character (but not
1416 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1417 * characters such as tab or CRLF.
1418 * (vi) Supported encodings:
1419 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1423 * JISX201, JISX208,JISX212 : new .cnv data files created
1424 * KSC5601 : alias to ibm-949 mapping table
1425 * GB2312 : alias to ibm-1386 mapping table
1426 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1427 * ISO-8859-7 : alisas to ibm-9409 mapping table
1430 /* preference order of JP charsets */
1431 static const StateEnum jpCharsetPref
[]={
1444 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1445 * not in order of jpCharsetPref[]!
1447 static const char escSeqChars
[][6] ={
1448 "\x1B\x28\x42", /* <ESC>(B ASCII */
1449 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1450 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1451 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1452 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1453 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1454 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1455 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1456 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1459 static const int8_t escSeqCharsLen
[] ={
1460 3, /* length of <ESC>(B ASCII */
1461 3, /* length of <ESC>.A ISO-8859-1 */
1462 3, /* length of <ESC>.F ISO-8859-7 */
1463 3, /* length of <ESC>(J JISX-201 */
1464 3, /* length of <ESC>$B JISX-208 */
1465 4, /* length of <ESC>$(D JISX-212 */
1466 3, /* length of <ESC>$A GB2312 */
1467 4, /* length of <ESC>$(C KSC5601 */
1468 3 /* length of <ESC>(I HWKANA_7BIT */
1472 * The iteration over various code pages works this way:
1473 * i) Get the currentState from myConverterData->currentState
1474 * ii) Check if the character is mapped to a valid character in the currentState
1475 * Yes -> a) set the initIterState to currentState
1476 * b) remain in this state until an invalid character is found
1477 * No -> a) go to the next code page and find the character
1478 * iii) Before changing the state increment the current state check if the current state
1479 * is equal to the intitIteration state
1480 * Yes -> A character that cannot be represented in any of the supported encodings
1481 * break and return a U_INVALID_CHARACTER error
1482 * No -> Continue and find the character in next code page
1485 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1488 /* Map 00..7F to Unicode according to JIS X 0201. */
1489 static inline uint32_t
1490 jisx201ToU(uint32_t value
) {
1493 } else if(value
== 0x5c) {
1495 } else if(value
== 0x7e) {
1497 } else /* value <= 0x7f */ {
1502 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1503 static inline uint32_t
1504 jisx201FromU(uint32_t value
) {
1506 if(value
!=0x5c && value
!=0x7e) {
1509 } else if(value
==0xa5) {
1511 } else if(value
==0x203e) {
1518 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1519 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1520 * Return 0 if the byte pair is out of range.
1522 static inline uint32_t
1523 _2022FromSJIS(uint32_t value
) {
1526 if(value
> 0xEFFC) {
1527 return 0; /* beyond JIS X 0208 */
1530 trail
= (uint8_t)value
;
1532 value
&= 0xff00; /* lead byte */
1533 if(value
<= 0x9f00) {
1535 } else /* 0xe000 <= value <= 0xef00 */ {
1543 value
|= trail
- 0x1f;
1545 value
|= trail
- 0x20;
1547 } else /* trail <= 0xfc */ {
1548 value
|= trail
- 0x7e;
1554 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1555 * If either byte is outside 21..7E make sure that the result is not valid
1556 * for Shift-JIS so that the converter catches it.
1557 * Some invalid byte values already turn into equally invalid Shift-JIS
1558 * byte values and need not be tested explicitly.
1561 _2022ToSJIS(uint8_t c1
, uint8_t c2
, char bytes
[2]) {
1566 } else if(c2
<= 0x7e) {
1569 c2
= 0; /* invalid */
1572 if((uint8_t)(c2
-0x21) <= ((0x7e)-0x21)) {
1575 c2
= 0; /* invalid */
1581 } else if(c1
<= 0x3f) {
1584 c1
= 0; /* invalid */
1586 bytes
[0] = (char)c1
;
1587 bytes
[1] = (char)c2
;
1591 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1593 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1594 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1595 * These were the only fallbacks in ICU's jisx-208.ucm file.
1597 static const uint16_t hwkana_fb
[HWKANA_END
- HWKANA_START
+ 1] = {
1598 0x2123, /* U+FF61 */
1613 0x213C, /* U+FF70 */
1629 0x253F, /* U+FF80 */
1645 0x255F, /* U+FF90 */
1663 static void U_CALLCONV
1664 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
) {
1665 UConverter
*cnv
= args
->converter
;
1666 UConverterDataISO2022
*converterData
;
1667 ISO2022State
*pFromU2022State
;
1668 uint8_t *target
= (uint8_t *) args
->target
;
1669 const uint8_t *targetLimit
= (const uint8_t *) args
->targetLimit
;
1670 const UChar
* source
= args
->source
;
1671 const UChar
* sourceLimit
= args
->sourceLimit
;
1672 int32_t* offsets
= args
->offsets
;
1675 int32_t len
, outLen
;
1677 int32_t choiceCount
;
1678 uint32_t targetValue
= 0;
1684 /* set up the state */
1685 converterData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
1686 pFromU2022State
= &converterData
->fromU2022State
;
1690 /* check if the last codepoint of previous buffer was a lead surrogate*/
1691 if((sourceChar
= cnv
->fromUChar32
)!=0 && target
< targetLimit
) {
1695 while(source
< sourceLimit
) {
1696 if(target
< targetLimit
) {
1698 sourceChar
= *(source
++);
1699 /*check if the char is a First surrogate*/
1700 if(U16_IS_SURROGATE(sourceChar
)) {
1701 if(U16_IS_SURROGATE_LEAD(sourceChar
)) {
1703 /*look ahead to find the trail surrogate*/
1704 if(source
< sourceLimit
) {
1705 /* test the following code unit */
1706 UChar trail
=(UChar
) *source
;
1707 if(U16_IS_TRAIL(trail
)) {
1709 sourceChar
=U16_GET_SUPPLEMENTARY(sourceChar
, trail
);
1710 cnv
->fromUChar32
=0x00;
1711 /* convert this supplementary code point */
1712 /* exit this condition tree */
1714 /* this is an unmatched lead code unit (1st surrogate) */
1715 /* callback(illegal) */
1716 *err
=U_ILLEGAL_CHAR_FOUND
;
1717 cnv
->fromUChar32
=sourceChar
;
1722 cnv
->fromUChar32
=sourceChar
;
1726 /* this is an unmatched trail code unit (2nd surrogate) */
1727 /* callback(illegal) */
1728 *err
=U_ILLEGAL_CHAR_FOUND
;
1729 cnv
->fromUChar32
=sourceChar
;
1734 /* do not convert SO/SI/ESC */
1735 if(IS_2022_CONTROL(sourceChar
)) {
1736 /* callback(illegal) */
1737 *err
=U_ILLEGAL_CHAR_FOUND
;
1738 cnv
->fromUChar32
=sourceChar
;
1742 /* do the conversion */
1744 if(choiceCount
== 0) {
1748 * The csm variable keeps track of which charsets are allowed
1749 * and not used yet while building the choices[].
1751 csm
= jpCharsetMasks
[converterData
->version
];
1754 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1755 if(converterData
->version
== 3 || converterData
->version
== 4) {
1756 choices
[choiceCount
++] = (int8_t)HWKANA_7BIT
;
1758 /* Do not try single-byte half-width Katakana for other versions. */
1759 csm
&= ~CSM(HWKANA_7BIT
);
1761 /* try the current G0 charset */
1762 choices
[choiceCount
++] = cs
= pFromU2022State
->cs
[0];
1765 /* try the current G2 charset */
1766 if((cs
= pFromU2022State
->cs
[2]) != 0) {
1767 choices
[choiceCount
++] = cs
;
1771 /* try all the other possible charsets */
1772 for(i
= 0; i
< UPRV_LENGTHOF(jpCharsetPref
); ++i
) {
1773 cs
= (int8_t)jpCharsetPref
[i
];
1775 choices
[choiceCount
++] = cs
;
1783 * len==0: no mapping found yet
1784 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1785 * len>0: found a roundtrip result, done
1789 * We will turn off useFallback after finding a fallback,
1790 * but we still get fallbacks from PUA code points as usual.
1791 * Therefore, we will also need to check that we don't overwrite
1792 * an early fallback with a later one.
1794 useFallback
= cnv
->useFallback
;
1796 for(i
= 0; i
< choiceCount
&& len
<= 0; ++i
) {
1799 int8_t cs0
= choices
[i
];
1802 if(sourceChar
<= 0x7f) {
1803 targetValue
= (uint32_t)sourceChar
;
1810 if(GR96_START
<= sourceChar
&& sourceChar
<= GR96_END
) {
1811 targetValue
= (uint32_t)sourceChar
- 0x80;
1818 if((uint32_t)(sourceChar
- HWKANA_START
) <= (HWKANA_END
- HWKANA_START
)) {
1819 if(converterData
->version
==3) {
1820 /* JIS7: use G1 (SO) */
1821 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1822 targetValue
= (uint32_t)(sourceChar
- (HWKANA_START
- 0x21));
1824 pFromU2022State
->cs
[1] = cs
= cs0
; /* do not output an escape sequence */
1826 } else if(converterData
->version
==4) {
1827 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1828 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1829 targetValue
= (uint32_t)(sourceChar
- (HWKANA_START
- 0xa1));
1832 cs
= pFromU2022State
->cs
[0];
1833 if(IS_JP_DBCS(cs
)) {
1834 /* switch from a DBCS charset to JISX201 */
1835 cs
= (int8_t)JISX201
;
1837 /* else stay in the current G0 charset */
1840 /* else do not use HWKANA_7BIT with other versions */
1845 value
= jisx201FromU(sourceChar
);
1847 targetValue
= value
;
1851 useFallback
= FALSE
;
1855 /* G0 DBCS from Shift-JIS table */
1856 len2
= MBCS_FROM_UCHAR32_ISO2022(
1857 converterData
->myConverterArray
[cs0
],
1859 useFallback
, MBCS_OUTPUT_2
);
1860 if(len2
== 2 || (len2
== -2 && len
== 0)) { /* only accept DBCS: abs(len)==2 */
1861 value
= _2022FromSJIS(value
);
1863 targetValue
= value
;
1867 useFallback
= FALSE
;
1869 } else if(len
== 0 && useFallback
&&
1870 (uint32_t)(sourceChar
- HWKANA_START
) <= (HWKANA_END
- HWKANA_START
)) {
1871 targetValue
= hwkana_fb
[sourceChar
- HWKANA_START
];
1875 useFallback
= FALSE
;
1879 /* G0 SBCS forced to 7-bit output */
1880 len2
= MBCS_SINGLE_FROM_UCHAR32(
1881 converterData
->myConverterArray
[cs0
],
1884 if(len2
!= 0 && !(len2
< 0 && len
!= 0) && GR96_START
<= value
&& value
<= GR96_END
) {
1885 targetValue
= value
- 0x80;
1889 useFallback
= FALSE
;
1894 len2
= MBCS_FROM_UCHAR32_ISO2022(
1895 converterData
->myConverterArray
[cs0
],
1897 useFallback
, MBCS_OUTPUT_2
);
1898 if(len2
== 2 || (len2
== -2 && len
== 0)) { /* only accept DBCS: abs(len)==2 */
1899 if(cs0
== KSC5601
) {
1901 * Check for valid bytes for the encoding scheme.
1902 * This is necessary because the sub-converter (windows-949)
1903 * has a broader encoding scheme than is valid for 2022.
1905 value
= _2022FromGR94DBCS(value
);
1910 targetValue
= value
;
1914 useFallback
= FALSE
;
1922 len
= -len
; /* fallback */
1924 outLen
= 0; /* count output bytes */
1926 /* write SI if necessary (only for JIS7) */
1927 if(pFromU2022State
->g
== 1 && g
== 0) {
1928 buffer
[outLen
++] = UCNV_SI
;
1929 pFromU2022State
->g
= 0;
1932 /* write the designation sequence if necessary */
1933 if(cs
!= pFromU2022State
->cs
[g
]) {
1934 int32_t escLen
= escSeqCharsLen
[cs
];
1935 uprv_memcpy(buffer
+ outLen
, escSeqChars
[cs
], escLen
);
1937 pFromU2022State
->cs
[g
] = cs
;
1939 /* invalidate the choices[] */
1943 /* write the shift sequence if necessary */
1944 if(g
!= pFromU2022State
->g
) {
1946 /* case 0 handled before writing escapes */
1948 buffer
[outLen
++] = UCNV_SO
;
1949 pFromU2022State
->g
= 1;
1951 default: /* case 2 */
1952 buffer
[outLen
++] = 0x1b;
1953 buffer
[outLen
++] = 0x4e;
1955 /* no case 3: no SS3 in ISO-2022-JP-x */
1959 /* write the output bytes */
1961 buffer
[outLen
++] = (char)targetValue
;
1962 } else /* len == 2 */ {
1963 buffer
[outLen
++] = (char)(targetValue
>> 8);
1964 buffer
[outLen
++] = (char)targetValue
;
1968 * if we cannot find the character after checking all codepages
1969 * then this is an error
1971 *err
= U_INVALID_CHAR_FOUND
;
1972 cnv
->fromUChar32
=sourceChar
;
1976 if(sourceChar
== CR
|| sourceChar
== LF
) {
1977 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1978 pFromU2022State
->cs
[2] = 0;
1982 /* output outLen>0 bytes in buffer[] */
1984 *target
++ = buffer
[0];
1986 *offsets
++ = (int32_t)(source
- args
->source
- 1); /* -1: known to be ASCII */
1988 } else if(outLen
== 2 && (target
+ 2) <= targetLimit
) {
1989 *target
++ = buffer
[0];
1990 *target
++ = buffer
[1];
1992 int32_t sourceIndex
= (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
));
1993 *offsets
++ = sourceIndex
;
1994 *offsets
++ = sourceIndex
;
2000 &target
, (const char *)targetLimit
,
2001 &offsets
, (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
)),
2003 if(U_FAILURE(*err
)) {
2007 } /* end if(myTargetIndex<myTargetLength) */
2009 *err
=U_BUFFER_OVERFLOW_ERROR
;
2013 }/* end while(mySourceIndex<mySourceLength) */
2016 * the end of the input stream and detection of truncated input
2017 * are handled by the framework, but for ISO-2022-JP conversion
2018 * we need to be in ASCII mode at the very end
2022 * in SO mode or not in ASCII mode
2023 * end of input and no truncated input
2025 if( U_SUCCESS(*err
) &&
2026 (pFromU2022State
->g
!=0 || pFromU2022State
->cs
[0]!=ASCII
) &&
2027 args
->flush
&& source
>=sourceLimit
&& cnv
->fromUChar32
==0
2029 int32_t sourceIndex
;
2033 if(pFromU2022State
->g
!= 0) {
2034 buffer
[outLen
++] = UCNV_SI
;
2035 pFromU2022State
->g
= 0;
2038 if(pFromU2022State
->cs
[0] != ASCII
) {
2039 int32_t escLen
= escSeqCharsLen
[ASCII
];
2040 uprv_memcpy(buffer
+ outLen
, escSeqChars
[ASCII
], escLen
);
2042 pFromU2022State
->cs
[0] = (int8_t)ASCII
;
2045 /* get the source index of the last input character */
2047 * TODO this would be simpler and more reliable if we used a pair
2048 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2049 * so that we could simply use the prevSourceIndex here;
2050 * this code gives an incorrect result for the rare case of an unmatched
2051 * trail surrogate that is alone in the last buffer of the text stream
2053 sourceIndex
=(int32_t)(source
-args
->source
);
2056 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
2057 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
2068 &target
, (const char *)targetLimit
,
2069 &offsets
, sourceIndex
,
2073 /*save the state and return */
2074 args
->source
= source
;
2075 args
->target
= (char*)target
;
2078 /*************** to unicode *******************/
2080 static void U_CALLCONV
2081 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
2084 const char *mySource
= (char *) args
->source
;
2085 UChar
*myTarget
= args
->target
;
2086 const char *mySourceLimit
= args
->sourceLimit
;
2087 uint32_t targetUniChar
= 0x0000;
2088 uint32_t mySourceChar
= 0x0000;
2089 uint32_t tmpSourceChar
= 0x0000;
2090 UConverterDataISO2022
* myData
;
2091 ISO2022State
*pToU2022State
;
2094 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2095 pToU2022State
= &myData
->toU2022State
;
2097 if(myData
->key
!= 0) {
2098 /* continue with a partial escape sequence */
2100 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
2101 /* continue with a partial double-byte character */
2102 mySourceChar
= args
->converter
->toUBytes
[0];
2103 args
->converter
->toULength
= 0;
2104 cs
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
2105 targetUniChar
= missingCharMarker
;
2109 while(mySource
< mySourceLimit
){
2111 targetUniChar
=missingCharMarker
;
2113 if(myTarget
< args
->targetLimit
){
2115 mySourceChar
= (unsigned char) *mySource
++;
2117 switch(mySourceChar
) {
2119 if(myData
->version
==3) {
2123 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2124 myData
->isEmptySegment
= FALSE
; /* reset this, we have a different error */
2129 if(myData
->version
==3) {
2130 /* JIS7: switch to G1 half-width Katakana */
2131 pToU2022State
->cs
[1] = (int8_t)HWKANA_7BIT
;
2135 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2136 myData
->isEmptySegment
= FALSE
; /* reset this, we have a different error */
2144 const char * mySourceBefore
= mySource
;
2145 int8_t toULengthBefore
= args
->converter
->toULength
;
2147 changeState_2022(args
->converter
,&(mySource
),
2148 mySourceLimit
, ISO_2022_JP
,err
);
2150 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2151 if(myData
->version
==0 && myData
->key
==0 && U_SUCCESS(*err
) && myData
->isEmptySegment
) {
2152 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
2153 args
->converter
->toUCallbackReason
= UCNV_IRREGULAR
;
2154 args
->converter
->toULength
= (int8_t)(toULengthBefore
+ (mySource
- mySourceBefore
));
2158 /* invalid or illegal escape sequence */
2159 if(U_FAILURE(*err
)){
2160 args
->target
= myTarget
;
2161 args
->source
= mySource
;
2162 myData
->isEmptySegment
= FALSE
; /* Reset to avoid future spurious errors */
2165 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2166 if(myData
->key
==0) {
2167 myData
->isEmptySegment
= TRUE
;
2171 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2175 /* automatically reset to single-byte mode */
2176 if((StateEnum
)pToU2022State
->cs
[0] != ASCII
&& (StateEnum
)pToU2022State
->cs
[0] != JISX201
) {
2177 pToU2022State
->cs
[0] = (int8_t)ASCII
;
2179 pToU2022State
->cs
[2] = 0;
2180 pToU2022State
->g
= 0;
2183 /* convert one or two bytes */
2184 myData
->isEmptySegment
= FALSE
;
2185 cs
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
2186 if( (uint8_t)(mySourceChar
- 0xa1) <= (0xdf - 0xa1) && myData
->version
==4 &&
2189 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2190 targetUniChar
= mySourceChar
+ (HWKANA_START
- 0xa1);
2192 /* return from a single-shift state to the previous one */
2193 if(pToU2022State
->g
>= 2) {
2194 pToU2022State
->g
=pToU2022State
->prevG
;
2198 if(mySourceChar
<= 0x7f) {
2199 targetUniChar
= mySourceChar
;
2203 if(mySourceChar
<= 0x7f) {
2204 targetUniChar
= mySourceChar
+ 0x80;
2206 /* return from a single-shift state to the previous one */
2207 pToU2022State
->g
=pToU2022State
->prevG
;
2210 if(mySourceChar
<= 0x7f) {
2211 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2213 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2214 myData
->myConverterArray
[cs
],
2215 mySourceChar
+ 0x80);
2217 /* return from a single-shift state to the previous one */
2218 pToU2022State
->g
=pToU2022State
->prevG
;
2221 if(mySourceChar
<= 0x7f) {
2222 targetUniChar
= jisx201ToU(mySourceChar
);
2226 if((uint8_t)(mySourceChar
- 0x21) <= (0x5f - 0x21)) {
2227 /* 7-bit halfwidth Katakana */
2228 targetUniChar
= mySourceChar
+ (HWKANA_START
- 0x21);
2233 if(mySource
< mySourceLimit
) {
2234 int leadIsOk
, trailIsOk
;
2237 trailByte
= (uint8_t)*mySource
;
2239 * Ticket 5691: consistent illegal sequences:
2240 * - We include at least the first byte in the illegal sequence.
2241 * - If any of the non-initial bytes could be the start of a character,
2242 * we stop the illegal sequence before the first one of those.
2244 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2245 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2246 * Otherwise we convert or report the pair of bytes.
2248 leadIsOk
= (uint8_t)(mySourceChar
- 0x21) <= (0x7e - 0x21);
2249 trailIsOk
= (uint8_t)(trailByte
- 0x21) <= (0x7e - 0x21);
2250 if (leadIsOk
&& trailIsOk
) {
2252 tmpSourceChar
= (mySourceChar
<< 8) | trailByte
;
2254 _2022ToSJIS((uint8_t)mySourceChar
, trailByte
, tempBuf
);
2255 mySourceChar
= tmpSourceChar
;
2257 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2258 mySourceChar
= tmpSourceChar
;
2259 if (cs
== KSC5601
) {
2260 tmpSourceChar
+= 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2262 tempBuf
[0] = (char)(tmpSourceChar
>> 8);
2263 tempBuf
[1] = (char)(tmpSourceChar
);
2265 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(myData
->myConverterArray
[cs
], tempBuf
, 2, FALSE
);
2266 } else if (!(trailIsOk
|| IS_2022_CONTROL(trailByte
))) {
2267 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2269 /* add another bit so that the code below writes 2 bytes in case of error */
2270 mySourceChar
= 0x10000 | (mySourceChar
<< 8) | trailByte
;
2273 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
2274 args
->converter
->toULength
= 1;
2277 } /* End of inner switch */
2279 } /* End of outer switch */
2280 if(targetUniChar
< (missingCharMarker
-1/*0xfffe*/)){
2282 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2284 *(myTarget
++)=(UChar
)targetUniChar
;
2286 else if(targetUniChar
> missingCharMarker
){
2287 /* disassemble the surrogate pair and write to output*/
2288 targetUniChar
-=0x0010000;
2289 *myTarget
= (UChar
)(0xd800+(UChar
)(targetUniChar
>>10));
2291 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2294 if(myTarget
< args
->targetLimit
){
2295 *myTarget
= (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
2297 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2301 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]=
2302 (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
2307 /* Call the callback function*/
2308 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
2312 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2313 *err
=U_BUFFER_OVERFLOW_ERROR
;
2318 args
->target
= myTarget
;
2319 args
->source
= mySource
;
2323 #if !UCONFIG_ONLY_HTML_CONVERSION
2324 /***************************************************************
2325 * Rules for ISO-2022-KR encoding
2326 * i) The KSC5601 designator sequence should appear only once in a file,
2327 * at the begining of a line before any KSC5601 characters. This usually
2328 * means that it appears by itself on the first line of the file
2329 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2330 * and SI to shift into single byte mode
2332 static void U_CALLCONV
2333 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
2335 UConverter
* saveConv
= args
->converter
;
2336 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*)saveConv
->extraInfo
;
2337 args
->converter
=myConverterData
->currentConverter
;
2339 myConverterData
->currentConverter
->fromUChar32
= saveConv
->fromUChar32
;
2340 ucnv_MBCSFromUnicodeWithOffsets(args
,err
);
2341 saveConv
->fromUChar32
= myConverterData
->currentConverter
->fromUChar32
;
2343 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
2344 if(myConverterData
->currentConverter
->charErrorBufferLength
> 0) {
2346 saveConv
->charErrorBuffer
,
2347 myConverterData
->currentConverter
->charErrorBuffer
,
2348 myConverterData
->currentConverter
->charErrorBufferLength
);
2350 saveConv
->charErrorBufferLength
= myConverterData
->currentConverter
->charErrorBufferLength
;
2351 myConverterData
->currentConverter
->charErrorBufferLength
= 0;
2353 args
->converter
=saveConv
;
2356 static void U_CALLCONV
2357 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
2359 const UChar
*source
= args
->source
;
2360 const UChar
*sourceLimit
= args
->sourceLimit
;
2361 unsigned char *target
= (unsigned char *) args
->target
;
2362 unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
2363 int32_t* offsets
= args
->offsets
;
2364 uint32_t targetByteUnit
= 0x0000;
2365 UChar32 sourceChar
= 0x0000;
2366 UBool isTargetByteDBCS
;
2367 UBool oldIsTargetByteDBCS
;
2368 UConverterDataISO2022
*converterData
;
2369 UConverterSharedData
* sharedData
;
2373 converterData
=(UConverterDataISO2022
*)args
->converter
->extraInfo
;
2374 /* if the version is 1 then the user is requesting
2375 * conversion with ibm-25546 pass the arguments to
2376 * MBCS converter and return
2378 if(converterData
->version
==1){
2379 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args
,err
);
2383 /* initialize data */
2384 sharedData
= converterData
->currentConverter
->sharedData
;
2385 useFallback
= args
->converter
->useFallback
;
2386 isTargetByteDBCS
=(UBool
)args
->converter
->fromUnicodeStatus
;
2387 oldIsTargetByteDBCS
= isTargetByteDBCS
;
2389 isTargetByteDBCS
= (UBool
) args
->converter
->fromUnicodeStatus
;
2390 if((sourceChar
= args
->converter
->fromUChar32
)!=0 && target
<targetLimit
) {
2393 while(source
< sourceLimit
){
2395 targetByteUnit
= missingCharMarker
;
2397 if(target
< (unsigned char*) args
->targetLimit
){
2398 sourceChar
= *source
++;
2400 /* do not convert SO/SI/ESC */
2401 if(IS_2022_CONTROL(sourceChar
)) {
2402 /* callback(illegal) */
2403 *err
=U_ILLEGAL_CHAR_FOUND
;
2404 args
->converter
->fromUChar32
=sourceChar
;
2408 length
= MBCS_FROM_UCHAR32_ISO2022(sharedData
,sourceChar
,&targetByteUnit
,useFallback
,MBCS_OUTPUT_2
);
2410 length
= -length
; /* fallback */
2412 /* only DBCS or SBCS characters are expected*/
2413 /* DB characters with high bit set to 1 are expected */
2414 if( length
> 2 || length
==0 ||
2415 (length
== 1 && targetByteUnit
> 0x7f) ||
2417 ((uint16_t)(targetByteUnit
- 0xa1a1) > (0xfefe - 0xa1a1) ||
2418 (uint8_t)(targetByteUnit
- 0xa1) > (0xfe - 0xa1)))
2420 targetByteUnit
=missingCharMarker
;
2422 if (targetByteUnit
!= missingCharMarker
){
2424 oldIsTargetByteDBCS
= isTargetByteDBCS
;
2425 isTargetByteDBCS
= (UBool
)(targetByteUnit
>0x00FF);
2426 /* append the shift sequence */
2427 if (oldIsTargetByteDBCS
!= isTargetByteDBCS
){
2429 if (isTargetByteDBCS
)
2430 *target
++ = UCNV_SO
;
2432 *target
++ = UCNV_SI
;
2434 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2436 /* write the targetUniChar to target */
2437 if(targetByteUnit
<= 0x00FF){
2438 if( target
< targetLimit
){
2439 *(target
++) = (unsigned char) targetByteUnit
;
2441 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2445 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
);
2446 *err
= U_BUFFER_OVERFLOW_ERROR
;
2449 if(target
< targetLimit
){
2450 *(target
++) =(unsigned char) ((targetByteUnit
>>8) -0x80);
2452 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2454 if(target
< targetLimit
){
2455 *(target
++) =(unsigned char) (targetByteUnit
-0x80);
2457 *(offsets
++) = (int32_t)(source
- args
->source
-1);
2460 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
-0x80);
2461 *err
= U_BUFFER_OVERFLOW_ERROR
;
2464 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) ((targetByteUnit
>>8) -0x80);
2465 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
-0x80);
2466 *err
= U_BUFFER_OVERFLOW_ERROR
;
2472 /* oops.. the code point is unassingned
2473 * set the error and reason
2476 /*check if the char is a First surrogate*/
2477 if(U16_IS_SURROGATE(sourceChar
)) {
2478 if(U16_IS_SURROGATE_LEAD(sourceChar
)) {
2480 /*look ahead to find the trail surrogate*/
2481 if(source
< sourceLimit
) {
2482 /* test the following code unit */
2483 UChar trail
=(UChar
) *source
;
2484 if(U16_IS_TRAIL(trail
)) {
2486 sourceChar
=U16_GET_SUPPLEMENTARY(sourceChar
, trail
);
2487 *err
= U_INVALID_CHAR_FOUND
;
2488 /* convert this surrogate code point */
2489 /* exit this condition tree */
2491 /* this is an unmatched lead code unit (1st surrogate) */
2492 /* callback(illegal) */
2493 *err
=U_ILLEGAL_CHAR_FOUND
;
2497 *err
= U_ZERO_ERROR
;
2500 /* this is an unmatched trail code unit (2nd surrogate) */
2501 /* callback(illegal) */
2502 *err
=U_ILLEGAL_CHAR_FOUND
;
2505 /* callback(unassigned) for a BMP code point */
2506 *err
= U_INVALID_CHAR_FOUND
;
2509 args
->converter
->fromUChar32
=sourceChar
;
2512 } /* end if(myTargetIndex<myTargetLength) */
2514 *err
=U_BUFFER_OVERFLOW_ERROR
;
2518 }/* end while(mySourceIndex<mySourceLength) */
2521 * the end of the input stream and detection of truncated input
2522 * are handled by the framework, but for ISO-2022-KR conversion
2523 * we need to be in ASCII mode at the very end
2528 * end of input and no truncated input
2530 if( U_SUCCESS(*err
) &&
2532 args
->flush
&& source
>=sourceLimit
&& args
->converter
->fromUChar32
==0
2534 int32_t sourceIndex
;
2536 /* we are switching to ASCII */
2537 isTargetByteDBCS
=FALSE
;
2539 /* get the source index of the last input character */
2541 * TODO this would be simpler and more reliable if we used a pair
2542 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2543 * so that we could simply use the prevSourceIndex here;
2544 * this code gives an incorrect result for the rare case of an unmatched
2545 * trail surrogate that is alone in the last buffer of the text stream
2547 sourceIndex
=(int32_t)(source
-args
->source
);
2550 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
2551 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
2562 &target
, (const char *)targetLimit
,
2563 &offsets
, sourceIndex
,
2567 /*save the state and return */
2568 args
->source
= source
;
2569 args
->target
= (char*)target
;
2570 args
->converter
->fromUnicodeStatus
= (uint32_t)isTargetByteDBCS
;
2573 /************************ To Unicode ***************************************/
2575 static void U_CALLCONV
2576 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs
*args
,
2578 char const* sourceStart
;
2579 UConverterDataISO2022
* myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2581 UConverterToUnicodeArgs subArgs
;
2582 int32_t minArgsSize
;
2584 /* set up the subconverter arguments */
2585 if(args
->size
<sizeof(UConverterToUnicodeArgs
)) {
2586 minArgsSize
= args
->size
;
2588 minArgsSize
= (int32_t)sizeof(UConverterToUnicodeArgs
);
2591 uprv_memcpy(&subArgs
, args
, minArgsSize
);
2592 subArgs
.size
= (uint16_t)minArgsSize
;
2593 subArgs
.converter
= myData
->currentConverter
;
2595 /* remember the original start of the input for offsets */
2596 sourceStart
= args
->source
;
2598 if(myData
->key
!= 0) {
2599 /* continue with a partial escape sequence */
2603 while(U_SUCCESS(*err
) && args
->source
< args
->sourceLimit
) {
2604 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2605 subArgs
.source
= args
->source
;
2606 subArgs
.sourceLimit
= getEndOfBuffer_2022(&(args
->source
), args
->sourceLimit
, args
->flush
);
2607 if(subArgs
.source
!= subArgs
.sourceLimit
) {
2609 * get the current partial byte sequence
2611 * it needs to be moved between the public and the subconverter
2612 * so that the conversion framework, which only sees the public
2613 * converter, can handle truncated and illegal input etc.
2615 if(args
->converter
->toULength
> 0) {
2616 uprv_memcpy(subArgs
.converter
->toUBytes
, args
->converter
->toUBytes
, args
->converter
->toULength
);
2618 subArgs
.converter
->toULength
= args
->converter
->toULength
;
2621 * Convert up to the end of the input, or to before the next escape character.
2622 * Does not handle conversion extensions because the preToU[] state etc.
2625 ucnv_MBCSToUnicodeWithOffsets(&subArgs
, err
);
2627 if(args
->offsets
!= NULL
&& sourceStart
!= args
->source
) {
2628 /* update offsets to base them on the actual start of the input */
2629 int32_t *offsets
= args
->offsets
;
2630 UChar
*target
= args
->target
;
2631 int32_t delta
= (int32_t)(args
->source
- sourceStart
);
2632 while(target
< subArgs
.target
) {
2640 args
->source
= subArgs
.source
;
2641 args
->target
= subArgs
.target
;
2642 args
->offsets
= subArgs
.offsets
;
2644 /* copy input/error/overflow buffers */
2645 if(subArgs
.converter
->toULength
> 0) {
2646 uprv_memcpy(args
->converter
->toUBytes
, subArgs
.converter
->toUBytes
, subArgs
.converter
->toULength
);
2648 args
->converter
->toULength
= subArgs
.converter
->toULength
;
2650 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
2651 if(subArgs
.converter
->UCharErrorBufferLength
> 0) {
2652 uprv_memcpy(args
->converter
->UCharErrorBuffer
, subArgs
.converter
->UCharErrorBuffer
,
2653 subArgs
.converter
->UCharErrorBufferLength
);
2655 args
->converter
->UCharErrorBufferLength
=subArgs
.converter
->UCharErrorBufferLength
;
2656 subArgs
.converter
->UCharErrorBufferLength
= 0;
2660 if (U_FAILURE(*err
) || (args
->source
== args
->sourceLimit
)) {
2665 changeState_2022(args
->converter
,
2673 static void U_CALLCONV
2674 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
2677 const char *mySource
= ( char *) args
->source
;
2678 UChar
*myTarget
= args
->target
;
2679 const char *mySourceLimit
= args
->sourceLimit
;
2680 UChar32 targetUniChar
= 0x0000;
2681 UChar mySourceChar
= 0x0000;
2682 UConverterDataISO2022
* myData
;
2683 UConverterSharedData
* sharedData
;
2686 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2687 if(myData
->version
==1){
2688 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args
,err
);
2692 /* initialize state */
2693 sharedData
= myData
->currentConverter
->sharedData
;
2694 useFallback
= args
->converter
->useFallback
;
2696 if(myData
->key
!= 0) {
2697 /* continue with a partial escape sequence */
2699 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
2700 /* continue with a partial double-byte character */
2701 mySourceChar
= args
->converter
->toUBytes
[0];
2702 args
->converter
->toULength
= 0;
2706 while(mySource
< mySourceLimit
){
2708 if(myTarget
< args
->targetLimit
){
2710 mySourceChar
= (unsigned char) *mySource
++;
2712 if(mySourceChar
==UCNV_SI
){
2713 myData
->toU2022State
.g
= 0;
2714 if (myData
->isEmptySegment
) {
2715 myData
->isEmptySegment
= FALSE
; /* we are handling it, reset to avoid future spurious errors */
2716 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
2717 args
->converter
->toUCallbackReason
= UCNV_IRREGULAR
;
2718 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
2719 args
->converter
->toULength
= 1;
2720 args
->target
= myTarget
;
2721 args
->source
= mySource
;
2724 /*consume the source */
2726 }else if(mySourceChar
==UCNV_SO
){
2727 myData
->toU2022State
.g
= 1;
2728 myData
->isEmptySegment
= TRUE
; /* Begin a new segment, empty so far */
2729 /*consume the source */
2731 }else if(mySourceChar
==ESC_2022
){
2734 myData
->isEmptySegment
= FALSE
; /* Any invalid ESC sequences will be detected separately, so just reset this */
2735 changeState_2022(args
->converter
,&(mySource
),
2736 mySourceLimit
, ISO_2022_KR
, err
);
2737 if(U_FAILURE(*err
)){
2738 args
->target
= myTarget
;
2739 args
->source
= mySource
;
2745 myData
->isEmptySegment
= FALSE
; /* Any invalid char errors will be detected separately, so just reset this */
2746 if(myData
->toU2022State
.g
== 1) {
2747 if(mySource
< mySourceLimit
) {
2748 int leadIsOk
, trailIsOk
;
2751 targetUniChar
= missingCharMarker
;
2752 trailByte
= (uint8_t)*mySource
;
2754 * Ticket 5691: consistent illegal sequences:
2755 * - We include at least the first byte in the illegal sequence.
2756 * - If any of the non-initial bytes could be the start of a character,
2757 * we stop the illegal sequence before the first one of those.
2759 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2760 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2761 * Otherwise we convert or report the pair of bytes.
2763 leadIsOk
= (uint8_t)(mySourceChar
- 0x21) <= (0x7e - 0x21);
2764 trailIsOk
= (uint8_t)(trailByte
- 0x21) <= (0x7e - 0x21);
2765 if (leadIsOk
&& trailIsOk
) {
2767 tempBuf
[0] = (char)(mySourceChar
+ 0x80);
2768 tempBuf
[1] = (char)(trailByte
+ 0x80);
2769 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(sharedData
, tempBuf
, 2, useFallback
);
2770 mySourceChar
= (mySourceChar
<< 8) | trailByte
;
2771 } else if (!(trailIsOk
|| IS_2022_CONTROL(trailByte
))) {
2772 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2774 /* add another bit so that the code below writes 2 bytes in case of error */
2775 mySourceChar
= static_cast<UChar
>(0x10000 | (mySourceChar
<< 8) | trailByte
);
2778 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
2779 args
->converter
->toULength
= 1;
2783 else if(mySourceChar
<= 0x7f) {
2784 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(sharedData
, mySource
- 1, 1, useFallback
);
2786 targetUniChar
= 0xffff;
2788 if(targetUniChar
< 0xfffe){
2790 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
2792 *(myTarget
++)=(UChar
)targetUniChar
;
2795 /* Call the callback function*/
2796 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
2801 *err
=U_BUFFER_OVERFLOW_ERROR
;
2805 args
->target
= myTarget
;
2806 args
->source
= mySource
;
2809 /*************************** END ISO2022-KR *********************************/
2811 /*************************** ISO-2022-CN *********************************
2813 * Rules for ISO-2022-CN Encoding:
2814 * i) The designator sequence must appear once on a line before any instance
2815 * of character set it designates.
2816 * ii) If two lines contain characters from the same character set, both lines
2817 * must include the designator sequence.
2818 * iii) Once the designator sequence is known, a shifting sequence has to be found
2819 * to invoke the shifting
2820 * iv) All lines start in ASCII and end in ASCII.
2821 * v) Four shifting sequences are employed for this purpose:
2823 * Sequcence ASCII Eq Charsets
2824 * ---------- ------- ---------
2826 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2827 * SS2 <ESC>N CNS-11643-1992 Plane 2
2828 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2831 * SOdesignator : ESC "$" ")" finalchar_for_SO
2832 * SS2designator : ESC "$" "*" finalchar_for_SS2
2833 * SS3designator : ESC "$" "+" finalchar_for_SS3
2835 * ESC $ ) A Indicates the bytes following SO are Chinese
2836 * characters as defined in GB 2312-80, until
2837 * another SOdesignation appears
2840 * ESC $ ) E Indicates the bytes following SO are as defined
2841 * in ISO-IR-165 (for details, see section 2.1),
2842 * until another SOdesignation appears
2844 * ESC $ ) G Indicates the bytes following SO are as defined
2845 * in CNS 11643-plane-1, until another
2846 * SOdesignation appears
2848 * ESC $ * H Indicates the two bytes immediately following
2849 * SS2 is a Chinese character as defined in CNS
2850 * 11643-plane-2, until another SS2designation
2852 * (Meaning <ESC>N must preceed every 2 byte
2855 * ESC $ + I Indicates the immediate two bytes following SS3
2856 * is a Chinese character as defined in CNS
2857 * 11643-plane-3, until another SS3designation
2859 * (Meaning <ESC>O must preceed every 2 byte
2862 * ESC $ + J Indicates the immediate two bytes following SS3
2863 * is a Chinese character as defined in CNS
2864 * 11643-plane-4, until another SS3designation
2866 * (In English: <ESC>O must preceed every 2 byte
2869 * ESC $ + K Indicates the immediate two bytes following SS3
2870 * is a Chinese character as defined in CNS
2871 * 11643-plane-5, until another SS3designation
2874 * ESC $ + L Indicates the immediate two bytes following SS3
2875 * is a Chinese character as defined in CNS
2876 * 11643-plane-6, until another SS3designation
2879 * ESC $ + M Indicates the immediate two bytes following SS3
2880 * is a Chinese character as defined in CNS
2881 * 11643-plane-7, until another SS3designation
2884 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2885 * has its own designation information before any Chinese characters
2890 /* The following are defined this way to make the strings truly readonly */
2891 static const char GB_2312_80_STR
[] = "\x1B\x24\x29\x41";
2892 static const char ISO_IR_165_STR
[] = "\x1B\x24\x29\x45";
2893 static const char CNS_11643_1992_Plane_1_STR
[] = "\x1B\x24\x29\x47";
2894 static const char CNS_11643_1992_Plane_2_STR
[] = "\x1B\x24\x2A\x48";
2895 static const char CNS_11643_1992_Plane_3_STR
[] = "\x1B\x24\x2B\x49";
2896 static const char CNS_11643_1992_Plane_4_STR
[] = "\x1B\x24\x2B\x4A";
2897 static const char CNS_11643_1992_Plane_5_STR
[] = "\x1B\x24\x2B\x4B";
2898 static const char CNS_11643_1992_Plane_6_STR
[] = "\x1B\x24\x2B\x4C";
2899 static const char CNS_11643_1992_Plane_7_STR
[] = "\x1B\x24\x2B\x4D";
2901 /********************** ISO2022-CN Data **************************/
2902 static const char* const escSeqCharsCN
[10] ={
2903 SHIFT_IN_STR
, /* 0 ASCII */
2904 GB_2312_80_STR
, /* 1 GB2312_1 */
2905 ISO_IR_165_STR
, /* 2 ISO_IR_165 */
2906 CNS_11643_1992_Plane_1_STR
,
2907 CNS_11643_1992_Plane_2_STR
,
2908 CNS_11643_1992_Plane_3_STR
,
2909 CNS_11643_1992_Plane_4_STR
,
2910 CNS_11643_1992_Plane_5_STR
,
2911 CNS_11643_1992_Plane_6_STR
,
2912 CNS_11643_1992_Plane_7_STR
2915 static void U_CALLCONV
2916 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
2917 UConverter
*cnv
= args
->converter
;
2918 UConverterDataISO2022
*converterData
;
2919 ISO2022State
*pFromU2022State
;
2920 uint8_t *target
= (uint8_t *) args
->target
;
2921 const uint8_t *targetLimit
= (const uint8_t *) args
->targetLimit
;
2922 const UChar
* source
= args
->source
;
2923 const UChar
* sourceLimit
= args
->sourceLimit
;
2924 int32_t* offsets
= args
->offsets
;
2929 int32_t choiceCount
;
2930 uint32_t targetValue
= 0;
2933 /* set up the state */
2934 converterData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
2935 pFromU2022State
= &converterData
->fromU2022State
;
2939 /* check if the last codepoint of previous buffer was a lead surrogate*/
2940 if((sourceChar
= cnv
->fromUChar32
)!=0 && target
< targetLimit
) {
2944 while( source
< sourceLimit
){
2945 if(target
< targetLimit
){
2947 sourceChar
= *(source
++);
2948 /*check if the char is a First surrogate*/
2949 if(U16_IS_SURROGATE(sourceChar
)) {
2950 if(U16_IS_SURROGATE_LEAD(sourceChar
)) {
2952 /*look ahead to find the trail surrogate*/
2953 if(source
< sourceLimit
) {
2954 /* test the following code unit */
2955 UChar trail
=(UChar
) *source
;
2956 if(U16_IS_TRAIL(trail
)) {
2958 sourceChar
=U16_GET_SUPPLEMENTARY(sourceChar
, trail
);
2959 cnv
->fromUChar32
=0x00;
2960 /* convert this supplementary code point */
2961 /* exit this condition tree */
2963 /* this is an unmatched lead code unit (1st surrogate) */
2964 /* callback(illegal) */
2965 *err
=U_ILLEGAL_CHAR_FOUND
;
2966 cnv
->fromUChar32
=sourceChar
;
2971 cnv
->fromUChar32
=sourceChar
;
2975 /* this is an unmatched trail code unit (2nd surrogate) */
2976 /* callback(illegal) */
2977 *err
=U_ILLEGAL_CHAR_FOUND
;
2978 cnv
->fromUChar32
=sourceChar
;
2983 /* do the conversion */
2984 if(sourceChar
<= 0x007f ){
2985 /* do not convert SO/SI/ESC */
2986 if(IS_2022_CONTROL(sourceChar
)) {
2987 /* callback(illegal) */
2988 *err
=U_ILLEGAL_CHAR_FOUND
;
2989 cnv
->fromUChar32
=sourceChar
;
2994 if(pFromU2022State
->g
== 0) {
2995 buffer
[0] = (char)sourceChar
;
2998 buffer
[0] = UCNV_SI
;
2999 buffer
[1] = (char)sourceChar
;
3001 pFromU2022State
->g
= 0;
3004 if(sourceChar
== CR
|| sourceChar
== LF
) {
3005 /* reset the state at the end of a line */
3006 uprv_memset(pFromU2022State
, 0, sizeof(ISO2022State
));
3011 /* convert U+0080..U+10ffff */
3015 if(choiceCount
== 0) {
3016 /* try the current SO/G1 converter first */
3017 choices
[0] = pFromU2022State
->cs
[1];
3019 /* default to GB2312_1 if none is designated yet */
3020 if(choices
[0] == 0) {
3021 choices
[0] = GB2312_1
;
3024 if(converterData
->version
== 0) {
3027 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3028 if(choices
[0] == GB2312_1
) {
3029 choices
[1] = (int8_t)CNS_11643_1
;
3031 choices
[1] = (int8_t)GB2312_1
;
3035 } else if (converterData
->version
== 1) {
3036 /* ISO-2022-CN-EXT */
3038 /* try one of the other converters */
3039 switch(choices
[0]) {
3041 choices
[1] = (int8_t)CNS_11643_1
;
3042 choices
[2] = (int8_t)ISO_IR_165
;
3045 choices
[1] = (int8_t)GB2312_1
;
3046 choices
[2] = (int8_t)CNS_11643_1
;
3048 default: /* CNS_11643_x */
3049 choices
[1] = (int8_t)GB2312_1
;
3050 choices
[2] = (int8_t)ISO_IR_165
;
3056 choices
[0] = (int8_t)CNS_11643_1
;
3057 choices
[1] = (int8_t)GB2312_1
;
3063 * len==0: no mapping found yet
3064 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3065 * len>0: found a roundtrip result, done
3069 * We will turn off useFallback after finding a fallback,
3070 * but we still get fallbacks from PUA code points as usual.
3071 * Therefore, we will also need to check that we don't overwrite
3072 * an early fallback with a later one.
3074 useFallback
= cnv
->useFallback
;
3076 for(i
= 0; i
< choiceCount
&& len
<= 0; ++i
) {
3077 int8_t cs0
= choices
[i
];
3081 if(cs0
>= CNS_11643_0
) {
3082 len2
= MBCS_FROM_UCHAR32_ISO2022(
3083 converterData
->myConverterArray
[CNS_11643
],
3088 if(len2
== 3 || (len2
== -3 && len
== 0)) {
3089 targetValue
= value
;
3090 cs
= (int8_t)(CNS_11643_0
+ (value
>> 16) - 0x80);
3095 useFallback
= FALSE
;
3097 if(cs
== CNS_11643_1
) {
3099 } else if(cs
== CNS_11643_2
) {
3101 } else /* plane 3..7 */ if(converterData
->version
== 1) {
3104 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3109 /* GB2312_1 or ISO-IR-165 */
3110 U_ASSERT(cs0
<UCNV_2022_MAX_CONVERTERS
);
3111 len2
= MBCS_FROM_UCHAR32_ISO2022(
3112 converterData
->myConverterArray
[cs0
],
3117 if(len2
== 2 || (len2
== -2 && len
== 0)) {
3118 targetValue
= value
;
3122 useFallback
= FALSE
;
3129 len
= 0; /* count output bytes; it must have been abs(len) == 2 */
3131 /* write the designation sequence if necessary */
3132 if(cs
!= pFromU2022State
->cs
[g
]) {
3133 if(cs
< CNS_11643
) {
3134 uprv_memcpy(buffer
, escSeqCharsCN
[cs
], 4);
3136 U_ASSERT(cs
>= CNS_11643_1
);
3137 uprv_memcpy(buffer
, escSeqCharsCN
[CNS_11643
+ (cs
- CNS_11643_1
)], 4);
3140 pFromU2022State
->cs
[g
] = cs
;
3142 /* changing the SO/G1 charset invalidates the choices[] */
3147 /* write the shift sequence if necessary */
3148 if(g
!= pFromU2022State
->g
) {
3151 buffer
[len
++] = UCNV_SO
;
3153 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3154 pFromU2022State
->g
= 1;
3157 buffer
[len
++] = 0x1b;
3158 buffer
[len
++] = 0x4e;
3160 default: /* case 3 */
3161 buffer
[len
++] = 0x1b;
3162 buffer
[len
++] = 0x4f;
3167 /* write the two output bytes */
3168 buffer
[len
++] = (char)(targetValue
>> 8);
3169 buffer
[len
++] = (char)targetValue
;
3171 /* if we cannot find the character after checking all codepages
3172 * then this is an error
3174 *err
= U_INVALID_CHAR_FOUND
;
3175 cnv
->fromUChar32
=sourceChar
;
3180 /* output len>0 bytes in buffer[] */
3182 *target
++ = buffer
[0];
3184 *offsets
++ = (int32_t)(source
- args
->source
- 1); /* -1: known to be ASCII */
3186 } else if(len
== 2 && (target
+ 2) <= targetLimit
) {
3187 *target
++ = buffer
[0];
3188 *target
++ = buffer
[1];
3190 int32_t sourceIndex
= (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
));
3191 *offsets
++ = sourceIndex
;
3192 *offsets
++ = sourceIndex
;
3198 &target
, (const char *)targetLimit
,
3199 &offsets
, (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
)),
3201 if(U_FAILURE(*err
)) {
3205 } /* end if(myTargetIndex<myTargetLength) */
3207 *err
=U_BUFFER_OVERFLOW_ERROR
;
3211 }/* end while(mySourceIndex<mySourceLength) */
3214 * the end of the input stream and detection of truncated input
3215 * are handled by the framework, but for ISO-2022-CN conversion
3216 * we need to be in ASCII mode at the very end
3221 * end of input and no truncated input
3223 if( U_SUCCESS(*err
) &&
3224 pFromU2022State
->g
!=0 &&
3225 args
->flush
&& source
>=sourceLimit
&& cnv
->fromUChar32
==0
3227 int32_t sourceIndex
;
3229 /* we are switching to ASCII */
3230 pFromU2022State
->g
=0;
3232 /* get the source index of the last input character */
3234 * TODO this would be simpler and more reliable if we used a pair
3235 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3236 * so that we could simply use the prevSourceIndex here;
3237 * this code gives an incorrect result for the rare case of an unmatched
3238 * trail surrogate that is alone in the last buffer of the text stream
3240 sourceIndex
=(int32_t)(source
-args
->source
);
3243 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
3244 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
3255 &target
, (const char *)targetLimit
,
3256 &offsets
, sourceIndex
,
3260 /*save the state and return */
3261 args
->source
= source
;
3262 args
->target
= (char*)target
;
3266 static void U_CALLCONV
3267 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
3270 const char *mySource
= (char *) args
->source
;
3271 UChar
*myTarget
= args
->target
;
3272 const char *mySourceLimit
= args
->sourceLimit
;
3273 uint32_t targetUniChar
= 0x0000;
3274 uint32_t mySourceChar
= 0x0000;
3275 UConverterDataISO2022
* myData
;
3276 ISO2022State
*pToU2022State
;
3278 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
3279 pToU2022State
= &myData
->toU2022State
;
3281 if(myData
->key
!= 0) {
3282 /* continue with a partial escape sequence */
3284 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
3285 /* continue with a partial double-byte character */
3286 mySourceChar
= args
->converter
->toUBytes
[0];
3287 args
->converter
->toULength
= 0;
3288 targetUniChar
= missingCharMarker
;
3292 while(mySource
< mySourceLimit
){
3294 targetUniChar
=missingCharMarker
;
3296 if(myTarget
< args
->targetLimit
){
3298 mySourceChar
= (unsigned char) *mySource
++;
3300 switch(mySourceChar
){
3303 if (myData
->isEmptySegment
) {
3304 myData
->isEmptySegment
= FALSE
; /* we are handling it, reset to avoid future spurious errors */
3305 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
3306 args
->converter
->toUCallbackReason
= UCNV_IRREGULAR
;
3307 args
->converter
->toUBytes
[0] = static_cast<uint8_t>(mySourceChar
);
3308 args
->converter
->toULength
= 1;
3309 args
->target
= myTarget
;
3310 args
->source
= mySource
;
3316 if(pToU2022State
->cs
[1] != 0) {
3318 myData
->isEmptySegment
= TRUE
; /* Begin a new segment, empty so far */
3321 /* illegal to have SO before a matching designator */
3322 myData
->isEmptySegment
= FALSE
; /* Handling a different error, reset this to avoid future spurious errs */
3330 const char * mySourceBefore
= mySource
;
3331 int8_t toULengthBefore
= args
->converter
->toULength
;
3333 changeState_2022(args
->converter
,&(mySource
),
3334 mySourceLimit
, ISO_2022_CN
,err
);
3336 /* After SO there must be at least one character before a designator (designator error handled separately) */
3337 if(myData
->key
==0 && U_SUCCESS(*err
) && myData
->isEmptySegment
) {
3338 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
3339 args
->converter
->toUCallbackReason
= UCNV_IRREGULAR
;
3340 args
->converter
->toULength
= (int8_t)(toULengthBefore
+ (mySource
- mySourceBefore
));
3344 /* invalid or illegal escape sequence */
3345 if(U_FAILURE(*err
)){
3346 args
->target
= myTarget
;
3347 args
->source
= mySource
;
3348 myData
->isEmptySegment
= FALSE
; /* Reset to avoid future spurious errors */
3353 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3357 uprv_memset(pToU2022State
, 0, sizeof(ISO2022State
));
3360 /* convert one or two bytes */
3361 myData
->isEmptySegment
= FALSE
;
3362 if(pToU2022State
->g
!= 0) {
3363 if(mySource
< mySourceLimit
) {
3364 UConverterSharedData
*cnv
;
3365 StateEnum tempState
;
3367 int leadIsOk
, trailIsOk
;
3370 trailByte
= (uint8_t)*mySource
;
3372 * Ticket 5691: consistent illegal sequences:
3373 * - We include at least the first byte in the illegal sequence.
3374 * - If any of the non-initial bytes could be the start of a character,
3375 * we stop the illegal sequence before the first one of those.
3377 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3378 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3379 * Otherwise we convert or report the pair of bytes.
3381 leadIsOk
= (uint8_t)(mySourceChar
- 0x21) <= (0x7e - 0x21);
3382 trailIsOk
= (uint8_t)(trailByte
- 0x21) <= (0x7e - 0x21);
3383 if (leadIsOk
&& trailIsOk
) {
3385 tempState
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
3386 if(tempState
>= CNS_11643_0
) {
3387 cnv
= myData
->myConverterArray
[CNS_11643
];
3388 tempBuf
[0] = (char) (0x80+(tempState
-CNS_11643_0
));
3389 tempBuf
[1] = (char) (mySourceChar
);
3390 tempBuf
[2] = (char) trailByte
;
3394 U_ASSERT(tempState
<UCNV_2022_MAX_CONVERTERS
);
3395 cnv
= myData
->myConverterArray
[tempState
];
3396 tempBuf
[0] = (char) (mySourceChar
);
3397 tempBuf
[1] = (char) trailByte
;
3400 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(cnv
, tempBuf
, tempBufLen
, FALSE
);
3401 mySourceChar
= (mySourceChar
<< 8) | trailByte
;
3402 } else if (!(trailIsOk
|| IS_2022_CONTROL(trailByte
))) {
3403 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3405 /* add another bit so that the code below writes 2 bytes in case of error */
3406 mySourceChar
= 0x10000 | (mySourceChar
<< 8) | trailByte
;
3408 if(pToU2022State
->g
>=2) {
3409 /* return from a single-shift state to the previous one */
3410 pToU2022State
->g
=pToU2022State
->prevG
;
3413 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
3414 args
->converter
->toULength
= 1;
3419 if(mySourceChar
<= 0x7f) {
3420 targetUniChar
= (UChar
) mySourceChar
;
3425 if(targetUniChar
< (missingCharMarker
-1/*0xfffe*/)){
3427 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
3429 *(myTarget
++)=(UChar
)targetUniChar
;
3431 else if(targetUniChar
> missingCharMarker
){
3432 /* disassemble the surrogate pair and write to output*/
3433 targetUniChar
-=0x0010000;
3434 *myTarget
= (UChar
)(0xd800+(UChar
)(targetUniChar
>>10));
3436 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
3439 if(myTarget
< args
->targetLimit
){
3440 *myTarget
= (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
3442 args
->offsets
[myTarget
- args
->target
] = (int32_t)(mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2));
3446 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]=
3447 (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
3452 /* Call the callback function*/
3453 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
3458 *err
=U_BUFFER_OVERFLOW_ERROR
;
3463 args
->target
= myTarget
;
3464 args
->source
= mySource
;
3466 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3468 static void U_CALLCONV
3469 _ISO_2022_WriteSub(UConverterFromUnicodeArgs
*args
, int32_t offsetIndex
, UErrorCode
*err
) {
3470 UConverter
*cnv
= args
->converter
;
3471 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) cnv
->extraInfo
;
3472 ISO2022State
*pFromU2022State
=&myConverterData
->fromU2022State
;
3477 subchar
=(char *)cnv
->subChars
;
3478 length
=cnv
->subCharLen
; /* assume length==1 for most variants */
3481 switch(myConverterData
->locale
[0]){
3486 if(pFromU2022State
->g
== 1) {
3487 /* JIS7: switch from G1 to G0 */
3488 pFromU2022State
->g
= 0;
3492 cs
= pFromU2022State
->cs
[0];
3493 if(cs
!= ASCII
&& cs
!= JISX201
) {
3494 /* not in ASCII or JIS X 0201: switch to ASCII */
3495 pFromU2022State
->cs
[0] = (int8_t)ASCII
;
3505 if(pFromU2022State
->g
!= 0) {
3506 /* not in ASCII mode: switch to ASCII */
3507 pFromU2022State
->g
= 0;
3513 if(myConverterData
->version
== 0) {
3515 if(args
->converter
->fromUnicodeStatus
) {
3516 /* in DBCS mode: switch to SBCS */
3517 args
->converter
->fromUnicodeStatus
= 0;
3521 } else /* length == 2*/ {
3522 if(!args
->converter
->fromUnicodeStatus
) {
3523 /* in SBCS mode: switch to DBCS */
3524 args
->converter
->fromUnicodeStatus
= 1;
3532 /* save the subconverter's substitution string */
3533 uint8_t *currentSubChars
= myConverterData
->currentConverter
->subChars
;
3534 int8_t currentSubCharLen
= myConverterData
->currentConverter
->subCharLen
;
3536 /* set our substitution string into the subconverter */
3537 myConverterData
->currentConverter
->subChars
= (uint8_t *)subchar
;
3538 myConverterData
->currentConverter
->subCharLen
= (int8_t)length
;
3540 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3541 args
->converter
= myConverterData
->currentConverter
;
3542 myConverterData
->currentConverter
->fromUChar32
= cnv
->fromUChar32
;
3543 ucnv_cbFromUWriteSub(args
, 0, err
);
3544 cnv
->fromUChar32
= myConverterData
->currentConverter
->fromUChar32
;
3545 args
->converter
= cnv
;
3547 /* restore the subconverter's substitution string */
3548 myConverterData
->currentConverter
->subChars
= currentSubChars
;
3549 myConverterData
->currentConverter
->subCharLen
= currentSubCharLen
;
3551 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
3552 if(myConverterData
->currentConverter
->charErrorBufferLength
> 0) {
3554 cnv
->charErrorBuffer
,
3555 myConverterData
->currentConverter
->charErrorBuffer
,
3556 myConverterData
->currentConverter
->charErrorBufferLength
);
3558 cnv
->charErrorBufferLength
= myConverterData
->currentConverter
->charErrorBufferLength
;
3559 myConverterData
->currentConverter
->charErrorBufferLength
= 0;
3567 ucnv_cbFromUWriteBytes(args
,
3568 buffer
, (int32_t)(p
- buffer
),
3573 * Structure for cloning an ISO 2022 converter into a single memory block.
3574 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3575 * and then ucnv_safeClone() of the sub-converter may additionally align
3576 * currentConverter inside the cloneStruct, for which we need the deadSpace
3577 * after currentConverter.
3578 * This is because UAlignedMemory may be larger than the actually
3579 * necessary alignment size for the platform.
3580 * The other cloneStruct fields will not be moved around,
3581 * and are aligned properly with cloneStruct's alignment.
3586 UConverter currentConverter
;
3587 UAlignedMemory deadSpace
;
3588 UConverterDataISO2022 mydata
;
3594 static UConverter
* U_CALLCONV
3595 _ISO_2022_SafeClone(
3596 const UConverter
*cnv
,
3598 int32_t *pBufferSize
,
3601 struct cloneStruct
* localClone
;
3602 UConverterDataISO2022
*cnvData
;
3605 if (*pBufferSize
== 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3606 *pBufferSize
= (int32_t)sizeof(struct cloneStruct
);
3610 cnvData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
3611 localClone
= (struct cloneStruct
*)stackBuffer
;
3613 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3615 uprv_memcpy(&localClone
->mydata
, cnvData
, sizeof(UConverterDataISO2022
));
3616 localClone
->cnv
.extraInfo
= &localClone
->mydata
; /* set pointer to extra data */
3617 localClone
->cnv
.isExtraLocal
= TRUE
;
3619 /* share the subconverters */
3621 if(cnvData
->currentConverter
!= NULL
) {
3622 size
= (int32_t)(sizeof(UConverter
) + sizeof(UAlignedMemory
)); /* include size of padding */
3623 localClone
->mydata
.currentConverter
=
3624 ucnv_safeClone(cnvData
->currentConverter
,
3625 &localClone
->currentConverter
,
3627 if(U_FAILURE(*status
)) {
3632 for(i
=0; i
<UCNV_2022_MAX_CONVERTERS
; ++i
) {
3633 if(cnvData
->myConverterArray
[i
] != NULL
) {
3634 ucnv_incrementRefCount(cnvData
->myConverterArray
[i
]);
3638 return &localClone
->cnv
;
3643 static void U_CALLCONV
3644 _ISO_2022_GetUnicodeSet(const UConverter
*cnv
,
3645 const USetAdder
*sa
,
3646 UConverterUnicodeSet which
,
3647 UErrorCode
*pErrorCode
)
3650 UConverterDataISO2022
* cnvData
;
3652 if (U_FAILURE(*pErrorCode
)) {
3655 #ifdef U_ENABLE_GENERIC_ISO_2022
3656 if (cnv
->sharedData
== &_ISO2022Data
) {
3657 /* We use UTF-8 in this case */
3658 sa
->addRange(sa
->set
, 0, 0xd7FF);
3659 sa
->addRange(sa
->set
, 0xE000, 0x10FFFF);
3664 cnvData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
3666 /* open a set and initialize it with code points that are algorithmically round-tripped */
3667 switch(cnvData
->locale
[0]){
3669 /* include JIS X 0201 which is hardcoded */
3670 sa
->add(sa
->set
, 0xa5);
3671 sa
->add(sa
->set
, 0x203e);
3672 if(jpCharsetMasks
[cnvData
->version
]&CSM(ISO8859_1
)) {
3673 /* include Latin-1 for some variants of JP */
3674 sa
->addRange(sa
->set
, 0, 0xff);
3676 /* include ASCII for JP */
3677 sa
->addRange(sa
->set
, 0, 0x7f);
3679 if(cnvData
->version
==3 || cnvData
->version
==4 || which
==UCNV_ROUNDTRIP_AND_FALLBACK_SET
) {
3681 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3682 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3683 * use half-width Katakana.
3684 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3685 * half-width Katakana via the ESC ( I sequence.
3686 * However, we only emit (fromUnicode) half-width Katakana according to the
3687 * definition of each variant.
3689 * When including fallbacks,
3690 * we need to include half-width Katakana Unicode code points for all JP variants because
3691 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3693 /* include half-width Katakana for JP */
3694 sa
->addRange(sa
->set
, HWKANA_START
, HWKANA_END
);
3697 #if !UCONFIG_ONLY_HTML_CONVERSION
3700 /* include ASCII for CN */
3701 sa
->addRange(sa
->set
, 0, 0x7f);
3704 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3705 cnvData
->currentConverter
->sharedData
->impl
->getUnicodeSet(
3706 cnvData
->currentConverter
, sa
, which
, pErrorCode
);
3707 /* the loop over myConverterArray[] will simply not find another converter */
3714 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3715 if( (cnvData
->locale
[0]=='c' || cnvData
->locale
[0]=='z') &&
3716 cnvData
->version
==0 && i
==CNS_11643
3718 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3719 ucnv_MBCSGetUnicodeSetForBytes(
3720 cnvData
->myConverterArray
[i
],
3721 sa
, UCNV_ROUNDTRIP_SET
,
3727 for (i
=0; i
<UCNV_2022_MAX_CONVERTERS
; i
++) {
3728 UConverterSetFilter filter
;
3729 if(cnvData
->myConverterArray
[i
]!=NULL
) {
3730 if(cnvData
->locale
[0]=='j' && i
==JISX208
) {
3732 * Only add code points that map to Shift-JIS codes
3733 * corresponding to JIS X 0208.
3735 filter
=UCNV_SET_FILTER_SJIS
;
3736 #if !UCONFIG_ONLY_HTML_CONVERSION
3737 } else if( (cnvData
->locale
[0]=='c' || cnvData
->locale
[0]=='z') &&
3738 cnvData
->version
==0 && i
==CNS_11643
) {
3740 * Version-specific for CN:
3741 * CN version 0 does not map CNS planes 3..7 although
3742 * they are all available in the CNS conversion table;
3743 * CN version 1 (-EXT) does map them all.
3744 * The two versions create different Unicode sets.
3746 filter
=UCNV_SET_FILTER_2022_CN
;
3747 } else if(i
==KSC5601
) {
3749 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3750 * are broader than GR94.
3752 filter
=UCNV_SET_FILTER_GR94DBCS
;
3755 filter
=UCNV_SET_FILTER_NONE
;
3757 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData
->myConverterArray
[i
], sa
, which
, filter
, pErrorCode
);
3762 * ISO 2022 converters must not convert SO/SI/ESC despite what
3763 * sub-converters do by themselves.
3764 * Remove these characters from the set.
3766 sa
->remove(sa
->set
, 0x0e);
3767 sa
->remove(sa
->set
, 0x0f);
3768 sa
->remove(sa
->set
, 0x1b);
3770 /* ISO 2022 converters do not convert C1 controls either */
3771 sa
->removeRange(sa
->set
, 0x80, 0x9f);
3774 static const UConverterImpl _ISO2022Impl
={
3784 #ifdef U_ENABLE_GENERIC_ISO_2022
3785 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC
,
3786 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC
,
3787 ucnv_fromUnicode_UTF8
,
3788 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
3800 _ISO_2022_SafeClone
,
3801 _ISO_2022_GetUnicodeSet
,
3806 static const UConverterStaticData _ISO2022StaticData
={
3807 sizeof(UConverterStaticData
),
3813 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3820 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3822 const UConverterSharedData _ISO2022Data
=
3823 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData
, &_ISO2022Impl
);
3825 /*************JP****************/
3826 static const UConverterImpl _ISO2022JPImpl
={
3836 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3837 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3838 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3839 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3845 _ISO_2022_SafeClone
,
3846 _ISO_2022_GetUnicodeSet
,
3851 static const UConverterStaticData _ISO2022JPStaticData
={
3852 sizeof(UConverterStaticData
),
3858 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3865 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3870 const UConverterSharedData _ISO2022JPData
=
3871 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData
, &_ISO2022JPImpl
);
3875 #if !UCONFIG_ONLY_HTML_CONVERSION
3876 /************* KR ***************/
3877 static const UConverterImpl _ISO2022KRImpl
={
3887 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3888 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3889 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3890 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3896 _ISO_2022_SafeClone
,
3897 _ISO_2022_GetUnicodeSet
,
3902 static const UConverterStaticData _ISO2022KRStaticData
={
3903 sizeof(UConverterStaticData
),
3909 8, /* max 8 bytes per UChar */
3916 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3921 const UConverterSharedData _ISO2022KRData
=
3922 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData
, &_ISO2022KRImpl
);
3926 /*************** CN ***************/
3927 static const UConverterImpl _ISO2022CNImpl
={
3938 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3939 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3940 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3941 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3947 _ISO_2022_SafeClone
,
3948 _ISO_2022_GetUnicodeSet
,
3953 static const UConverterStaticData _ISO2022CNStaticData
={
3954 sizeof(UConverterStaticData
),
3960 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3967 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3972 const UConverterSharedData _ISO2022CNData
=
3973 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData
, &_ISO2022CNImpl
);
3976 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3978 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */