2 **********************************************************************
3 * Copyright (C) 2000-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.c
8 * tab size: 8 (not used)
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
29 #include "unicode/utypes.h"
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
46 #ifdef U_ENABLE_GENERIC_ISO_2022
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
74 * Markus Scherer 2003-dec-03
78 static const char SHIFT_IN_STR
[] = "\x0F";
79 static const char SHIFT_OUT_STR
[] = "\x0E";
87 /* for ISO-2022-JP and -CN implementations */
104 HWKANA_7BIT
=8, /* Halfwidth Katakana 7 bit */
107 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
113 * these are used in StateEnum and ISO2022State variables,
114 * but CNS_11643 must be used to index into myConverterArray[]
126 /* is the StateEnum charset value for a DBCS charset? */
127 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
129 #define CSM(cs) ((uint16_t)1<<(cs))
132 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
133 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
135 * Note: The converter uses some leniency:
136 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
137 * all versions, not just JIS7 and JIS8.
138 * - ICU does not distinguish between different versions of JIS X 0208.
140 static const uint16_t jpCharsetMasks
[5]={
141 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
),
142 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
),
143 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
),
144 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
),
145 CSM(ASCII
)|CSM(JISX201
)|CSM(JISX208
)|CSM(HWKANA_7BIT
)|CSM(JISX212
)|CSM(GB2312
)|CSM(KSC5601
)|CSM(ISO8859_1
)|CSM(ISO8859_7
)
157 typedef struct ISO2022State
{
158 int8_t cs
[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
159 int8_t g
; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
160 int8_t prevG
; /* g before single shift (SS2 or SS3) */
163 #define UCNV_OPTIONS_VERSION_MASK 0xf
164 #define UCNV_2022_MAX_CONVERTERS 10
167 UConverter
*currentConverter
;
168 #ifdef U_ENABLE_GENERIC_ISO_2022
171 Cnv2022Type currentType
;
172 ISO2022State toU2022State
, fromU2022State
;
173 UConverterSharedData
*myConverterArray
[UCNV_2022_MAX_CONVERTERS
];
178 }UConverterDataISO2022
;
181 /* ISO-2022 ----------------------------------------------------------------- */
183 /*Forward declaration */
185 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs
* args
,
188 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
,
191 #define ESC_2022 0x1B /*ESC*/
195 INVALID_2022
= -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
196 VALID_NON_TERMINAL_2022
= 0, /*so far corresponds to a valid iso 2022 escape sequence*/
197 VALID_TERMINAL_2022
= 1, /*corresponds to a valid iso 2022 escape sequence*/
198 VALID_MAYBE_TERMINAL_2022
= 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
199 } UCNV_TableStates_2022
;
202 * The way these state transition arrays work is:
203 * ex : ESC$B is the sequence for JISX208
204 * a) First Iteration: char is ESC
205 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
206 * int x = normalize_esq_chars_2022[27] which is equal to 1
207 * ii) Search for this value in escSeqStateTable_Key_2022[]
208 * value of x is stored at escSeqStateTable_Key_2022[0]
209 * iii) Save this index as offset
210 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
211 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
212 * b) Switch on this state and continue to next char
213 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
214 * which is normalize_esq_chars_2022[36] == 4
215 * ii) x is currently 1(from above)
216 * x<<=5 -- x is now 32
217 * x+=normalize_esq_chars_2022[36]
219 * iii) Search for this value in escSeqStateTable_Key_2022[]
220 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
221 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
222 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
223 * c) Switch on this state and continue to next char
224 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
225 * ii) x is currently 36 (from above)
226 * x<<=5 -- x is now 1152
227 * x+=normalize_esq_chars_2022[66]
229 * iii) Search for this value in escSeqStateTable_Key_2022[]
230 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
231 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
232 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
233 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
237 /*Below are the 3 arrays depicting a state transition table*/
238 static const int8_t normalize_esq_chars_2022
[256] = {
239 /* 0 1 2 3 4 5 6 7 8 9 */
241 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
242 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
243 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
244 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
245 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
246 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
247 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
248 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
249 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
250 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
251 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
252 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
253 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
254 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
255 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
256 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
257 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
258 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
259 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
260 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
261 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
262 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
263 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
264 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
265 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
269 #ifdef U_ENABLE_GENERIC_ISO_2022
271 * When the generic ISO-2022 converter is completely removed, not just disabled
272 * per #ifdef, then the following state table and the associated tables that are
273 * dimensioned with MAX_STATES_2022 should be trimmed.
275 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
276 * the associated escape sequences starting with ESC ( B should be removed.
277 * This includes the ones with key values 1097 and all of the ones above 1000000.
279 * For the latter, the tables can simply be truncated.
280 * For the former, since the tables must be kept parallel, it is probably best
281 * to simply duplicate an adjacent table cell, parallel in all tables.
283 * It may make sense to restructure the tables, especially by using small search
284 * tables for the variants instead of indexing them parallel to the table here.
288 #define MAX_STATES_2022 74
289 static const int32_t escSeqStateTable_Key_2022
[MAX_STATES_2022
] = {
290 /* 0 1 2 3 4 5 6 7 8 9 */
292 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
293 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
294 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
295 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
296 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
297 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
298 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
299 ,35947631 ,35947635 ,35947636 ,35947638
302 #ifdef U_ENABLE_GENERIC_ISO_2022
304 static const char* const escSeqStateTable_Result_2022
[MAX_STATES_2022
] = {
305 /* 0 1 2 3 4 5 6 7 8 9 */
307 NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,NULL
,"latin1" ,"latin1"
308 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
309 ,"latin1" ,NULL
,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL
,NULL
,NULL
,NULL
,"UTF8"
310 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL
,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
311 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
312 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
313 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL
,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
314 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
319 static const UCNV_TableStates_2022 escSeqStateTable_Value_2022
[MAX_STATES_2022
] = {
320 /* 0 1 2 3 4 5 6 7 8 9 */
321 VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
322 ,VALID_MAYBE_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
323 ,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
324 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
325 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
326 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
327 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
328 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
,VALID_TERMINAL_2022
332 /* Type def for refactoring changeState_2022 code*/
334 #ifdef U_ENABLE_GENERIC_ISO_2022
342 /*********** ISO 2022 Converter Protos ***********/
344 _ISO2022Open(UConverter
*cnv
, const char *name
, const char *locale
,uint32_t options
, UErrorCode
*errorCode
);
347 _ISO2022Close(UConverter
*converter
);
350 _ISO2022Reset(UConverter
*converter
, UConverterResetChoice choice
);
353 _ISO2022getName(const UConverter
* cnv
);
356 _ISO_2022_WriteSub(UConverterFromUnicodeArgs
*args
, int32_t offsetIndex
, UErrorCode
*err
);
359 _ISO_2022_SafeClone(const UConverter
*cnv
, void *stackBuffer
, int32_t *pBufferSize
, UErrorCode
*status
);
361 #ifdef U_ENABLE_GENERIC_ISO_2022
363 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs
* args
, UErrorCode
* err
);
366 /*const UConverterSharedData _ISO2022Data;*/
367 static const UConverterSharedData _ISO2022JPData
;
368 static const UConverterSharedData _ISO2022KRData
;
369 static const UConverterSharedData _ISO2022CNData
;
371 /*************** Converter implementations ******************/
374 setInitialStateToUnicodeKR(UConverter
* converter
, UConverterDataISO2022
*myConverterData
){
375 if(myConverterData
->version
== 1) {
376 UConverter
*cnv
= myConverterData
->currentConverter
;
378 cnv
->toUnicodeStatus
=0; /* offset */
379 cnv
->mode
=0; /* state */
380 cnv
->toULength
=0; /* byteIndex */
385 setInitialStateFromUnicodeKR(UConverter
* converter
,UConverterDataISO2022
*myConverterData
){
386 /* in ISO-2022-KR the designator sequence appears only once
387 * in a file so we append it only once
389 if( converter
->charErrorBufferLength
==0){
391 converter
->charErrorBufferLength
= 4;
392 converter
->charErrorBuffer
[0] = 0x1b;
393 converter
->charErrorBuffer
[1] = 0x24;
394 converter
->charErrorBuffer
[2] = 0x29;
395 converter
->charErrorBuffer
[3] = 0x43;
397 if(myConverterData
->version
== 1) {
398 UConverter
*cnv
= myConverterData
->currentConverter
;
401 cnv
->fromUnicodeStatus
=1; /* prevLength */
406 _ISO2022Open(UConverter
*cnv
, const char *name
, const char *locale
,uint32_t options
, UErrorCode
*errorCode
){
408 char myLocale
[6]={' ',' ',' ',' ',' ',' '};
410 cnv
->extraInfo
= uprv_malloc (sizeof (UConverterDataISO2022
));
411 if(cnv
->extraInfo
!= NULL
) {
412 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) cnv
->extraInfo
;
415 uprv_memset(myConverterData
, 0, sizeof(UConverterDataISO2022
));
416 myConverterData
->currentConverter
= NULL
;
417 myConverterData
->currentType
= ASCII1
;
418 myConverterData
->key
=0;
419 #ifdef U_ENABLE_GENERIC_ISO_2022
420 myConverterData
->isFirstBuffer
= TRUE
;
422 cnv
->fromUnicodeStatus
=FALSE
;
424 uprv_strncpy(myLocale
, locale
, sizeof(myLocale
));
426 myConverterData
->version
= 0;
427 version
= options
& UCNV_OPTIONS_VERSION_MASK
;
428 if(myLocale
[0]=='j' && (myLocale
[1]=='a'|| myLocale
[1]=='p') &&
429 (myLocale
[2]=='_' || myLocale
[2]=='\0')){
431 /* open the required converters and cache them */
432 if(jpCharsetMasks
[version
]&CSM(ISO8859_7
)) {
433 myConverterData
->myConverterArray
[ISO8859_7
]= ucnv_loadSharedData("ISO8859_7", NULL
, errorCode
);
435 myConverterData
->myConverterArray
[JISX201
] = ucnv_loadSharedData("JISX0201", NULL
, errorCode
);
436 myConverterData
->myConverterArray
[JISX208
] = ucnv_loadSharedData("jisx-208", NULL
, errorCode
);
437 if(jpCharsetMasks
[version
]&CSM(JISX212
)) {
438 myConverterData
->myConverterArray
[JISX212
] = ucnv_loadSharedData("jisx-212", NULL
, errorCode
);
440 if(jpCharsetMasks
[version
]&CSM(GB2312
)) {
441 myConverterData
->myConverterArray
[GB2312
] = ucnv_loadSharedData("ibm-5478", NULL
, errorCode
); /* gb_2312_80-1 */
443 if(jpCharsetMasks
[version
]&CSM(KSC5601
)) {
444 myConverterData
->myConverterArray
[KSC5601
] = ucnv_loadSharedData("ksc_5601", NULL
, errorCode
);
447 /* set the function pointers to appropriate funtions */
448 cnv
->sharedData
=(UConverterSharedData
*)(&_ISO2022JPData
);
449 uprv_strcpy(myConverterData
->locale
,"ja");
451 myConverterData
->version
= version
;
452 uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ja,version=");
453 len
= uprv_strlen(myConverterData
->name
);
454 myConverterData
->name
[len
]=(char)(myConverterData
->version
+(int)'0');
455 myConverterData
->name
[len
+1]='\0';
457 else if(myLocale
[0]=='k' && (myLocale
[1]=='o'|| myLocale
[1]=='r') &&
458 (myLocale
[2]=='_' || myLocale
[2]=='\0')){
460 if ((options
& UCNV_OPTIONS_VERSION_MASK
)==1){
461 myConverterData
->version
= 1;
462 myConverterData
->currentConverter
=
463 ucnv_open("icu-internal-25546",errorCode
);
465 if (U_FAILURE(*errorCode
)) {
470 uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ko,version=1");
471 uprv_memcpy(cnv
->subChar
, myConverterData
->currentConverter
->subChar
, 4);
472 cnv
->subCharLen
= myConverterData
->currentConverter
->subCharLen
;
474 myConverterData
->currentConverter
=ucnv_open("ibm-949",errorCode
);
476 if (U_FAILURE(*errorCode
)) {
481 myConverterData
->version
= 0;
482 uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=ko,version=0");
485 /* initialize the state variables */
486 setInitialStateToUnicodeKR(cnv
, myConverterData
);
487 setInitialStateFromUnicodeKR(cnv
,myConverterData
);
489 /* set the function pointers to appropriate funtions */
490 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022KRData
;
491 uprv_strcpy(myConverterData
->locale
,"ko");
493 else if(((myLocale
[0]=='z' && myLocale
[1]=='h') || (myLocale
[0]=='c'&& myLocale
[1]=='n'))&&
494 (myLocale
[2]=='_' || myLocale
[2]=='\0')){
496 /* open the required converters and cache them */
497 myConverterData
->myConverterArray
[GB2312_1
] = ucnv_loadSharedData("ibm-5478", NULL
, errorCode
);
499 myConverterData
->myConverterArray
[ISO_IR_165
] = ucnv_loadSharedData("iso-ir-165", NULL
, errorCode
);
501 myConverterData
->myConverterArray
[CNS_11643
] = ucnv_loadSharedData("cns-11643-1992", NULL
, errorCode
);
504 /* set the function pointers to appropriate funtions */
505 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022CNData
;
506 uprv_strcpy(myConverterData
->locale
,"cn");
508 if ((options
& UCNV_OPTIONS_VERSION_MASK
)==1){
509 myConverterData
->version
= 1;
510 uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=zh,version=1");
512 uprv_strcpy(myConverterData
->name
,"ISO_2022,locale=zh,version=0");
513 myConverterData
->version
= 0;
517 #ifdef U_ENABLE_GENERIC_ISO_2022
518 /* append the UTF-8 escape sequence */
519 cnv
->charErrorBufferLength
= 3;
520 cnv
->charErrorBuffer
[0] = 0x1b;
521 cnv
->charErrorBuffer
[1] = 0x25;
522 cnv
->charErrorBuffer
[2] = 0x42;
524 cnv
->sharedData
=(UConverterSharedData
*)&_ISO2022Data
;
525 /* initialize the state variables */
526 uprv_strcpy(myConverterData
->name
,"ISO_2022");
528 *errorCode
= U_UNSUPPORTED_ERROR
;
533 cnv
->maxBytesPerUChar
=cnv
->sharedData
->staticData
->maxBytesPerChar
;
535 if(U_FAILURE(*errorCode
)) {
539 *errorCode
= U_MEMORY_ALLOCATION_ERROR
;
545 _ISO2022Close(UConverter
*converter
) {
546 UConverterDataISO2022
* myData
=(UConverterDataISO2022
*) (converter
->extraInfo
);
547 UConverterSharedData
**array
= myData
->myConverterArray
;
550 if (converter
->extraInfo
!= NULL
) {
551 /*close the array of converter pointers and free the memory*/
552 for (i
=0; i
<UCNV_2022_MAX_CONVERTERS
; i
++) {
554 ucnv_unloadSharedDataIfReady(array
[i
]);
558 ucnv_close(myData
->currentConverter
);
560 if(!converter
->isExtraLocal
){
561 uprv_free (converter
->extraInfo
);
562 converter
->extraInfo
= NULL
;
568 _ISO2022Reset(UConverter
*converter
, UConverterResetChoice choice
) {
569 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) (converter
->extraInfo
);
570 if(choice
<=UCNV_RESET_TO_UNICODE
) {
571 uprv_memset(&myConverterData
->toU2022State
, 0, sizeof(ISO2022State
));
572 myConverterData
->key
= 0;
574 if(choice
!=UCNV_RESET_TO_UNICODE
) {
575 uprv_memset(&myConverterData
->fromU2022State
, 0, sizeof(ISO2022State
));
577 #ifdef U_ENABLE_GENERIC_ISO_2022
578 if(myConverterData
->locale
[0] == 0){
579 if(choice
<=UCNV_RESET_TO_UNICODE
) {
580 myConverterData
->isFirstBuffer
= TRUE
;
581 myConverterData
->key
= 0;
582 if (converter
->mode
== UCNV_SO
){
583 ucnv_close (myConverterData
->currentConverter
);
584 myConverterData
->currentConverter
=NULL
;
586 converter
->mode
= UCNV_SI
;
588 if(choice
!=UCNV_RESET_TO_UNICODE
) {
589 /* re-append UTF-8 escape sequence */
590 converter
->charErrorBufferLength
= 3;
591 converter
->charErrorBuffer
[0] = 0x1b;
592 converter
->charErrorBuffer
[1] = 0x28;
593 converter
->charErrorBuffer
[2] = 0x42;
599 /* reset the state variables */
600 if(myConverterData
->locale
[0] == 'k'){
601 if(choice
<=UCNV_RESET_TO_UNICODE
) {
602 setInitialStateToUnicodeKR(converter
, myConverterData
);
604 if(choice
!=UCNV_RESET_TO_UNICODE
) {
605 setInitialStateFromUnicodeKR(converter
, myConverterData
);
612 _ISO2022getName(const UConverter
* cnv
){
614 UConverterDataISO2022
* myData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
621 /*************** to unicode *******************/
622 /****************************************************************************
623 * Recognized escape sequences are
635 static const StateEnum nextStateToUnicodeJP
[MAX_STATES_2022
]= {
636 /* 0 1 2 3 4 5 6 7 8 9 */
637 INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,SS2_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
638 ,ASCII
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,JISX201
,HWKANA_7BIT
,JISX201
,INVALID_STATE
639 ,INVALID_STATE
,INVALID_STATE
,JISX208
,GB2312
,JISX208
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
640 ,ISO8859_1
,ISO8859_7
,JISX208
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,KSC5601
,JISX212
,INVALID_STATE
641 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
642 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
643 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
644 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
647 /*************** to unicode *******************/
648 static const StateEnum nextStateToUnicodeCN
[MAX_STATES_2022
]= {
649 /* 0 1 2 3 4 5 6 7 8 9 */
650 INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,SS2_STATE
,SS3_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
651 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
652 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
653 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
654 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,GB2312_1
,INVALID_STATE
,ISO_IR_165
655 ,CNS_11643_1
,CNS_11643_2
,CNS_11643_3
,CNS_11643_4
,CNS_11643_5
,CNS_11643_6
,CNS_11643_7
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
656 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
657 ,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
,INVALID_STATE
661 static UCNV_TableStates_2022
662 getKey_2022(char c
,int32_t* key
,int32_t* offset
){
665 int32_t hi
= MAX_STATES_2022
;
668 togo
= normalize_esq_chars_2022
[(uint8_t)c
];
670 /* not a valid character anywhere in an escape sequence */
675 togo
= (*key
<< 5) + togo
;
677 while (hi
!= low
) /*binary search*/{
679 register int32_t mid
= (hi
+low
) >> 1; /*Finds median*/
684 if (escSeqStateTable_Key_2022
[mid
] > togo
){
687 else if (escSeqStateTable_Key_2022
[mid
] < togo
){
690 else /*we found it*/{
693 return escSeqStateTable_Value_2022
[mid
];
704 /*runs through a state machine to determine the escape sequence - codepage correspondance
707 changeState_2022(UConverter
* _this
,
709 const char* sourceLimit
,
712 UCNV_TableStates_2022 value
;
713 UConverterDataISO2022
* myData2022
= ((UConverterDataISO2022
*)_this
->extraInfo
);
714 uint32_t key
= myData2022
->key
;
718 value
= VALID_NON_TERMINAL_2022
;
719 while (*source
< sourceLimit
) {
721 _this
->toUBytes
[_this
->toULength
++]=(uint8_t)c
;
722 value
= getKey_2022(c
,(int32_t *) &key
, &offset
);
726 case VALID_NON_TERMINAL_2022
:
727 /* continue with the loop */
730 case VALID_TERMINAL_2022
:
737 case VALID_MAYBE_TERMINAL_2022
:
738 #ifdef U_ENABLE_GENERIC_ISO_2022
739 /* ESC ( B is ambiguous only for ISO_2022 itself */
740 if(var
== ISO_2022
) {
741 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
742 _this
->toULength
= 0;
744 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
746 /* continue with the loop */
747 value
= VALID_NON_TERMINAL_2022
;
752 /* not ISO_2022 itself, finish here */
753 value
= VALID_TERMINAL_2022
;
761 myData2022
->key
= key
;
763 if (value
== VALID_NON_TERMINAL_2022
) {
764 /* indicate that the escape sequence is incomplete: key!=0 */
766 } else if (value
== INVALID_2022
) {
767 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
769 } else /* value == VALID_TERMINAL_2022 */ {
771 #ifdef U_ENABLE_GENERIC_ISO_2022
774 const char *chosenConverterName
= escSeqStateTable_Result_2022
[offset
];
775 if(chosenConverterName
== NULL
) {
777 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
781 _this
->mode
= UCNV_SI
;
782 ucnv_close(myData2022
->currentConverter
);
783 myData2022
->currentConverter
= myUConverter
= ucnv_open(chosenConverterName
, err
);
784 if(U_SUCCESS(*err
)) {
785 myUConverter
->fromCharErrorBehaviour
= UCNV_TO_U_CALLBACK_STOP
;
786 _this
->mode
= UCNV_SO
;
793 StateEnum tempState
=nextStateToUnicodeJP
[offset
];
796 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
799 if(myData2022
->toU2022State
.cs
[2]!=0) {
800 if(myData2022
->toU2022State
.g
<2) {
801 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
803 myData2022
->toU2022State
.g
=2;
805 /* illegal to have SS2 before a matching designator */
806 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
809 /* case SS3_STATE: not used in ISO-2022-JP-x */
812 if((jpCharsetMasks
[myData2022
->version
] & CSM(tempState
)) == 0) {
813 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
815 /* G2 charset for SS2 */
816 myData2022
->toU2022State
.cs
[2]=(int8_t)tempState
;
820 if((jpCharsetMasks
[myData2022
->version
] & CSM(tempState
)) == 0) {
821 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
824 myData2022
->toU2022State
.cs
[0]=(int8_t)tempState
;
832 StateEnum tempState
=nextStateToUnicodeCN
[offset
];
835 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
838 if(myData2022
->toU2022State
.cs
[2]!=0) {
839 if(myData2022
->toU2022State
.g
<2) {
840 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
842 myData2022
->toU2022State
.g
=2;
844 /* illegal to have SS2 before a matching designator */
845 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
849 if(myData2022
->toU2022State
.cs
[3]!=0) {
850 if(myData2022
->toU2022State
.g
<2) {
851 myData2022
->toU2022State
.prevG
=myData2022
->toU2022State
.g
;
853 myData2022
->toU2022State
.g
=3;
855 /* illegal to have SS3 before a matching designator */
856 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
860 if(myData2022
->version
==0) {
861 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
866 myData2022
->toU2022State
.cs
[1]=(int8_t)tempState
;
869 myData2022
->toU2022State
.cs
[2]=(int8_t)tempState
;
872 /* other CNS 11643 planes */
873 if(myData2022
->version
==0) {
874 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
876 myData2022
->toU2022State
.cs
[3]=(int8_t)tempState
;
884 /* nothing to be done, just accept this one escape sequence */
886 *err
= U_UNSUPPORTED_ESCAPE_SEQUENCE
;
891 *err
= U_ILLEGAL_ESCAPE_SEQUENCE
;
895 if(U_SUCCESS(*err
)) {
896 _this
->toULength
= 0;
900 /*Checks the characters of the buffer against valid 2022 escape sequences
901 *if the match we return a pointer to the initial start of the sequence otherwise
902 *we return sourceLimit
904 /*for 2022 looks ahead in the stream
905 *to determine the longest possible convertible
908 static U_INLINE
const char*
909 getEndOfBuffer_2022(const char** source
,
910 const char* sourceLimit
,
913 const char* mySource
= *source
;
915 #ifdef U_ENABLE_GENERIC_ISO_2022
916 if (*source
>= sourceLimit
)
921 if (*mySource
== ESC_2022
){
925 UCNV_TableStates_2022 value
= VALID_NON_TERMINAL_2022
;
927 /* Kludge: I could not
928 * figure out the reason for validating an escape sequence
929 * twice - once here and once in changeState_2022().
930 * is it possible to have an ESC character in a ISO2022
931 * byte stream which is valid in a code page? Is it legal?
934 (mySource
+i
< sourceLimit
)&&(value
== VALID_NON_TERMINAL_2022
);
936 value
= getKey_2022(*(mySource
+i
), &key
, &offset
);
938 if (value
> 0 || *mySource
==ESC_2022
)
941 if ((value
== VALID_NON_TERMINAL_2022
)&&(!flush
) )
944 }while (++mySource
< sourceLimit
);
948 while(mySource
< sourceLimit
&& *mySource
!= ESC_2022
) {
956 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
957 * any future change in _MBCSFromUChar32() function should be reflected in
961 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData
* sharedData
,
969 const uint16_t *table
;
970 uint32_t stage2Entry
;
973 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
974 if(c
<0x10000 || (sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
975 table
=sharedData
->mbcs
.fromUnicodeTable
;
976 stage2Entry
=MBCS_STAGE_2_FROM_U(table
, c
);
977 /* get the bytes and the length for the output */
978 if(outputType
==MBCS_OUTPUT_2
){
979 myValue
=MBCS_VALUE_2_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
985 } else /* outputType==MBCS_OUTPUT_3 */ {
986 p
=MBCS_POINTER_3_FROM_STAGE_2(sharedData
->mbcs
.fromUnicodeBytes
, stage2Entry
, c
);
987 myValue
=((uint32_t)*p
<<16)|((uint32_t)p
[1]<<8)|p
[2];
990 } else if(myValue
<=0xffff) {
996 /* is this code point assigned, or do we use fallbacks? */
997 if( (stage2Entry
&(1<<(16+(c
&0xf))))!=0 ||
998 (FROM_U_USE_FALLBACK(useFallback
, c
) && myValue
!=0)
1001 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1002 * There is no way with this data structure for fallback output
1003 * to be a zero byte.
1011 cx
=sharedData
->mbcs
.extIndexes
;
1013 *length
=ucnv_extSimpleMatchFromU(cx
, c
, value
, useFallback
);
1021 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1022 * any future change in _MBCSSingleFromUChar32() function should be reflected in
1025 static U_INLINE
void
1026 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData
* sharedData
,
1031 const uint16_t *table
;
1033 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1034 if(c
>=0x10000 && !(sharedData
->mbcs
.unicodeMask
&UCNV_HAS_SUPPLEMENTARY
)) {
1035 *retval
=(uint16_t)-1;
1038 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1039 table
=sharedData
->mbcs
.fromUnicodeTable
;
1040 /* get the byte for the output */
1041 value
=MBCS_SINGLE_RESULT_FROM_U(table
, (uint16_t *)sharedData
->mbcs
.fromUnicodeBytes
, c
);
1042 /* is this code point assigned, or do we use fallbacks? */
1043 if(useFallback
? value
>=0x800 : value
>=0xc00) {
1048 *retval
=(uint16_t) value
;
1051 #ifdef U_ENABLE_GENERIC_ISO_2022
1053 /**********************************************************************************
1054 * ISO-2022 Converter
1060 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs
* args
,
1062 const char* mySourceLimit
, *realSourceLimit
;
1063 const char* sourceStart
;
1064 const UChar
* myTargetStart
;
1065 UConverter
* saveThis
;
1066 UConverterDataISO2022
* myData
;
1069 saveThis
= args
->converter
;
1070 myData
=((UConverterDataISO2022
*)(saveThis
->extraInfo
));
1072 realSourceLimit
= args
->sourceLimit
;
1073 while (args
->source
< realSourceLimit
) {
1074 if(myData
->key
== 0) { /* are we in the middle of an escape sequence? */
1075 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1076 mySourceLimit
= getEndOfBuffer_2022(&(args
->source
), realSourceLimit
, args
->flush
);
1078 if(args
->source
< mySourceLimit
) {
1079 if(myData
->currentConverter
==NULL
) {
1080 myData
->currentConverter
= ucnv_open("ASCII",err
);
1081 if(U_FAILURE(*err
)){
1085 myData
->currentConverter
->fromCharErrorBehaviour
= UCNV_TO_U_CALLBACK_STOP
;
1086 saveThis
->mode
= UCNV_SO
;
1089 /* convert to before the ESC or until the end of the buffer */
1090 myData
->isFirstBuffer
=FALSE
;
1091 sourceStart
= args
->source
;
1092 myTargetStart
= args
->target
;
1093 args
->converter
= myData
->currentConverter
;
1094 ucnv_toUnicode(args
->converter
,
1100 (UBool
)(args
->flush
&& mySourceLimit
== realSourceLimit
),
1102 args
->converter
= saveThis
;
1104 if (*err
== U_BUFFER_OVERFLOW_ERROR
) {
1105 /* move the overflow buffer */
1106 length
= saveThis
->UCharErrorBufferLength
= myData
->currentConverter
->UCharErrorBufferLength
;
1107 myData
->currentConverter
->UCharErrorBufferLength
= 0;
1109 uprv_memcpy(saveThis
->UCharErrorBuffer
,
1110 myData
->currentConverter
->UCharErrorBuffer
,
1111 length
*U_SIZEOF_UCHAR
);
1118 * -Error while converting
1119 * -Done with entire buffer
1120 * -Need to write offsets or update the current offset
1121 * (leave that up to the code in ucnv.c)
1123 * or else we just stopped at an ESC byte and continue with changeState_2022()
1125 if (U_FAILURE(*err
) ||
1126 (args
->source
== realSourceLimit
) ||
1127 (args
->offsets
!= NULL
&& (args
->target
!= myTargetStart
|| args
->source
!= sourceStart
) ||
1128 (mySourceLimit
< realSourceLimit
&& myData
->currentConverter
->toULength
> 0))
1130 /* copy partial or error input for truncated detection and error handling */
1131 if(U_FAILURE(*err
)) {
1132 length
= saveThis
->invalidCharLength
= myData
->currentConverter
->invalidCharLength
;
1134 uprv_memcpy(saveThis
->invalidCharBuffer
, myData
->currentConverter
->invalidCharBuffer
, length
);
1137 length
= saveThis
->toULength
= myData
->currentConverter
->toULength
;
1139 uprv_memcpy(saveThis
->toUBytes
, myData
->currentConverter
->toUBytes
, length
);
1140 if(args
->source
< mySourceLimit
) {
1141 *err
= U_TRUNCATED_CHAR_FOUND
; /* truncated input before ESC */
1150 sourceStart
= args
->source
;
1151 changeState_2022(args
->converter
,
1156 if (U_FAILURE(*err
) || (args
->source
!= sourceStart
&& args
->offsets
!= NULL
)) {
1157 /* let the ucnv.c code update its current offset */
1166 * To Unicode Callback helper function
1169 toUnicodeCallback(UConverter
*cnv
,
1170 const uint32_t sourceChar
, const uint32_t targetUniChar
,
1172 if(sourceChar
>0xff){
1173 cnv
->toUBytes
[0] = (uint8_t)(sourceChar
>>8);
1174 cnv
->toUBytes
[1] = (uint8_t)sourceChar
;
1178 cnv
->toUBytes
[0] =(char) sourceChar
;
1182 if(targetUniChar
== (missingCharMarker
-1/*0xfffe*/)){
1183 *err
= U_INVALID_CHAR_FOUND
;
1186 *err
= U_ILLEGAL_CHAR_FOUND
;
1190 /**************************************ISO-2022-JP*************************************************/
1192 /************************************** IMPORTANT **************************************************
1193 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1194 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1195 * The converter iterates over each Unicode codepoint
1196 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1197 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1198 * would do as far as possible.
1200 * If the implementation of these macros or structure of sharedData struct change in the future, make
1201 * sure that ISO-2022 is also changed.
1202 ***************************************************************************************************
1205 /***************************************************************************************************
1206 * Rules for ISO-2022-jp encoding
1207 * (i) Escape sequences must be fully contained within a line they should not
1208 * span new lines or CRs
1209 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1210 * JIS-Roman character escape sequence should follow before the line terminates
1211 * (iii) If the first character on the line is represented by two bytes then a two
1212 * byte character escape sequence should precede it
1213 * (iv) If no escape sequence is encountered then the characters are ASCII
1214 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1215 * and invoked with SS2 (ESC N).
1216 * (vi) If there is any G0 designation in text, there must be a switch to
1217 * ASCII or to JIS X 0201-Roman before a space character (but not
1218 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1219 * characters such as tab or CRLF.
1220 * (vi) Supported encodings:
1221 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1225 * JISX201, JISX208,JISX212 : new .cnv data files created
1226 * KSC5601 : alias to ibm-949 mapping table
1227 * GB2312 : alias to ibm-1386 mapping table
1228 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1229 * ISO-8859-7 : alisas to ibm-9409 mapping table
1232 /* preference order of JP charsets */
1233 static const StateEnum jpCharsetPref
[]={
1245 static const char escSeqChars
[][6] ={
1246 "\x1B\x28\x42", /* <ESC>(B ASCII */
1247 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1248 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1249 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1250 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1251 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1252 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1253 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1254 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1257 static const int32_t escSeqCharsLen
[] ={
1258 3, /* length of <ESC>(B ASCII */
1259 3, /* length of <ESC>.A ISO-8859-1 */
1260 3, /* length of <ESC>.F ISO-8859-7 */
1261 3, /* length of <ESC>(J JISX-201 */
1262 3, /* length of <ESC>$B JISX-208 */
1263 4, /* length of <ESC>$(D JISX-212 */
1264 3, /* length of <ESC>$A GB2312 */
1265 4, /* length of <ESC>$(C KSC5601 */
1266 3 /* length of <ESC>(I HWKANA_7BIT */
1270 * The iteration over various code pages works this way:
1271 * i) Get the currentState from myConverterData->currentState
1272 * ii) Check if the character is mapped to a valid character in the currentState
1273 * Yes -> a) set the initIterState to currentState
1274 * b) remain in this state until an invalid character is found
1275 * No -> a) go to the next code page and find the character
1276 * iii) Before changing the state increment the current state check if the current state
1277 * is equal to the intitIteration state
1278 * Yes -> A character that cannot be represented in any of the supported encodings
1279 * break and return a U_INVALID_CHARACTER error
1280 * No -> Continue and find the character in next code page
1283 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1287 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
) {
1288 UConverterDataISO2022
*converterData
;
1289 ISO2022State
*pFromU2022State
;
1290 uint8_t *target
= (uint8_t *) args
->target
;
1291 const uint8_t *targetLimit
= (const uint8_t *) args
->targetLimit
;
1292 const UChar
* source
= args
->source
;
1293 const UChar
* sourceLimit
= args
->sourceLimit
;
1294 int32_t* offsets
= args
->offsets
;
1297 int32_t len
, outLen
;
1299 int32_t choiceCount
;
1300 uint32_t targetValue
;
1306 /* set up the state */
1307 converterData
= (UConverterDataISO2022
*)args
->converter
->extraInfo
;
1308 pFromU2022State
= &converterData
->fromU2022State
;
1309 useFallback
= args
->converter
->useFallback
;
1313 /* check if the last codepoint of previous buffer was a lead surrogate*/
1314 if((sourceChar
= args
->converter
->fromUChar32
)!=0 && target
< targetLimit
) {
1318 while(source
< sourceLimit
) {
1319 if(target
< targetLimit
) {
1321 sourceChar
= *(source
++);
1322 /*check if the char is a First surrogate*/
1323 if(UTF_IS_SURROGATE(sourceChar
)) {
1324 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
1326 /*look ahead to find the trail surrogate*/
1327 if(source
< sourceLimit
) {
1328 /* test the following code unit */
1329 UChar trail
=(UChar
) *source
;
1330 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1332 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
1333 args
->converter
->fromUChar32
=0x00;
1334 /* convert this supplementary code point */
1335 /* exit this condition tree */
1337 /* this is an unmatched lead code unit (1st surrogate) */
1338 /* callback(illegal) */
1339 *err
=U_ILLEGAL_CHAR_FOUND
;
1340 args
->converter
->fromUChar32
=sourceChar
;
1345 args
->converter
->fromUChar32
=sourceChar
;
1349 /* this is an unmatched trail code unit (2nd surrogate) */
1350 /* callback(illegal) */
1351 *err
=U_ILLEGAL_CHAR_FOUND
;
1352 args
->converter
->fromUChar32
=sourceChar
;
1357 /* do the conversion */
1359 if(choiceCount
== 0) {
1363 * The csm variable keeps track of which charsets are allowed
1364 * and not used yet while building the choices[].
1366 csm
= jpCharsetMasks
[converterData
->version
];
1369 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1370 if(converterData
->version
== 3 || converterData
->version
== 4) {
1371 choices
[choiceCount
++] = cs
= (int8_t)HWKANA_7BIT
;
1375 /* try the current G0 charset */
1376 choices
[choiceCount
++] = cs
= pFromU2022State
->cs
[0];
1379 /* try the current G2 charset */
1380 if((cs
= pFromU2022State
->cs
[2]) != 0) {
1381 choices
[choiceCount
++] = cs
;
1385 /* try all the other possible charsets */
1386 for(i
= 0; i
< LENGTHOF(jpCharsetPref
); ++i
) {
1387 cs
= (int8_t)jpCharsetPref
[i
];
1389 choices
[choiceCount
++] = cs
;
1398 for(i
= 0; i
< choiceCount
&& len
== 0; ++i
) {
1402 if(sourceChar
<= 0x7f) {
1403 targetValue
= (uint32_t)sourceChar
;
1408 if(0x80 <= sourceChar
&& sourceChar
<= 0xff) {
1409 targetValue
= (uint32_t)sourceChar
- 0x80;
1415 if((uint32_t)(0xff9f-sourceChar
)<=(0xff9f-0xff61)) {
1416 targetValue
= (uint32_t)(sourceChar
- (0xff61 - 0x21));
1419 if(converterData
->version
==3) {
1420 /* JIS7: use G1 (SO) */
1421 pFromU2022State
->cs
[1] = cs
; /* do not output an escape sequence */
1423 } else if(converterData
->version
==4) {
1424 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1427 targetValue
+= 0x80;
1429 cs0
= pFromU2022State
->cs
[0];
1430 if(IS_JP_DBCS(cs0
)) {
1431 /* switch from a DBCS charset to JISX201 */
1432 cs
= (int8_t)JISX201
;
1434 /* stay in the current G0 charset */
1442 MBCS_SINGLE_FROM_UCHAR32(
1443 converterData
->myConverterArray
[cs
],
1444 sourceChar
, &targetValue
,
1446 if(targetValue
<= 0x7f) {
1451 /* G0 SBCS forced to 7-bit output */
1452 MBCS_SINGLE_FROM_UCHAR32(
1453 converterData
->myConverterArray
[cs
],
1454 sourceChar
, &targetValue
,
1456 if(0x80 <= targetValue
&& targetValue
<= 0xff) {
1457 targetValue
-= 0x80;
1464 MBCS_FROM_UCHAR32_ISO2022(
1465 converterData
->myConverterArray
[cs
],
1466 sourceChar
, &targetValue
,
1467 useFallback
, &len
, MBCS_OUTPUT_2
);
1476 outLen
= 0; /* count output bytes */
1478 /* write SI if necessary (only for JIS7) */
1479 if(pFromU2022State
->g
== 1 && g
== 0) {
1480 buffer
[outLen
++] = UCNV_SI
;
1481 pFromU2022State
->g
= 0;
1484 /* write the designation sequence if necessary */
1485 if(cs
!= pFromU2022State
->cs
[g
]) {
1486 int32_t escLen
= escSeqCharsLen
[cs
];
1487 uprv_memcpy(buffer
+ outLen
, escSeqChars
[cs
], escLen
);
1489 pFromU2022State
->cs
[g
] = cs
;
1491 /* invalidate the choices[] */
1495 /* write the shift sequence if necessary */
1496 if(g
!= pFromU2022State
->g
) {
1498 /* case 0 handled before writing escapes */
1500 buffer
[outLen
++] = UCNV_SO
;
1501 pFromU2022State
->g
= 1;
1503 default: /* case 2 */
1504 buffer
[outLen
++] = 0x1b;
1505 buffer
[outLen
++] = 0x4e;
1507 /* no case 3: no SS3 in ISO-2022-JP-x */
1511 /* write the output bytes */
1513 buffer
[outLen
++] = (char)targetValue
;
1514 } else /* len == 2 */ {
1515 buffer
[outLen
++] = (char)(targetValue
>> 8);
1516 buffer
[outLen
++] = (char)targetValue
;
1520 * if we cannot find the character after checking all codepages
1521 * then this is an error
1523 *err
= U_INVALID_CHAR_FOUND
;
1524 args
->converter
->fromUChar32
=sourceChar
;
1528 if(sourceChar
== CR
|| sourceChar
== LF
) {
1529 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1530 pFromU2022State
->cs
[2] = 0;
1534 /* output outLen>0 bytes in buffer[] */
1536 *target
++ = buffer
[0];
1538 *offsets
++ = source
- args
->source
- 1; /* -1: known to be ASCII */
1540 } else if(outLen
== 2 && (target
+ 2) <= targetLimit
) {
1541 *target
++ = buffer
[0];
1542 *target
++ = buffer
[1];
1544 int32_t sourceIndex
= (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
));
1545 *offsets
++ = sourceIndex
;
1546 *offsets
++ = sourceIndex
;
1549 ucnv_fromUWriteBytes(
1552 (char **)&target
, (const char *)targetLimit
,
1553 &offsets
, (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
)),
1555 if(U_FAILURE(*err
)) {
1559 } /* end if(myTargetIndex<myTargetLength) */
1561 *err
=U_BUFFER_OVERFLOW_ERROR
;
1565 }/* end while(mySourceIndex<mySourceLength) */
1568 * the end of the input stream and detection of truncated input
1569 * are handled by the framework, but for ISO-2022-JP conversion
1570 * we need to be in ASCII mode at the very end
1574 * in SO mode or not in ASCII mode
1575 * end of input and no truncated input
1577 if( U_SUCCESS(*err
) &&
1578 (pFromU2022State
->g
!=0 || pFromU2022State
->cs
[0]!=ASCII
) &&
1579 args
->flush
&& source
>=sourceLimit
&& args
->converter
->fromUChar32
==0
1581 int32_t sourceIndex
;
1585 if(pFromU2022State
->g
!= 0) {
1586 buffer
[outLen
++] = UCNV_SI
;
1587 pFromU2022State
->g
= 0;
1590 if(pFromU2022State
->cs
[0] != ASCII
) {
1591 int32_t escLen
= escSeqCharsLen
[ASCII
];
1592 uprv_memcpy(buffer
+ outLen
, escSeqChars
[ASCII
], escLen
);
1594 pFromU2022State
->cs
[0] = (int8_t)ASCII
;
1597 /* get the source index of the last input character */
1599 * TODO this would be simpler and more reliable if we used a pair
1600 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1601 * so that we could simply use the prevSourceIndex here;
1602 * this code gives an incorrect result for the rare case of an unmatched
1603 * trail surrogate that is alone in the last buffer of the text stream
1605 sourceIndex
=(int32_t)(source
-args
->source
);
1608 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
1609 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
1617 ucnv_fromUWriteBytes(
1620 (char **)&target
, (const char *)targetLimit
,
1621 &offsets
, sourceIndex
,
1625 /*save the state and return */
1626 args
->source
= source
;
1627 args
->target
= (char*)target
;
1630 /*************** to unicode *******************/
1633 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
1636 const char *mySource
= (char *) args
->source
;
1637 UChar
*myTarget
= args
->target
;
1638 const char *mySourceLimit
= args
->sourceLimit
;
1639 uint32_t targetUniChar
= 0x0000;
1640 uint32_t mySourceChar
= 0x0000;
1641 UConverterDataISO2022
* myData
;
1642 ISO2022State
*pToU2022State
;
1645 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
1646 pToU2022State
= &myData
->toU2022State
;
1648 if(myData
->key
!= 0) {
1649 /* continue with a partial escape sequence */
1651 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
1652 /* continue with a partial double-byte character */
1653 mySourceChar
= args
->converter
->toUBytes
[0];
1654 args
->converter
->toULength
= 0;
1655 cs
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
1659 while(mySource
< mySourceLimit
){
1661 targetUniChar
=missingCharMarker
;
1663 if(myTarget
< args
->targetLimit
){
1665 mySourceChar
= (unsigned char) *mySource
++;
1667 switch(mySourceChar
) {
1669 if(myData
->version
==3) {
1673 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1678 if(myData
->version
==3) {
1679 /* JIS7: switch to G1 half-width Katakana */
1680 pToU2022State
->cs
[1] = (int8_t)HWKANA_7BIT
;
1684 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1691 changeState_2022(args
->converter
,&(mySource
),
1692 mySourceLimit
, ISO_2022_JP
,err
);
1694 /* invalid or illegal escape sequence */
1695 if(U_FAILURE(*err
)){
1696 args
->target
= myTarget
;
1697 args
->source
= mySource
;
1702 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
1707 /* automatically reset to single-byte mode */
1708 if((StateEnum
)pToU2022State
->cs
[0] != ASCII
&& (StateEnum
)pToU2022State
->cs
[0] != JISX201
) {
1709 pToU2022State
->cs
[0] = (int8_t)ASCII
;
1711 pToU2022State
->cs
[2] = 0;
1712 pToU2022State
->g
= 0;
1715 /* convert one or two bytes */
1716 cs
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
1717 if( (uint8_t)(mySourceChar
- 0xa1) <= (0xdf - 0xa1) && myData
->version
==4 &&
1720 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
1721 targetUniChar
= mySourceChar
+ (0xff61 - 0xa1);
1723 /* return from a single-shift state to the previous one */
1724 if(pToU2022State
->g
>= 2) {
1725 pToU2022State
->g
=pToU2022State
->prevG
;
1729 if(mySourceChar
<= 0x7f) {
1730 targetUniChar
= mySourceChar
;
1734 if(mySourceChar
<= 0x7f) {
1735 targetUniChar
= mySourceChar
+ 0x80;
1737 /* return from a single-shift state to the previous one */
1738 pToU2022State
->g
=pToU2022State
->prevG
;
1741 if(mySourceChar
<= 0x7f) {
1742 /* convert mySourceChar+0x80 to use a normal 8-bit table */
1744 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1745 myData
->myConverterArray
[cs
],
1746 mySourceChar
+ 0x80);
1748 /* return from a single-shift state to the previous one */
1749 pToU2022State
->g
=pToU2022State
->prevG
;
1752 if(mySourceChar
<= 0x7f) {
1754 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1755 myData
->myConverterArray
[cs
],
1760 if((uint8_t)(mySourceChar
- 0x21) <= (0x5f - 0x21)) {
1761 /* 7-bit halfwidth Katakana */
1762 targetUniChar
= mySourceChar
+ (0xff61 - 0x21);
1767 if(mySource
< mySourceLimit
) {
1770 tempBuf
[0] = (char) (mySourceChar
);
1771 tempBuf
[1] = trailByte
= *mySource
++;
1772 mySourceChar
= (mySourceChar
<< 8) | (uint8_t)(trailByte
);
1773 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(myData
->myConverterArray
[cs
], tempBuf
, 2, FALSE
);
1775 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
1776 args
->converter
->toULength
= 1;
1782 if(targetUniChar
< (missingCharMarker
-1/*0xfffe*/)){
1784 args
->offsets
[myTarget
- args
->target
]= mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2);
1786 *(myTarget
++)=(UChar
)targetUniChar
;
1788 else if(targetUniChar
> missingCharMarker
){
1789 /* disassemble the surrogate pair and write to output*/
1790 targetUniChar
-=0x0010000;
1791 *myTarget
= (UChar
)(0xd800+(UChar
)(targetUniChar
>>10));
1793 args
->offsets
[myTarget
- args
->target
]= mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2);
1796 if(myTarget
< args
->targetLimit
){
1797 *myTarget
= (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
1799 args
->offsets
[myTarget
- args
->target
]= mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2);
1803 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]=
1804 (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
1809 /* Call the callback function*/
1810 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
1815 *err
=U_BUFFER_OVERFLOW_ERROR
;
1820 args
->target
= myTarget
;
1821 args
->source
= mySource
;
1825 /***************************************************************
1826 * Rules for ISO-2022-KR encoding
1827 * i) The KSC5601 designator sequence should appear only once in a file,
1828 * at the begining of a line before any KSC5601 characters. This usually
1829 * means that it appears by itself on the first line of the file
1830 * ii) There are only 2 shifting sequences SO to shift into double byte mode
1831 * and SI to shift into single byte mode
1834 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
1836 UConverter
* saveConv
= args
->converter
;
1837 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*)saveConv
->extraInfo
;
1838 args
->converter
=myConverterData
->currentConverter
;
1840 myConverterData
->currentConverter
->fromUChar32
= saveConv
->fromUChar32
;
1841 ucnv_MBCSFromUnicodeWithOffsets(args
,err
);
1842 saveConv
->fromUChar32
= myConverterData
->currentConverter
->fromUChar32
;
1844 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
1845 if(myConverterData
->currentConverter
->charErrorBufferLength
> 0) {
1847 saveConv
->charErrorBuffer
,
1848 myConverterData
->currentConverter
->charErrorBuffer
,
1849 myConverterData
->currentConverter
->charErrorBufferLength
);
1851 saveConv
->charErrorBufferLength
= myConverterData
->currentConverter
->charErrorBufferLength
;
1852 myConverterData
->currentConverter
->charErrorBufferLength
= 0;
1854 args
->converter
=saveConv
;
1858 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
1860 const UChar
*source
= args
->source
;
1861 const UChar
*sourceLimit
= args
->sourceLimit
;
1862 unsigned char *target
= (unsigned char *) args
->target
;
1863 unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
1864 int32_t* offsets
= args
->offsets
;
1865 uint32_t targetByteUnit
= 0x0000;
1866 UChar32 sourceChar
= 0x0000;
1867 UBool isTargetByteDBCS
;
1868 UBool oldIsTargetByteDBCS
;
1869 UConverterDataISO2022
*converterData
;
1870 UConverterSharedData
* sharedData
;
1874 converterData
=(UConverterDataISO2022
*)args
->converter
->extraInfo
;
1875 /* if the version is 1 then the user is requesting
1876 * conversion with ibm-25546 pass the arguments to
1877 * MBCS converter and return
1879 if(converterData
->version
==1){
1880 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args
,err
);
1884 /* initialize data */
1885 sharedData
= converterData
->currentConverter
->sharedData
;
1886 useFallback
= args
->converter
->useFallback
;
1887 isTargetByteDBCS
=(UBool
)args
->converter
->fromUnicodeStatus
;
1888 oldIsTargetByteDBCS
= isTargetByteDBCS
;
1890 isTargetByteDBCS
= (UBool
) args
->converter
->fromUnicodeStatus
;
1891 if((sourceChar
= args
->converter
->fromUChar32
)!=0 && target
<targetLimit
) {
1894 while(source
< sourceLimit
){
1896 targetByteUnit
= missingCharMarker
;
1898 if(target
< (unsigned char*) args
->targetLimit
){
1899 sourceChar
= *source
++;
1900 /* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData,
1901 sourceChar,&targetByteUnit,args->converter->useFallback);*/
1902 MBCS_FROM_UCHAR32_ISO2022(sharedData
,sourceChar
,&targetByteUnit
,useFallback
,&length
,MBCS_OUTPUT_2
);
1903 /* only DBCS or SBCS characters are expected*/
1904 /* DB characters with high bit set to 1 are expected */
1905 if(length
> 2 || length
==0 ||(((targetByteUnit
& 0x8080) != 0x8080)&& length
==2)){
1906 targetByteUnit
=missingCharMarker
;
1908 if (targetByteUnit
!= missingCharMarker
){
1910 oldIsTargetByteDBCS
= isTargetByteDBCS
;
1911 isTargetByteDBCS
= (UBool
)(targetByteUnit
>0x00FF);
1912 /* append the shift sequence */
1913 if (oldIsTargetByteDBCS
!= isTargetByteDBCS
){
1915 if (isTargetByteDBCS
)
1916 *target
++ = UCNV_SO
;
1918 *target
++ = UCNV_SI
;
1920 *(offsets
++)= source
- args
->source
-1;
1922 /* write the targetUniChar to target */
1923 if(targetByteUnit
<= 0x00FF){
1924 if( target
< targetLimit
){
1925 *(target
++) = (unsigned char) targetByteUnit
;
1927 *(offsets
++) = source
- args
->source
-1;
1931 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
);
1932 *err
= U_BUFFER_OVERFLOW_ERROR
;
1935 if(target
< targetLimit
){
1936 *(target
++) =(unsigned char) ((targetByteUnit
>>8) -0x80);
1938 *(offsets
++) = source
- args
->source
-1;
1940 if(target
< targetLimit
){
1941 *(target
++) =(unsigned char) (targetByteUnit
-0x80);
1943 *(offsets
++) = source
- args
->source
-1;
1946 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
-0x80);
1947 *err
= U_BUFFER_OVERFLOW_ERROR
;
1950 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) ((targetByteUnit
>>8) -0x80);
1951 args
->converter
->charErrorBuffer
[args
->converter
->charErrorBufferLength
++] = (unsigned char) (targetByteUnit
-0x80);
1952 *err
= U_BUFFER_OVERFLOW_ERROR
;
1958 /* oops.. the code point is unassingned
1959 * set the error and reason
1962 /*check if the char is a First surrogate*/
1963 if(UTF_IS_SURROGATE(sourceChar
)) {
1964 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
1966 /*look ahead to find the trail surrogate*/
1967 if(source
< sourceLimit
) {
1968 /* test the following code unit */
1969 UChar trail
=(UChar
) *source
;
1970 if(UTF_IS_SECOND_SURROGATE(trail
)) {
1972 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
1973 *err
= U_INVALID_CHAR_FOUND
;
1974 /* convert this surrogate code point */
1975 /* exit this condition tree */
1977 /* this is an unmatched lead code unit (1st surrogate) */
1978 /* callback(illegal) */
1979 *err
=U_ILLEGAL_CHAR_FOUND
;
1983 *err
= U_ZERO_ERROR
;
1986 /* this is an unmatched trail code unit (2nd surrogate) */
1987 /* callback(illegal) */
1988 *err
=U_ILLEGAL_CHAR_FOUND
;
1991 /* callback(unassigned) for a BMP code point */
1992 *err
= U_INVALID_CHAR_FOUND
;
1995 args
->converter
->fromUChar32
=sourceChar
;
1996 args
->converter
->fromUnicodeStatus
= (int32_t)isTargetByteDBCS
;
1999 } /* end if(myTargetIndex<myTargetLength) */
2001 *err
=U_BUFFER_OVERFLOW_ERROR
;
2005 }/* end while(mySourceIndex<mySourceLength) */
2008 * the end of the input stream and detection of truncated input
2009 * are handled by the framework, but for ISO-2022-KR conversion
2010 * we need to be in ASCII mode at the very end
2015 * end of input and no truncated input
2017 if( U_SUCCESS(*err
) &&
2019 args
->flush
&& source
>=sourceLimit
&& args
->converter
->fromUChar32
==0
2021 int32_t sourceIndex
;
2023 /* we are switching to ASCII */
2024 isTargetByteDBCS
=FALSE
;
2026 /* get the source index of the last input character */
2028 * TODO this would be simpler and more reliable if we used a pair
2029 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2030 * so that we could simply use the prevSourceIndex here;
2031 * this code gives an incorrect result for the rare case of an unmatched
2032 * trail surrogate that is alone in the last buffer of the text stream
2034 sourceIndex
=(int32_t)(source
-args
->source
);
2037 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
2038 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
2046 ucnv_fromUWriteBytes(
2049 (char **)&target
, (const char *)targetLimit
,
2050 &offsets
, sourceIndex
,
2054 /*save the state and return */
2055 args
->source
= source
;
2056 args
->target
= (char*)target
;
2057 args
->converter
->fromUnicodeStatus
= (uint32_t)isTargetByteDBCS
;
2060 /************************ To Unicode ***************************************/
2063 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs
*args
,
2065 char const* sourceStart
;
2066 UConverterDataISO2022
* myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2068 UConverterToUnicodeArgs subArgs
;
2069 int32_t minArgsSize
;
2071 /* set up the subconverter arguments */
2072 if(args
->size
<sizeof(UConverterToUnicodeArgs
)) {
2073 minArgsSize
= args
->size
;
2075 minArgsSize
= (int32_t)sizeof(UConverterToUnicodeArgs
);
2078 uprv_memcpy(&subArgs
, args
, minArgsSize
);
2079 subArgs
.size
= (uint16_t)minArgsSize
;
2080 subArgs
.converter
= myData
->currentConverter
;
2082 /* remember the original start of the input for offsets */
2083 sourceStart
= args
->source
;
2085 if(myData
->key
!= 0) {
2086 /* continue with a partial escape sequence */
2090 while(U_SUCCESS(*err
) && args
->source
< args
->sourceLimit
) {
2091 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2092 subArgs
.source
= args
->source
;
2093 subArgs
.sourceLimit
= getEndOfBuffer_2022(&(args
->source
), args
->sourceLimit
, args
->flush
);
2094 if(subArgs
.source
!= subArgs
.sourceLimit
) {
2096 * get the current partial byte sequence
2098 * it needs to be moved between the public and the subconverter
2099 * so that the conversion framework, which only sees the public
2100 * converter, can handle truncated and illegal input etc.
2102 if(args
->converter
->toULength
> 0) {
2103 uprv_memcpy(subArgs
.converter
->toUBytes
, args
->converter
->toUBytes
, args
->converter
->toULength
);
2105 subArgs
.converter
->toULength
= args
->converter
->toULength
;
2108 * Convert up to the end of the input, or to before the next escape character.
2109 * Does not handle conversion extensions because the preToU[] state etc.
2112 ucnv_MBCSToUnicodeWithOffsets(&subArgs
, err
);
2114 if(args
->offsets
!= NULL
&& sourceStart
!= args
->source
) {
2115 /* update offsets to base them on the actual start of the input */
2116 int32_t *offsets
= args
->offsets
;
2117 UChar
*target
= args
->target
;
2118 int32_t delta
= (int32_t)(args
->source
- sourceStart
);
2119 while(target
< subArgs
.target
) {
2127 args
->source
= subArgs
.source
;
2128 args
->target
= subArgs
.target
;
2129 args
->offsets
= subArgs
.offsets
;
2131 /* copy input/error/overflow buffers */
2132 if(subArgs
.converter
->toULength
> 0) {
2133 uprv_memcpy(args
->converter
->toUBytes
, subArgs
.converter
->toUBytes
, subArgs
.converter
->toULength
);
2135 args
->converter
->toULength
= subArgs
.converter
->toULength
;
2137 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
2138 if(subArgs
.converter
->UCharErrorBufferLength
> 0) {
2139 uprv_memcpy(args
->converter
->UCharErrorBuffer
, subArgs
.converter
->UCharErrorBuffer
,
2140 subArgs
.converter
->UCharErrorBufferLength
);
2142 args
->converter
->UCharErrorBufferLength
=subArgs
.converter
->UCharErrorBufferLength
;
2143 subArgs
.converter
->UCharErrorBufferLength
= 0;
2147 if (U_FAILURE(*err
) || (args
->source
== args
->sourceLimit
)) {
2152 changeState_2022(args
->converter
,
2161 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
2164 const char *mySource
= ( char *) args
->source
;
2165 UChar
*myTarget
= args
->target
;
2166 const char *mySourceLimit
= args
->sourceLimit
;
2167 UChar32 targetUniChar
= 0x0000;
2168 UChar mySourceChar
= 0x0000;
2169 UConverterDataISO2022
* myData
;
2170 UConverterSharedData
* sharedData
;
2173 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2174 if(myData
->version
==1){
2175 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args
,err
);
2179 /* initialize state */
2180 sharedData
= myData
->currentConverter
->sharedData
;
2181 useFallback
= args
->converter
->useFallback
;
2183 if(myData
->key
!= 0) {
2184 /* continue with a partial escape sequence */
2186 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
2187 /* continue with a partial double-byte character */
2188 mySourceChar
= args
->converter
->toUBytes
[0];
2189 args
->converter
->toULength
= 0;
2193 while(mySource
< mySourceLimit
){
2195 if(myTarget
< args
->targetLimit
){
2197 mySourceChar
= (unsigned char) *mySource
++;
2199 if(mySourceChar
==UCNV_SI
){
2200 myData
->toU2022State
.g
= 0;
2201 /*consume the source */
2203 }else if(mySourceChar
==UCNV_SO
){
2204 myData
->toU2022State
.g
= 1;
2205 /*consume the source */
2207 }else if(mySourceChar
==ESC_2022
){
2210 changeState_2022(args
->converter
,&(mySource
),
2211 mySourceLimit
, ISO_2022_KR
, err
);
2212 if(U_FAILURE(*err
)){
2213 args
->target
= myTarget
;
2214 args
->source
= mySource
;
2220 if(myData
->toU2022State
.g
== 1) {
2221 if(mySource
< mySourceLimit
) {
2224 trailByte
= *mySource
++;
2225 tempBuf
[0] = (char)(mySourceChar
+ 0x80);
2226 tempBuf
[1] = (char)(trailByte
+ 0x80);
2227 mySourceChar
= (mySourceChar
<< 8) | (uint8_t)(trailByte
);
2228 if((mySourceChar
& 0x8080) == 0) {
2229 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(sharedData
, tempBuf
, 2, useFallback
);
2231 /* illegal bytes > 0x7f */
2232 targetUniChar
= missingCharMarker
;
2235 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
2236 args
->converter
->toULength
= 1;
2241 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(sharedData
, mySource
- 1, 1, useFallback
);
2243 if(targetUniChar
< 0xfffe){
2245 args
->offsets
[myTarget
- args
->target
]= mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2);
2247 *(myTarget
++)=(UChar
)targetUniChar
;
2250 /* Call the callback function*/
2251 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
2256 *err
=U_BUFFER_OVERFLOW_ERROR
;
2260 args
->target
= myTarget
;
2261 args
->source
= mySource
;
2264 /*************************** END ISO2022-KR *********************************/
2266 /*************************** ISO-2022-CN *********************************
2268 * Rules for ISO-2022-CN Encoding:
2269 * i) The designator sequence must appear once on a line before any instance
2270 * of character set it designates.
2271 * ii) If two lines contain characters from the same character set, both lines
2272 * must include the designator sequence.
2273 * iii) Once the designator sequence is known, a shifting sequence has to be found
2274 * to invoke the shifting
2275 * iv) All lines start in ASCII and end in ASCII.
2276 * v) Four shifting sequences are employed for this purpose:
2278 * Sequcence ASCII Eq Charsets
2279 * ---------- ------- ---------
2281 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2282 * SS2 <ESC>N CNS-11643-1992 Plane 2
2283 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2286 * SOdesignator : ESC "$" ")" finalchar_for_SO
2287 * SS2designator : ESC "$" "*" finalchar_for_SS2
2288 * SS3designator : ESC "$" "+" finalchar_for_SS3
2290 * ESC $ ) A Indicates the bytes following SO are Chinese
2291 * characters as defined in GB 2312-80, until
2292 * another SOdesignation appears
2295 * ESC $ ) E Indicates the bytes following SO are as defined
2296 * in ISO-IR-165 (for details, see section 2.1),
2297 * until another SOdesignation appears
2299 * ESC $ ) G Indicates the bytes following SO are as defined
2300 * in CNS 11643-plane-1, until another
2301 * SOdesignation appears
2303 * ESC $ * H Indicates the two bytes immediately following
2304 * SS2 is a Chinese character as defined in CNS
2305 * 11643-plane-2, until another SS2designation
2307 * (Meaning <ESC>N must preceed every 2 byte
2310 * ESC $ + I Indicates the immediate two bytes following SS3
2311 * is a Chinese character as defined in CNS
2312 * 11643-plane-3, until another SS3designation
2314 * (Meaning <ESC>O must preceed every 2 byte
2317 * ESC $ + J Indicates the immediate two bytes following SS3
2318 * is a Chinese character as defined in CNS
2319 * 11643-plane-4, until another SS3designation
2321 * (In English: <ESC>O must preceed every 2 byte
2324 * ESC $ + K Indicates the immediate two bytes following SS3
2325 * is a Chinese character as defined in CNS
2326 * 11643-plane-5, until another SS3designation
2329 * ESC $ + L Indicates the immediate two bytes following SS3
2330 * is a Chinese character as defined in CNS
2331 * 11643-plane-6, until another SS3designation
2334 * ESC $ + M Indicates the immediate two bytes following SS3
2335 * is a Chinese character as defined in CNS
2336 * 11643-plane-7, until another SS3designation
2339 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2340 * has its own designation information before any Chinese characters
2345 /* The following are defined this way to make the strings truely readonly */
2346 static const char GB_2312_80_STR
[] = "\x1B\x24\x29\x41";
2347 static const char ISO_IR_165_STR
[] = "\x1B\x24\x29\x45";
2348 static const char CNS_11643_1992_Plane_1_STR
[] = "\x1B\x24\x29\x47";
2349 static const char CNS_11643_1992_Plane_2_STR
[] = "\x1B\x24\x2A\x48";
2350 static const char CNS_11643_1992_Plane_3_STR
[] = "\x1B\x24\x2B\x49";
2351 static const char CNS_11643_1992_Plane_4_STR
[] = "\x1B\x24\x2B\x4A";
2352 static const char CNS_11643_1992_Plane_5_STR
[] = "\x1B\x24\x2B\x4B";
2353 static const char CNS_11643_1992_Plane_6_STR
[] = "\x1B\x24\x2B\x4C";
2354 static const char CNS_11643_1992_Plane_7_STR
[] = "\x1B\x24\x2B\x4D";
2356 /********************** ISO2022-CN Data **************************/
2357 static const char* const escSeqCharsCN
[10] ={
2358 SHIFT_IN_STR
, /* ASCII */
2361 CNS_11643_1992_Plane_1_STR
,
2362 CNS_11643_1992_Plane_2_STR
,
2363 CNS_11643_1992_Plane_3_STR
,
2364 CNS_11643_1992_Plane_4_STR
,
2365 CNS_11643_1992_Plane_5_STR
,
2366 CNS_11643_1992_Plane_6_STR
,
2367 CNS_11643_1992_Plane_7_STR
2371 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs
* args
, UErrorCode
* err
){
2373 UConverterDataISO2022
*converterData
;
2374 ISO2022State
*pFromU2022State
;
2375 uint8_t *target
= (uint8_t *) args
->target
;
2376 const uint8_t *targetLimit
= (const uint8_t *) args
->targetLimit
;
2377 const UChar
* source
= args
->source
;
2378 const UChar
* sourceLimit
= args
->sourceLimit
;
2379 int32_t* offsets
= args
->offsets
;
2384 int32_t choiceCount
;
2385 uint32_t targetValue
;
2388 /* set up the state */
2389 converterData
= (UConverterDataISO2022
*)args
->converter
->extraInfo
;
2390 pFromU2022State
= &converterData
->fromU2022State
;
2391 useFallback
= args
->converter
->useFallback
;
2395 /* check if the last codepoint of previous buffer was a lead surrogate*/
2396 if((sourceChar
= args
->converter
->fromUChar32
)!=0 && target
< targetLimit
) {
2400 while( source
< sourceLimit
){
2401 if(target
< targetLimit
){
2403 sourceChar
= *(source
++);
2404 /*check if the char is a First surrogate*/
2405 if(UTF_IS_SURROGATE(sourceChar
)) {
2406 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
2408 /*look ahead to find the trail surrogate*/
2409 if(source
< sourceLimit
) {
2410 /* test the following code unit */
2411 UChar trail
=(UChar
) *source
;
2412 if(UTF_IS_SECOND_SURROGATE(trail
)) {
2414 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
2415 args
->converter
->fromUChar32
=0x00;
2416 /* convert this supplementary code point */
2417 /* exit this condition tree */
2419 /* this is an unmatched lead code unit (1st surrogate) */
2420 /* callback(illegal) */
2421 *err
=U_ILLEGAL_CHAR_FOUND
;
2422 args
->converter
->fromUChar32
=sourceChar
;
2427 args
->converter
->fromUChar32
=sourceChar
;
2431 /* this is an unmatched trail code unit (2nd surrogate) */
2432 /* callback(illegal) */
2433 *err
=U_ILLEGAL_CHAR_FOUND
;
2434 args
->converter
->fromUChar32
=sourceChar
;
2439 /* do the conversion */
2440 if(sourceChar
<= 0x007f ){
2442 if(pFromU2022State
->g
== 0) {
2443 buffer
[0] = (char)sourceChar
;
2446 buffer
[0] = UCNV_SI
;
2447 buffer
[1] = (char)sourceChar
;
2449 pFromU2022State
->g
= 0;
2452 if(sourceChar
== CR
|| sourceChar
== LF
) {
2453 /* reset the state at the end of a line */
2454 uprv_memset(pFromU2022State
, 0, sizeof(ISO2022State
));
2459 /* convert U+0080..U+10ffff */
2460 UConverterSharedData
*cnv
;
2464 if(choiceCount
== 0) {
2465 /* try the current SO/G1 converter first */
2466 choices
[0] = pFromU2022State
->cs
[1];
2468 /* default to GB2312_1 if none is designated yet */
2469 if(choices
[0] == 0) {
2470 choices
[0] = GB2312_1
;
2473 if(converterData
->version
== 0) {
2476 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2477 if(choices
[0] == GB2312_1
) {
2478 choices
[1] = (int8_t)CNS_11643_1
;
2480 choices
[1] = (int8_t)GB2312_1
;
2485 /* ISO-2022-CN-EXT */
2487 /* try one of the other converters */
2488 switch(choices
[0]) {
2490 choices
[1] = (int8_t)CNS_11643_1
;
2491 choices
[2] = (int8_t)ISO_IR_165
;
2494 choices
[1] = (int8_t)GB2312_1
;
2495 choices
[2] = (int8_t)CNS_11643_1
;
2497 default: /* CNS_11643_x */
2498 choices
[1] = (int8_t)GB2312_1
;
2499 choices
[2] = (int8_t)ISO_IR_165
;
2510 for(i
= 0; i
< choiceCount
&& len
== 0; ++i
) {
2513 if(cs
> CNS_11643_0
) {
2514 cnv
= converterData
->myConverterArray
[CNS_11643
];
2515 MBCS_FROM_UCHAR32_ISO2022(cnv
,sourceChar
,&targetValue
,useFallback
,&len
,MBCS_OUTPUT_3
);
2517 cs
= (int8_t)(CNS_11643_0
+ (targetValue
>> 16) - 0x80);
2519 if(cs
== CNS_11643_1
) {
2521 } else if(cs
== CNS_11643_2
) {
2523 } else /* plane 3..7 */ if(converterData
->version
== 1) {
2526 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
2531 /* GB2312_1 or ISO-IR-165 */
2532 cnv
= converterData
->myConverterArray
[cs
];
2533 MBCS_FROM_UCHAR32_ISO2022(cnv
,sourceChar
,&targetValue
,useFallback
,&len
,MBCS_OUTPUT_2
);
2534 g
= 1; /* used if len == 2 */
2540 len
= 0; /* count output bytes; it must have been len == 2 */
2542 /* write the designation sequence if necessary */
2543 if(cs
!= pFromU2022State
->cs
[g
]) {
2544 if(cs
< CNS_11643
) {
2545 uprv_memcpy(buffer
, escSeqCharsCN
[cs
], 4);
2547 uprv_memcpy(buffer
, escSeqCharsCN
[CNS_11643
+ (cs
- CNS_11643_1
)], 4);
2550 pFromU2022State
->cs
[g
] = cs
;
2552 /* changing the SO/G1 charset invalidates the choices[] */
2557 /* write the shift sequence if necessary */
2558 if(g
!= pFromU2022State
->g
) {
2561 buffer
[len
++] = UCNV_SO
;
2563 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
2564 pFromU2022State
->g
= 1;
2567 buffer
[len
++] = 0x1b;
2568 buffer
[len
++] = 0x4e;
2570 default: /* case 3 */
2571 buffer
[len
++] = 0x1b;
2572 buffer
[len
++] = 0x4f;
2577 /* write the two output bytes */
2578 buffer
[len
++] = (char)(targetValue
>> 8);
2579 buffer
[len
++] = (char)targetValue
;
2581 /* if we cannot find the character after checking all codepages
2582 * then this is an error
2584 *err
= U_INVALID_CHAR_FOUND
;
2585 args
->converter
->fromUChar32
=sourceChar
;
2590 /* output len>0 bytes in buffer[] */
2592 *target
++ = buffer
[0];
2594 *offsets
++ = source
- args
->source
- 1; /* -1: known to be ASCII */
2596 } else if(len
== 2 && (target
+ 2) <= targetLimit
) {
2597 *target
++ = buffer
[0];
2598 *target
++ = buffer
[1];
2600 int32_t sourceIndex
= (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
));
2601 *offsets
++ = sourceIndex
;
2602 *offsets
++ = sourceIndex
;
2605 ucnv_fromUWriteBytes(
2608 (char **)&target
, (const char *)targetLimit
,
2609 &offsets
, (int32_t)(source
- args
->source
- U16_LENGTH(sourceChar
)),
2611 if(U_FAILURE(*err
)) {
2615 } /* end if(myTargetIndex<myTargetLength) */
2617 *err
=U_BUFFER_OVERFLOW_ERROR
;
2621 }/* end while(mySourceIndex<mySourceLength) */
2624 * the end of the input stream and detection of truncated input
2625 * are handled by the framework, but for ISO-2022-CN conversion
2626 * we need to be in ASCII mode at the very end
2631 * end of input and no truncated input
2633 if( U_SUCCESS(*err
) &&
2634 pFromU2022State
->g
!=0 &&
2635 args
->flush
&& source
>=sourceLimit
&& args
->converter
->fromUChar32
==0
2637 int32_t sourceIndex
;
2639 /* we are switching to ASCII */
2640 pFromU2022State
->g
=0;
2642 /* get the source index of the last input character */
2644 * TODO this would be simpler and more reliable if we used a pair
2645 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2646 * so that we could simply use the prevSourceIndex here;
2647 * this code gives an incorrect result for the rare case of an unmatched
2648 * trail surrogate that is alone in the last buffer of the text stream
2650 sourceIndex
=(int32_t)(source
-args
->source
);
2653 if( U16_IS_TRAIL(args
->source
[sourceIndex
]) &&
2654 (sourceIndex
==0 || U16_IS_LEAD(args
->source
[sourceIndex
-1]))
2662 ucnv_fromUWriteBytes(
2665 (char **)&target
, (const char *)targetLimit
,
2666 &offsets
, sourceIndex
,
2670 /*save the state and return */
2671 args
->source
= source
;
2672 args
->target
= (char*)target
;
2677 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
2680 const char *mySource
= (char *) args
->source
;
2681 UChar
*myTarget
= args
->target
;
2682 const char *mySourceLimit
= args
->sourceLimit
;
2683 uint32_t targetUniChar
= 0x0000;
2684 uint32_t mySourceChar
= 0x0000;
2685 UConverterDataISO2022
* myData
;
2686 ISO2022State
*pToU2022State
;
2688 myData
=(UConverterDataISO2022
*)(args
->converter
->extraInfo
);
2689 pToU2022State
= &myData
->toU2022State
;
2691 if(myData
->key
!= 0) {
2692 /* continue with a partial escape sequence */
2694 } else if(args
->converter
->toULength
== 1 && mySource
< mySourceLimit
&& myTarget
< args
->targetLimit
) {
2695 /* continue with a partial double-byte character */
2696 mySourceChar
= args
->converter
->toUBytes
[0];
2697 args
->converter
->toULength
= 0;
2701 while(mySource
< mySourceLimit
){
2703 targetUniChar
=missingCharMarker
;
2705 if(myTarget
< args
->targetLimit
){
2707 mySourceChar
= (unsigned char) *mySource
++;
2709 switch(mySourceChar
){
2715 if(pToU2022State
->cs
[1] != 0) {
2719 /* illegal to have SO before a matching designator */
2726 changeState_2022(args
->converter
,&(mySource
),
2727 mySourceLimit
, ISO_2022_CN
,err
);
2729 /* invalid or illegal escape sequence */
2730 if(U_FAILURE(*err
)){
2731 args
->target
= myTarget
;
2732 args
->source
= mySource
;
2737 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
2742 uprv_memset(pToU2022State
, 0, sizeof(ISO2022State
));
2745 /* convert one or two bytes */
2746 if(pToU2022State
->g
!= 0) {
2747 if(mySource
< mySourceLimit
) {
2748 UConverterSharedData
*cnv
;
2749 StateEnum tempState
;
2753 trailByte
= *mySource
++;
2754 tempState
= (StateEnum
)pToU2022State
->cs
[pToU2022State
->g
];
2755 if(tempState
> CNS_11643_0
) {
2756 cnv
= myData
->myConverterArray
[CNS_11643
];
2757 tempBuf
[0] = (char) (0x80+(tempState
-CNS_11643_0
));
2758 tempBuf
[1] = (char) (mySourceChar
);
2759 tempBuf
[2] = trailByte
;
2763 cnv
= myData
->myConverterArray
[tempState
];
2764 tempBuf
[0] = (char) (mySourceChar
);
2765 tempBuf
[1] = trailByte
;
2768 mySourceChar
= (mySourceChar
<< 8) | (uint8_t)(trailByte
);
2769 if(pToU2022State
->g
>=2) {
2770 /* return from a single-shift state to the previous one */
2771 pToU2022State
->g
=pToU2022State
->prevG
;
2773 targetUniChar
= ucnv_MBCSSimpleGetNextUChar(cnv
, tempBuf
, tempBufLen
, FALSE
);
2775 args
->converter
->toUBytes
[0] = (uint8_t)mySourceChar
;
2776 args
->converter
->toULength
= 1;
2781 if(mySourceChar
<= 0x7f) {
2782 targetUniChar
= (UChar
) mySourceChar
;
2787 if(targetUniChar
< (missingCharMarker
-1/*0xfffe*/)){
2789 args
->offsets
[myTarget
- args
->target
]= mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2);
2791 *(myTarget
++)=(UChar
)targetUniChar
;
2793 else if(targetUniChar
> missingCharMarker
){
2794 /* disassemble the surrogate pair and write to output*/
2795 targetUniChar
-=0x0010000;
2796 *myTarget
= (UChar
)(0xd800+(UChar
)(targetUniChar
>>10));
2798 args
->offsets
[myTarget
- args
->target
]= mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2);
2801 if(myTarget
< args
->targetLimit
){
2802 *myTarget
= (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
2804 args
->offsets
[myTarget
- args
->target
]= mySource
- args
->source
- (mySourceChar
<= 0xff ? 1 : 2);
2808 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]=
2809 (UChar
)(0xdc00+(UChar
)(targetUniChar
&0x3ff));
2814 /* Call the callback function*/
2815 toUnicodeCallback(args
->converter
,mySourceChar
,targetUniChar
,err
);
2820 *err
=U_BUFFER_OVERFLOW_ERROR
;
2825 args
->target
= myTarget
;
2826 args
->source
= mySource
;
2830 _ISO_2022_WriteSub(UConverterFromUnicodeArgs
*args
, int32_t offsetIndex
, UErrorCode
*err
) {
2831 UConverter
*cnv
= args
->converter
;
2832 UConverterDataISO2022
*myConverterData
=(UConverterDataISO2022
*) cnv
->extraInfo
;
2833 ISO2022State
*pFromU2022State
=&myConverterData
->fromU2022State
;
2838 subchar
=(char *)cnv
->subChar
;
2839 length
=cnv
->subCharLen
; /* assume length==1 for most variants */
2842 switch(myConverterData
->locale
[0]){
2847 if(pFromU2022State
->g
== 1) {
2848 /* JIS7: switch from G1 to G0 */
2849 pFromU2022State
->g
= 0;
2853 cs
= pFromU2022State
->cs
[0];
2854 if(cs
!= ASCII
&& cs
!= JISX201
) {
2855 /* not in ASCII or JIS X 0201: switch to ASCII */
2856 pFromU2022State
->cs
[0] = (int8_t)ASCII
;
2866 if(pFromU2022State
->g
!= 0) {
2867 /* not in ASCII mode: switch to ASCII */
2868 pFromU2022State
->g
= 0;
2874 if(myConverterData
->version
== 0) {
2876 if((UBool
)args
->converter
->fromUnicodeStatus
) {
2877 /* in DBCS mode: switch to SBCS */
2878 args
->converter
->fromUnicodeStatus
= 0;
2882 } else /* length == 2*/ {
2883 if(!(UBool
)args
->converter
->fromUnicodeStatus
) {
2884 /* in SBCS mode: switch to DBCS */
2885 args
->converter
->fromUnicodeStatus
= 1;
2893 /* let the subconverter write the subchar */
2894 args
->converter
= myConverterData
->currentConverter
;
2895 uprv_memcpy(myConverterData
->currentConverter
->subChar
, subchar
, 4);
2896 myConverterData
->currentConverter
->subCharLen
= (int8_t)length
;
2898 myConverterData
->currentConverter
->fromUChar32
= cnv
->fromUChar32
;
2899 ucnv_cbFromUWriteSub(args
, 0, err
);
2900 cnv
->fromUChar32
= myConverterData
->currentConverter
->fromUChar32
;
2902 if(*err
== U_BUFFER_OVERFLOW_ERROR
) {
2903 if(myConverterData
->currentConverter
->charErrorBufferLength
> 0) {
2905 cnv
->charErrorBuffer
,
2906 myConverterData
->currentConverter
->charErrorBuffer
,
2907 myConverterData
->currentConverter
->charErrorBufferLength
);
2909 cnv
->charErrorBufferLength
= myConverterData
->currentConverter
->charErrorBufferLength
;
2910 myConverterData
->currentConverter
->charErrorBufferLength
= 0;
2912 args
->converter
= cnv
;
2919 ucnv_cbFromUWriteBytes(args
,
2920 buffer
, (int32_t)(p
- buffer
),
2924 /* structure for SafeClone calculations */
2928 UConverterDataISO2022 mydata
;
2929 UConverter currentConverter
;
2934 _ISO_2022_SafeClone(
2935 const UConverter
*cnv
,
2937 int32_t *pBufferSize
,
2940 struct cloneStruct
* localClone
;
2941 UConverterDataISO2022
*cnvData
;
2944 if (*pBufferSize
== 0) { /* 'preflighting' request - set needed size into *pBufferSize */
2945 *pBufferSize
= (int32_t)sizeof(struct cloneStruct
);
2949 cnvData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
2950 localClone
= (struct cloneStruct
*)stackBuffer
;
2952 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
2954 uprv_memcpy(&localClone
->mydata
, cnvData
, sizeof(UConverterDataISO2022
));
2956 /* share the subconverters */
2958 if(cnvData
->currentConverter
!= NULL
) {
2959 size
= (int32_t)sizeof(UConverter
);
2960 localClone
->mydata
.currentConverter
=
2961 ucnv_safeClone(cnvData
->currentConverter
,
2962 &localClone
->currentConverter
,
2964 if(U_FAILURE(*status
)) {
2969 for(i
=0; i
<UCNV_2022_MAX_CONVERTERS
; ++i
) {
2970 if(cnvData
->myConverterArray
[i
] != NULL
) {
2971 ucnv_incrementRefCount(cnvData
->myConverterArray
[i
]);
2975 localClone
->cnv
.extraInfo
= &localClone
->mydata
; /* set pointer to extra data */
2976 localClone
->cnv
.isExtraLocal
= TRUE
;
2977 return &localClone
->cnv
;
2981 _ISO_2022_GetUnicodeSet(const UConverter
*cnv
,
2983 UConverterUnicodeSet which
,
2984 UErrorCode
*pErrorCode
)
2987 UConverterDataISO2022
* cnvData
;
2989 if (U_FAILURE(*pErrorCode
)) {
2992 #ifdef U_ENABLE_GENERIC_ISO_2022
2993 if (cnv
->sharedData
== &_ISO2022Data
) {
2994 /* We use UTF-8 in this case */
2995 sa
->addRange(sa
->set
, 0, 0xd7FF);
2996 sa
->addRange(sa
->set
, 0xE000, 0x10FFFF);
3001 cnvData
= (UConverterDataISO2022
*)cnv
->extraInfo
;
3003 /* open a set and initialize it with code points that are algorithmically round-tripped */
3004 switch(cnvData
->locale
[0]){
3006 if(jpCharsetMasks
[cnvData
->version
]&CSM(ISO8859_1
)) {
3007 /* include Latin-1 for some variants of JP */
3008 sa
->addRange(sa
->set
, 0, 0xff);
3010 /* include ASCII for JP */
3011 sa
->addRange(sa
->set
, 0, 0x7f);
3013 if(jpCharsetMasks
[cnvData
->version
]&CSM(HWKANA_7BIT
)) {
3014 /* include half-width Katakana for JP */
3015 sa
->addRange(sa
->set
, 0xff61, 0xff9f);
3020 /* include ASCII for CN */
3021 sa
->addRange(sa
->set
, 0, 0x7f);
3024 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3025 cnvData
->currentConverter
->sharedData
->impl
->getUnicodeSet(
3026 cnvData
->currentConverter
, sa
, which
, pErrorCode
);
3033 * TODO: need to make this version-specific for CN.
3034 * CN version 0 does not map CNS planes 3..7 although
3035 * they are all available in the CNS conversion table;
3036 * CN version 1 does map them all.
3037 * The two versions need to create different Unicode sets.
3039 for (i
=0; i
<UCNV_2022_MAX_CONVERTERS
; i
++) {
3040 if(cnvData
->myConverterArray
[i
]!=NULL
) {
3041 if( (cnvData
->locale
[0]=='c' || cnvData
->locale
[0]=='z') &&
3042 cnvData
->version
==0 && i
==CNS_11643
3044 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3045 ucnv_MBCSGetUnicodeSetForBytes(
3046 cnvData
->myConverterArray
[i
],
3047 sa
, UCNV_ROUNDTRIP_SET
,
3051 ucnv_MBCSGetUnicodeSetForUnicode(cnvData
->myConverterArray
[i
], sa
, which
, pErrorCode
);
3057 static const UConverterImpl _ISO2022Impl
={
3067 #ifdef U_ENABLE_GENERIC_ISO_2022
3068 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC
,
3069 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC
,
3070 ucnv_fromUnicode_UTF8
,
3071 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC
,
3083 _ISO_2022_SafeClone
,
3084 _ISO_2022_GetUnicodeSet
3086 static const UConverterStaticData _ISO2022StaticData
={
3087 sizeof(UConverterStaticData
),
3093 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3100 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3102 const UConverterSharedData _ISO2022Data
={
3103 sizeof(UConverterSharedData
),
3107 &_ISO2022StaticData
,
3113 /*************JP****************/
3114 static const UConverterImpl _ISO2022JPImpl
={
3124 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3125 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3126 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3127 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC
,
3133 _ISO_2022_SafeClone
,
3134 _ISO_2022_GetUnicodeSet
3136 static const UConverterStaticData _ISO2022JPStaticData
={
3137 sizeof(UConverterStaticData
),
3143 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3150 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3152 static const UConverterSharedData _ISO2022JPData
={
3153 sizeof(UConverterSharedData
),
3157 &_ISO2022JPStaticData
,
3163 /************* KR ***************/
3164 static const UConverterImpl _ISO2022KRImpl
={
3174 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3175 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3176 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3177 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC
,
3183 _ISO_2022_SafeClone
,
3184 _ISO_2022_GetUnicodeSet
3186 static const UConverterStaticData _ISO2022KRStaticData
={
3187 sizeof(UConverterStaticData
),
3193 3, /* max 3 bytes per UChar: SO+DBCS */
3200 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3202 static const UConverterSharedData _ISO2022KRData
={
3203 sizeof(UConverterSharedData
),
3207 &_ISO2022KRStaticData
,
3213 /*************** CN ***************/
3214 static const UConverterImpl _ISO2022CNImpl
={
3225 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3226 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3227 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3228 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC
,
3234 _ISO_2022_SafeClone
,
3235 _ISO_2022_GetUnicodeSet
3237 static const UConverterStaticData _ISO2022CNStaticData
={
3238 sizeof(UConverterStaticData
),
3244 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3251 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3253 static const UConverterSharedData _ISO2022CNData
={
3254 sizeof(UConverterSharedData
),
3258 &_ISO2022CNStaticData
,
3266 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */