]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnv2022.c
ICU-6.2.8.tar.gz
[apple/icu.git] / icuSources / common / ucnv2022.c
1 /*
2 **********************************************************************
3 * Copyright (C) 2000-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
13 *
14 * Change history:
15 *
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
19 * function
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
22 * ucnvebdc.c
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
27 */
28
29 #include "unicode/utypes.h"
30
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
37 #include "ucnv_imp.h"
38 #include "ucnv_bld.h"
39 #include "ucnv_cnv.h"
40 #include "ucnvmbcs.h"
41 #include "cstring.h"
42 #include "cmemory.h"
43
44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45
46 #ifdef U_ENABLE_GENERIC_ISO_2022
47 /*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
72 * for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76 #endif
77
78 static const char SHIFT_IN_STR[] = "\x0F";
79 static const char SHIFT_OUT_STR[] = "\x0E";
80
81 #define CR 0x0D
82 #define LF 0x0A
83 #define H_TAB 0x09
84 #define V_TAB 0x0B
85 #define SPACE 0x20
86
87 /* for ISO-2022-JP and -CN implementations */
88 typedef enum {
89 /* shared values */
90 INVALID_STATE=-1,
91 ASCII = 0,
92
93 SS2_STATE=0x10,
94 SS3_STATE,
95
96 /* JP */
97 ISO8859_1 = 1 ,
98 ISO8859_7 = 2 ,
99 JISX201 = 3,
100 JISX208 = 4,
101 JISX212 = 5,
102 GB2312 =6,
103 KSC5601 =7,
104 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
105
106 /* CN */
107 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
108 GB2312_1=1,
109 ISO_IR_165=2,
110 CNS_11643=3,
111
112 /*
113 * these are used in StateEnum and ISO2022State variables,
114 * but CNS_11643 must be used to index into myConverterArray[]
115 */
116 CNS_11643_0=0x20,
117 CNS_11643_1,
118 CNS_11643_2,
119 CNS_11643_3,
120 CNS_11643_4,
121 CNS_11643_5,
122 CNS_11643_6,
123 CNS_11643_7
124 } StateEnum;
125
126 /* is the StateEnum charset value for a DBCS charset? */
127 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
128
129 #define CSM(cs) ((uint16_t)1<<(cs))
130
131 /*
132 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
133 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
134 *
135 * Note: The converter uses some leniency:
136 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
137 * all versions, not just JIS7 and JIS8.
138 * - ICU does not distinguish between different versions of JIS X 0208.
139 */
140 static const uint16_t jpCharsetMasks[5]={
141 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
142 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
143 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
144 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
145 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
146 };
147
148 typedef enum {
149 ASCII1=0,
150 LATIN1,
151 SBCS,
152 DBCS,
153 MBCS,
154 HWKANA
155 }Cnv2022Type;
156
157 typedef struct ISO2022State {
158 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
159 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
160 int8_t prevG; /* g before single shift (SS2 or SS3) */
161 } ISO2022State;
162
163 #define UCNV_OPTIONS_VERSION_MASK 0xf
164 #define UCNV_2022_MAX_CONVERTERS 10
165
166 typedef struct{
167 UConverter *currentConverter;
168 #ifdef U_ENABLE_GENERIC_ISO_2022
169 UBool isFirstBuffer;
170 #endif
171 Cnv2022Type currentType;
172 ISO2022State toU2022State, fromU2022State;
173 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
174 uint32_t key;
175 uint32_t version;
176 char locale[3];
177 char name[30];
178 }UConverterDataISO2022;
179
180 /* Protos */
181 /* ISO-2022 ----------------------------------------------------------------- */
182
183 /*Forward declaration */
184 U_CFUNC void
185 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
186 UErrorCode * err);
187 U_CFUNC void
188 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
189 UErrorCode * err);
190
191 #define ESC_2022 0x1B /*ESC*/
192
193 typedef enum
194 {
195 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
196 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
197 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
198 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
199 } UCNV_TableStates_2022;
200
201 /*
202 * The way these state transition arrays work is:
203 * ex : ESC$B is the sequence for JISX208
204 * a) First Iteration: char is ESC
205 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
206 * int x = normalize_esq_chars_2022[27] which is equal to 1
207 * ii) Search for this value in escSeqStateTable_Key_2022[]
208 * value of x is stored at escSeqStateTable_Key_2022[0]
209 * iii) Save this index as offset
210 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
211 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
212 * b) Switch on this state and continue to next char
213 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
214 * which is normalize_esq_chars_2022[36] == 4
215 * ii) x is currently 1(from above)
216 * x<<=5 -- x is now 32
217 * x+=normalize_esq_chars_2022[36]
218 * now x is 36
219 * iii) Search for this value in escSeqStateTable_Key_2022[]
220 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
221 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
222 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
223 * c) Switch on this state and continue to next char
224 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
225 * ii) x is currently 36 (from above)
226 * x<<=5 -- x is now 1152
227 * x+=normalize_esq_chars_2022[66]
228 * now x is 1161
229 * iii) Search for this value in escSeqStateTable_Key_2022[]
230 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
231 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
232 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
233 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
234 */
235
236
237 /*Below are the 3 arrays depicting a state transition table*/
238 static const int8_t normalize_esq_chars_2022[256] = {
239 /* 0 1 2 3 4 5 6 7 8 9 */
240
241 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
242 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
243 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
244 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
245 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
246 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
247 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
248 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
249 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
250 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
251 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
252 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
253 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
254 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
255 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
256 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
257 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
258 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
259 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
260 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
261 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
262 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
263 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
264 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
265 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
266 ,0 ,0 ,0 ,0 ,0 ,0
267 };
268
269 #ifdef U_ENABLE_GENERIC_ISO_2022
270 /*
271 * When the generic ISO-2022 converter is completely removed, not just disabled
272 * per #ifdef, then the following state table and the associated tables that are
273 * dimensioned with MAX_STATES_2022 should be trimmed.
274 *
275 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
276 * the associated escape sequences starting with ESC ( B should be removed.
277 * This includes the ones with key values 1097 and all of the ones above 1000000.
278 *
279 * For the latter, the tables can simply be truncated.
280 * For the former, since the tables must be kept parallel, it is probably best
281 * to simply duplicate an adjacent table cell, parallel in all tables.
282 *
283 * It may make sense to restructure the tables, especially by using small search
284 * tables for the variants instead of indexing them parallel to the table here.
285 */
286 #endif
287
288 #define MAX_STATES_2022 74
289 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
290 /* 0 1 2 3 4 5 6 7 8 9 */
291
292 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
293 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
294 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
295 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
296 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
297 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
298 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
299 ,35947631 ,35947635 ,35947636 ,35947638
300 };
301
302 #ifdef U_ENABLE_GENERIC_ISO_2022
303
304 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
305 /* 0 1 2 3 4 5 6 7 8 9 */
306
307 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
308 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
309 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
310 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
311 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
312 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
313 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
314 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
315 };
316
317 #endif
318
319 static const UCNV_TableStates_2022 escSeqStateTable_Value_2022[MAX_STATES_2022] = {
320 /* 0 1 2 3 4 5 6 7 8 9 */
321 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
322 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
323 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
324 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
325 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
326 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
327 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
328 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
329 };
330
331
332 /* Type def for refactoring changeState_2022 code*/
333 typedef enum{
334 #ifdef U_ENABLE_GENERIC_ISO_2022
335 ISO_2022=0,
336 #endif
337 ISO_2022_JP=1,
338 ISO_2022_KR=2,
339 ISO_2022_CN=3
340 } Variant2022;
341
342 /*********** ISO 2022 Converter Protos ***********/
343 static void
344 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode);
345
346 static void
347 _ISO2022Close(UConverter *converter);
348
349 static void
350 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
351
352 static const char*
353 _ISO2022getName(const UConverter* cnv);
354
355 static void
356 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
357
358 static UConverter *
359 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
360
361 #ifdef U_ENABLE_GENERIC_ISO_2022
362 static void
363 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
364 #endif
365
366 /*const UConverterSharedData _ISO2022Data;*/
367 static const UConverterSharedData _ISO2022JPData;
368 static const UConverterSharedData _ISO2022KRData;
369 static const UConverterSharedData _ISO2022CNData;
370
371 /*************** Converter implementations ******************/
372
373 static void
374 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
375 if(myConverterData->version == 1) {
376 UConverter *cnv = myConverterData->currentConverter;
377
378 cnv->toUnicodeStatus=0; /* offset */
379 cnv->mode=0; /* state */
380 cnv->toULength=0; /* byteIndex */
381 }
382 }
383
384 static void
385 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
386 /* in ISO-2022-KR the designator sequence appears only once
387 * in a file so we append it only once
388 */
389 if( converter->charErrorBufferLength==0){
390
391 converter->charErrorBufferLength = 4;
392 converter->charErrorBuffer[0] = 0x1b;
393 converter->charErrorBuffer[1] = 0x24;
394 converter->charErrorBuffer[2] = 0x29;
395 converter->charErrorBuffer[3] = 0x43;
396 }
397 if(myConverterData->version == 1) {
398 UConverter *cnv = myConverterData->currentConverter;
399
400 cnv->fromUChar32=0;
401 cnv->fromUnicodeStatus=1; /* prevLength */
402 }
403 }
404
405 static void
406 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
407
408 char myLocale[6]={' ',' ',' ',' ',' ',' '};
409
410 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
411 if(cnv->extraInfo != NULL) {
412 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
413 uint32_t version;
414
415 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
416 myConverterData->currentConverter = NULL;
417 myConverterData->currentType = ASCII1;
418 myConverterData->key =0;
419 #ifdef U_ENABLE_GENERIC_ISO_2022
420 myConverterData->isFirstBuffer = TRUE;
421 #endif
422 cnv->fromUnicodeStatus =FALSE;
423 if(locale){
424 uprv_strncpy(myLocale, locale, sizeof(myLocale));
425 }
426 myConverterData->version= 0;
427 version = options & UCNV_OPTIONS_VERSION_MASK;
428 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
429 (myLocale[2]=='_' || myLocale[2]=='\0')){
430 int len=0;
431 /* open the required converters and cache them */
432 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
433 myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
434 }
435 myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode);
436 myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode);
437 if(jpCharsetMasks[version]&CSM(JISX212)) {
438 myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode);
439 }
440 if(jpCharsetMasks[version]&CSM(GB2312)) {
441 myConverterData->myConverterArray[GB2312] = ucnv_loadSharedData("ibm-5478", NULL, errorCode); /* gb_2312_80-1 */
442 }
443 if(jpCharsetMasks[version]&CSM(KSC5601)) {
444 myConverterData->myConverterArray[KSC5601] = ucnv_loadSharedData("ksc_5601", NULL, errorCode);
445 }
446
447 /* set the function pointers to appropriate funtions */
448 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
449 uprv_strcpy(myConverterData->locale,"ja");
450
451 myConverterData->version = version;
452 uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
453 len = uprv_strlen(myConverterData->name);
454 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
455 myConverterData->name[len+1]='\0';
456 }
457 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
458 (myLocale[2]=='_' || myLocale[2]=='\0')){
459
460 if ((options & UCNV_OPTIONS_VERSION_MASK)==1){
461 myConverterData->version = 1;
462 myConverterData->currentConverter=
463 ucnv_open("icu-internal-25546",errorCode);
464
465 if (U_FAILURE(*errorCode)) {
466 _ISO2022Close(cnv);
467 return;
468 }
469
470 uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
471 uprv_memcpy(cnv->subChar, myConverterData->currentConverter->subChar, 4);
472 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
473 }else{
474 myConverterData->currentConverter=ucnv_open("ibm-949",errorCode);
475
476 if (U_FAILURE(*errorCode)) {
477 _ISO2022Close(cnv);
478 return;
479 }
480
481 myConverterData->version = 0;
482 uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
483 }
484
485 /* initialize the state variables */
486 setInitialStateToUnicodeKR(cnv, myConverterData);
487 setInitialStateFromUnicodeKR(cnv,myConverterData);
488
489 /* set the function pointers to appropriate funtions */
490 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
491 uprv_strcpy(myConverterData->locale,"ko");
492 }
493 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
494 (myLocale[2]=='_' || myLocale[2]=='\0')){
495
496 /* open the required converters and cache them */
497 myConverterData->myConverterArray[GB2312_1] = ucnv_loadSharedData("ibm-5478", NULL, errorCode);
498 if(version==1) {
499 myConverterData->myConverterArray[ISO_IR_165] = ucnv_loadSharedData("iso-ir-165", NULL, errorCode);
500 }
501 myConverterData->myConverterArray[CNS_11643] = ucnv_loadSharedData("cns-11643-1992", NULL, errorCode);
502
503
504 /* set the function pointers to appropriate funtions */
505 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
506 uprv_strcpy(myConverterData->locale,"cn");
507
508 if ((options & UCNV_OPTIONS_VERSION_MASK)==1){
509 myConverterData->version = 1;
510 uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
511 }else{
512 uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
513 myConverterData->version = 0;
514 }
515 }
516 else{
517 #ifdef U_ENABLE_GENERIC_ISO_2022
518 /* append the UTF-8 escape sequence */
519 cnv->charErrorBufferLength = 3;
520 cnv->charErrorBuffer[0] = 0x1b;
521 cnv->charErrorBuffer[1] = 0x25;
522 cnv->charErrorBuffer[2] = 0x42;
523
524 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
525 /* initialize the state variables */
526 uprv_strcpy(myConverterData->name,"ISO_2022");
527 #else
528 *errorCode = U_UNSUPPORTED_ERROR;
529 return;
530 #endif
531 }
532
533 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
534
535 if(U_FAILURE(*errorCode)) {
536 _ISO2022Close(cnv);
537 }
538 } else {
539 *errorCode = U_MEMORY_ALLOCATION_ERROR;
540 }
541 }
542
543
544 static void
545 _ISO2022Close(UConverter *converter) {
546 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
547 UConverterSharedData **array = myData->myConverterArray;
548 int32_t i;
549
550 if (converter->extraInfo != NULL) {
551 /*close the array of converter pointers and free the memory*/
552 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
553 if(array[i]!=NULL) {
554 ucnv_unloadSharedDataIfReady(array[i]);
555 }
556 }
557
558 ucnv_close(myData->currentConverter);
559
560 if(!converter->isExtraLocal){
561 uprv_free (converter->extraInfo);
562 converter->extraInfo = NULL;
563 }
564 }
565 }
566
567 static void
568 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
569 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
570 if(choice<=UCNV_RESET_TO_UNICODE) {
571 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
572 myConverterData->key = 0;
573 }
574 if(choice!=UCNV_RESET_TO_UNICODE) {
575 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
576 }
577 #ifdef U_ENABLE_GENERIC_ISO_2022
578 if(myConverterData->locale[0] == 0){
579 if(choice<=UCNV_RESET_TO_UNICODE) {
580 myConverterData->isFirstBuffer = TRUE;
581 myConverterData->key = 0;
582 if (converter->mode == UCNV_SO){
583 ucnv_close (myConverterData->currentConverter);
584 myConverterData->currentConverter=NULL;
585 }
586 converter->mode = UCNV_SI;
587 }
588 if(choice!=UCNV_RESET_TO_UNICODE) {
589 /* re-append UTF-8 escape sequence */
590 converter->charErrorBufferLength = 3;
591 converter->charErrorBuffer[0] = 0x1b;
592 converter->charErrorBuffer[1] = 0x28;
593 converter->charErrorBuffer[2] = 0x42;
594 }
595 }
596 else
597 #endif
598 {
599 /* reset the state variables */
600 if(myConverterData->locale[0] == 'k'){
601 if(choice<=UCNV_RESET_TO_UNICODE) {
602 setInitialStateToUnicodeKR(converter, myConverterData);
603 }
604 if(choice!=UCNV_RESET_TO_UNICODE) {
605 setInitialStateFromUnicodeKR(converter, myConverterData);
606 }
607 }
608 }
609 }
610
611 static const char*
612 _ISO2022getName(const UConverter* cnv){
613 if(cnv->extraInfo){
614 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
615 return myData->name;
616 }
617 return NULL;
618 }
619
620
621 /*************** to unicode *******************/
622 /****************************************************************************
623 * Recognized escape sequences are
624 * <ESC>(B ASCII
625 * <ESC>.A ISO-8859-1
626 * <ESC>.F ISO-8859-7
627 * <ESC>(J JISX-201
628 * <ESC>(I JISX-201
629 * <ESC>$B JISX-208
630 * <ESC>$@ JISX-208
631 * <ESC>$(D JISX-212
632 * <ESC>$A GB2312
633 * <ESC>$(C KSC5601
634 */
635 static const StateEnum nextStateToUnicodeJP[MAX_STATES_2022]= {
636 /* 0 1 2 3 4 5 6 7 8 9 */
637 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
638 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
639 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
640 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
641 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
642 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
643 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
644 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
645 };
646
647 /*************** to unicode *******************/
648 static const StateEnum nextStateToUnicodeCN[MAX_STATES_2022]= {
649 /* 0 1 2 3 4 5 6 7 8 9 */
650 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
651 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
652 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
653 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
654 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
655 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
656 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
657 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
658 };
659
660
661 static UCNV_TableStates_2022
662 getKey_2022(char c,int32_t* key,int32_t* offset){
663 int32_t togo;
664 int32_t low = 0;
665 int32_t hi = MAX_STATES_2022;
666 int32_t oldmid=0;
667
668 togo = normalize_esq_chars_2022[(uint8_t)c];
669 if(togo == 0) {
670 /* not a valid character anywhere in an escape sequence */
671 *key = 0;
672 *offset = 0;
673 return INVALID_2022;
674 }
675 togo = (*key << 5) + togo;
676
677 while (hi != low) /*binary search*/{
678
679 register int32_t mid = (hi+low) >> 1; /*Finds median*/
680
681 if (mid == oldmid)
682 break;
683
684 if (escSeqStateTable_Key_2022[mid] > togo){
685 hi = mid;
686 }
687 else if (escSeqStateTable_Key_2022[mid] < togo){
688 low = mid;
689 }
690 else /*we found it*/{
691 *key = togo;
692 *offset = mid;
693 return escSeqStateTable_Value_2022[mid];
694 }
695 oldmid = mid;
696
697 }
698
699 *key = 0;
700 *offset = 0;
701 return INVALID_2022;
702 }
703
704 /*runs through a state machine to determine the escape sequence - codepage correspondance
705 */
706 static void
707 changeState_2022(UConverter* _this,
708 const char** source,
709 const char* sourceLimit,
710 Variant2022 var,
711 UErrorCode* err){
712 UCNV_TableStates_2022 value;
713 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
714 uint32_t key = myData2022->key;
715 int32_t offset;
716 char c;
717
718 value = VALID_NON_TERMINAL_2022;
719 while (*source < sourceLimit) {
720 c = *(*source)++;
721 _this->toUBytes[_this->toULength++]=(uint8_t)c;
722 value = getKey_2022(c,(int32_t *) &key, &offset);
723
724 switch (value){
725
726 case VALID_NON_TERMINAL_2022 :
727 /* continue with the loop */
728 break;
729
730 case VALID_TERMINAL_2022:
731 key = 0;
732 goto DONE;
733
734 case INVALID_2022:
735 goto DONE;
736
737 case VALID_MAYBE_TERMINAL_2022:
738 #ifdef U_ENABLE_GENERIC_ISO_2022
739 /* ESC ( B is ambiguous only for ISO_2022 itself */
740 if(var == ISO_2022) {
741 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
742 _this->toULength = 0;
743
744 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
745
746 /* continue with the loop */
747 value = VALID_NON_TERMINAL_2022;
748 break;
749 } else
750 #endif
751 {
752 /* not ISO_2022 itself, finish here */
753 value = VALID_TERMINAL_2022;
754 key = 0;
755 goto DONE;
756 }
757 }
758 }
759
760 DONE:
761 myData2022->key = key;
762
763 if (value == VALID_NON_TERMINAL_2022) {
764 /* indicate that the escape sequence is incomplete: key!=0 */
765 return;
766 } else if (value == INVALID_2022 ) {
767 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
768 return;
769 } else /* value == VALID_TERMINAL_2022 */ {
770 switch(var){
771 #ifdef U_ENABLE_GENERIC_ISO_2022
772 case ISO_2022:
773 {
774 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
775 if(chosenConverterName == NULL) {
776 /* SS2 or SS3 */
777 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
778 return;
779 }
780
781 _this->mode = UCNV_SI;
782 ucnv_close(myData2022->currentConverter);
783 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
784 if(U_SUCCESS(*err)) {
785 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
786 _this->mode = UCNV_SO;
787 }
788 break;
789 }
790 #endif
791 case ISO_2022_JP:
792 {
793 StateEnum tempState=nextStateToUnicodeJP[offset];
794 switch(tempState) {
795 case INVALID_STATE:
796 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
797 break;
798 case SS2_STATE:
799 if(myData2022->toU2022State.cs[2]!=0) {
800 if(myData2022->toU2022State.g<2) {
801 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
802 }
803 myData2022->toU2022State.g=2;
804 } else {
805 /* illegal to have SS2 before a matching designator */
806 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
807 }
808 break;
809 /* case SS3_STATE: not used in ISO-2022-JP-x */
810 case ISO8859_1:
811 case ISO8859_7:
812 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
813 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
814 } else {
815 /* G2 charset for SS2 */
816 myData2022->toU2022State.cs[2]=(int8_t)tempState;
817 }
818 break;
819 default:
820 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
821 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
822 } else {
823 /* G0 charset */
824 myData2022->toU2022State.cs[0]=(int8_t)tempState;
825 }
826 break;
827 }
828 }
829 break;
830 case ISO_2022_CN:
831 {
832 StateEnum tempState=nextStateToUnicodeCN[offset];
833 switch(tempState) {
834 case INVALID_STATE:
835 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
836 break;
837 case SS2_STATE:
838 if(myData2022->toU2022State.cs[2]!=0) {
839 if(myData2022->toU2022State.g<2) {
840 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
841 }
842 myData2022->toU2022State.g=2;
843 } else {
844 /* illegal to have SS2 before a matching designator */
845 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
846 }
847 break;
848 case SS3_STATE:
849 if(myData2022->toU2022State.cs[3]!=0) {
850 if(myData2022->toU2022State.g<2) {
851 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
852 }
853 myData2022->toU2022State.g=3;
854 } else {
855 /* illegal to have SS3 before a matching designator */
856 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
857 }
858 break;
859 case ISO_IR_165:
860 if(myData2022->version==0) {
861 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
862 break;
863 }
864 case GB2312_1:
865 case CNS_11643_1:
866 myData2022->toU2022State.cs[1]=(int8_t)tempState;
867 break;
868 case CNS_11643_2:
869 myData2022->toU2022State.cs[2]=(int8_t)tempState;
870 break;
871 default:
872 /* other CNS 11643 planes */
873 if(myData2022->version==0) {
874 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
875 } else {
876 myData2022->toU2022State.cs[3]=(int8_t)tempState;
877 }
878 break;
879 }
880 }
881 break;
882 case ISO_2022_KR:
883 if(offset==0x30){
884 /* nothing to be done, just accept this one escape sequence */
885 } else {
886 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
887 }
888 break;
889
890 default:
891 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
892 break;
893 }
894 }
895 if(U_SUCCESS(*err)) {
896 _this->toULength = 0;
897 }
898 }
899
900 /*Checks the characters of the buffer against valid 2022 escape sequences
901 *if the match we return a pointer to the initial start of the sequence otherwise
902 *we return sourceLimit
903 */
904 /*for 2022 looks ahead in the stream
905 *to determine the longest possible convertible
906 *data stream
907 */
908 static U_INLINE const char*
909 getEndOfBuffer_2022(const char** source,
910 const char* sourceLimit,
911 UBool flush){
912
913 const char* mySource = *source;
914
915 #ifdef U_ENABLE_GENERIC_ISO_2022
916 if (*source >= sourceLimit)
917 return sourceLimit;
918
919 do{
920
921 if (*mySource == ESC_2022){
922 int8_t i;
923 int32_t key = 0;
924 int32_t offset;
925 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
926
927 /* Kludge: I could not
928 * figure out the reason for validating an escape sequence
929 * twice - once here and once in changeState_2022().
930 * is it possible to have an ESC character in a ISO2022
931 * byte stream which is valid in a code page? Is it legal?
932 */
933 for (i=0;
934 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
935 i++) {
936 value = getKey_2022(*(mySource+i), &key, &offset);
937 }
938 if (value > 0 || *mySource==ESC_2022)
939 return mySource;
940
941 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
942 return sourceLimit;
943 }
944 }while (++mySource < sourceLimit);
945
946 return sourceLimit;
947 #else
948 while(mySource < sourceLimit && *mySource != ESC_2022) {
949 ++mySource;
950 }
951 return mySource;
952 #endif
953 }
954
955
956 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
957 * any future change in _MBCSFromUChar32() function should be reflected in
958 * this macro
959 */
960 static U_INLINE void
961 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
962 UChar32 c,
963 uint32_t* value,
964 UBool useFallback,
965 int32_t *length,
966 int outputType)
967 {
968 const int32_t *cx;
969 const uint16_t *table;
970 uint32_t stage2Entry;
971 uint32_t myValue;
972 const uint8_t *p;
973 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
974 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
975 table=sharedData->mbcs.fromUnicodeTable;
976 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
977 /* get the bytes and the length for the output */
978 if(outputType==MBCS_OUTPUT_2){
979 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
980 if(myValue<=0xff) {
981 *length=1;
982 } else {
983 *length=2;
984 }
985 } else /* outputType==MBCS_OUTPUT_3 */ {
986 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
987 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
988 if(myValue<=0xff) {
989 *length=1;
990 } else if(myValue<=0xffff) {
991 *length=2;
992 } else {
993 *length=3;
994 }
995 }
996 /* is this code point assigned, or do we use fallbacks? */
997 if( (stage2Entry&(1<<(16+(c&0xf))))!=0 ||
998 (FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0)
999 ) {
1000 /*
1001 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1002 * There is no way with this data structure for fallback output
1003 * to be a zero byte.
1004 */
1005 /* assigned */
1006 *value=myValue;
1007 return;
1008 }
1009 }
1010
1011 cx=sharedData->mbcs.extIndexes;
1012 if(cx!=NULL) {
1013 *length=ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1014 return;
1015 }
1016
1017 /* unassigned */
1018 *length=0;
1019 }
1020
1021 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1022 * any future change in _MBCSSingleFromUChar32() function should be reflected in
1023 * this macro
1024 */
1025 static U_INLINE void
1026 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1027 UChar32 c,
1028 uint32_t* retval,
1029 UBool useFallback)
1030 {
1031 const uint16_t *table;
1032 int32_t value;
1033 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1034 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1035 *retval=(uint16_t)-1;
1036 return;
1037 }
1038 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1039 table=sharedData->mbcs.fromUnicodeTable;
1040 /* get the byte for the output */
1041 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1042 /* is this code point assigned, or do we use fallbacks? */
1043 if(useFallback ? value>=0x800 : value>=0xc00) {
1044 value &=0xff;
1045 } else {
1046 value= -1;
1047 }
1048 *retval=(uint16_t) value;
1049 }
1050
1051 #ifdef U_ENABLE_GENERIC_ISO_2022
1052
1053 /**********************************************************************************
1054 * ISO-2022 Converter
1055 *
1056 *
1057 */
1058
1059 static void
1060 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1061 UErrorCode* err){
1062 const char* mySourceLimit, *realSourceLimit;
1063 const char* sourceStart;
1064 const UChar* myTargetStart;
1065 UConverter* saveThis;
1066 UConverterDataISO2022* myData;
1067 int8_t length;
1068
1069 saveThis = args->converter;
1070 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1071
1072 realSourceLimit = args->sourceLimit;
1073 while (args->source < realSourceLimit) {
1074 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1075 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1076 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1077
1078 if(args->source < mySourceLimit) {
1079 if(myData->currentConverter==NULL) {
1080 myData->currentConverter = ucnv_open("ASCII",err);
1081 if(U_FAILURE(*err)){
1082 return;
1083 }
1084
1085 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1086 saveThis->mode = UCNV_SO;
1087 }
1088
1089 /* convert to before the ESC or until the end of the buffer */
1090 myData->isFirstBuffer=FALSE;
1091 sourceStart = args->source;
1092 myTargetStart = args->target;
1093 args->converter = myData->currentConverter;
1094 ucnv_toUnicode(args->converter,
1095 &args->target,
1096 args->targetLimit,
1097 &args->source,
1098 mySourceLimit,
1099 args->offsets,
1100 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1101 err);
1102 args->converter = saveThis;
1103
1104 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1105 /* move the overflow buffer */
1106 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1107 myData->currentConverter->UCharErrorBufferLength = 0;
1108 if(length > 0) {
1109 uprv_memcpy(saveThis->UCharErrorBuffer,
1110 myData->currentConverter->UCharErrorBuffer,
1111 length*U_SIZEOF_UCHAR);
1112 }
1113 return;
1114 }
1115
1116 /*
1117 * At least one of:
1118 * -Error while converting
1119 * -Done with entire buffer
1120 * -Need to write offsets or update the current offset
1121 * (leave that up to the code in ucnv.c)
1122 *
1123 * or else we just stopped at an ESC byte and continue with changeState_2022()
1124 */
1125 if (U_FAILURE(*err) ||
1126 (args->source == realSourceLimit) ||
1127 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1128 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1129 ) {
1130 /* copy partial or error input for truncated detection and error handling */
1131 if(U_FAILURE(*err)) {
1132 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1133 if(length > 0) {
1134 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1135 }
1136 } else {
1137 length = saveThis->toULength = myData->currentConverter->toULength;
1138 if(length > 0) {
1139 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1140 if(args->source < mySourceLimit) {
1141 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1142 }
1143 }
1144 }
1145 return;
1146 }
1147 }
1148 }
1149
1150 sourceStart = args->source;
1151 changeState_2022(args->converter,
1152 &(args->source),
1153 realSourceLimit,
1154 ISO_2022,
1155 err);
1156 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1157 /* let the ucnv.c code update its current offset */
1158 return;
1159 }
1160 }
1161 }
1162
1163 #endif
1164
1165 /*
1166 * To Unicode Callback helper function
1167 */
1168 static void
1169 toUnicodeCallback(UConverter *cnv,
1170 const uint32_t sourceChar, const uint32_t targetUniChar,
1171 UErrorCode* err){
1172 if(sourceChar>0xff){
1173 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1174 cnv->toUBytes[1] = (uint8_t)sourceChar;
1175 cnv->toULength = 2;
1176 }
1177 else{
1178 cnv->toUBytes[0] =(char) sourceChar;
1179 cnv->toULength = 2;
1180 }
1181
1182 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1183 *err = U_INVALID_CHAR_FOUND;
1184 }
1185 else{
1186 *err = U_ILLEGAL_CHAR_FOUND;
1187 }
1188 }
1189
1190 /**************************************ISO-2022-JP*************************************************/
1191
1192 /************************************** IMPORTANT **************************************************
1193 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1194 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1195 * The converter iterates over each Unicode codepoint
1196 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1197 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1198 * would do as far as possible.
1199 *
1200 * If the implementation of these macros or structure of sharedData struct change in the future, make
1201 * sure that ISO-2022 is also changed.
1202 ***************************************************************************************************
1203 */
1204
1205 /***************************************************************************************************
1206 * Rules for ISO-2022-jp encoding
1207 * (i) Escape sequences must be fully contained within a line they should not
1208 * span new lines or CRs
1209 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1210 * JIS-Roman character escape sequence should follow before the line terminates
1211 * (iii) If the first character on the line is represented by two bytes then a two
1212 * byte character escape sequence should precede it
1213 * (iv) If no escape sequence is encountered then the characters are ASCII
1214 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1215 * and invoked with SS2 (ESC N).
1216 * (vi) If there is any G0 designation in text, there must be a switch to
1217 * ASCII or to JIS X 0201-Roman before a space character (but not
1218 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1219 * characters such as tab or CRLF.
1220 * (vi) Supported encodings:
1221 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1222 *
1223 * source : RFC-1554
1224 *
1225 * JISX201, JISX208,JISX212 : new .cnv data files created
1226 * KSC5601 : alias to ibm-949 mapping table
1227 * GB2312 : alias to ibm-1386 mapping table
1228 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1229 * ISO-8859-7 : alisas to ibm-9409 mapping table
1230 */
1231
1232 /* preference order of JP charsets */
1233 static const StateEnum jpCharsetPref[]={
1234 ASCII,
1235 JISX201,
1236 ISO8859_1,
1237 ISO8859_7,
1238 JISX208,
1239 JISX212,
1240 GB2312,
1241 KSC5601,
1242 HWKANA_7BIT
1243 };
1244
1245 static const char escSeqChars[][6] ={
1246 "\x1B\x28\x42", /* <ESC>(B ASCII */
1247 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1248 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1249 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1250 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1251 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1252 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1253 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1254 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1255
1256 };
1257 static const int32_t escSeqCharsLen[] ={
1258 3, /* length of <ESC>(B ASCII */
1259 3, /* length of <ESC>.A ISO-8859-1 */
1260 3, /* length of <ESC>.F ISO-8859-7 */
1261 3, /* length of <ESC>(J JISX-201 */
1262 3, /* length of <ESC>$B JISX-208 */
1263 4, /* length of <ESC>$(D JISX-212 */
1264 3, /* length of <ESC>$A GB2312 */
1265 4, /* length of <ESC>$(C KSC5601 */
1266 3 /* length of <ESC>(I HWKANA_7BIT */
1267 };
1268
1269 /*
1270 * The iteration over various code pages works this way:
1271 * i) Get the currentState from myConverterData->currentState
1272 * ii) Check if the character is mapped to a valid character in the currentState
1273 * Yes -> a) set the initIterState to currentState
1274 * b) remain in this state until an invalid character is found
1275 * No -> a) go to the next code page and find the character
1276 * iii) Before changing the state increment the current state check if the current state
1277 * is equal to the intitIteration state
1278 * Yes -> A character that cannot be represented in any of the supported encodings
1279 * break and return a U_INVALID_CHARACTER error
1280 * No -> Continue and find the character in next code page
1281 *
1282 *
1283 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1284 */
1285
1286 static void
1287 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1288 UConverterDataISO2022 *converterData;
1289 ISO2022State *pFromU2022State;
1290 uint8_t *target = (uint8_t *) args->target;
1291 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1292 const UChar* source = args->source;
1293 const UChar* sourceLimit = args->sourceLimit;
1294 int32_t* offsets = args->offsets;
1295 UChar32 sourceChar;
1296 char buffer[8];
1297 int32_t len, outLen;
1298 int8_t choices[10];
1299 int32_t choiceCount;
1300 uint32_t targetValue;
1301 UBool useFallback;
1302
1303 int32_t i;
1304 int8_t cs, g;
1305
1306 /* set up the state */
1307 converterData = (UConverterDataISO2022*)args->converter->extraInfo;
1308 pFromU2022State = &converterData->fromU2022State;
1309 useFallback = args->converter->useFallback;
1310
1311 choiceCount = 0;
1312
1313 /* check if the last codepoint of previous buffer was a lead surrogate*/
1314 if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
1315 goto getTrail;
1316 }
1317
1318 while(source < sourceLimit) {
1319 if(target < targetLimit) {
1320
1321 sourceChar = *(source++);
1322 /*check if the char is a First surrogate*/
1323 if(UTF_IS_SURROGATE(sourceChar)) {
1324 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1325 getTrail:
1326 /*look ahead to find the trail surrogate*/
1327 if(source < sourceLimit) {
1328 /* test the following code unit */
1329 UChar trail=(UChar) *source;
1330 if(UTF_IS_SECOND_SURROGATE(trail)) {
1331 source++;
1332 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1333 args->converter->fromUChar32=0x00;
1334 /* convert this supplementary code point */
1335 /* exit this condition tree */
1336 } else {
1337 /* this is an unmatched lead code unit (1st surrogate) */
1338 /* callback(illegal) */
1339 *err=U_ILLEGAL_CHAR_FOUND;
1340 args->converter->fromUChar32=sourceChar;
1341 break;
1342 }
1343 } else {
1344 /* no more input */
1345 args->converter->fromUChar32=sourceChar;
1346 break;
1347 }
1348 } else {
1349 /* this is an unmatched trail code unit (2nd surrogate) */
1350 /* callback(illegal) */
1351 *err=U_ILLEGAL_CHAR_FOUND;
1352 args->converter->fromUChar32=sourceChar;
1353 break;
1354 }
1355 }
1356
1357 /* do the conversion */
1358
1359 if(choiceCount == 0) {
1360 uint16_t csm;
1361
1362 /*
1363 * The csm variable keeps track of which charsets are allowed
1364 * and not used yet while building the choices[].
1365 */
1366 csm = jpCharsetMasks[converterData->version];
1367 choiceCount = 0;
1368
1369 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1370 if(converterData->version == 3 || converterData->version == 4) {
1371 choices[choiceCount++] = cs = (int8_t)HWKANA_7BIT;
1372 csm &= ~CSM(cs);
1373 }
1374
1375 /* try the current G0 charset */
1376 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1377 csm &= ~CSM(cs);
1378
1379 /* try the current G2 charset */
1380 if((cs = pFromU2022State->cs[2]) != 0) {
1381 choices[choiceCount++] = cs;
1382 csm &= ~CSM(cs);
1383 }
1384
1385 /* try all the other possible charsets */
1386 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1387 cs = (int8_t)jpCharsetPref[i];
1388 if(CSM(cs) & csm) {
1389 choices[choiceCount++] = cs;
1390 csm &= ~CSM(cs);
1391 }
1392 }
1393 }
1394
1395 cs = g = 0;
1396 len = 0;
1397
1398 for(i = 0; i < choiceCount && len == 0; ++i) {
1399 cs = choices[i];
1400 switch(cs) {
1401 case ASCII:
1402 if(sourceChar <= 0x7f) {
1403 targetValue = (uint32_t)sourceChar;
1404 len = 1;
1405 }
1406 break;
1407 case ISO8859_1:
1408 if(0x80 <= sourceChar && sourceChar <= 0xff) {
1409 targetValue = (uint32_t)sourceChar - 0x80;
1410 len = 1;
1411 g = 2;
1412 }
1413 break;
1414 case HWKANA_7BIT:
1415 if((uint32_t)(0xff9f-sourceChar)<=(0xff9f-0xff61)) {
1416 targetValue = (uint32_t)(sourceChar - (0xff61 - 0x21));
1417 len = 1;
1418
1419 if(converterData->version==3) {
1420 /* JIS7: use G1 (SO) */
1421 pFromU2022State->cs[1] = cs; /* do not output an escape sequence */
1422 g = 1;
1423 } else if(converterData->version==4) {
1424 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1425 int8_t cs0;
1426
1427 targetValue += 0x80;
1428
1429 cs0 = pFromU2022State->cs[0];
1430 if(IS_JP_DBCS(cs0)) {
1431 /* switch from a DBCS charset to JISX201 */
1432 cs = (int8_t)JISX201;
1433 } else {
1434 /* stay in the current G0 charset */
1435 cs = cs0;
1436 }
1437 }
1438 }
1439 break;
1440 case JISX201:
1441 /* G0 SBCS */
1442 MBCS_SINGLE_FROM_UCHAR32(
1443 converterData->myConverterArray[cs],
1444 sourceChar, &targetValue,
1445 useFallback);
1446 if(targetValue <= 0x7f) {
1447 len = 1;
1448 }
1449 break;
1450 case ISO8859_7:
1451 /* G0 SBCS forced to 7-bit output */
1452 MBCS_SINGLE_FROM_UCHAR32(
1453 converterData->myConverterArray[cs],
1454 sourceChar, &targetValue,
1455 useFallback);
1456 if(0x80 <= targetValue && targetValue <= 0xff) {
1457 targetValue -= 0x80;
1458 len = 1;
1459 g = 2;
1460 }
1461 break;
1462 default:
1463 /* G0 DBCS */
1464 MBCS_FROM_UCHAR32_ISO2022(
1465 converterData->myConverterArray[cs],
1466 sourceChar, &targetValue,
1467 useFallback, &len, MBCS_OUTPUT_2);
1468 if(len != 2) {
1469 len = 0;
1470 }
1471 break;
1472 }
1473 }
1474
1475 if(len > 0) {
1476 outLen = 0; /* count output bytes */
1477
1478 /* write SI if necessary (only for JIS7) */
1479 if(pFromU2022State->g == 1 && g == 0) {
1480 buffer[outLen++] = UCNV_SI;
1481 pFromU2022State->g = 0;
1482 }
1483
1484 /* write the designation sequence if necessary */
1485 if(cs != pFromU2022State->cs[g]) {
1486 int32_t escLen = escSeqCharsLen[cs];
1487 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1488 outLen += escLen;
1489 pFromU2022State->cs[g] = cs;
1490
1491 /* invalidate the choices[] */
1492 choiceCount = 0;
1493 }
1494
1495 /* write the shift sequence if necessary */
1496 if(g != pFromU2022State->g) {
1497 switch(g) {
1498 /* case 0 handled before writing escapes */
1499 case 1:
1500 buffer[outLen++] = UCNV_SO;
1501 pFromU2022State->g = 1;
1502 break;
1503 default: /* case 2 */
1504 buffer[outLen++] = 0x1b;
1505 buffer[outLen++] = 0x4e;
1506 break;
1507 /* no case 3: no SS3 in ISO-2022-JP-x */
1508 }
1509 }
1510
1511 /* write the output bytes */
1512 if(len == 1) {
1513 buffer[outLen++] = (char)targetValue;
1514 } else /* len == 2 */ {
1515 buffer[outLen++] = (char)(targetValue >> 8);
1516 buffer[outLen++] = (char)targetValue;
1517 }
1518 } else {
1519 /*
1520 * if we cannot find the character after checking all codepages
1521 * then this is an error
1522 */
1523 *err = U_INVALID_CHAR_FOUND;
1524 args->converter->fromUChar32=sourceChar;
1525 break;
1526 }
1527
1528 if(sourceChar == CR || sourceChar == LF) {
1529 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1530 pFromU2022State->cs[2] = 0;
1531 choiceCount = 0;
1532 }
1533
1534 /* output outLen>0 bytes in buffer[] */
1535 if(outLen == 1) {
1536 *target++ = buffer[0];
1537 if(offsets) {
1538 *offsets++ = source - args->source - 1; /* -1: known to be ASCII */
1539 }
1540 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1541 *target++ = buffer[0];
1542 *target++ = buffer[1];
1543 if(offsets) {
1544 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1545 *offsets++ = sourceIndex;
1546 *offsets++ = sourceIndex;
1547 }
1548 } else {
1549 ucnv_fromUWriteBytes(
1550 args->converter,
1551 buffer, outLen,
1552 (char **)&target, (const char *)targetLimit,
1553 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1554 err);
1555 if(U_FAILURE(*err)) {
1556 break;
1557 }
1558 }
1559 } /* end if(myTargetIndex<myTargetLength) */
1560 else{
1561 *err =U_BUFFER_OVERFLOW_ERROR;
1562 break;
1563 }
1564
1565 }/* end while(mySourceIndex<mySourceLength) */
1566
1567 /*
1568 * the end of the input stream and detection of truncated input
1569 * are handled by the framework, but for ISO-2022-JP conversion
1570 * we need to be in ASCII mode at the very end
1571 *
1572 * conditions:
1573 * successful
1574 * in SO mode or not in ASCII mode
1575 * end of input and no truncated input
1576 */
1577 if( U_SUCCESS(*err) &&
1578 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1579 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
1580 ) {
1581 int32_t sourceIndex;
1582
1583 outLen = 0;
1584
1585 if(pFromU2022State->g != 0) {
1586 buffer[outLen++] = UCNV_SI;
1587 pFromU2022State->g = 0;
1588 }
1589
1590 if(pFromU2022State->cs[0] != ASCII) {
1591 int32_t escLen = escSeqCharsLen[ASCII];
1592 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1593 outLen += escLen;
1594 pFromU2022State->cs[0] = (int8_t)ASCII;
1595 }
1596
1597 /* get the source index of the last input character */
1598 /*
1599 * TODO this would be simpler and more reliable if we used a pair
1600 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1601 * so that we could simply use the prevSourceIndex here;
1602 * this code gives an incorrect result for the rare case of an unmatched
1603 * trail surrogate that is alone in the last buffer of the text stream
1604 */
1605 sourceIndex=(int32_t)(source-args->source);
1606 if(sourceIndex>0) {
1607 --sourceIndex;
1608 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1609 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1610 ) {
1611 --sourceIndex;
1612 }
1613 } else {
1614 sourceIndex=-1;
1615 }
1616
1617 ucnv_fromUWriteBytes(
1618 args->converter,
1619 buffer, outLen,
1620 (char **)&target, (const char *)targetLimit,
1621 &offsets, sourceIndex,
1622 err);
1623 }
1624
1625 /*save the state and return */
1626 args->source = source;
1627 args->target = (char*)target;
1628 }
1629
1630 /*************** to unicode *******************/
1631
1632 static void
1633 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
1634 UErrorCode* err){
1635 char tempBuf[3];
1636 const char *mySource = (char *) args->source;
1637 UChar *myTarget = args->target;
1638 const char *mySourceLimit = args->sourceLimit;
1639 uint32_t targetUniChar = 0x0000;
1640 uint32_t mySourceChar = 0x0000;
1641 UConverterDataISO2022* myData;
1642 ISO2022State *pToU2022State;
1643 StateEnum cs;
1644
1645 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
1646 pToU2022State = &myData->toU2022State;
1647
1648 if(myData->key != 0) {
1649 /* continue with a partial escape sequence */
1650 goto escape;
1651 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
1652 /* continue with a partial double-byte character */
1653 mySourceChar = args->converter->toUBytes[0];
1654 args->converter->toULength = 0;
1655 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1656 goto getTrailByte;
1657 }
1658
1659 while(mySource < mySourceLimit){
1660
1661 targetUniChar =missingCharMarker;
1662
1663 if(myTarget < args->targetLimit){
1664
1665 mySourceChar= (unsigned char) *mySource++;
1666
1667 switch(mySourceChar) {
1668 case UCNV_SI:
1669 if(myData->version==3) {
1670 pToU2022State->g=0;
1671 continue;
1672 } else {
1673 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1674 break;
1675 }
1676
1677 case UCNV_SO:
1678 if(myData->version==3) {
1679 /* JIS7: switch to G1 half-width Katakana */
1680 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
1681 pToU2022State->g=1;
1682 continue;
1683 } else {
1684 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1685 break;
1686 }
1687
1688 case ESC_2022:
1689 mySource--;
1690 escape:
1691 changeState_2022(args->converter,&(mySource),
1692 mySourceLimit, ISO_2022_JP,err);
1693
1694 /* invalid or illegal escape sequence */
1695 if(U_FAILURE(*err)){
1696 args->target = myTarget;
1697 args->source = mySource;
1698 return;
1699 }
1700 continue;
1701
1702 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
1703
1704 case CR:
1705 /*falls through*/
1706 case LF:
1707 /* automatically reset to single-byte mode */
1708 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
1709 pToU2022State->cs[0] = (int8_t)ASCII;
1710 }
1711 pToU2022State->cs[2] = 0;
1712 pToU2022State->g = 0;
1713 /* falls through */
1714 default:
1715 /* convert one or two bytes */
1716 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1717 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
1718 !IS_JP_DBCS(cs)
1719 ) {
1720 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
1721 targetUniChar = mySourceChar + (0xff61 - 0xa1);
1722
1723 /* return from a single-shift state to the previous one */
1724 if(pToU2022State->g >= 2) {
1725 pToU2022State->g=pToU2022State->prevG;
1726 }
1727 } else switch(cs) {
1728 case ASCII:
1729 if(mySourceChar <= 0x7f) {
1730 targetUniChar = mySourceChar;
1731 }
1732 break;
1733 case ISO8859_1:
1734 if(mySourceChar <= 0x7f) {
1735 targetUniChar = mySourceChar + 0x80;
1736 }
1737 /* return from a single-shift state to the previous one */
1738 pToU2022State->g=pToU2022State->prevG;
1739 break;
1740 case ISO8859_7:
1741 if(mySourceChar <= 0x7f) {
1742 /* convert mySourceChar+0x80 to use a normal 8-bit table */
1743 targetUniChar =
1744 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1745 myData->myConverterArray[cs],
1746 mySourceChar + 0x80);
1747 }
1748 /* return from a single-shift state to the previous one */
1749 pToU2022State->g=pToU2022State->prevG;
1750 break;
1751 case JISX201:
1752 if(mySourceChar <= 0x7f) {
1753 targetUniChar =
1754 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1755 myData->myConverterArray[cs],
1756 mySourceChar);
1757 }
1758 break;
1759 case HWKANA_7BIT:
1760 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
1761 /* 7-bit halfwidth Katakana */
1762 targetUniChar = mySourceChar + (0xff61 - 0x21);
1763 }
1764 break;
1765 default:
1766 /* G0 DBCS */
1767 if(mySource < mySourceLimit) {
1768 char trailByte;
1769 getTrailByte:
1770 tempBuf[0] = (char) (mySourceChar);
1771 tempBuf[1] = trailByte = *mySource++;
1772 mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
1773 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
1774 } else {
1775 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
1776 args->converter->toULength = 1;
1777 goto endloop;
1778 }
1779 }
1780 break;
1781 }
1782 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
1783 if(args->offsets){
1784 args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
1785 }
1786 *(myTarget++)=(UChar)targetUniChar;
1787 }
1788 else if(targetUniChar > missingCharMarker){
1789 /* disassemble the surrogate pair and write to output*/
1790 targetUniChar-=0x0010000;
1791 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
1792 if(args->offsets){
1793 args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
1794 }
1795 ++myTarget;
1796 if(myTarget< args->targetLimit){
1797 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
1798 if(args->offsets){
1799 args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
1800 }
1801 ++myTarget;
1802 }else{
1803 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
1804 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
1805 }
1806
1807 }
1808 else{
1809 /* Call the callback function*/
1810 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
1811 break;
1812 }
1813 }
1814 else{
1815 *err =U_BUFFER_OVERFLOW_ERROR;
1816 break;
1817 }
1818 }
1819 endloop:
1820 args->target = myTarget;
1821 args->source = mySource;
1822 }
1823
1824
1825 /***************************************************************
1826 * Rules for ISO-2022-KR encoding
1827 * i) The KSC5601 designator sequence should appear only once in a file,
1828 * at the begining of a line before any KSC5601 characters. This usually
1829 * means that it appears by itself on the first line of the file
1830 * ii) There are only 2 shifting sequences SO to shift into double byte mode
1831 * and SI to shift into single byte mode
1832 */
1833 static void
1834 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
1835
1836 UConverter* saveConv = args->converter;
1837 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
1838 args->converter=myConverterData->currentConverter;
1839
1840 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
1841 ucnv_MBCSFromUnicodeWithOffsets(args,err);
1842 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
1843
1844 if(*err == U_BUFFER_OVERFLOW_ERROR) {
1845 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
1846 uprv_memcpy(
1847 saveConv->charErrorBuffer,
1848 myConverterData->currentConverter->charErrorBuffer,
1849 myConverterData->currentConverter->charErrorBufferLength);
1850 }
1851 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
1852 myConverterData->currentConverter->charErrorBufferLength = 0;
1853 }
1854 args->converter=saveConv;
1855 }
1856
1857 static void
1858 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
1859
1860 const UChar *source = args->source;
1861 const UChar *sourceLimit = args->sourceLimit;
1862 unsigned char *target = (unsigned char *) args->target;
1863 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
1864 int32_t* offsets = args->offsets;
1865 uint32_t targetByteUnit = 0x0000;
1866 UChar32 sourceChar = 0x0000;
1867 UBool isTargetByteDBCS;
1868 UBool oldIsTargetByteDBCS;
1869 UConverterDataISO2022 *converterData;
1870 UConverterSharedData* sharedData;
1871 UBool useFallback;
1872 int32_t length =0;
1873
1874 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
1875 /* if the version is 1 then the user is requesting
1876 * conversion with ibm-25546 pass the arguments to
1877 * MBCS converter and return
1878 */
1879 if(converterData->version==1){
1880 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
1881 return;
1882 }
1883
1884 /* initialize data */
1885 sharedData = converterData->currentConverter->sharedData;
1886 useFallback = args->converter->useFallback;
1887 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
1888 oldIsTargetByteDBCS = isTargetByteDBCS;
1889
1890 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
1891 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
1892 goto getTrail;
1893 }
1894 while(source < sourceLimit){
1895
1896 targetByteUnit = missingCharMarker;
1897
1898 if(target < (unsigned char*) args->targetLimit){
1899 sourceChar = *source++;
1900 /* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData,
1901 sourceChar,&targetByteUnit,args->converter->useFallback);*/
1902 MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,&length,MBCS_OUTPUT_2);
1903 /* only DBCS or SBCS characters are expected*/
1904 /* DB characters with high bit set to 1 are expected */
1905 if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
1906 targetByteUnit=missingCharMarker;
1907 }
1908 if (targetByteUnit != missingCharMarker){
1909
1910 oldIsTargetByteDBCS = isTargetByteDBCS;
1911 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
1912 /* append the shift sequence */
1913 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
1914
1915 if (isTargetByteDBCS)
1916 *target++ = UCNV_SO;
1917 else
1918 *target++ = UCNV_SI;
1919 if(offsets)
1920 *(offsets++)= source - args->source-1;
1921 }
1922 /* write the targetUniChar to target */
1923 if(targetByteUnit <= 0x00FF){
1924 if( target < targetLimit){
1925 *(target++) = (unsigned char) targetByteUnit;
1926 if(offsets){
1927 *(offsets++) = source - args->source-1;
1928 }
1929
1930 }else{
1931 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
1932 *err = U_BUFFER_OVERFLOW_ERROR;
1933 }
1934 }else{
1935 if(target < targetLimit){
1936 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
1937 if(offsets){
1938 *(offsets++) = source - args->source-1;
1939 }
1940 if(target < targetLimit){
1941 *(target++) =(unsigned char) (targetByteUnit -0x80);
1942 if(offsets){
1943 *(offsets++) = source - args->source-1;
1944 }
1945 }else{
1946 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
1947 *err = U_BUFFER_OVERFLOW_ERROR;
1948 }
1949 }else{
1950 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
1951 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
1952 *err = U_BUFFER_OVERFLOW_ERROR;
1953 }
1954 }
1955
1956 }
1957 else{
1958 /* oops.. the code point is unassingned
1959 * set the error and reason
1960 */
1961
1962 /*check if the char is a First surrogate*/
1963 if(UTF_IS_SURROGATE(sourceChar)) {
1964 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1965 getTrail:
1966 /*look ahead to find the trail surrogate*/
1967 if(source < sourceLimit) {
1968 /* test the following code unit */
1969 UChar trail=(UChar) *source;
1970 if(UTF_IS_SECOND_SURROGATE(trail)) {
1971 source++;
1972 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1973 *err = U_INVALID_CHAR_FOUND;
1974 /* convert this surrogate code point */
1975 /* exit this condition tree */
1976 } else {
1977 /* this is an unmatched lead code unit (1st surrogate) */
1978 /* callback(illegal) */
1979 *err=U_ILLEGAL_CHAR_FOUND;
1980 }
1981 } else {
1982 /* no more input */
1983 *err = U_ZERO_ERROR;
1984 }
1985 } else {
1986 /* this is an unmatched trail code unit (2nd surrogate) */
1987 /* callback(illegal) */
1988 *err=U_ILLEGAL_CHAR_FOUND;
1989 }
1990 } else {
1991 /* callback(unassigned) for a BMP code point */
1992 *err = U_INVALID_CHAR_FOUND;
1993 }
1994
1995 args->converter->fromUChar32=sourceChar;
1996 args->converter->fromUnicodeStatus = (int32_t)isTargetByteDBCS;
1997 break;
1998 }
1999 } /* end if(myTargetIndex<myTargetLength) */
2000 else{
2001 *err =U_BUFFER_OVERFLOW_ERROR;
2002 break;
2003 }
2004
2005 }/* end while(mySourceIndex<mySourceLength) */
2006
2007 /*
2008 * the end of the input stream and detection of truncated input
2009 * are handled by the framework, but for ISO-2022-KR conversion
2010 * we need to be in ASCII mode at the very end
2011 *
2012 * conditions:
2013 * successful
2014 * not in ASCII mode
2015 * end of input and no truncated input
2016 */
2017 if( U_SUCCESS(*err) &&
2018 isTargetByteDBCS &&
2019 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2020 ) {
2021 int32_t sourceIndex;
2022
2023 /* we are switching to ASCII */
2024 isTargetByteDBCS=FALSE;
2025
2026 /* get the source index of the last input character */
2027 /*
2028 * TODO this would be simpler and more reliable if we used a pair
2029 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2030 * so that we could simply use the prevSourceIndex here;
2031 * this code gives an incorrect result for the rare case of an unmatched
2032 * trail surrogate that is alone in the last buffer of the text stream
2033 */
2034 sourceIndex=(int32_t)(source-args->source);
2035 if(sourceIndex>0) {
2036 --sourceIndex;
2037 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2038 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2039 ) {
2040 --sourceIndex;
2041 }
2042 } else {
2043 sourceIndex=-1;
2044 }
2045
2046 ucnv_fromUWriteBytes(
2047 args->converter,
2048 SHIFT_IN_STR, 1,
2049 (char **)&target, (const char *)targetLimit,
2050 &offsets, sourceIndex,
2051 err);
2052 }
2053
2054 /*save the state and return */
2055 args->source = source;
2056 args->target = (char*)target;
2057 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2058 }
2059
2060 /************************ To Unicode ***************************************/
2061
2062 static void
2063 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2064 UErrorCode* err){
2065 char const* sourceStart;
2066 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2067
2068 UConverterToUnicodeArgs subArgs;
2069 int32_t minArgsSize;
2070
2071 /* set up the subconverter arguments */
2072 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2073 minArgsSize = args->size;
2074 } else {
2075 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2076 }
2077
2078 uprv_memcpy(&subArgs, args, minArgsSize);
2079 subArgs.size = (uint16_t)minArgsSize;
2080 subArgs.converter = myData->currentConverter;
2081
2082 /* remember the original start of the input for offsets */
2083 sourceStart = args->source;
2084
2085 if(myData->key != 0) {
2086 /* continue with a partial escape sequence */
2087 goto escape;
2088 }
2089
2090 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2091 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2092 subArgs.source = args->source;
2093 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2094 if(subArgs.source != subArgs.sourceLimit) {
2095 /*
2096 * get the current partial byte sequence
2097 *
2098 * it needs to be moved between the public and the subconverter
2099 * so that the conversion framework, which only sees the public
2100 * converter, can handle truncated and illegal input etc.
2101 */
2102 if(args->converter->toULength > 0) {
2103 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2104 }
2105 subArgs.converter->toULength = args->converter->toULength;
2106
2107 /*
2108 * Convert up to the end of the input, or to before the next escape character.
2109 * Does not handle conversion extensions because the preToU[] state etc.
2110 * is not copied.
2111 */
2112 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2113
2114 if(args->offsets != NULL && sourceStart != args->source) {
2115 /* update offsets to base them on the actual start of the input */
2116 int32_t *offsets = args->offsets;
2117 UChar *target = args->target;
2118 int32_t delta = (int32_t)(args->source - sourceStart);
2119 while(target < subArgs.target) {
2120 if(*offsets >= 0) {
2121 *offsets += delta;
2122 }
2123 ++offsets;
2124 ++target;
2125 }
2126 }
2127 args->source = subArgs.source;
2128 args->target = subArgs.target;
2129 args->offsets = subArgs.offsets;
2130
2131 /* copy input/error/overflow buffers */
2132 if(subArgs.converter->toULength > 0) {
2133 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2134 }
2135 args->converter->toULength = subArgs.converter->toULength;
2136
2137 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2138 if(subArgs.converter->UCharErrorBufferLength > 0) {
2139 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2140 subArgs.converter->UCharErrorBufferLength);
2141 }
2142 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2143 subArgs.converter->UCharErrorBufferLength = 0;
2144 }
2145 }
2146
2147 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2148 return;
2149 }
2150
2151 escape:
2152 changeState_2022(args->converter,
2153 &(args->source),
2154 args->sourceLimit,
2155 ISO_2022_KR,
2156 err);
2157 }
2158 }
2159
2160 static void
2161 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2162 UErrorCode* err){
2163 char tempBuf[2];
2164 const char *mySource = ( char *) args->source;
2165 UChar *myTarget = args->target;
2166 const char *mySourceLimit = args->sourceLimit;
2167 UChar32 targetUniChar = 0x0000;
2168 UChar mySourceChar = 0x0000;
2169 UConverterDataISO2022* myData;
2170 UConverterSharedData* sharedData ;
2171 UBool useFallback;
2172
2173 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2174 if(myData->version==1){
2175 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2176 return;
2177 }
2178
2179 /* initialize state */
2180 sharedData = myData->currentConverter->sharedData;
2181 useFallback = args->converter->useFallback;
2182
2183 if(myData->key != 0) {
2184 /* continue with a partial escape sequence */
2185 goto escape;
2186 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2187 /* continue with a partial double-byte character */
2188 mySourceChar = args->converter->toUBytes[0];
2189 args->converter->toULength = 0;
2190 goto getTrailByte;
2191 }
2192
2193 while(mySource< mySourceLimit){
2194
2195 if(myTarget < args->targetLimit){
2196
2197 mySourceChar= (unsigned char) *mySource++;
2198
2199 if(mySourceChar==UCNV_SI){
2200 myData->toU2022State.g = 0;
2201 /*consume the source */
2202 continue;
2203 }else if(mySourceChar==UCNV_SO){
2204 myData->toU2022State.g = 1;
2205 /*consume the source */
2206 continue;
2207 }else if(mySourceChar==ESC_2022){
2208 mySource--;
2209 escape:
2210 changeState_2022(args->converter,&(mySource),
2211 mySourceLimit, ISO_2022_KR, err);
2212 if(U_FAILURE(*err)){
2213 args->target = myTarget;
2214 args->source = mySource;
2215 return;
2216 }
2217 continue;
2218 }
2219
2220 if(myData->toU2022State.g == 1) {
2221 if(mySource < mySourceLimit) {
2222 char trailByte;
2223 getTrailByte:
2224 trailByte = *mySource++;
2225 tempBuf[0] = (char)(mySourceChar + 0x80);
2226 tempBuf[1] = (char)(trailByte + 0x80);
2227 mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2228 if((mySourceChar & 0x8080) == 0) {
2229 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2230 } else {
2231 /* illegal bytes > 0x7f */
2232 targetUniChar = missingCharMarker;
2233 }
2234 } else {
2235 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2236 args->converter->toULength = 1;
2237 break;
2238 }
2239 }
2240 else{
2241 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2242 }
2243 if(targetUniChar < 0xfffe){
2244 if(args->offsets) {
2245 args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
2246 }
2247 *(myTarget++)=(UChar)targetUniChar;
2248 }
2249 else {
2250 /* Call the callback function*/
2251 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2252 break;
2253 }
2254 }
2255 else{
2256 *err =U_BUFFER_OVERFLOW_ERROR;
2257 break;
2258 }
2259 }
2260 args->target = myTarget;
2261 args->source = mySource;
2262 }
2263
2264 /*************************** END ISO2022-KR *********************************/
2265
2266 /*************************** ISO-2022-CN *********************************
2267 *
2268 * Rules for ISO-2022-CN Encoding:
2269 * i) The designator sequence must appear once on a line before any instance
2270 * of character set it designates.
2271 * ii) If two lines contain characters from the same character set, both lines
2272 * must include the designator sequence.
2273 * iii) Once the designator sequence is known, a shifting sequence has to be found
2274 * to invoke the shifting
2275 * iv) All lines start in ASCII and end in ASCII.
2276 * v) Four shifting sequences are employed for this purpose:
2277 *
2278 * Sequcence ASCII Eq Charsets
2279 * ---------- ------- ---------
2280 * SI <SI> US-ASCII
2281 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2282 * SS2 <ESC>N CNS-11643-1992 Plane 2
2283 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2284 *
2285 * vi)
2286 * SOdesignator : ESC "$" ")" finalchar_for_SO
2287 * SS2designator : ESC "$" "*" finalchar_for_SS2
2288 * SS3designator : ESC "$" "+" finalchar_for_SS3
2289 *
2290 * ESC $ ) A Indicates the bytes following SO are Chinese
2291 * characters as defined in GB 2312-80, until
2292 * another SOdesignation appears
2293 *
2294 *
2295 * ESC $ ) E Indicates the bytes following SO are as defined
2296 * in ISO-IR-165 (for details, see section 2.1),
2297 * until another SOdesignation appears
2298 *
2299 * ESC $ ) G Indicates the bytes following SO are as defined
2300 * in CNS 11643-plane-1, until another
2301 * SOdesignation appears
2302 *
2303 * ESC $ * H Indicates the two bytes immediately following
2304 * SS2 is a Chinese character as defined in CNS
2305 * 11643-plane-2, until another SS2designation
2306 * appears
2307 * (Meaning <ESC>N must preceed every 2 byte
2308 * sequence.)
2309 *
2310 * ESC $ + I Indicates the immediate two bytes following SS3
2311 * is a Chinese character as defined in CNS
2312 * 11643-plane-3, until another SS3designation
2313 * appears
2314 * (Meaning <ESC>O must preceed every 2 byte
2315 * sequence.)
2316 *
2317 * ESC $ + J Indicates the immediate two bytes following SS3
2318 * is a Chinese character as defined in CNS
2319 * 11643-plane-4, until another SS3designation
2320 * appears
2321 * (In English: <ESC>O must preceed every 2 byte
2322 * sequence.)
2323 *
2324 * ESC $ + K Indicates the immediate two bytes following SS3
2325 * is a Chinese character as defined in CNS
2326 * 11643-plane-5, until another SS3designation
2327 * appears
2328 *
2329 * ESC $ + L Indicates the immediate two bytes following SS3
2330 * is a Chinese character as defined in CNS
2331 * 11643-plane-6, until another SS3designation
2332 * appears
2333 *
2334 * ESC $ + M Indicates the immediate two bytes following SS3
2335 * is a Chinese character as defined in CNS
2336 * 11643-plane-7, until another SS3designation
2337 * appears
2338 *
2339 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2340 * has its own designation information before any Chinese characters
2341 * appear
2342 *
2343 */
2344
2345 /* The following are defined this way to make the strings truely readonly */
2346 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2347 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2348 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2349 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2350 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2351 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2352 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2353 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2354 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2355
2356 /********************** ISO2022-CN Data **************************/
2357 static const char* const escSeqCharsCN[10] ={
2358 SHIFT_IN_STR, /* ASCII */
2359 GB_2312_80_STR,
2360 ISO_IR_165_STR,
2361 CNS_11643_1992_Plane_1_STR,
2362 CNS_11643_1992_Plane_2_STR,
2363 CNS_11643_1992_Plane_3_STR,
2364 CNS_11643_1992_Plane_4_STR,
2365 CNS_11643_1992_Plane_5_STR,
2366 CNS_11643_1992_Plane_6_STR,
2367 CNS_11643_1992_Plane_7_STR
2368 };
2369
2370 static void
2371 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2372
2373 UConverterDataISO2022 *converterData;
2374 ISO2022State *pFromU2022State;
2375 uint8_t *target = (uint8_t *) args->target;
2376 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2377 const UChar* source = args->source;
2378 const UChar* sourceLimit = args->sourceLimit;
2379 int32_t* offsets = args->offsets;
2380 UChar32 sourceChar;
2381 char buffer[8];
2382 int32_t len;
2383 int8_t choices[3];
2384 int32_t choiceCount;
2385 uint32_t targetValue;
2386 UBool useFallback;
2387
2388 /* set up the state */
2389 converterData = (UConverterDataISO2022*)args->converter->extraInfo;
2390 pFromU2022State = &converterData->fromU2022State;
2391 useFallback = args->converter->useFallback;
2392
2393 choiceCount = 0;
2394
2395 /* check if the last codepoint of previous buffer was a lead surrogate*/
2396 if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
2397 goto getTrail;
2398 }
2399
2400 while( source < sourceLimit){
2401 if(target < targetLimit){
2402
2403 sourceChar = *(source++);
2404 /*check if the char is a First surrogate*/
2405 if(UTF_IS_SURROGATE(sourceChar)) {
2406 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2407 getTrail:
2408 /*look ahead to find the trail surrogate*/
2409 if(source < sourceLimit) {
2410 /* test the following code unit */
2411 UChar trail=(UChar) *source;
2412 if(UTF_IS_SECOND_SURROGATE(trail)) {
2413 source++;
2414 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2415 args->converter->fromUChar32=0x00;
2416 /* convert this supplementary code point */
2417 /* exit this condition tree */
2418 } else {
2419 /* this is an unmatched lead code unit (1st surrogate) */
2420 /* callback(illegal) */
2421 *err=U_ILLEGAL_CHAR_FOUND;
2422 args->converter->fromUChar32=sourceChar;
2423 break;
2424 }
2425 } else {
2426 /* no more input */
2427 args->converter->fromUChar32=sourceChar;
2428 break;
2429 }
2430 } else {
2431 /* this is an unmatched trail code unit (2nd surrogate) */
2432 /* callback(illegal) */
2433 *err=U_ILLEGAL_CHAR_FOUND;
2434 args->converter->fromUChar32=sourceChar;
2435 break;
2436 }
2437 }
2438
2439 /* do the conversion */
2440 if(sourceChar <= 0x007f ){
2441 /* US-ASCII */
2442 if(pFromU2022State->g == 0) {
2443 buffer[0] = (char)sourceChar;
2444 len = 1;
2445 } else {
2446 buffer[0] = UCNV_SI;
2447 buffer[1] = (char)sourceChar;
2448 len = 2;
2449 pFromU2022State->g = 0;
2450 choiceCount = 0;
2451 }
2452 if(sourceChar == CR || sourceChar == LF) {
2453 /* reset the state at the end of a line */
2454 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2455 choiceCount = 0;
2456 }
2457 }
2458 else{
2459 /* convert U+0080..U+10ffff */
2460 UConverterSharedData *cnv;
2461 int32_t i;
2462 int8_t cs, g;
2463
2464 if(choiceCount == 0) {
2465 /* try the current SO/G1 converter first */
2466 choices[0] = pFromU2022State->cs[1];
2467
2468 /* default to GB2312_1 if none is designated yet */
2469 if(choices[0] == 0) {
2470 choices[0] = GB2312_1;
2471 }
2472
2473 if(converterData->version == 0) {
2474 /* ISO-2022-CN */
2475
2476 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2477 if(choices[0] == GB2312_1) {
2478 choices[1] = (int8_t)CNS_11643_1;
2479 } else {
2480 choices[1] = (int8_t)GB2312_1;
2481 }
2482
2483 choiceCount = 2;
2484 } else {
2485 /* ISO-2022-CN-EXT */
2486
2487 /* try one of the other converters */
2488 switch(choices[0]) {
2489 case GB2312_1:
2490 choices[1] = (int8_t)CNS_11643_1;
2491 choices[2] = (int8_t)ISO_IR_165;
2492 break;
2493 case ISO_IR_165:
2494 choices[1] = (int8_t)GB2312_1;
2495 choices[2] = (int8_t)CNS_11643_1;
2496 break;
2497 default: /* CNS_11643_x */
2498 choices[1] = (int8_t)GB2312_1;
2499 choices[2] = (int8_t)ISO_IR_165;
2500 break;
2501 }
2502
2503 choiceCount = 3;
2504 }
2505 }
2506
2507 cs = g = 0;
2508 len = 0;
2509
2510 for(i = 0; i < choiceCount && len == 0; ++i) {
2511 cs = choices[i];
2512 if(cs > 0) {
2513 if(cs > CNS_11643_0) {
2514 cnv = converterData->myConverterArray[CNS_11643];
2515 MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_3);
2516 if(len==3) {
2517 cs = (int8_t)(CNS_11643_0 + (targetValue >> 16) - 0x80);
2518 len = 2;
2519 if(cs == CNS_11643_1) {
2520 g = 1;
2521 } else if(cs == CNS_11643_2) {
2522 g = 2;
2523 } else /* plane 3..7 */ if(converterData->version == 1) {
2524 g = 3;
2525 } else {
2526 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
2527 len = 0;
2528 }
2529 }
2530 } else {
2531 /* GB2312_1 or ISO-IR-165 */
2532 cnv = converterData->myConverterArray[cs];
2533 MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_2);
2534 g = 1; /* used if len == 2 */
2535 }
2536 }
2537 }
2538
2539 if(len > 0) {
2540 len = 0; /* count output bytes; it must have been len == 2 */
2541
2542 /* write the designation sequence if necessary */
2543 if(cs != pFromU2022State->cs[g]) {
2544 if(cs < CNS_11643) {
2545 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
2546 } else {
2547 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
2548 }
2549 len = 4;
2550 pFromU2022State->cs[g] = cs;
2551 if(g == 1) {
2552 /* changing the SO/G1 charset invalidates the choices[] */
2553 choiceCount = 0;
2554 }
2555 }
2556
2557 /* write the shift sequence if necessary */
2558 if(g != pFromU2022State->g) {
2559 switch(g) {
2560 case 1:
2561 buffer[len++] = UCNV_SO;
2562
2563 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
2564 pFromU2022State->g = 1;
2565 break;
2566 case 2:
2567 buffer[len++] = 0x1b;
2568 buffer[len++] = 0x4e;
2569 break;
2570 default: /* case 3 */
2571 buffer[len++] = 0x1b;
2572 buffer[len++] = 0x4f;
2573 break;
2574 }
2575 }
2576
2577 /* write the two output bytes */
2578 buffer[len++] = (char)(targetValue >> 8);
2579 buffer[len++] = (char)targetValue;
2580 } else {
2581 /* if we cannot find the character after checking all codepages
2582 * then this is an error
2583 */
2584 *err = U_INVALID_CHAR_FOUND;
2585 args->converter->fromUChar32=sourceChar;
2586 break;
2587 }
2588 }
2589
2590 /* output len>0 bytes in buffer[] */
2591 if(len == 1) {
2592 *target++ = buffer[0];
2593 if(offsets) {
2594 *offsets++ = source - args->source - 1; /* -1: known to be ASCII */
2595 }
2596 } else if(len == 2 && (target + 2) <= targetLimit) {
2597 *target++ = buffer[0];
2598 *target++ = buffer[1];
2599 if(offsets) {
2600 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
2601 *offsets++ = sourceIndex;
2602 *offsets++ = sourceIndex;
2603 }
2604 } else {
2605 ucnv_fromUWriteBytes(
2606 args->converter,
2607 buffer, len,
2608 (char **)&target, (const char *)targetLimit,
2609 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2610 err);
2611 if(U_FAILURE(*err)) {
2612 break;
2613 }
2614 }
2615 } /* end if(myTargetIndex<myTargetLength) */
2616 else{
2617 *err =U_BUFFER_OVERFLOW_ERROR;
2618 break;
2619 }
2620
2621 }/* end while(mySourceIndex<mySourceLength) */
2622
2623 /*
2624 * the end of the input stream and detection of truncated input
2625 * are handled by the framework, but for ISO-2022-CN conversion
2626 * we need to be in ASCII mode at the very end
2627 *
2628 * conditions:
2629 * successful
2630 * not in ASCII mode
2631 * end of input and no truncated input
2632 */
2633 if( U_SUCCESS(*err) &&
2634 pFromU2022State->g!=0 &&
2635 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2636 ) {
2637 int32_t sourceIndex;
2638
2639 /* we are switching to ASCII */
2640 pFromU2022State->g=0;
2641
2642 /* get the source index of the last input character */
2643 /*
2644 * TODO this would be simpler and more reliable if we used a pair
2645 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2646 * so that we could simply use the prevSourceIndex here;
2647 * this code gives an incorrect result for the rare case of an unmatched
2648 * trail surrogate that is alone in the last buffer of the text stream
2649 */
2650 sourceIndex=(int32_t)(source-args->source);
2651 if(sourceIndex>0) {
2652 --sourceIndex;
2653 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2654 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2655 ) {
2656 --sourceIndex;
2657 }
2658 } else {
2659 sourceIndex=-1;
2660 }
2661
2662 ucnv_fromUWriteBytes(
2663 args->converter,
2664 SHIFT_IN_STR, 1,
2665 (char **)&target, (const char *)targetLimit,
2666 &offsets, sourceIndex,
2667 err);
2668 }
2669
2670 /*save the state and return */
2671 args->source = source;
2672 args->target = (char*)target;
2673 }
2674
2675
2676 static void
2677 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2678 UErrorCode* err){
2679 char tempBuf[3];
2680 const char *mySource = (char *) args->source;
2681 UChar *myTarget = args->target;
2682 const char *mySourceLimit = args->sourceLimit;
2683 uint32_t targetUniChar = 0x0000;
2684 uint32_t mySourceChar = 0x0000;
2685 UConverterDataISO2022* myData;
2686 ISO2022State *pToU2022State;
2687
2688 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2689 pToU2022State = &myData->toU2022State;
2690
2691 if(myData->key != 0) {
2692 /* continue with a partial escape sequence */
2693 goto escape;
2694 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2695 /* continue with a partial double-byte character */
2696 mySourceChar = args->converter->toUBytes[0];
2697 args->converter->toULength = 0;
2698 goto getTrailByte;
2699 }
2700
2701 while(mySource < mySourceLimit){
2702
2703 targetUniChar =missingCharMarker;
2704
2705 if(myTarget < args->targetLimit){
2706
2707 mySourceChar= (unsigned char) *mySource++;
2708
2709 switch(mySourceChar){
2710 case UCNV_SI:
2711 pToU2022State->g=0;
2712 continue;
2713
2714 case UCNV_SO:
2715 if(pToU2022State->cs[1] != 0) {
2716 pToU2022State->g=1;
2717 continue;
2718 } else {
2719 /* illegal to have SO before a matching designator */
2720 break;
2721 }
2722
2723 case ESC_2022:
2724 mySource--;
2725 escape:
2726 changeState_2022(args->converter,&(mySource),
2727 mySourceLimit, ISO_2022_CN,err);
2728
2729 /* invalid or illegal escape sequence */
2730 if(U_FAILURE(*err)){
2731 args->target = myTarget;
2732 args->source = mySource;
2733 return;
2734 }
2735 continue;
2736
2737 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
2738
2739 case CR:
2740 /*falls through*/
2741 case LF:
2742 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
2743 /* falls through */
2744 default:
2745 /* convert one or two bytes */
2746 if(pToU2022State->g != 0) {
2747 if(mySource < mySourceLimit) {
2748 UConverterSharedData *cnv;
2749 StateEnum tempState;
2750 int32_t tempBufLen;
2751 char trailByte;
2752 getTrailByte:
2753 trailByte = *mySource++;
2754 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
2755 if(tempState > CNS_11643_0) {
2756 cnv = myData->myConverterArray[CNS_11643];
2757 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
2758 tempBuf[1] = (char) (mySourceChar);
2759 tempBuf[2] = trailByte;
2760 tempBufLen = 3;
2761
2762 }else{
2763 cnv = myData->myConverterArray[tempState];
2764 tempBuf[0] = (char) (mySourceChar);
2765 tempBuf[1] = trailByte;
2766 tempBufLen = 2;
2767 }
2768 mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2769 if(pToU2022State->g>=2) {
2770 /* return from a single-shift state to the previous one */
2771 pToU2022State->g=pToU2022State->prevG;
2772 }
2773 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
2774 } else {
2775 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2776 args->converter->toULength = 1;
2777 goto endloop;
2778 }
2779 }
2780 else{
2781 if(mySourceChar <= 0x7f) {
2782 targetUniChar = (UChar) mySourceChar;
2783 }
2784 }
2785 break;
2786 }
2787 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2788 if(args->offsets){
2789 args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
2790 }
2791 *(myTarget++)=(UChar)targetUniChar;
2792 }
2793 else if(targetUniChar > missingCharMarker){
2794 /* disassemble the surrogate pair and write to output*/
2795 targetUniChar-=0x0010000;
2796 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2797 if(args->offsets){
2798 args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
2799 }
2800 ++myTarget;
2801 if(myTarget< args->targetLimit){
2802 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2803 if(args->offsets){
2804 args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
2805 }
2806 ++myTarget;
2807 }else{
2808 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2809 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2810 }
2811
2812 }
2813 else{
2814 /* Call the callback function*/
2815 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2816 break;
2817 }
2818 }
2819 else{
2820 *err =U_BUFFER_OVERFLOW_ERROR;
2821 break;
2822 }
2823 }
2824 endloop:
2825 args->target = myTarget;
2826 args->source = mySource;
2827 }
2828
2829 static void
2830 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
2831 UConverter *cnv = args->converter;
2832 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
2833 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
2834 char *p, *subchar;
2835 char buffer[8];
2836 int32_t length;
2837
2838 subchar=(char *)cnv->subChar;
2839 length=cnv->subCharLen; /* assume length==1 for most variants */
2840
2841 p = buffer;
2842 switch(myConverterData->locale[0]){
2843 case 'j':
2844 {
2845 int8_t cs;
2846
2847 if(pFromU2022State->g == 1) {
2848 /* JIS7: switch from G1 to G0 */
2849 pFromU2022State->g = 0;
2850 *p++ = UCNV_SI;
2851 }
2852
2853 cs = pFromU2022State->cs[0];
2854 if(cs != ASCII && cs != JISX201) {
2855 /* not in ASCII or JIS X 0201: switch to ASCII */
2856 pFromU2022State->cs[0] = (int8_t)ASCII;
2857 *p++ = '\x1b';
2858 *p++ = '\x28';
2859 *p++ = '\x42';
2860 }
2861
2862 *p++ = subchar[0];
2863 break;
2864 }
2865 case 'c':
2866 if(pFromU2022State->g != 0) {
2867 /* not in ASCII mode: switch to ASCII */
2868 pFromU2022State->g = 0;
2869 *p++ = UCNV_SI;
2870 }
2871 *p++ = subchar[0];
2872 break;
2873 case 'k':
2874 if(myConverterData->version == 0) {
2875 if(length == 1) {
2876 if((UBool)args->converter->fromUnicodeStatus) {
2877 /* in DBCS mode: switch to SBCS */
2878 args->converter->fromUnicodeStatus = 0;
2879 *p++ = UCNV_SI;
2880 }
2881 *p++ = subchar[0];
2882 } else /* length == 2*/ {
2883 if(!(UBool)args->converter->fromUnicodeStatus) {
2884 /* in SBCS mode: switch to DBCS */
2885 args->converter->fromUnicodeStatus = 1;
2886 *p++ = UCNV_SO;
2887 }
2888 *p++ = subchar[0];
2889 *p++ = subchar[1];
2890 }
2891 break;
2892 } else {
2893 /* let the subconverter write the subchar */
2894 args->converter = myConverterData->currentConverter;
2895 uprv_memcpy(myConverterData->currentConverter->subChar, subchar, 4);
2896 myConverterData->currentConverter->subCharLen = (int8_t)length;
2897
2898 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
2899 ucnv_cbFromUWriteSub(args, 0, err);
2900 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2901
2902 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2903 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2904 uprv_memcpy(
2905 cnv->charErrorBuffer,
2906 myConverterData->currentConverter->charErrorBuffer,
2907 myConverterData->currentConverter->charErrorBufferLength);
2908 }
2909 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2910 myConverterData->currentConverter->charErrorBufferLength = 0;
2911 }
2912 args->converter = cnv;
2913 return;
2914 }
2915 default:
2916 /* not expected */
2917 break;
2918 }
2919 ucnv_cbFromUWriteBytes(args,
2920 buffer, (int32_t)(p - buffer),
2921 offsetIndex, err);
2922 }
2923
2924 /* structure for SafeClone calculations */
2925 struct cloneStruct
2926 {
2927 UConverter cnv;
2928 UConverterDataISO2022 mydata;
2929 UConverter currentConverter;
2930 };
2931
2932
2933 static UConverter *
2934 _ISO_2022_SafeClone(
2935 const UConverter *cnv,
2936 void *stackBuffer,
2937 int32_t *pBufferSize,
2938 UErrorCode *status)
2939 {
2940 struct cloneStruct * localClone;
2941 UConverterDataISO2022 *cnvData;
2942 int32_t i, size;
2943
2944 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
2945 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
2946 return NULL;
2947 }
2948
2949 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
2950 localClone = (struct cloneStruct *)stackBuffer;
2951
2952 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
2953
2954 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
2955
2956 /* share the subconverters */
2957
2958 if(cnvData->currentConverter != NULL) {
2959 size = (int32_t)sizeof(UConverter);
2960 localClone->mydata.currentConverter =
2961 ucnv_safeClone(cnvData->currentConverter,
2962 &localClone->currentConverter,
2963 &size, status);
2964 if(U_FAILURE(*status)) {
2965 return NULL;
2966 }
2967 }
2968
2969 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
2970 if(cnvData->myConverterArray[i] != NULL) {
2971 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
2972 }
2973 }
2974
2975 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
2976 localClone->cnv.isExtraLocal = TRUE;
2977 return &localClone->cnv;
2978 }
2979
2980 static void
2981 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
2982 USetAdder *sa,
2983 UConverterUnicodeSet which,
2984 UErrorCode *pErrorCode)
2985 {
2986 int32_t i;
2987 UConverterDataISO2022* cnvData;
2988
2989 if (U_FAILURE(*pErrorCode)) {
2990 return;
2991 }
2992 #ifdef U_ENABLE_GENERIC_ISO_2022
2993 if (cnv->sharedData == &_ISO2022Data) {
2994 /* We use UTF-8 in this case */
2995 sa->addRange(sa->set, 0, 0xd7FF);
2996 sa->addRange(sa->set, 0xE000, 0x10FFFF);
2997 return;
2998 }
2999 #endif
3000
3001 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3002
3003 /* open a set and initialize it with code points that are algorithmically round-tripped */
3004 switch(cnvData->locale[0]){
3005 case 'j':
3006 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3007 /* include Latin-1 for some variants of JP */
3008 sa->addRange(sa->set, 0, 0xff);
3009 } else {
3010 /* include ASCII for JP */
3011 sa->addRange(sa->set, 0, 0x7f);
3012 }
3013 if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
3014 /* include half-width Katakana for JP */
3015 sa->addRange(sa->set, 0xff61, 0xff9f);
3016 }
3017 break;
3018 case 'c':
3019 case 'z':
3020 /* include ASCII for CN */
3021 sa->addRange(sa->set, 0, 0x7f);
3022 break;
3023 case 'k':
3024 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3025 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3026 cnvData->currentConverter, sa, which, pErrorCode);
3027 return;
3028 default:
3029 break;
3030 }
3031
3032 /*
3033 * TODO: need to make this version-specific for CN.
3034 * CN version 0 does not map CNS planes 3..7 although
3035 * they are all available in the CNS conversion table;
3036 * CN version 1 does map them all.
3037 * The two versions need to create different Unicode sets.
3038 */
3039 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3040 if(cnvData->myConverterArray[i]!=NULL) {
3041 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3042 cnvData->version==0 && i==CNS_11643
3043 ) {
3044 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3045 ucnv_MBCSGetUnicodeSetForBytes(
3046 cnvData->myConverterArray[i],
3047 sa, UCNV_ROUNDTRIP_SET,
3048 0, 0x81, 0x82,
3049 pErrorCode);
3050 } else {
3051 ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode);
3052 }
3053 }
3054 }
3055 }
3056
3057 static const UConverterImpl _ISO2022Impl={
3058 UCNV_ISO_2022,
3059
3060 NULL,
3061 NULL,
3062
3063 _ISO2022Open,
3064 _ISO2022Close,
3065 _ISO2022Reset,
3066
3067 #ifdef U_ENABLE_GENERIC_ISO_2022
3068 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3069 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3070 ucnv_fromUnicode_UTF8,
3071 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3072 #else
3073 NULL,
3074 NULL,
3075 NULL,
3076 NULL,
3077 #endif
3078 NULL,
3079
3080 NULL,
3081 _ISO2022getName,
3082 _ISO_2022_WriteSub,
3083 _ISO_2022_SafeClone,
3084 _ISO_2022_GetUnicodeSet
3085 };
3086 static const UConverterStaticData _ISO2022StaticData={
3087 sizeof(UConverterStaticData),
3088 "ISO_2022",
3089 2022,
3090 UCNV_IBM,
3091 UCNV_ISO_2022,
3092 1,
3093 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3094 { 0x1a, 0, 0, 0 },
3095 1,
3096 FALSE,
3097 FALSE,
3098 0,
3099 0,
3100 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3101 };
3102 const UConverterSharedData _ISO2022Data={
3103 sizeof(UConverterSharedData),
3104 ~((uint32_t) 0),
3105 NULL,
3106 NULL,
3107 &_ISO2022StaticData,
3108 FALSE,
3109 &_ISO2022Impl,
3110 0
3111 };
3112
3113 /*************JP****************/
3114 static const UConverterImpl _ISO2022JPImpl={
3115 UCNV_ISO_2022,
3116
3117 NULL,
3118 NULL,
3119
3120 _ISO2022Open,
3121 _ISO2022Close,
3122 _ISO2022Reset,
3123
3124 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3125 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3126 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3127 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3128 NULL,
3129
3130 NULL,
3131 _ISO2022getName,
3132 _ISO_2022_WriteSub,
3133 _ISO_2022_SafeClone,
3134 _ISO_2022_GetUnicodeSet
3135 };
3136 static const UConverterStaticData _ISO2022JPStaticData={
3137 sizeof(UConverterStaticData),
3138 "ISO_2022_JP",
3139 0,
3140 UCNV_IBM,
3141 UCNV_ISO_2022,
3142 1,
3143 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3144 { 0x1a, 0, 0, 0 },
3145 1,
3146 FALSE,
3147 FALSE,
3148 0,
3149 0,
3150 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3151 };
3152 static const UConverterSharedData _ISO2022JPData={
3153 sizeof(UConverterSharedData),
3154 ~((uint32_t) 0),
3155 NULL,
3156 NULL,
3157 &_ISO2022JPStaticData,
3158 FALSE,
3159 &_ISO2022JPImpl,
3160 0
3161 };
3162
3163 /************* KR ***************/
3164 static const UConverterImpl _ISO2022KRImpl={
3165 UCNV_ISO_2022,
3166
3167 NULL,
3168 NULL,
3169
3170 _ISO2022Open,
3171 _ISO2022Close,
3172 _ISO2022Reset,
3173
3174 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3175 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3176 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3177 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3178 NULL,
3179
3180 NULL,
3181 _ISO2022getName,
3182 _ISO_2022_WriteSub,
3183 _ISO_2022_SafeClone,
3184 _ISO_2022_GetUnicodeSet
3185 };
3186 static const UConverterStaticData _ISO2022KRStaticData={
3187 sizeof(UConverterStaticData),
3188 "ISO_2022_KR",
3189 0,
3190 UCNV_IBM,
3191 UCNV_ISO_2022,
3192 1,
3193 3, /* max 3 bytes per UChar: SO+DBCS */
3194 { 0x1a, 0, 0, 0 },
3195 1,
3196 FALSE,
3197 FALSE,
3198 0,
3199 0,
3200 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3201 };
3202 static const UConverterSharedData _ISO2022KRData={
3203 sizeof(UConverterSharedData),
3204 ~((uint32_t) 0),
3205 NULL,
3206 NULL,
3207 &_ISO2022KRStaticData,
3208 FALSE,
3209 &_ISO2022KRImpl,
3210 0
3211 };
3212
3213 /*************** CN ***************/
3214 static const UConverterImpl _ISO2022CNImpl={
3215
3216 UCNV_ISO_2022,
3217
3218 NULL,
3219 NULL,
3220
3221 _ISO2022Open,
3222 _ISO2022Close,
3223 _ISO2022Reset,
3224
3225 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3226 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3227 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3228 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3229 NULL,
3230
3231 NULL,
3232 _ISO2022getName,
3233 _ISO_2022_WriteSub,
3234 _ISO_2022_SafeClone,
3235 _ISO_2022_GetUnicodeSet
3236 };
3237 static const UConverterStaticData _ISO2022CNStaticData={
3238 sizeof(UConverterStaticData),
3239 "ISO_2022_CN",
3240 0,
3241 UCNV_IBM,
3242 UCNV_ISO_2022,
3243 2,
3244 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3245 { 0x1a, 0, 0, 0 },
3246 1,
3247 FALSE,
3248 FALSE,
3249 0,
3250 0,
3251 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3252 };
3253 static const UConverterSharedData _ISO2022CNData={
3254 sizeof(UConverterSharedData),
3255 ~((uint32_t) 0),
3256 NULL,
3257 NULL,
3258 &_ISO2022CNStaticData,
3259 FALSE,
3260 &_ISO2022CNImpl,
3261 0
3262 };
3263
3264
3265
3266 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */