]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ucnv2022.cpp
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucnv2022.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2000-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
13 *
14 * Change history:
15 *
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
19 * function
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
22 * ucnvebdc.c
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
27 */
28
29 #include "unicode/utypes.h"
30
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
37 #include "unicode/utf16.h"
38 #include "ucnv_imp.h"
39 #include "ucnv_bld.h"
40 #include "ucnv_cnv.h"
41 #include "ucnvmbcs.h"
42 #include "cstring.h"
43 #include "cmemory.h"
44 #include "uassert.h"
45
46 #ifdef U_ENABLE_GENERIC_ISO_2022
47 /*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
72 * for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76 #endif
77
78 #if !UCONFIG_ONLY_HTML_CONVERSION
79 static const char SHIFT_IN_STR[] = "\x0F";
80 // static const char SHIFT_OUT_STR[] = "\x0E";
81 #endif
82
83 #define CR 0x0D
84 #define LF 0x0A
85 #define H_TAB 0x09
86 #define V_TAB 0x0B
87 #define SPACE 0x20
88
89 enum {
90 HWKANA_START=0xff61,
91 HWKANA_END=0xff9f
92 };
93
94 /*
95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
96 * as bytes 21..7E. (Subtract 0x80.)
97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
98 * as bytes 20..7F. (Subtract 0x80.)
99 * Do not encode C1 control codes with native bytes 80..9F
100 * as bytes 00..1F (C0 control codes).
101 */
102 enum {
103 GR94_START=0xa1,
104 GR94_END=0xfe,
105 GR96_START=0xa0,
106 GR96_END=0xff
107 };
108
109 /*
110 * ISO 2022 control codes must not be converted from Unicode
111 * because they would mess up the byte stream.
112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
113 * corresponding to SO, SI, and ESC.
114 */
115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
116
117 /* for ISO-2022-JP and -CN implementations */
118 typedef enum {
119 /* shared values */
120 INVALID_STATE=-1,
121 ASCII = 0,
122
123 SS2_STATE=0x10,
124 SS3_STATE,
125
126 /* JP */
127 ISO8859_1 = 1 ,
128 ISO8859_7 = 2 ,
129 JISX201 = 3,
130 JISX208 = 4,
131 JISX212 = 5,
132 GB2312 =6,
133 KSC5601 =7,
134 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
135
136 /* CN */
137 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
138 GB2312_1=1,
139 ISO_IR_165=2,
140 CNS_11643=3,
141
142 /*
143 * these are used in StateEnum and ISO2022State variables,
144 * but CNS_11643 must be used to index into myConverterArray[]
145 */
146 CNS_11643_0=0x20,
147 CNS_11643_1,
148 CNS_11643_2,
149 CNS_11643_3,
150 CNS_11643_4,
151 CNS_11643_5,
152 CNS_11643_6,
153 CNS_11643_7
154 } StateEnum;
155
156 /* is the StateEnum charset value for a DBCS charset? */
157 #if UCONFIG_ONLY_HTML_CONVERSION
158 #define IS_JP_DBCS(cs) (JISX208==(cs))
159 #else
160 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
161 #endif
162
163 #define CSM(cs) ((uint16_t)1<<(cs))
164
165 /*
166 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
167 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
168 *
169 * Note: The converter uses some leniency:
170 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
171 * all versions, not just JIS7 and JIS8.
172 * - ICU does not distinguish between different versions of JIS X 0208.
173 */
174 #if UCONFIG_ONLY_HTML_CONVERSION
175 enum { MAX_JA_VERSION=0 };
176 #else
177 enum { MAX_JA_VERSION=4 };
178 #endif
179 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
180 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
181 #if !UCONFIG_ONLY_HTML_CONVERSION
182 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
183 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
184 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
185 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
186 #endif
187 };
188
189 typedef enum {
190 ASCII1=0,
191 LATIN1,
192 SBCS,
193 DBCS,
194 MBCS,
195 HWKANA
196 }Cnv2022Type;
197
198 typedef struct ISO2022State {
199 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
200 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
201 int8_t prevG; /* g before single shift (SS2 or SS3) */
202 } ISO2022State;
203
204 #define UCNV_OPTIONS_VERSION_MASK 0xf
205 #define UCNV_2022_MAX_CONVERTERS 10
206
207 typedef struct{
208 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
209 UConverter *currentConverter;
210 Cnv2022Type currentType;
211 ISO2022State toU2022State, fromU2022State;
212 uint32_t key;
213 uint32_t version;
214 #ifdef U_ENABLE_GENERIC_ISO_2022
215 UBool isFirstBuffer;
216 #endif
217 UBool isEmptySegment;
218 char name[30];
219 char locale[3];
220 }UConverterDataISO2022;
221
222 /* Protos */
223 /* ISO-2022 ----------------------------------------------------------------- */
224
225 /*Forward declaration */
226 U_CFUNC void
227 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
228 UErrorCode * err);
229 U_CFUNC void
230 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
231 UErrorCode * err);
232
233 #define ESC_2022 0x1B /*ESC*/
234
235 typedef enum
236 {
237 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
238 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
239 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
240 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
241 } UCNV_TableStates_2022;
242
243 /*
244 * The way these state transition arrays work is:
245 * ex : ESC$B is the sequence for JISX208
246 * a) First Iteration: char is ESC
247 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
248 * int x = normalize_esq_chars_2022[27] which is equal to 1
249 * ii) Search for this value in escSeqStateTable_Key_2022[]
250 * value of x is stored at escSeqStateTable_Key_2022[0]
251 * iii) Save this index as offset
252 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
253 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
254 * b) Switch on this state and continue to next char
255 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
256 * which is normalize_esq_chars_2022[36] == 4
257 * ii) x is currently 1(from above)
258 * x<<=5 -- x is now 32
259 * x+=normalize_esq_chars_2022[36]
260 * now x is 36
261 * iii) Search for this value in escSeqStateTable_Key_2022[]
262 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
263 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
264 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
265 * c) Switch on this state and continue to next char
266 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
267 * ii) x is currently 36 (from above)
268 * x<<=5 -- x is now 1152
269 * x+=normalize_esq_chars_2022[66]
270 * now x is 1161
271 * iii) Search for this value in escSeqStateTable_Key_2022[]
272 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
273 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
274 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
275 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
276 */
277
278
279 /*Below are the 3 arrays depicting a state transition table*/
280 static const int8_t normalize_esq_chars_2022[256] = {
281 /* 0 1 2 3 4 5 6 7 8 9 */
282
283 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
287 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
290 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
291 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
298 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
299 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
300 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
301 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
302 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
303 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
304 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
305 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
306 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
307 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
308 ,0 ,0 ,0 ,0 ,0 ,0
309 };
310
311 #ifdef U_ENABLE_GENERIC_ISO_2022
312 /*
313 * When the generic ISO-2022 converter is completely removed, not just disabled
314 * per #ifdef, then the following state table and the associated tables that are
315 * dimensioned with MAX_STATES_2022 should be trimmed.
316 *
317 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
318 * the associated escape sequences starting with ESC ( B should be removed.
319 * This includes the ones with key values 1097 and all of the ones above 1000000.
320 *
321 * For the latter, the tables can simply be truncated.
322 * For the former, since the tables must be kept parallel, it is probably best
323 * to simply duplicate an adjacent table cell, parallel in all tables.
324 *
325 * It may make sense to restructure the tables, especially by using small search
326 * tables for the variants instead of indexing them parallel to the table here.
327 */
328 #endif
329
330 #define MAX_STATES_2022 74
331 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
332 /* 0 1 2 3 4 5 6 7 8 9 */
333
334 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
335 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
336 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
337 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
338 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
339 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
340 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
341 ,35947631 ,35947635 ,35947636 ,35947638
342 };
343
344 #ifdef U_ENABLE_GENERIC_ISO_2022
345
346 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
347 /* 0 1 2 3 4 5 6 7 8 9 */
348
349 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
350 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
351 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
352 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
353 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
354 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
355 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
356 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
357 };
358
359 #endif
360
361 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
362 /* 0 1 2 3 4 5 6 7 8 9 */
363 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
364 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
365 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
366 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
367 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
368 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
369 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
370 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
371 };
372
373 /* Type def for refactoring changeState_2022 code*/
374 typedef enum{
375 #ifdef U_ENABLE_GENERIC_ISO_2022
376 ISO_2022=0,
377 #endif
378 ISO_2022_JP=1,
379 #if !UCONFIG_ONLY_HTML_CONVERSION
380 ISO_2022_KR=2,
381 ISO_2022_CN=3
382 #endif
383 } Variant2022;
384
385 /*********** ISO 2022 Converter Protos ***********/
386 static void
387 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
388
389 static void
390 _ISO2022Close(UConverter *converter);
391
392 static void
393 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
394
395 static const char*
396 _ISO2022getName(const UConverter* cnv);
397
398 static void
399 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
400
401 static UConverter *
402 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
403
404 #ifdef U_ENABLE_GENERIC_ISO_2022
405 static void
406 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
407 #endif
408
409 namespace {
410
411 /*const UConverterSharedData _ISO2022Data;*/
412 extern const UConverterSharedData _ISO2022JPData;
413
414 #if !UCONFIG_ONLY_HTML_CONVERSION
415 extern const UConverterSharedData _ISO2022KRData;
416 extern const UConverterSharedData _ISO2022CNData;
417 #endif
418
419 } // namespace
420
421 /*************** Converter implementations ******************/
422
423 /* The purpose of this function is to get around gcc compiler warnings. */
424 static inline void
425 fromUWriteUInt8(UConverter *cnv,
426 const char *bytes, int32_t length,
427 uint8_t **target, const char *targetLimit,
428 int32_t **offsets,
429 int32_t sourceIndex,
430 UErrorCode *pErrorCode)
431 {
432 char *targetChars = (char *)*target;
433 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
434 offsets, sourceIndex, pErrorCode);
435 *target = (uint8_t*)targetChars;
436
437 }
438
439 static inline void
440 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
441 if(myConverterData->version == 1) {
442 UConverter *cnv = myConverterData->currentConverter;
443
444 cnv->toUnicodeStatus=0; /* offset */
445 cnv->mode=0; /* state */
446 cnv->toULength=0; /* byteIndex */
447 }
448 }
449
450 static inline void
451 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
452 /* in ISO-2022-KR the designator sequence appears only once
453 * in a file so we append it only once
454 */
455 if( converter->charErrorBufferLength==0){
456
457 converter->charErrorBufferLength = 4;
458 converter->charErrorBuffer[0] = 0x1b;
459 converter->charErrorBuffer[1] = 0x24;
460 converter->charErrorBuffer[2] = 0x29;
461 converter->charErrorBuffer[3] = 0x43;
462 }
463 if(myConverterData->version == 1) {
464 UConverter *cnv = myConverterData->currentConverter;
465
466 cnv->fromUChar32=0;
467 cnv->fromUnicodeStatus=1; /* prevLength */
468 }
469 }
470
471 static void
472 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
473
474 char myLocale[6]={' ',' ',' ',' ',' ',' '};
475
476 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
477 if(cnv->extraInfo != NULL) {
478 UConverterNamePieces stackPieces;
479 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
480 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
481 uint32_t version;
482
483 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
484
485 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
486 myConverterData->currentType = ASCII1;
487 cnv->fromUnicodeStatus =FALSE;
488 if(pArgs->locale){
489 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
490 }
491 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
492 myConverterData->version = version;
493 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
494 (myLocale[2]=='_' || myLocale[2]=='\0'))
495 {
496 /* open the required converters and cache them */
497 if(version>MAX_JA_VERSION) {
498 // ICU 55 fails to open a converter for an unsupported version.
499 // Previously, it fell back to version 0, but that would yield
500 // unexpected behavior.
501 *errorCode = U_MISSING_RESOURCE_ERROR;
502 return;
503 }
504 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
505 myConverterData->myConverterArray[ISO8859_7] =
506 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
507 }
508 myConverterData->myConverterArray[JISX208] =
509 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
510 if(jpCharsetMasks[version]&CSM(JISX212)) {
511 myConverterData->myConverterArray[JISX212] =
512 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
513 }
514 if(jpCharsetMasks[version]&CSM(GB2312)) {
515 myConverterData->myConverterArray[GB2312] =
516 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
517 }
518 if(jpCharsetMasks[version]&CSM(KSC5601)) {
519 myConverterData->myConverterArray[KSC5601] =
520 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
521 }
522
523 /* set the function pointers to appropriate funtions */
524 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
525 uprv_strcpy(myConverterData->locale,"ja");
526
527 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
528 size_t len = uprv_strlen(myConverterData->name);
529 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
530 myConverterData->name[len+1]='\0';
531 }
532 #if !UCONFIG_ONLY_HTML_CONVERSION
533 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
534 (myLocale[2]=='_' || myLocale[2]=='\0'))
535 {
536 if(version>1) {
537 // ICU 55 fails to open a converter for an unsupported version.
538 // Previously, it fell back to version 0, but that would yield
539 // unexpected behavior.
540 *errorCode = U_MISSING_RESOURCE_ERROR;
541 return;
542 }
543 const char *cnvName;
544 if(version==1) {
545 cnvName="icu-internal-25546";
546 } else {
547 cnvName="ibm-949";
548 myConverterData->version=version=0;
549 }
550 if(pArgs->onlyTestIsLoadable) {
551 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
552 uprv_free(cnv->extraInfo);
553 cnv->extraInfo=NULL;
554 return;
555 } else {
556 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
557 if (U_FAILURE(*errorCode)) {
558 _ISO2022Close(cnv);
559 return;
560 }
561
562 if(version==1) {
563 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
564 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
565 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
566 }else{
567 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
568 }
569
570 /* initialize the state variables */
571 setInitialStateToUnicodeKR(cnv, myConverterData);
572 setInitialStateFromUnicodeKR(cnv, myConverterData);
573
574 /* set the function pointers to appropriate funtions */
575 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
576 uprv_strcpy(myConverterData->locale,"ko");
577 }
578 }
579 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
580 (myLocale[2]=='_' || myLocale[2]=='\0'))
581 {
582 if(version>2) {
583 // ICU 55 fails to open a converter for an unsupported version.
584 // Previously, it fell back to version 0, but that would yield
585 // unexpected behavior.
586 *errorCode = U_MISSING_RESOURCE_ERROR;
587 return;
588 }
589
590 /* open the required converters and cache them */
591 myConverterData->myConverterArray[GB2312_1] =
592 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
593 if(version==1) {
594 myConverterData->myConverterArray[ISO_IR_165] =
595 ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
596 }
597 myConverterData->myConverterArray[CNS_11643] =
598 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
599
600
601 /* set the function pointers to appropriate funtions */
602 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
603 uprv_strcpy(myConverterData->locale,"cn");
604
605 if (version==0){
606 myConverterData->version = 0;
607 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
608 }else if (version==1){
609 myConverterData->version = 1;
610 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
611 }else {
612 myConverterData->version = 2;
613 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
614 }
615 }
616 #endif // !UCONFIG_ONLY_HTML_CONVERSION
617 else{
618 #ifdef U_ENABLE_GENERIC_ISO_2022
619 myConverterData->isFirstBuffer = TRUE;
620
621 /* append the UTF-8 escape sequence */
622 cnv->charErrorBufferLength = 3;
623 cnv->charErrorBuffer[0] = 0x1b;
624 cnv->charErrorBuffer[1] = 0x25;
625 cnv->charErrorBuffer[2] = 0x42;
626
627 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
628 /* initialize the state variables */
629 uprv_strcpy(myConverterData->name,"ISO_2022");
630 #else
631 *errorCode = U_MISSING_RESOURCE_ERROR;
632 // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
633 // data loading error code.
634 return;
635 #endif
636 }
637
638 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
639
640 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
641 _ISO2022Close(cnv);
642 }
643 } else {
644 *errorCode = U_MEMORY_ALLOCATION_ERROR;
645 }
646 }
647
648
649 static void
650 _ISO2022Close(UConverter *converter) {
651 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
652 UConverterSharedData **array = myData->myConverterArray;
653 int32_t i;
654
655 if (converter->extraInfo != NULL) {
656 /*close the array of converter pointers and free the memory*/
657 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
658 if(array[i]!=NULL) {
659 ucnv_unloadSharedDataIfReady(array[i]);
660 }
661 }
662
663 ucnv_close(myData->currentConverter);
664
665 if(!converter->isExtraLocal){
666 uprv_free (converter->extraInfo);
667 converter->extraInfo = NULL;
668 }
669 }
670 }
671
672 static void
673 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
674 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
675 if(choice<=UCNV_RESET_TO_UNICODE) {
676 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
677 myConverterData->key = 0;
678 myConverterData->isEmptySegment = FALSE;
679 }
680 if(choice!=UCNV_RESET_TO_UNICODE) {
681 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
682 }
683 #ifdef U_ENABLE_GENERIC_ISO_2022
684 if(myConverterData->locale[0] == 0){
685 if(choice<=UCNV_RESET_TO_UNICODE) {
686 myConverterData->isFirstBuffer = TRUE;
687 myConverterData->key = 0;
688 if (converter->mode == UCNV_SO){
689 ucnv_close (myConverterData->currentConverter);
690 myConverterData->currentConverter=NULL;
691 }
692 converter->mode = UCNV_SI;
693 }
694 if(choice!=UCNV_RESET_TO_UNICODE) {
695 /* re-append UTF-8 escape sequence */
696 converter->charErrorBufferLength = 3;
697 converter->charErrorBuffer[0] = 0x1b;
698 converter->charErrorBuffer[1] = 0x28;
699 converter->charErrorBuffer[2] = 0x42;
700 }
701 }
702 else
703 #endif
704 {
705 /* reset the state variables */
706 if(myConverterData->locale[0] == 'k'){
707 if(choice<=UCNV_RESET_TO_UNICODE) {
708 setInitialStateToUnicodeKR(converter, myConverterData);
709 }
710 if(choice!=UCNV_RESET_TO_UNICODE) {
711 setInitialStateFromUnicodeKR(converter, myConverterData);
712 }
713 }
714 }
715 }
716
717 static const char*
718 _ISO2022getName(const UConverter* cnv){
719 if(cnv->extraInfo){
720 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
721 return myData->name;
722 }
723 return NULL;
724 }
725
726
727 /*************** to unicode *******************/
728 /****************************************************************************
729 * Recognized escape sequences are
730 * <ESC>(B ASCII
731 * <ESC>.A ISO-8859-1
732 * <ESC>.F ISO-8859-7
733 * <ESC>(J JISX-201
734 * <ESC>(I JISX-201
735 * <ESC>$B JISX-208
736 * <ESC>$@ JISX-208
737 * <ESC>$(D JISX-212
738 * <ESC>$A GB2312
739 * <ESC>$(C KSC5601
740 */
741 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
742 /* 0 1 2 3 4 5 6 7 8 9 */
743 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
744 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
745 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
746 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
747 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
748 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
749 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
750 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
751 };
752
753 #if !UCONFIG_ONLY_HTML_CONVERSION
754 /*************** to unicode *******************/
755 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
756 /* 0 1 2 3 4 5 6 7 8 9 */
757 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
758 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
759 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
760 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
761 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
762 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
763 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
764 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
765 };
766 #endif
767
768
769 static UCNV_TableStates_2022
770 getKey_2022(char c,int32_t* key,int32_t* offset){
771 int32_t togo;
772 int32_t low = 0;
773 int32_t hi = MAX_STATES_2022;
774 int32_t oldmid=0;
775
776 togo = normalize_esq_chars_2022[(uint8_t)c];
777 if(togo == 0) {
778 /* not a valid character anywhere in an escape sequence */
779 *key = 0;
780 *offset = 0;
781 return INVALID_2022;
782 }
783 togo = (*key << 5) + togo;
784
785 while (hi != low) /*binary search*/{
786
787 int32_t mid = (hi+low) >> 1; /*Finds median*/
788
789 if (mid == oldmid)
790 break;
791
792 if (escSeqStateTable_Key_2022[mid] > togo){
793 hi = mid;
794 }
795 else if (escSeqStateTable_Key_2022[mid] < togo){
796 low = mid;
797 }
798 else /*we found it*/{
799 *key = togo;
800 *offset = mid;
801 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
802 }
803 oldmid = mid;
804
805 }
806
807 *key = 0;
808 *offset = 0;
809 return INVALID_2022;
810 }
811
812 /*runs through a state machine to determine the escape sequence - codepage correspondance
813 */
814 static void
815 changeState_2022(UConverter* _this,
816 const char** source,
817 const char* sourceLimit,
818 Variant2022 var,
819 UErrorCode* err){
820 UCNV_TableStates_2022 value;
821 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
822 uint32_t key = myData2022->key;
823 int32_t offset = 0;
824 int8_t initialToULength = _this->toULength;
825 char c;
826
827 value = VALID_NON_TERMINAL_2022;
828 while (*source < sourceLimit) {
829 c = *(*source)++;
830 _this->toUBytes[_this->toULength++]=(uint8_t)c;
831 value = getKey_2022(c,(int32_t *) &key, &offset);
832
833 switch (value){
834
835 case VALID_NON_TERMINAL_2022 :
836 /* continue with the loop */
837 break;
838
839 case VALID_TERMINAL_2022:
840 key = 0;
841 goto DONE;
842
843 case INVALID_2022:
844 goto DONE;
845
846 case VALID_MAYBE_TERMINAL_2022:
847 #ifdef U_ENABLE_GENERIC_ISO_2022
848 /* ESC ( B is ambiguous only for ISO_2022 itself */
849 if(var == ISO_2022) {
850 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
851 _this->toULength = 0;
852
853 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
854
855 /* continue with the loop */
856 value = VALID_NON_TERMINAL_2022;
857 break;
858 } else
859 #endif
860 {
861 /* not ISO_2022 itself, finish here */
862 value = VALID_TERMINAL_2022;
863 key = 0;
864 goto DONE;
865 }
866 }
867 }
868
869 DONE:
870 myData2022->key = key;
871
872 if (value == VALID_NON_TERMINAL_2022) {
873 /* indicate that the escape sequence is incomplete: key!=0 */
874 return;
875 } else if (value == INVALID_2022 ) {
876 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
877 } else /* value == VALID_TERMINAL_2022 */ {
878 switch(var){
879 #ifdef U_ENABLE_GENERIC_ISO_2022
880 case ISO_2022:
881 {
882 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
883 if(chosenConverterName == NULL) {
884 /* SS2 or SS3 */
885 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
886 _this->toUCallbackReason = UCNV_UNASSIGNED;
887 return;
888 }
889
890 _this->mode = UCNV_SI;
891 ucnv_close(myData2022->currentConverter);
892 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
893 if(U_SUCCESS(*err)) {
894 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
895 _this->mode = UCNV_SO;
896 }
897 break;
898 }
899 #endif
900 case ISO_2022_JP:
901 {
902 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
903 switch(tempState) {
904 case INVALID_STATE:
905 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
906 break;
907 case SS2_STATE:
908 if(myData2022->toU2022State.cs[2]!=0) {
909 if(myData2022->toU2022State.g<2) {
910 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
911 }
912 myData2022->toU2022State.g=2;
913 } else {
914 /* illegal to have SS2 before a matching designator */
915 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
916 }
917 break;
918 /* case SS3_STATE: not used in ISO-2022-JP-x */
919 case ISO8859_1:
920 case ISO8859_7:
921 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
922 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
923 } else {
924 /* G2 charset for SS2 */
925 myData2022->toU2022State.cs[2]=(int8_t)tempState;
926 }
927 break;
928 default:
929 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
930 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
931 } else {
932 /* G0 charset */
933 myData2022->toU2022State.cs[0]=(int8_t)tempState;
934 }
935 break;
936 }
937 }
938 break;
939 #if !UCONFIG_ONLY_HTML_CONVERSION
940 case ISO_2022_CN:
941 {
942 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
943 switch(tempState) {
944 case INVALID_STATE:
945 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
946 break;
947 case SS2_STATE:
948 if(myData2022->toU2022State.cs[2]!=0) {
949 if(myData2022->toU2022State.g<2) {
950 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
951 }
952 myData2022->toU2022State.g=2;
953 } else {
954 /* illegal to have SS2 before a matching designator */
955 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
956 }
957 break;
958 case SS3_STATE:
959 if(myData2022->toU2022State.cs[3]!=0) {
960 if(myData2022->toU2022State.g<2) {
961 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
962 }
963 myData2022->toU2022State.g=3;
964 } else {
965 /* illegal to have SS3 before a matching designator */
966 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
967 }
968 break;
969 case ISO_IR_165:
970 if(myData2022->version==0) {
971 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
972 break;
973 }
974 U_FALLTHROUGH;
975 case GB2312_1:
976 U_FALLTHROUGH;
977 case CNS_11643_1:
978 myData2022->toU2022State.cs[1]=(int8_t)tempState;
979 break;
980 case CNS_11643_2:
981 myData2022->toU2022State.cs[2]=(int8_t)tempState;
982 break;
983 default:
984 /* other CNS 11643 planes */
985 if(myData2022->version==0) {
986 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
987 } else {
988 myData2022->toU2022State.cs[3]=(int8_t)tempState;
989 }
990 break;
991 }
992 }
993 break;
994 case ISO_2022_KR:
995 if(offset==0x30){
996 /* nothing to be done, just accept this one escape sequence */
997 } else {
998 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
999 }
1000 break;
1001 #endif // !UCONFIG_ONLY_HTML_CONVERSION
1002
1003 default:
1004 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1005 break;
1006 }
1007 }
1008 if(U_SUCCESS(*err)) {
1009 _this->toULength = 0;
1010 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1011 if(_this->toULength>1) {
1012 /*
1013 * Ticket 5691: consistent illegal sequences:
1014 * - We include at least the first byte (ESC) in the illegal sequence.
1015 * - If any of the non-initial bytes could be the start of a character,
1016 * we stop the illegal sequence before the first one of those.
1017 * In escape sequences, all following bytes are "printable", that is,
1018 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1019 * they are valid single/lead bytes.
1020 * For simplicity, we always only report the initial ESC byte as the
1021 * illegal sequence and back out all other bytes we looked at.
1022 */
1023 /* Back out some bytes. */
1024 int8_t backOutDistance=_this->toULength-1;
1025 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1026 if(backOutDistance<=bytesFromThisBuffer) {
1027 /* same as initialToULength<=1 */
1028 *source-=backOutDistance;
1029 } else {
1030 /* Back out bytes from the previous buffer: Need to replay them. */
1031 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1032 /* same as -(initialToULength-1) */
1033 /* preToULength is negative! */
1034 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1035 *source-=bytesFromThisBuffer;
1036 }
1037 _this->toULength=1;
1038 }
1039 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1040 _this->toUCallbackReason = UCNV_UNASSIGNED;
1041 }
1042 }
1043
1044 #if !UCONFIG_ONLY_HTML_CONVERSION
1045 /*Checks the characters of the buffer against valid 2022 escape sequences
1046 *if the match we return a pointer to the initial start of the sequence otherwise
1047 *we return sourceLimit
1048 */
1049 /*for 2022 looks ahead in the stream
1050 *to determine the longest possible convertible
1051 *data stream
1052 */
1053 static inline const char*
1054 getEndOfBuffer_2022(const char** source,
1055 const char* sourceLimit,
1056 UBool /*flush*/){
1057
1058 const char* mySource = *source;
1059
1060 #ifdef U_ENABLE_GENERIC_ISO_2022
1061 if (*source >= sourceLimit)
1062 return sourceLimit;
1063
1064 do{
1065
1066 if (*mySource == ESC_2022){
1067 int8_t i;
1068 int32_t key = 0;
1069 int32_t offset;
1070 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1071
1072 /* Kludge: I could not
1073 * figure out the reason for validating an escape sequence
1074 * twice - once here and once in changeState_2022().
1075 * is it possible to have an ESC character in a ISO2022
1076 * byte stream which is valid in a code page? Is it legal?
1077 */
1078 for (i=0;
1079 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1080 i++) {
1081 value = getKey_2022(*(mySource+i), &key, &offset);
1082 }
1083 if (value > 0 || *mySource==ESC_2022)
1084 return mySource;
1085
1086 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1087 return sourceLimit;
1088 }
1089 }while (++mySource < sourceLimit);
1090
1091 return sourceLimit;
1092 #else
1093 while(mySource < sourceLimit && *mySource != ESC_2022) {
1094 ++mySource;
1095 }
1096 return mySource;
1097 #endif
1098 }
1099 #endif
1100
1101 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1102 * any future change in _MBCSFromUChar32() function should be reflected here.
1103 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1104 */
1105 static inline int32_t
1106 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1107 UChar32 c,
1108 uint32_t* value,
1109 UBool useFallback,
1110 int outputType)
1111 {
1112 const int32_t *cx;
1113 const uint16_t *table;
1114 uint32_t stage2Entry;
1115 uint32_t myValue;
1116 int32_t length;
1117 const uint8_t *p;
1118 /*
1119 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1120 * Use internal version of ucnv_open() that verifies that the new structures are available,
1121 * else U_INTERNAL_PROGRAM_ERROR.
1122 */
1123 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1124 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1125 table=sharedData->mbcs.fromUnicodeTable;
1126 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1127 /* get the bytes and the length for the output */
1128 if(outputType==MBCS_OUTPUT_2){
1129 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1130 if(myValue<=0xff) {
1131 length=1;
1132 } else {
1133 length=2;
1134 }
1135 } else /* outputType==MBCS_OUTPUT_3 */ {
1136 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1137 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1138 if(myValue<=0xff) {
1139 length=1;
1140 } else if(myValue<=0xffff) {
1141 length=2;
1142 } else {
1143 length=3;
1144 }
1145 }
1146 /* is this code point assigned, or do we use fallbacks? */
1147 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1148 /* assigned */
1149 *value=myValue;
1150 return length;
1151 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1152 /*
1153 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1154 * There is no way with this data structure for fallback output
1155 * to be a zero byte.
1156 */
1157 *value=myValue;
1158 return -length;
1159 }
1160 }
1161
1162 cx=sharedData->mbcs.extIndexes;
1163 if(cx!=NULL) {
1164 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1165 }
1166
1167 /* unassigned */
1168 return 0;
1169 }
1170
1171 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1172 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1173 * @param retval pointer to output byte
1174 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1175 */
1176 static inline int32_t
1177 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1178 UChar32 c,
1179 uint32_t* retval,
1180 UBool useFallback)
1181 {
1182 const uint16_t *table;
1183 int32_t value;
1184 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1185 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1186 return 0;
1187 }
1188 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1189 table=sharedData->mbcs.fromUnicodeTable;
1190 /* get the byte for the output */
1191 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1192 /* is this code point assigned, or do we use fallbacks? */
1193 *retval=(uint32_t)(value&0xff);
1194 if(value>=0xf00) {
1195 return 1; /* roundtrip */
1196 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1197 return -1; /* fallback taken */
1198 } else {
1199 return 0; /* no mapping */
1200 }
1201 }
1202
1203 /*
1204 * Check that the result is a 2-byte value with each byte in the range A1..FE
1205 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1206 * to move it to the ISO 2022 range 21..7E.
1207 * Return 0 if out of range.
1208 */
1209 static inline uint32_t
1210 _2022FromGR94DBCS(uint32_t value) {
1211 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1212 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1213 ) {
1214 return value - 0x8080; /* shift down to 21..7e byte range */
1215 } else {
1216 return 0; /* not valid for ISO 2022 */
1217 }
1218 }
1219
1220 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1221 /*
1222 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1223 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1224 * unchanged.
1225 */
1226 static inline uint32_t
1227 _2022ToGR94DBCS(uint32_t value) {
1228 uint32_t returnValue = value + 0x8080;
1229 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1230 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1231 return returnValue;
1232 } else {
1233 return value;
1234 }
1235 }
1236 #endif
1237
1238 #ifdef U_ENABLE_GENERIC_ISO_2022
1239
1240 /**********************************************************************************
1241 * ISO-2022 Converter
1242 *
1243 *
1244 */
1245
1246 static void
1247 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1248 UErrorCode* err){
1249 const char* mySourceLimit, *realSourceLimit;
1250 const char* sourceStart;
1251 const UChar* myTargetStart;
1252 UConverter* saveThis;
1253 UConverterDataISO2022* myData;
1254 int8_t length;
1255
1256 saveThis = args->converter;
1257 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1258
1259 realSourceLimit = args->sourceLimit;
1260 while (args->source < realSourceLimit) {
1261 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1262 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1263 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1264
1265 if(args->source < mySourceLimit) {
1266 if(myData->currentConverter==NULL) {
1267 myData->currentConverter = ucnv_open("ASCII",err);
1268 if(U_FAILURE(*err)){
1269 return;
1270 }
1271
1272 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1273 saveThis->mode = UCNV_SO;
1274 }
1275
1276 /* convert to before the ESC or until the end of the buffer */
1277 myData->isFirstBuffer=FALSE;
1278 sourceStart = args->source;
1279 myTargetStart = args->target;
1280 args->converter = myData->currentConverter;
1281 ucnv_toUnicode(args->converter,
1282 &args->target,
1283 args->targetLimit,
1284 &args->source,
1285 mySourceLimit,
1286 args->offsets,
1287 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1288 err);
1289 args->converter = saveThis;
1290
1291 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1292 /* move the overflow buffer */
1293 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1294 myData->currentConverter->UCharErrorBufferLength = 0;
1295 if(length > 0) {
1296 uprv_memcpy(saveThis->UCharErrorBuffer,
1297 myData->currentConverter->UCharErrorBuffer,
1298 length*U_SIZEOF_UCHAR);
1299 }
1300 return;
1301 }
1302
1303 /*
1304 * At least one of:
1305 * -Error while converting
1306 * -Done with entire buffer
1307 * -Need to write offsets or update the current offset
1308 * (leave that up to the code in ucnv.c)
1309 *
1310 * or else we just stopped at an ESC byte and continue with changeState_2022()
1311 */
1312 if (U_FAILURE(*err) ||
1313 (args->source == realSourceLimit) ||
1314 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1315 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1316 ) {
1317 /* copy partial or error input for truncated detection and error handling */
1318 if(U_FAILURE(*err)) {
1319 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1320 if(length > 0) {
1321 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1322 }
1323 } else {
1324 length = saveThis->toULength = myData->currentConverter->toULength;
1325 if(length > 0) {
1326 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1327 if(args->source < mySourceLimit) {
1328 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1329 }
1330 }
1331 }
1332 return;
1333 }
1334 }
1335 }
1336
1337 sourceStart = args->source;
1338 changeState_2022(args->converter,
1339 &(args->source),
1340 realSourceLimit,
1341 ISO_2022,
1342 err);
1343 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1344 /* let the ucnv.c code update its current offset */
1345 return;
1346 }
1347 }
1348 }
1349
1350 #endif
1351
1352 /*
1353 * To Unicode Callback helper function
1354 */
1355 static void
1356 toUnicodeCallback(UConverter *cnv,
1357 const uint32_t sourceChar, const uint32_t targetUniChar,
1358 UErrorCode* err){
1359 if(sourceChar>0xff){
1360 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1361 cnv->toUBytes[1] = (uint8_t)sourceChar;
1362 cnv->toULength = 2;
1363 }
1364 else{
1365 cnv->toUBytes[0] =(char) sourceChar;
1366 cnv->toULength = 1;
1367 }
1368
1369 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1370 *err = U_INVALID_CHAR_FOUND;
1371 }
1372 else{
1373 *err = U_ILLEGAL_CHAR_FOUND;
1374 }
1375 }
1376
1377 /**************************************ISO-2022-JP*************************************************/
1378
1379 /************************************** IMPORTANT **************************************************
1380 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1381 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1382 * The converter iterates over each Unicode codepoint
1383 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1384 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1385 * would do as far as possible.
1386 *
1387 * If the implementation of these macros or structure of sharedData struct change in the future, make
1388 * sure that ISO-2022 is also changed.
1389 ***************************************************************************************************
1390 */
1391
1392 /***************************************************************************************************
1393 * Rules for ISO-2022-jp encoding
1394 * (i) Escape sequences must be fully contained within a line they should not
1395 * span new lines or CRs
1396 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1397 * JIS-Roman character escape sequence should follow before the line terminates
1398 * (iii) If the first character on the line is represented by two bytes then a two
1399 * byte character escape sequence should precede it
1400 * (iv) If no escape sequence is encountered then the characters are ASCII
1401 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1402 * and invoked with SS2 (ESC N).
1403 * (vi) If there is any G0 designation in text, there must be a switch to
1404 * ASCII or to JIS X 0201-Roman before a space character (but not
1405 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1406 * characters such as tab or CRLF.
1407 * (vi) Supported encodings:
1408 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1409 *
1410 * source : RFC-1554
1411 *
1412 * JISX201, JISX208,JISX212 : new .cnv data files created
1413 * KSC5601 : alias to ibm-949 mapping table
1414 * GB2312 : alias to ibm-1386 mapping table
1415 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1416 * ISO-8859-7 : alisas to ibm-9409 mapping table
1417 */
1418
1419 /* preference order of JP charsets */
1420 static const StateEnum jpCharsetPref[]={
1421 ASCII,
1422 JISX201,
1423 ISO8859_1,
1424 JISX208,
1425 ISO8859_7,
1426 JISX212,
1427 GB2312,
1428 KSC5601,
1429 HWKANA_7BIT
1430 };
1431
1432 /*
1433 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1434 * not in order of jpCharsetPref[]!
1435 */
1436 static const char escSeqChars[][6] ={
1437 "\x1B\x28\x42", /* <ESC>(B ASCII */
1438 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1439 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1440 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1441 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1442 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1443 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1444 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1445 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1446
1447 };
1448 static const int8_t escSeqCharsLen[] ={
1449 3, /* length of <ESC>(B ASCII */
1450 3, /* length of <ESC>.A ISO-8859-1 */
1451 3, /* length of <ESC>.F ISO-8859-7 */
1452 3, /* length of <ESC>(J JISX-201 */
1453 3, /* length of <ESC>$B JISX-208 */
1454 4, /* length of <ESC>$(D JISX-212 */
1455 3, /* length of <ESC>$A GB2312 */
1456 4, /* length of <ESC>$(C KSC5601 */
1457 3 /* length of <ESC>(I HWKANA_7BIT */
1458 };
1459
1460 /*
1461 * The iteration over various code pages works this way:
1462 * i) Get the currentState from myConverterData->currentState
1463 * ii) Check if the character is mapped to a valid character in the currentState
1464 * Yes -> a) set the initIterState to currentState
1465 * b) remain in this state until an invalid character is found
1466 * No -> a) go to the next code page and find the character
1467 * iii) Before changing the state increment the current state check if the current state
1468 * is equal to the intitIteration state
1469 * Yes -> A character that cannot be represented in any of the supported encodings
1470 * break and return a U_INVALID_CHARACTER error
1471 * No -> Continue and find the character in next code page
1472 *
1473 *
1474 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1475 */
1476
1477 /* Map 00..7F to Unicode according to JIS X 0201. */
1478 static inline uint32_t
1479 jisx201ToU(uint32_t value) {
1480 if(value < 0x5c) {
1481 return value;
1482 } else if(value == 0x5c) {
1483 return 0xa5;
1484 } else if(value == 0x7e) {
1485 return 0x203e;
1486 } else /* value <= 0x7f */ {
1487 return value;
1488 }
1489 }
1490
1491 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1492 static inline uint32_t
1493 jisx201FromU(uint32_t value) {
1494 if(value<=0x7f) {
1495 if(value!=0x5c && value!=0x7e) {
1496 return value;
1497 }
1498 } else if(value==0xa5) {
1499 return 0x5c;
1500 } else if(value==0x203e) {
1501 return 0x7e;
1502 }
1503 return 0xfffe;
1504 }
1505
1506 /*
1507 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1508 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1509 * Return 0 if the byte pair is out of range.
1510 */
1511 static inline uint32_t
1512 _2022FromSJIS(uint32_t value) {
1513 uint8_t trail;
1514
1515 if(value > 0xEFFC) {
1516 return 0; /* beyond JIS X 0208 */
1517 }
1518
1519 trail = (uint8_t)value;
1520
1521 value &= 0xff00; /* lead byte */
1522 if(value <= 0x9f00) {
1523 value -= 0x7000;
1524 } else /* 0xe000 <= value <= 0xef00 */ {
1525 value -= 0xb000;
1526 }
1527 value <<= 1;
1528
1529 if(trail <= 0x9e) {
1530 value -= 0x100;
1531 if(trail <= 0x7e) {
1532 value |= trail - 0x1f;
1533 } else {
1534 value |= trail - 0x20;
1535 }
1536 } else /* trail <= 0xfc */ {
1537 value |= trail - 0x7e;
1538 }
1539 return value;
1540 }
1541
1542 /*
1543 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1544 * If either byte is outside 21..7E make sure that the result is not valid
1545 * for Shift-JIS so that the converter catches it.
1546 * Some invalid byte values already turn into equally invalid Shift-JIS
1547 * byte values and need not be tested explicitly.
1548 */
1549 static inline void
1550 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1551 if(c1&1) {
1552 ++c1;
1553 if(c2 <= 0x5f) {
1554 c2 += 0x1f;
1555 } else if(c2 <= 0x7e) {
1556 c2 += 0x20;
1557 } else {
1558 c2 = 0; /* invalid */
1559 }
1560 } else {
1561 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1562 c2 += 0x7e;
1563 } else {
1564 c2 = 0; /* invalid */
1565 }
1566 }
1567 c1 >>= 1;
1568 if(c1 <= 0x2f) {
1569 c1 += 0x70;
1570 } else if(c1 <= 0x3f) {
1571 c1 += 0xb0;
1572 } else {
1573 c1 = 0; /* invalid */
1574 }
1575 bytes[0] = (char)c1;
1576 bytes[1] = (char)c2;
1577 }
1578
1579 /*
1580 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1581 * Katakana.
1582 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1583 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1584 * These were the only fallbacks in ICU's jisx-208.ucm file.
1585 */
1586 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1587 0x2123, /* U+FF61 */
1588 0x2156,
1589 0x2157,
1590 0x2122,
1591 0x2126,
1592 0x2572,
1593 0x2521,
1594 0x2523,
1595 0x2525,
1596 0x2527,
1597 0x2529,
1598 0x2563,
1599 0x2565,
1600 0x2567,
1601 0x2543,
1602 0x213C, /* U+FF70 */
1603 0x2522,
1604 0x2524,
1605 0x2526,
1606 0x2528,
1607 0x252A,
1608 0x252B,
1609 0x252D,
1610 0x252F,
1611 0x2531,
1612 0x2533,
1613 0x2535,
1614 0x2537,
1615 0x2539,
1616 0x253B,
1617 0x253D,
1618 0x253F, /* U+FF80 */
1619 0x2541,
1620 0x2544,
1621 0x2546,
1622 0x2548,
1623 0x254A,
1624 0x254B,
1625 0x254C,
1626 0x254D,
1627 0x254E,
1628 0x254F,
1629 0x2552,
1630 0x2555,
1631 0x2558,
1632 0x255B,
1633 0x255E,
1634 0x255F, /* U+FF90 */
1635 0x2560,
1636 0x2561,
1637 0x2562,
1638 0x2564,
1639 0x2566,
1640 0x2568,
1641 0x2569,
1642 0x256A,
1643 0x256B,
1644 0x256C,
1645 0x256D,
1646 0x256F,
1647 0x2573,
1648 0x212B,
1649 0x212C /* U+FF9F */
1650 };
1651
1652 static void
1653 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1654 UConverter *cnv = args->converter;
1655 UConverterDataISO2022 *converterData;
1656 ISO2022State *pFromU2022State;
1657 uint8_t *target = (uint8_t *) args->target;
1658 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1659 const UChar* source = args->source;
1660 const UChar* sourceLimit = args->sourceLimit;
1661 int32_t* offsets = args->offsets;
1662 UChar32 sourceChar;
1663 char buffer[8];
1664 int32_t len, outLen;
1665 int8_t choices[10];
1666 int32_t choiceCount;
1667 uint32_t targetValue = 0;
1668 UBool useFallback;
1669
1670 int32_t i;
1671 int8_t cs, g;
1672
1673 /* set up the state */
1674 converterData = (UConverterDataISO2022*)cnv->extraInfo;
1675 pFromU2022State = &converterData->fromU2022State;
1676
1677 choiceCount = 0;
1678
1679 /* check if the last codepoint of previous buffer was a lead surrogate*/
1680 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1681 goto getTrail;
1682 }
1683
1684 while(source < sourceLimit) {
1685 if(target < targetLimit) {
1686
1687 sourceChar = *(source++);
1688 /*check if the char is a First surrogate*/
1689 if(U16_IS_SURROGATE(sourceChar)) {
1690 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1691 getTrail:
1692 /*look ahead to find the trail surrogate*/
1693 if(source < sourceLimit) {
1694 /* test the following code unit */
1695 UChar trail=(UChar) *source;
1696 if(U16_IS_TRAIL(trail)) {
1697 source++;
1698 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1699 cnv->fromUChar32=0x00;
1700 /* convert this supplementary code point */
1701 /* exit this condition tree */
1702 } else {
1703 /* this is an unmatched lead code unit (1st surrogate) */
1704 /* callback(illegal) */
1705 *err=U_ILLEGAL_CHAR_FOUND;
1706 cnv->fromUChar32=sourceChar;
1707 break;
1708 }
1709 } else {
1710 /* no more input */
1711 cnv->fromUChar32=sourceChar;
1712 break;
1713 }
1714 } else {
1715 /* this is an unmatched trail code unit (2nd surrogate) */
1716 /* callback(illegal) */
1717 *err=U_ILLEGAL_CHAR_FOUND;
1718 cnv->fromUChar32=sourceChar;
1719 break;
1720 }
1721 }
1722
1723 /* do not convert SO/SI/ESC */
1724 if(IS_2022_CONTROL(sourceChar)) {
1725 /* callback(illegal) */
1726 *err=U_ILLEGAL_CHAR_FOUND;
1727 cnv->fromUChar32=sourceChar;
1728 break;
1729 }
1730
1731 /* do the conversion */
1732
1733 if(choiceCount == 0) {
1734 uint16_t csm;
1735
1736 /*
1737 * The csm variable keeps track of which charsets are allowed
1738 * and not used yet while building the choices[].
1739 */
1740 csm = jpCharsetMasks[converterData->version];
1741 choiceCount = 0;
1742
1743 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1744 if(converterData->version == 3 || converterData->version == 4) {
1745 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1746 }
1747 /* Do not try single-byte half-width Katakana for other versions. */
1748 csm &= ~CSM(HWKANA_7BIT);
1749
1750 /* try the current G0 charset */
1751 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1752 csm &= ~CSM(cs);
1753
1754 /* try the current G2 charset */
1755 if((cs = pFromU2022State->cs[2]) != 0) {
1756 choices[choiceCount++] = cs;
1757 csm &= ~CSM(cs);
1758 }
1759
1760 /* try all the other possible charsets */
1761 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
1762 cs = (int8_t)jpCharsetPref[i];
1763 if(CSM(cs) & csm) {
1764 choices[choiceCount++] = cs;
1765 csm &= ~CSM(cs);
1766 }
1767 }
1768 }
1769
1770 cs = g = 0;
1771 /*
1772 * len==0: no mapping found yet
1773 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1774 * len>0: found a roundtrip result, done
1775 */
1776 len = 0;
1777 /*
1778 * We will turn off useFallback after finding a fallback,
1779 * but we still get fallbacks from PUA code points as usual.
1780 * Therefore, we will also need to check that we don't overwrite
1781 * an early fallback with a later one.
1782 */
1783 useFallback = cnv->useFallback;
1784
1785 for(i = 0; i < choiceCount && len <= 0; ++i) {
1786 uint32_t value;
1787 int32_t len2;
1788 int8_t cs0 = choices[i];
1789 switch(cs0) {
1790 case ASCII:
1791 if(sourceChar <= 0x7f) {
1792 targetValue = (uint32_t)sourceChar;
1793 len = 1;
1794 cs = cs0;
1795 g = 0;
1796 }
1797 break;
1798 case ISO8859_1:
1799 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1800 targetValue = (uint32_t)sourceChar - 0x80;
1801 len = 1;
1802 cs = cs0;
1803 g = 2;
1804 }
1805 break;
1806 case HWKANA_7BIT:
1807 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1808 if(converterData->version==3) {
1809 /* JIS7: use G1 (SO) */
1810 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1811 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1812 len = 1;
1813 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1814 g = 1;
1815 } else if(converterData->version==4) {
1816 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1817 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1818 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1819 len = 1;
1820
1821 cs = pFromU2022State->cs[0];
1822 if(IS_JP_DBCS(cs)) {
1823 /* switch from a DBCS charset to JISX201 */
1824 cs = (int8_t)JISX201;
1825 }
1826 /* else stay in the current G0 charset */
1827 g = 0;
1828 }
1829 /* else do not use HWKANA_7BIT with other versions */
1830 }
1831 break;
1832 case JISX201:
1833 /* G0 SBCS */
1834 value = jisx201FromU(sourceChar);
1835 if(value <= 0x7f) {
1836 targetValue = value;
1837 len = 1;
1838 cs = cs0;
1839 g = 0;
1840 useFallback = FALSE;
1841 }
1842 break;
1843 case JISX208:
1844 /* G0 DBCS from Shift-JIS table */
1845 len2 = MBCS_FROM_UCHAR32_ISO2022(
1846 converterData->myConverterArray[cs0],
1847 sourceChar, &value,
1848 useFallback, MBCS_OUTPUT_2);
1849 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1850 value = _2022FromSJIS(value);
1851 if(value != 0) {
1852 targetValue = value;
1853 len = len2;
1854 cs = cs0;
1855 g = 0;
1856 useFallback = FALSE;
1857 }
1858 } else if(len == 0 && useFallback &&
1859 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1860 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1861 len = -2;
1862 cs = cs0;
1863 g = 0;
1864 useFallback = FALSE;
1865 }
1866 break;
1867 case ISO8859_7:
1868 /* G0 SBCS forced to 7-bit output */
1869 len2 = MBCS_SINGLE_FROM_UCHAR32(
1870 converterData->myConverterArray[cs0],
1871 sourceChar, &value,
1872 useFallback);
1873 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1874 targetValue = value - 0x80;
1875 len = len2;
1876 cs = cs0;
1877 g = 2;
1878 useFallback = FALSE;
1879 }
1880 break;
1881 default:
1882 /* G0 DBCS */
1883 len2 = MBCS_FROM_UCHAR32_ISO2022(
1884 converterData->myConverterArray[cs0],
1885 sourceChar, &value,
1886 useFallback, MBCS_OUTPUT_2);
1887 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1888 if(cs0 == KSC5601) {
1889 /*
1890 * Check for valid bytes for the encoding scheme.
1891 * This is necessary because the sub-converter (windows-949)
1892 * has a broader encoding scheme than is valid for 2022.
1893 */
1894 value = _2022FromGR94DBCS(value);
1895 if(value == 0) {
1896 break;
1897 }
1898 }
1899 targetValue = value;
1900 len = len2;
1901 cs = cs0;
1902 g = 0;
1903 useFallback = FALSE;
1904 }
1905 break;
1906 }
1907 }
1908
1909 if(len != 0) {
1910 if(len < 0) {
1911 len = -len; /* fallback */
1912 }
1913 outLen = 0; /* count output bytes */
1914
1915 /* write SI if necessary (only for JIS7) */
1916 if(pFromU2022State->g == 1 && g == 0) {
1917 buffer[outLen++] = UCNV_SI;
1918 pFromU2022State->g = 0;
1919 }
1920
1921 /* write the designation sequence if necessary */
1922 if(cs != pFromU2022State->cs[g]) {
1923 int32_t escLen = escSeqCharsLen[cs];
1924 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1925 outLen += escLen;
1926 pFromU2022State->cs[g] = cs;
1927
1928 /* invalidate the choices[] */
1929 choiceCount = 0;
1930 }
1931
1932 /* write the shift sequence if necessary */
1933 if(g != pFromU2022State->g) {
1934 switch(g) {
1935 /* case 0 handled before writing escapes */
1936 case 1:
1937 buffer[outLen++] = UCNV_SO;
1938 pFromU2022State->g = 1;
1939 break;
1940 default: /* case 2 */
1941 buffer[outLen++] = 0x1b;
1942 buffer[outLen++] = 0x4e;
1943 break;
1944 /* no case 3: no SS3 in ISO-2022-JP-x */
1945 }
1946 }
1947
1948 /* write the output bytes */
1949 if(len == 1) {
1950 buffer[outLen++] = (char)targetValue;
1951 } else /* len == 2 */ {
1952 buffer[outLen++] = (char)(targetValue >> 8);
1953 buffer[outLen++] = (char)targetValue;
1954 }
1955 } else {
1956 /*
1957 * if we cannot find the character after checking all codepages
1958 * then this is an error
1959 */
1960 *err = U_INVALID_CHAR_FOUND;
1961 cnv->fromUChar32=sourceChar;
1962 break;
1963 }
1964
1965 if(sourceChar == CR || sourceChar == LF) {
1966 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1967 pFromU2022State->cs[2] = 0;
1968 choiceCount = 0;
1969 }
1970
1971 /* output outLen>0 bytes in buffer[] */
1972 if(outLen == 1) {
1973 *target++ = buffer[0];
1974 if(offsets) {
1975 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1976 }
1977 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1978 *target++ = buffer[0];
1979 *target++ = buffer[1];
1980 if(offsets) {
1981 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1982 *offsets++ = sourceIndex;
1983 *offsets++ = sourceIndex;
1984 }
1985 } else {
1986 fromUWriteUInt8(
1987 cnv,
1988 buffer, outLen,
1989 &target, (const char *)targetLimit,
1990 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1991 err);
1992 if(U_FAILURE(*err)) {
1993 break;
1994 }
1995 }
1996 } /* end if(myTargetIndex<myTargetLength) */
1997 else{
1998 *err =U_BUFFER_OVERFLOW_ERROR;
1999 break;
2000 }
2001
2002 }/* end while(mySourceIndex<mySourceLength) */
2003
2004 /*
2005 * the end of the input stream and detection of truncated input
2006 * are handled by the framework, but for ISO-2022-JP conversion
2007 * we need to be in ASCII mode at the very end
2008 *
2009 * conditions:
2010 * successful
2011 * in SO mode or not in ASCII mode
2012 * end of input and no truncated input
2013 */
2014 if( U_SUCCESS(*err) &&
2015 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2016 args->flush && source>=sourceLimit && cnv->fromUChar32==0
2017 ) {
2018 int32_t sourceIndex;
2019
2020 outLen = 0;
2021
2022 if(pFromU2022State->g != 0) {
2023 buffer[outLen++] = UCNV_SI;
2024 pFromU2022State->g = 0;
2025 }
2026
2027 if(pFromU2022State->cs[0] != ASCII) {
2028 int32_t escLen = escSeqCharsLen[ASCII];
2029 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2030 outLen += escLen;
2031 pFromU2022State->cs[0] = (int8_t)ASCII;
2032 }
2033
2034 /* get the source index of the last input character */
2035 /*
2036 * TODO this would be simpler and more reliable if we used a pair
2037 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2038 * so that we could simply use the prevSourceIndex here;
2039 * this code gives an incorrect result for the rare case of an unmatched
2040 * trail surrogate that is alone in the last buffer of the text stream
2041 */
2042 sourceIndex=(int32_t)(source-args->source);
2043 if(sourceIndex>0) {
2044 --sourceIndex;
2045 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2046 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2047 ) {
2048 --sourceIndex;
2049 }
2050 } else {
2051 sourceIndex=-1;
2052 }
2053
2054 fromUWriteUInt8(
2055 cnv,
2056 buffer, outLen,
2057 &target, (const char *)targetLimit,
2058 &offsets, sourceIndex,
2059 err);
2060 }
2061
2062 /*save the state and return */
2063 args->source = source;
2064 args->target = (char*)target;
2065 }
2066
2067 /*************** to unicode *******************/
2068
2069 static void
2070 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2071 UErrorCode* err){
2072 char tempBuf[2];
2073 const char *mySource = (char *) args->source;
2074 UChar *myTarget = args->target;
2075 const char *mySourceLimit = args->sourceLimit;
2076 uint32_t targetUniChar = 0x0000;
2077 uint32_t mySourceChar = 0x0000;
2078 uint32_t tmpSourceChar = 0x0000;
2079 UConverterDataISO2022* myData;
2080 ISO2022State *pToU2022State;
2081 StateEnum cs;
2082
2083 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2084 pToU2022State = &myData->toU2022State;
2085
2086 if(myData->key != 0) {
2087 /* continue with a partial escape sequence */
2088 goto escape;
2089 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2090 /* continue with a partial double-byte character */
2091 mySourceChar = args->converter->toUBytes[0];
2092 args->converter->toULength = 0;
2093 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2094 targetUniChar = missingCharMarker;
2095 goto getTrailByte;
2096 }
2097
2098 while(mySource < mySourceLimit){
2099
2100 targetUniChar =missingCharMarker;
2101
2102 if(myTarget < args->targetLimit){
2103
2104 mySourceChar= (unsigned char) *mySource++;
2105
2106 switch(mySourceChar) {
2107 case UCNV_SI:
2108 if(myData->version==3) {
2109 pToU2022State->g=0;
2110 continue;
2111 } else {
2112 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2113 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2114 break;
2115 }
2116
2117 case UCNV_SO:
2118 if(myData->version==3) {
2119 /* JIS7: switch to G1 half-width Katakana */
2120 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2121 pToU2022State->g=1;
2122 continue;
2123 } else {
2124 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2125 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2126 break;
2127 }
2128
2129 case ESC_2022:
2130 mySource--;
2131 escape:
2132 {
2133 const char * mySourceBefore = mySource;
2134 int8_t toULengthBefore = args->converter->toULength;
2135
2136 changeState_2022(args->converter,&(mySource),
2137 mySourceLimit, ISO_2022_JP,err);
2138
2139 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2140 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2141 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2142 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2143 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2144 }
2145 }
2146
2147 /* invalid or illegal escape sequence */
2148 if(U_FAILURE(*err)){
2149 args->target = myTarget;
2150 args->source = mySource;
2151 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
2152 return;
2153 }
2154 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2155 if(myData->key==0) {
2156 myData->isEmptySegment = TRUE;
2157 }
2158 continue;
2159
2160 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2161
2162 case CR:
2163 case LF:
2164 /* automatically reset to single-byte mode */
2165 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2166 pToU2022State->cs[0] = (int8_t)ASCII;
2167 }
2168 pToU2022State->cs[2] = 0;
2169 pToU2022State->g = 0;
2170 U_FALLTHROUGH;
2171 default:
2172 /* convert one or two bytes */
2173 myData->isEmptySegment = FALSE;
2174 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2175 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2176 !IS_JP_DBCS(cs)
2177 ) {
2178 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2179 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2180
2181 /* return from a single-shift state to the previous one */
2182 if(pToU2022State->g >= 2) {
2183 pToU2022State->g=pToU2022State->prevG;
2184 }
2185 } else switch(cs) {
2186 case ASCII:
2187 if(mySourceChar <= 0x7f) {
2188 targetUniChar = mySourceChar;
2189 }
2190 break;
2191 case ISO8859_1:
2192 if(mySourceChar <= 0x7f) {
2193 targetUniChar = mySourceChar + 0x80;
2194 }
2195 /* return from a single-shift state to the previous one */
2196 pToU2022State->g=pToU2022State->prevG;
2197 break;
2198 case ISO8859_7:
2199 if(mySourceChar <= 0x7f) {
2200 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2201 targetUniChar =
2202 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2203 myData->myConverterArray[cs],
2204 mySourceChar + 0x80);
2205 }
2206 /* return from a single-shift state to the previous one */
2207 pToU2022State->g=pToU2022State->prevG;
2208 break;
2209 case JISX201:
2210 if(mySourceChar <= 0x7f) {
2211 targetUniChar = jisx201ToU(mySourceChar);
2212 }
2213 break;
2214 case HWKANA_7BIT:
2215 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2216 /* 7-bit halfwidth Katakana */
2217 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2218 }
2219 break;
2220 default:
2221 /* G0 DBCS */
2222 if(mySource < mySourceLimit) {
2223 int leadIsOk, trailIsOk;
2224 uint8_t trailByte;
2225 getTrailByte:
2226 trailByte = (uint8_t)*mySource;
2227 /*
2228 * Ticket 5691: consistent illegal sequences:
2229 * - We include at least the first byte in the illegal sequence.
2230 * - If any of the non-initial bytes could be the start of a character,
2231 * we stop the illegal sequence before the first one of those.
2232 *
2233 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2234 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2235 * Otherwise we convert or report the pair of bytes.
2236 */
2237 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2238 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2239 if (leadIsOk && trailIsOk) {
2240 ++mySource;
2241 tmpSourceChar = (mySourceChar << 8) | trailByte;
2242 if(cs == JISX208) {
2243 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2244 mySourceChar = tmpSourceChar;
2245 } else {
2246 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2247 mySourceChar = tmpSourceChar;
2248 if (cs == KSC5601) {
2249 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2250 }
2251 tempBuf[0] = (char)(tmpSourceChar >> 8);
2252 tempBuf[1] = (char)(tmpSourceChar);
2253 }
2254 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2255 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2256 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2257 ++mySource;
2258 /* add another bit so that the code below writes 2 bytes in case of error */
2259 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2260 }
2261 } else {
2262 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2263 args->converter->toULength = 1;
2264 goto endloop;
2265 }
2266 } /* End of inner switch */
2267 break;
2268 } /* End of outer switch */
2269 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2270 if(args->offsets){
2271 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2272 }
2273 *(myTarget++)=(UChar)targetUniChar;
2274 }
2275 else if(targetUniChar > missingCharMarker){
2276 /* disassemble the surrogate pair and write to output*/
2277 targetUniChar-=0x0010000;
2278 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2279 if(args->offsets){
2280 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2281 }
2282 ++myTarget;
2283 if(myTarget< args->targetLimit){
2284 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2285 if(args->offsets){
2286 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2287 }
2288 ++myTarget;
2289 }else{
2290 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2291 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2292 }
2293
2294 }
2295 else{
2296 /* Call the callback function*/
2297 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2298 break;
2299 }
2300 }
2301 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2302 *err =U_BUFFER_OVERFLOW_ERROR;
2303 break;
2304 }
2305 }
2306 endloop:
2307 args->target = myTarget;
2308 args->source = mySource;
2309 }
2310
2311
2312 #if !UCONFIG_ONLY_HTML_CONVERSION
2313 /***************************************************************
2314 * Rules for ISO-2022-KR encoding
2315 * i) The KSC5601 designator sequence should appear only once in a file,
2316 * at the begining of a line before any KSC5601 characters. This usually
2317 * means that it appears by itself on the first line of the file
2318 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2319 * and SI to shift into single byte mode
2320 */
2321 static void
2322 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2323
2324 UConverter* saveConv = args->converter;
2325 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2326 args->converter=myConverterData->currentConverter;
2327
2328 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2329 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2330 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2331
2332 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2333 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2334 uprv_memcpy(
2335 saveConv->charErrorBuffer,
2336 myConverterData->currentConverter->charErrorBuffer,
2337 myConverterData->currentConverter->charErrorBufferLength);
2338 }
2339 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2340 myConverterData->currentConverter->charErrorBufferLength = 0;
2341 }
2342 args->converter=saveConv;
2343 }
2344
2345 static void
2346 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2347
2348 const UChar *source = args->source;
2349 const UChar *sourceLimit = args->sourceLimit;
2350 unsigned char *target = (unsigned char *) args->target;
2351 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2352 int32_t* offsets = args->offsets;
2353 uint32_t targetByteUnit = 0x0000;
2354 UChar32 sourceChar = 0x0000;
2355 UBool isTargetByteDBCS;
2356 UBool oldIsTargetByteDBCS;
2357 UConverterDataISO2022 *converterData;
2358 UConverterSharedData* sharedData;
2359 UBool useFallback;
2360 int32_t length =0;
2361
2362 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2363 /* if the version is 1 then the user is requesting
2364 * conversion with ibm-25546 pass the arguments to
2365 * MBCS converter and return
2366 */
2367 if(converterData->version==1){
2368 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2369 return;
2370 }
2371
2372 /* initialize data */
2373 sharedData = converterData->currentConverter->sharedData;
2374 useFallback = args->converter->useFallback;
2375 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2376 oldIsTargetByteDBCS = isTargetByteDBCS;
2377
2378 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2379 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2380 goto getTrail;
2381 }
2382 while(source < sourceLimit){
2383
2384 targetByteUnit = missingCharMarker;
2385
2386 if(target < (unsigned char*) args->targetLimit){
2387 sourceChar = *source++;
2388
2389 /* do not convert SO/SI/ESC */
2390 if(IS_2022_CONTROL(sourceChar)) {
2391 /* callback(illegal) */
2392 *err=U_ILLEGAL_CHAR_FOUND;
2393 args->converter->fromUChar32=sourceChar;
2394 break;
2395 }
2396
2397 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2398 if(length < 0) {
2399 length = -length; /* fallback */
2400 }
2401 /* only DBCS or SBCS characters are expected*/
2402 /* DB characters with high bit set to 1 are expected */
2403 if( length > 2 || length==0 ||
2404 (length == 1 && targetByteUnit > 0x7f) ||
2405 (length == 2 &&
2406 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2407 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2408 ) {
2409 targetByteUnit=missingCharMarker;
2410 }
2411 if (targetByteUnit != missingCharMarker){
2412
2413 oldIsTargetByteDBCS = isTargetByteDBCS;
2414 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2415 /* append the shift sequence */
2416 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2417
2418 if (isTargetByteDBCS)
2419 *target++ = UCNV_SO;
2420 else
2421 *target++ = UCNV_SI;
2422 if(offsets)
2423 *(offsets++) = (int32_t)(source - args->source-1);
2424 }
2425 /* write the targetUniChar to target */
2426 if(targetByteUnit <= 0x00FF){
2427 if( target < targetLimit){
2428 *(target++) = (unsigned char) targetByteUnit;
2429 if(offsets){
2430 *(offsets++) = (int32_t)(source - args->source-1);
2431 }
2432
2433 }else{
2434 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2435 *err = U_BUFFER_OVERFLOW_ERROR;
2436 }
2437 }else{
2438 if(target < targetLimit){
2439 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2440 if(offsets){
2441 *(offsets++) = (int32_t)(source - args->source-1);
2442 }
2443 if(target < targetLimit){
2444 *(target++) =(unsigned char) (targetByteUnit -0x80);
2445 if(offsets){
2446 *(offsets++) = (int32_t)(source - args->source-1);
2447 }
2448 }else{
2449 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2450 *err = U_BUFFER_OVERFLOW_ERROR;
2451 }
2452 }else{
2453 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2454 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2455 *err = U_BUFFER_OVERFLOW_ERROR;
2456 }
2457 }
2458
2459 }
2460 else{
2461 /* oops.. the code point is unassingned
2462 * set the error and reason
2463 */
2464
2465 /*check if the char is a First surrogate*/
2466 if(U16_IS_SURROGATE(sourceChar)) {
2467 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2468 getTrail:
2469 /*look ahead to find the trail surrogate*/
2470 if(source < sourceLimit) {
2471 /* test the following code unit */
2472 UChar trail=(UChar) *source;
2473 if(U16_IS_TRAIL(trail)) {
2474 source++;
2475 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2476 *err = U_INVALID_CHAR_FOUND;
2477 /* convert this surrogate code point */
2478 /* exit this condition tree */
2479 } else {
2480 /* this is an unmatched lead code unit (1st surrogate) */
2481 /* callback(illegal) */
2482 *err=U_ILLEGAL_CHAR_FOUND;
2483 }
2484 } else {
2485 /* no more input */
2486 *err = U_ZERO_ERROR;
2487 }
2488 } else {
2489 /* this is an unmatched trail code unit (2nd surrogate) */
2490 /* callback(illegal) */
2491 *err=U_ILLEGAL_CHAR_FOUND;
2492 }
2493 } else {
2494 /* callback(unassigned) for a BMP code point */
2495 *err = U_INVALID_CHAR_FOUND;
2496 }
2497
2498 args->converter->fromUChar32=sourceChar;
2499 break;
2500 }
2501 } /* end if(myTargetIndex<myTargetLength) */
2502 else{
2503 *err =U_BUFFER_OVERFLOW_ERROR;
2504 break;
2505 }
2506
2507 }/* end while(mySourceIndex<mySourceLength) */
2508
2509 /*
2510 * the end of the input stream and detection of truncated input
2511 * are handled by the framework, but for ISO-2022-KR conversion
2512 * we need to be in ASCII mode at the very end
2513 *
2514 * conditions:
2515 * successful
2516 * not in ASCII mode
2517 * end of input and no truncated input
2518 */
2519 if( U_SUCCESS(*err) &&
2520 isTargetByteDBCS &&
2521 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2522 ) {
2523 int32_t sourceIndex;
2524
2525 /* we are switching to ASCII */
2526 isTargetByteDBCS=FALSE;
2527
2528 /* get the source index of the last input character */
2529 /*
2530 * TODO this would be simpler and more reliable if we used a pair
2531 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2532 * so that we could simply use the prevSourceIndex here;
2533 * this code gives an incorrect result for the rare case of an unmatched
2534 * trail surrogate that is alone in the last buffer of the text stream
2535 */
2536 sourceIndex=(int32_t)(source-args->source);
2537 if(sourceIndex>0) {
2538 --sourceIndex;
2539 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2540 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2541 ) {
2542 --sourceIndex;
2543 }
2544 } else {
2545 sourceIndex=-1;
2546 }
2547
2548 fromUWriteUInt8(
2549 args->converter,
2550 SHIFT_IN_STR, 1,
2551 &target, (const char *)targetLimit,
2552 &offsets, sourceIndex,
2553 err);
2554 }
2555
2556 /*save the state and return */
2557 args->source = source;
2558 args->target = (char*)target;
2559 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2560 }
2561
2562 /************************ To Unicode ***************************************/
2563
2564 static void
2565 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2566 UErrorCode* err){
2567 char const* sourceStart;
2568 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2569
2570 UConverterToUnicodeArgs subArgs;
2571 int32_t minArgsSize;
2572
2573 /* set up the subconverter arguments */
2574 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2575 minArgsSize = args->size;
2576 } else {
2577 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2578 }
2579
2580 uprv_memcpy(&subArgs, args, minArgsSize);
2581 subArgs.size = (uint16_t)minArgsSize;
2582 subArgs.converter = myData->currentConverter;
2583
2584 /* remember the original start of the input for offsets */
2585 sourceStart = args->source;
2586
2587 if(myData->key != 0) {
2588 /* continue with a partial escape sequence */
2589 goto escape;
2590 }
2591
2592 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2593 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2594 subArgs.source = args->source;
2595 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2596 if(subArgs.source != subArgs.sourceLimit) {
2597 /*
2598 * get the current partial byte sequence
2599 *
2600 * it needs to be moved between the public and the subconverter
2601 * so that the conversion framework, which only sees the public
2602 * converter, can handle truncated and illegal input etc.
2603 */
2604 if(args->converter->toULength > 0) {
2605 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2606 }
2607 subArgs.converter->toULength = args->converter->toULength;
2608
2609 /*
2610 * Convert up to the end of the input, or to before the next escape character.
2611 * Does not handle conversion extensions because the preToU[] state etc.
2612 * is not copied.
2613 */
2614 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2615
2616 if(args->offsets != NULL && sourceStart != args->source) {
2617 /* update offsets to base them on the actual start of the input */
2618 int32_t *offsets = args->offsets;
2619 UChar *target = args->target;
2620 int32_t delta = (int32_t)(args->source - sourceStart);
2621 while(target < subArgs.target) {
2622 if(*offsets >= 0) {
2623 *offsets += delta;
2624 }
2625 ++offsets;
2626 ++target;
2627 }
2628 }
2629 args->source = subArgs.source;
2630 args->target = subArgs.target;
2631 args->offsets = subArgs.offsets;
2632
2633 /* copy input/error/overflow buffers */
2634 if(subArgs.converter->toULength > 0) {
2635 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2636 }
2637 args->converter->toULength = subArgs.converter->toULength;
2638
2639 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2640 if(subArgs.converter->UCharErrorBufferLength > 0) {
2641 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2642 subArgs.converter->UCharErrorBufferLength);
2643 }
2644 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2645 subArgs.converter->UCharErrorBufferLength = 0;
2646 }
2647 }
2648
2649 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2650 return;
2651 }
2652
2653 escape:
2654 changeState_2022(args->converter,
2655 &(args->source),
2656 args->sourceLimit,
2657 ISO_2022_KR,
2658 err);
2659 }
2660 }
2661
2662 static void
2663 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2664 UErrorCode* err){
2665 char tempBuf[2];
2666 const char *mySource = ( char *) args->source;
2667 UChar *myTarget = args->target;
2668 const char *mySourceLimit = args->sourceLimit;
2669 UChar32 targetUniChar = 0x0000;
2670 UChar mySourceChar = 0x0000;
2671 UConverterDataISO2022* myData;
2672 UConverterSharedData* sharedData ;
2673 UBool useFallback;
2674
2675 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2676 if(myData->version==1){
2677 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2678 return;
2679 }
2680
2681 /* initialize state */
2682 sharedData = myData->currentConverter->sharedData;
2683 useFallback = args->converter->useFallback;
2684
2685 if(myData->key != 0) {
2686 /* continue with a partial escape sequence */
2687 goto escape;
2688 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2689 /* continue with a partial double-byte character */
2690 mySourceChar = args->converter->toUBytes[0];
2691 args->converter->toULength = 0;
2692 goto getTrailByte;
2693 }
2694
2695 while(mySource< mySourceLimit){
2696
2697 if(myTarget < args->targetLimit){
2698
2699 mySourceChar= (unsigned char) *mySource++;
2700
2701 if(mySourceChar==UCNV_SI){
2702 myData->toU2022State.g = 0;
2703 if (myData->isEmptySegment) {
2704 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2705 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2706 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2707 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2708 args->converter->toULength = 1;
2709 args->target = myTarget;
2710 args->source = mySource;
2711 return;
2712 }
2713 /*consume the source */
2714 continue;
2715 }else if(mySourceChar==UCNV_SO){
2716 myData->toU2022State.g = 1;
2717 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
2718 /*consume the source */
2719 continue;
2720 }else if(mySourceChar==ESC_2022){
2721 mySource--;
2722 escape:
2723 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2724 changeState_2022(args->converter,&(mySource),
2725 mySourceLimit, ISO_2022_KR, err);
2726 if(U_FAILURE(*err)){
2727 args->target = myTarget;
2728 args->source = mySource;
2729 return;
2730 }
2731 continue;
2732 }
2733
2734 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
2735 if(myData->toU2022State.g == 1) {
2736 if(mySource < mySourceLimit) {
2737 int leadIsOk, trailIsOk;
2738 uint8_t trailByte;
2739 getTrailByte:
2740 targetUniChar = missingCharMarker;
2741 trailByte = (uint8_t)*mySource;
2742 /*
2743 * Ticket 5691: consistent illegal sequences:
2744 * - We include at least the first byte in the illegal sequence.
2745 * - If any of the non-initial bytes could be the start of a character,
2746 * we stop the illegal sequence before the first one of those.
2747 *
2748 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2749 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2750 * Otherwise we convert or report the pair of bytes.
2751 */
2752 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2753 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2754 if (leadIsOk && trailIsOk) {
2755 ++mySource;
2756 tempBuf[0] = (char)(mySourceChar + 0x80);
2757 tempBuf[1] = (char)(trailByte + 0x80);
2758 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2759 mySourceChar = (mySourceChar << 8) | trailByte;
2760 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2761 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2762 ++mySource;
2763 /* add another bit so that the code below writes 2 bytes in case of error */
2764 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2765 }
2766 } else {
2767 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2768 args->converter->toULength = 1;
2769 break;
2770 }
2771 }
2772 else if(mySourceChar <= 0x7f) {
2773 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2774 } else {
2775 targetUniChar = 0xffff;
2776 }
2777 if(targetUniChar < 0xfffe){
2778 if(args->offsets) {
2779 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2780 }
2781 *(myTarget++)=(UChar)targetUniChar;
2782 }
2783 else {
2784 /* Call the callback function*/
2785 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2786 break;
2787 }
2788 }
2789 else{
2790 *err =U_BUFFER_OVERFLOW_ERROR;
2791 break;
2792 }
2793 }
2794 args->target = myTarget;
2795 args->source = mySource;
2796 }
2797
2798 /*************************** END ISO2022-KR *********************************/
2799
2800 /*************************** ISO-2022-CN *********************************
2801 *
2802 * Rules for ISO-2022-CN Encoding:
2803 * i) The designator sequence must appear once on a line before any instance
2804 * of character set it designates.
2805 * ii) If two lines contain characters from the same character set, both lines
2806 * must include the designator sequence.
2807 * iii) Once the designator sequence is known, a shifting sequence has to be found
2808 * to invoke the shifting
2809 * iv) All lines start in ASCII and end in ASCII.
2810 * v) Four shifting sequences are employed for this purpose:
2811 *
2812 * Sequcence ASCII Eq Charsets
2813 * ---------- ------- ---------
2814 * SI <SI> US-ASCII
2815 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2816 * SS2 <ESC>N CNS-11643-1992 Plane 2
2817 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2818 *
2819 * vi)
2820 * SOdesignator : ESC "$" ")" finalchar_for_SO
2821 * SS2designator : ESC "$" "*" finalchar_for_SS2
2822 * SS3designator : ESC "$" "+" finalchar_for_SS3
2823 *
2824 * ESC $ ) A Indicates the bytes following SO are Chinese
2825 * characters as defined in GB 2312-80, until
2826 * another SOdesignation appears
2827 *
2828 *
2829 * ESC $ ) E Indicates the bytes following SO are as defined
2830 * in ISO-IR-165 (for details, see section 2.1),
2831 * until another SOdesignation appears
2832 *
2833 * ESC $ ) G Indicates the bytes following SO are as defined
2834 * in CNS 11643-plane-1, until another
2835 * SOdesignation appears
2836 *
2837 * ESC $ * H Indicates the two bytes immediately following
2838 * SS2 is a Chinese character as defined in CNS
2839 * 11643-plane-2, until another SS2designation
2840 * appears
2841 * (Meaning <ESC>N must preceed every 2 byte
2842 * sequence.)
2843 *
2844 * ESC $ + I Indicates the immediate two bytes following SS3
2845 * is a Chinese character as defined in CNS
2846 * 11643-plane-3, until another SS3designation
2847 * appears
2848 * (Meaning <ESC>O must preceed every 2 byte
2849 * sequence.)
2850 *
2851 * ESC $ + J Indicates the immediate two bytes following SS3
2852 * is a Chinese character as defined in CNS
2853 * 11643-plane-4, until another SS3designation
2854 * appears
2855 * (In English: <ESC>O must preceed every 2 byte
2856 * sequence.)
2857 *
2858 * ESC $ + K Indicates the immediate two bytes following SS3
2859 * is a Chinese character as defined in CNS
2860 * 11643-plane-5, until another SS3designation
2861 * appears
2862 *
2863 * ESC $ + L Indicates the immediate two bytes following SS3
2864 * is a Chinese character as defined in CNS
2865 * 11643-plane-6, until another SS3designation
2866 * appears
2867 *
2868 * ESC $ + M Indicates the immediate two bytes following SS3
2869 * is a Chinese character as defined in CNS
2870 * 11643-plane-7, until another SS3designation
2871 * appears
2872 *
2873 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2874 * has its own designation information before any Chinese characters
2875 * appear
2876 *
2877 */
2878
2879 /* The following are defined this way to make the strings truly readonly */
2880 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2881 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2882 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2883 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2884 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2885 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2886 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2887 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2888 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2889
2890 /********************** ISO2022-CN Data **************************/
2891 static const char* const escSeqCharsCN[10] ={
2892 SHIFT_IN_STR, /* 0 ASCII */
2893 GB_2312_80_STR, /* 1 GB2312_1 */
2894 ISO_IR_165_STR, /* 2 ISO_IR_165 */
2895 CNS_11643_1992_Plane_1_STR,
2896 CNS_11643_1992_Plane_2_STR,
2897 CNS_11643_1992_Plane_3_STR,
2898 CNS_11643_1992_Plane_4_STR,
2899 CNS_11643_1992_Plane_5_STR,
2900 CNS_11643_1992_Plane_6_STR,
2901 CNS_11643_1992_Plane_7_STR
2902 };
2903
2904 static void
2905 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2906 UConverter *cnv = args->converter;
2907 UConverterDataISO2022 *converterData;
2908 ISO2022State *pFromU2022State;
2909 uint8_t *target = (uint8_t *) args->target;
2910 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2911 const UChar* source = args->source;
2912 const UChar* sourceLimit = args->sourceLimit;
2913 int32_t* offsets = args->offsets;
2914 UChar32 sourceChar;
2915 char buffer[8];
2916 int32_t len;
2917 int8_t choices[3];
2918 int32_t choiceCount;
2919 uint32_t targetValue = 0;
2920 UBool useFallback;
2921
2922 /* set up the state */
2923 converterData = (UConverterDataISO2022*)cnv->extraInfo;
2924 pFromU2022State = &converterData->fromU2022State;
2925
2926 choiceCount = 0;
2927
2928 /* check if the last codepoint of previous buffer was a lead surrogate*/
2929 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2930 goto getTrail;
2931 }
2932
2933 while( source < sourceLimit){
2934 if(target < targetLimit){
2935
2936 sourceChar = *(source++);
2937 /*check if the char is a First surrogate*/
2938 if(U16_IS_SURROGATE(sourceChar)) {
2939 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2940 getTrail:
2941 /*look ahead to find the trail surrogate*/
2942 if(source < sourceLimit) {
2943 /* test the following code unit */
2944 UChar trail=(UChar) *source;
2945 if(U16_IS_TRAIL(trail)) {
2946 source++;
2947 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2948 cnv->fromUChar32=0x00;
2949 /* convert this supplementary code point */
2950 /* exit this condition tree */
2951 } else {
2952 /* this is an unmatched lead code unit (1st surrogate) */
2953 /* callback(illegal) */
2954 *err=U_ILLEGAL_CHAR_FOUND;
2955 cnv->fromUChar32=sourceChar;
2956 break;
2957 }
2958 } else {
2959 /* no more input */
2960 cnv->fromUChar32=sourceChar;
2961 break;
2962 }
2963 } else {
2964 /* this is an unmatched trail code unit (2nd surrogate) */
2965 /* callback(illegal) */
2966 *err=U_ILLEGAL_CHAR_FOUND;
2967 cnv->fromUChar32=sourceChar;
2968 break;
2969 }
2970 }
2971
2972 /* do the conversion */
2973 if(sourceChar <= 0x007f ){
2974 /* do not convert SO/SI/ESC */
2975 if(IS_2022_CONTROL(sourceChar)) {
2976 /* callback(illegal) */
2977 *err=U_ILLEGAL_CHAR_FOUND;
2978 cnv->fromUChar32=sourceChar;
2979 break;
2980 }
2981
2982 /* US-ASCII */
2983 if(pFromU2022State->g == 0) {
2984 buffer[0] = (char)sourceChar;
2985 len = 1;
2986 } else {
2987 buffer[0] = UCNV_SI;
2988 buffer[1] = (char)sourceChar;
2989 len = 2;
2990 pFromU2022State->g = 0;
2991 choiceCount = 0;
2992 }
2993 if(sourceChar == CR || sourceChar == LF) {
2994 /* reset the state at the end of a line */
2995 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2996 choiceCount = 0;
2997 }
2998 }
2999 else{
3000 /* convert U+0080..U+10ffff */
3001 int32_t i;
3002 int8_t cs, g;
3003
3004 if(choiceCount == 0) {
3005 /* try the current SO/G1 converter first */
3006 choices[0] = pFromU2022State->cs[1];
3007
3008 /* default to GB2312_1 if none is designated yet */
3009 if(choices[0] == 0) {
3010 choices[0] = GB2312_1;
3011 }
3012
3013 if(converterData->version == 0) {
3014 /* ISO-2022-CN */
3015
3016 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3017 if(choices[0] == GB2312_1) {
3018 choices[1] = (int8_t)CNS_11643_1;
3019 } else {
3020 choices[1] = (int8_t)GB2312_1;
3021 }
3022
3023 choiceCount = 2;
3024 } else if (converterData->version == 1) {
3025 /* ISO-2022-CN-EXT */
3026
3027 /* try one of the other converters */
3028 switch(choices[0]) {
3029 case GB2312_1:
3030 choices[1] = (int8_t)CNS_11643_1;
3031 choices[2] = (int8_t)ISO_IR_165;
3032 break;
3033 case ISO_IR_165:
3034 choices[1] = (int8_t)GB2312_1;
3035 choices[2] = (int8_t)CNS_11643_1;
3036 break;
3037 default: /* CNS_11643_x */
3038 choices[1] = (int8_t)GB2312_1;
3039 choices[2] = (int8_t)ISO_IR_165;
3040 break;
3041 }
3042
3043 choiceCount = 3;
3044 } else {
3045 choices[0] = (int8_t)CNS_11643_1;
3046 choices[1] = (int8_t)GB2312_1;
3047 }
3048 }
3049
3050 cs = g = 0;
3051 /*
3052 * len==0: no mapping found yet
3053 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3054 * len>0: found a roundtrip result, done
3055 */
3056 len = 0;
3057 /*
3058 * We will turn off useFallback after finding a fallback,
3059 * but we still get fallbacks from PUA code points as usual.
3060 * Therefore, we will also need to check that we don't overwrite
3061 * an early fallback with a later one.
3062 */
3063 useFallback = cnv->useFallback;
3064
3065 for(i = 0; i < choiceCount && len <= 0; ++i) {
3066 int8_t cs0 = choices[i];
3067 if(cs0 > 0) {
3068 uint32_t value;
3069 int32_t len2;
3070 if(cs0 >= CNS_11643_0) {
3071 len2 = MBCS_FROM_UCHAR32_ISO2022(
3072 converterData->myConverterArray[CNS_11643],
3073 sourceChar,
3074 &value,
3075 useFallback,
3076 MBCS_OUTPUT_3);
3077 if(len2 == 3 || (len2 == -3 && len == 0)) {
3078 targetValue = value;
3079 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3080 if(len2 >= 0) {
3081 len = 2;
3082 } else {
3083 len = -2;
3084 useFallback = FALSE;
3085 }
3086 if(cs == CNS_11643_1) {
3087 g = 1;
3088 } else if(cs == CNS_11643_2) {
3089 g = 2;
3090 } else /* plane 3..7 */ if(converterData->version == 1) {
3091 g = 3;
3092 } else {
3093 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3094 len = 0;
3095 }
3096 }
3097 } else {
3098 /* GB2312_1 or ISO-IR-165 */
3099 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3100 len2 = MBCS_FROM_UCHAR32_ISO2022(
3101 converterData->myConverterArray[cs0],
3102 sourceChar,
3103 &value,
3104 useFallback,
3105 MBCS_OUTPUT_2);
3106 if(len2 == 2 || (len2 == -2 && len == 0)) {
3107 targetValue = value;
3108 len = len2;
3109 cs = cs0;
3110 g = 1;
3111 useFallback = FALSE;
3112 }
3113 }
3114 }
3115 }
3116
3117 if(len != 0) {
3118 len = 0; /* count output bytes; it must have been abs(len) == 2 */
3119
3120 /* write the designation sequence if necessary */
3121 if(cs != pFromU2022State->cs[g]) {
3122 if(cs < CNS_11643) {
3123 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3124 } else {
3125 U_ASSERT(cs >= CNS_11643_1);
3126 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3127 }
3128 len = 4;
3129 pFromU2022State->cs[g] = cs;
3130 if(g == 1) {
3131 /* changing the SO/G1 charset invalidates the choices[] */
3132 choiceCount = 0;
3133 }
3134 }
3135
3136 /* write the shift sequence if necessary */
3137 if(g != pFromU2022State->g) {
3138 switch(g) {
3139 case 1:
3140 buffer[len++] = UCNV_SO;
3141
3142 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3143 pFromU2022State->g = 1;
3144 break;
3145 case 2:
3146 buffer[len++] = 0x1b;
3147 buffer[len++] = 0x4e;
3148 break;
3149 default: /* case 3 */
3150 buffer[len++] = 0x1b;
3151 buffer[len++] = 0x4f;
3152 break;
3153 }
3154 }
3155
3156 /* write the two output bytes */
3157 buffer[len++] = (char)(targetValue >> 8);
3158 buffer[len++] = (char)targetValue;
3159 } else {
3160 /* if we cannot find the character after checking all codepages
3161 * then this is an error
3162 */
3163 *err = U_INVALID_CHAR_FOUND;
3164 cnv->fromUChar32=sourceChar;
3165 break;
3166 }
3167 }
3168
3169 /* output len>0 bytes in buffer[] */
3170 if(len == 1) {
3171 *target++ = buffer[0];
3172 if(offsets) {
3173 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3174 }
3175 } else if(len == 2 && (target + 2) <= targetLimit) {
3176 *target++ = buffer[0];
3177 *target++ = buffer[1];
3178 if(offsets) {
3179 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3180 *offsets++ = sourceIndex;
3181 *offsets++ = sourceIndex;
3182 }
3183 } else {
3184 fromUWriteUInt8(
3185 cnv,
3186 buffer, len,
3187 &target, (const char *)targetLimit,
3188 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3189 err);
3190 if(U_FAILURE(*err)) {
3191 break;
3192 }
3193 }
3194 } /* end if(myTargetIndex<myTargetLength) */
3195 else{
3196 *err =U_BUFFER_OVERFLOW_ERROR;
3197 break;
3198 }
3199
3200 }/* end while(mySourceIndex<mySourceLength) */
3201
3202 /*
3203 * the end of the input stream and detection of truncated input
3204 * are handled by the framework, but for ISO-2022-CN conversion
3205 * we need to be in ASCII mode at the very end
3206 *
3207 * conditions:
3208 * successful
3209 * not in ASCII mode
3210 * end of input and no truncated input
3211 */
3212 if( U_SUCCESS(*err) &&
3213 pFromU2022State->g!=0 &&
3214 args->flush && source>=sourceLimit && cnv->fromUChar32==0
3215 ) {
3216 int32_t sourceIndex;
3217
3218 /* we are switching to ASCII */
3219 pFromU2022State->g=0;
3220
3221 /* get the source index of the last input character */
3222 /*
3223 * TODO this would be simpler and more reliable if we used a pair
3224 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3225 * so that we could simply use the prevSourceIndex here;
3226 * this code gives an incorrect result for the rare case of an unmatched
3227 * trail surrogate that is alone in the last buffer of the text stream
3228 */
3229 sourceIndex=(int32_t)(source-args->source);
3230 if(sourceIndex>0) {
3231 --sourceIndex;
3232 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3233 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3234 ) {
3235 --sourceIndex;
3236 }
3237 } else {
3238 sourceIndex=-1;
3239 }
3240
3241 fromUWriteUInt8(
3242 cnv,
3243 SHIFT_IN_STR, 1,
3244 &target, (const char *)targetLimit,
3245 &offsets, sourceIndex,
3246 err);
3247 }
3248
3249 /*save the state and return */
3250 args->source = source;
3251 args->target = (char*)target;
3252 }
3253
3254
3255 static void
3256 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3257 UErrorCode* err){
3258 char tempBuf[3];
3259 const char *mySource = (char *) args->source;
3260 UChar *myTarget = args->target;
3261 const char *mySourceLimit = args->sourceLimit;
3262 uint32_t targetUniChar = 0x0000;
3263 uint32_t mySourceChar = 0x0000;
3264 UConverterDataISO2022* myData;
3265 ISO2022State *pToU2022State;
3266
3267 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3268 pToU2022State = &myData->toU2022State;
3269
3270 if(myData->key != 0) {
3271 /* continue with a partial escape sequence */
3272 goto escape;
3273 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3274 /* continue with a partial double-byte character */
3275 mySourceChar = args->converter->toUBytes[0];
3276 args->converter->toULength = 0;
3277 targetUniChar = missingCharMarker;
3278 goto getTrailByte;
3279 }
3280
3281 while(mySource < mySourceLimit){
3282
3283 targetUniChar =missingCharMarker;
3284
3285 if(myTarget < args->targetLimit){
3286
3287 mySourceChar= (unsigned char) *mySource++;
3288
3289 switch(mySourceChar){
3290 case UCNV_SI:
3291 pToU2022State->g=0;
3292 if (myData->isEmptySegment) {
3293 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
3294 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3295 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3296 args->converter->toUBytes[0] = mySourceChar;
3297 args->converter->toULength = 1;
3298 args->target = myTarget;
3299 args->source = mySource;
3300 return;
3301 }
3302 continue;
3303
3304 case UCNV_SO:
3305 if(pToU2022State->cs[1] != 0) {
3306 pToU2022State->g=1;
3307 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
3308 continue;
3309 } else {
3310 /* illegal to have SO before a matching designator */
3311 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
3312 break;
3313 }
3314
3315 case ESC_2022:
3316 mySource--;
3317 escape:
3318 {
3319 const char * mySourceBefore = mySource;
3320 int8_t toULengthBefore = args->converter->toULength;
3321
3322 changeState_2022(args->converter,&(mySource),
3323 mySourceLimit, ISO_2022_CN,err);
3324
3325 /* After SO there must be at least one character before a designator (designator error handled separately) */
3326 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3327 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3328 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3329 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3330 }
3331 }
3332
3333 /* invalid or illegal escape sequence */
3334 if(U_FAILURE(*err)){
3335 args->target = myTarget;
3336 args->source = mySource;
3337 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
3338 return;
3339 }
3340 continue;
3341
3342 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3343
3344 case CR:
3345 case LF:
3346 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3347 U_FALLTHROUGH;
3348 default:
3349 /* convert one or two bytes */
3350 myData->isEmptySegment = FALSE;
3351 if(pToU2022State->g != 0) {
3352 if(mySource < mySourceLimit) {
3353 UConverterSharedData *cnv;
3354 StateEnum tempState;
3355 int32_t tempBufLen;
3356 int leadIsOk, trailIsOk;
3357 uint8_t trailByte;
3358 getTrailByte:
3359 trailByte = (uint8_t)*mySource;
3360 /*
3361 * Ticket 5691: consistent illegal sequences:
3362 * - We include at least the first byte in the illegal sequence.
3363 * - If any of the non-initial bytes could be the start of a character,
3364 * we stop the illegal sequence before the first one of those.
3365 *
3366 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3367 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3368 * Otherwise we convert or report the pair of bytes.
3369 */
3370 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3371 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3372 if (leadIsOk && trailIsOk) {
3373 ++mySource;
3374 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3375 if(tempState >= CNS_11643_0) {
3376 cnv = myData->myConverterArray[CNS_11643];
3377 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3378 tempBuf[1] = (char) (mySourceChar);
3379 tempBuf[2] = (char) trailByte;
3380 tempBufLen = 3;
3381
3382 }else{
3383 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3384 cnv = myData->myConverterArray[tempState];
3385 tempBuf[0] = (char) (mySourceChar);
3386 tempBuf[1] = (char) trailByte;
3387 tempBufLen = 2;
3388 }
3389 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3390 mySourceChar = (mySourceChar << 8) | trailByte;
3391 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3392 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3393 ++mySource;
3394 /* add another bit so that the code below writes 2 bytes in case of error */
3395 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3396 }
3397 if(pToU2022State->g>=2) {
3398 /* return from a single-shift state to the previous one */
3399 pToU2022State->g=pToU2022State->prevG;
3400 }
3401 } else {
3402 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3403 args->converter->toULength = 1;
3404 goto endloop;
3405 }
3406 }
3407 else{
3408 if(mySourceChar <= 0x7f) {
3409 targetUniChar = (UChar) mySourceChar;
3410 }
3411 }
3412 break;
3413 }
3414 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3415 if(args->offsets){
3416 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3417 }
3418 *(myTarget++)=(UChar)targetUniChar;
3419 }
3420 else if(targetUniChar > missingCharMarker){
3421 /* disassemble the surrogate pair and write to output*/
3422 targetUniChar-=0x0010000;
3423 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3424 if(args->offsets){
3425 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3426 }
3427 ++myTarget;
3428 if(myTarget< args->targetLimit){
3429 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3430 if(args->offsets){
3431 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3432 }
3433 ++myTarget;
3434 }else{
3435 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3436 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3437 }
3438
3439 }
3440 else{
3441 /* Call the callback function*/
3442 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3443 break;
3444 }
3445 }
3446 else{
3447 *err =U_BUFFER_OVERFLOW_ERROR;
3448 break;
3449 }
3450 }
3451 endloop:
3452 args->target = myTarget;
3453 args->source = mySource;
3454 }
3455 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3456
3457 static void
3458 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3459 UConverter *cnv = args->converter;
3460 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3461 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3462 char *p, *subchar;
3463 char buffer[8];
3464 int32_t length;
3465
3466 subchar=(char *)cnv->subChars;
3467 length=cnv->subCharLen; /* assume length==1 for most variants */
3468
3469 p = buffer;
3470 switch(myConverterData->locale[0]){
3471 case 'j':
3472 {
3473 int8_t cs;
3474
3475 if(pFromU2022State->g == 1) {
3476 /* JIS7: switch from G1 to G0 */
3477 pFromU2022State->g = 0;
3478 *p++ = UCNV_SI;
3479 }
3480
3481 cs = pFromU2022State->cs[0];
3482 if(cs != ASCII && cs != JISX201) {
3483 /* not in ASCII or JIS X 0201: switch to ASCII */
3484 pFromU2022State->cs[0] = (int8_t)ASCII;
3485 *p++ = '\x1b';
3486 *p++ = '\x28';
3487 *p++ = '\x42';
3488 }
3489
3490 *p++ = subchar[0];
3491 break;
3492 }
3493 case 'c':
3494 if(pFromU2022State->g != 0) {
3495 /* not in ASCII mode: switch to ASCII */
3496 pFromU2022State->g = 0;
3497 *p++ = UCNV_SI;
3498 }
3499 *p++ = subchar[0];
3500 break;
3501 case 'k':
3502 if(myConverterData->version == 0) {
3503 if(length == 1) {
3504 if((UBool)args->converter->fromUnicodeStatus) {
3505 /* in DBCS mode: switch to SBCS */
3506 args->converter->fromUnicodeStatus = 0;
3507 *p++ = UCNV_SI;
3508 }
3509 *p++ = subchar[0];
3510 } else /* length == 2*/ {
3511 if(!(UBool)args->converter->fromUnicodeStatus) {
3512 /* in SBCS mode: switch to DBCS */
3513 args->converter->fromUnicodeStatus = 1;
3514 *p++ = UCNV_SO;
3515 }
3516 *p++ = subchar[0];
3517 *p++ = subchar[1];
3518 }
3519 break;
3520 } else {
3521 /* save the subconverter's substitution string */
3522 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3523 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3524
3525 /* set our substitution string into the subconverter */
3526 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3527 myConverterData->currentConverter->subCharLen = (int8_t)length;
3528
3529 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3530 args->converter = myConverterData->currentConverter;
3531 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3532 ucnv_cbFromUWriteSub(args, 0, err);
3533 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3534 args->converter = cnv;
3535
3536 /* restore the subconverter's substitution string */
3537 myConverterData->currentConverter->subChars = currentSubChars;
3538 myConverterData->currentConverter->subCharLen = currentSubCharLen;
3539
3540 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3541 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3542 uprv_memcpy(
3543 cnv->charErrorBuffer,
3544 myConverterData->currentConverter->charErrorBuffer,
3545 myConverterData->currentConverter->charErrorBufferLength);
3546 }
3547 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3548 myConverterData->currentConverter->charErrorBufferLength = 0;
3549 }
3550 return;
3551 }
3552 default:
3553 /* not expected */
3554 break;
3555 }
3556 ucnv_cbFromUWriteBytes(args,
3557 buffer, (int32_t)(p - buffer),
3558 offsetIndex, err);
3559 }
3560
3561 /*
3562 * Structure for cloning an ISO 2022 converter into a single memory block.
3563 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3564 * and then ucnv_safeClone() of the sub-converter may additionally align
3565 * currentConverter inside the cloneStruct, for which we need the deadSpace
3566 * after currentConverter.
3567 * This is because UAlignedMemory may be larger than the actually
3568 * necessary alignment size for the platform.
3569 * The other cloneStruct fields will not be moved around,
3570 * and are aligned properly with cloneStruct's alignment.
3571 */
3572 struct cloneStruct
3573 {
3574 UConverter cnv;
3575 UConverter currentConverter;
3576 UAlignedMemory deadSpace;
3577 UConverterDataISO2022 mydata;
3578 };
3579
3580
3581 static UConverter *
3582 _ISO_2022_SafeClone(
3583 const UConverter *cnv,
3584 void *stackBuffer,
3585 int32_t *pBufferSize,
3586 UErrorCode *status)
3587 {
3588 struct cloneStruct * localClone;
3589 UConverterDataISO2022 *cnvData;
3590 int32_t i, size;
3591
3592 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3593 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3594 return NULL;
3595 }
3596
3597 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3598 localClone = (struct cloneStruct *)stackBuffer;
3599
3600 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3601
3602 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3603 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3604 localClone->cnv.isExtraLocal = TRUE;
3605
3606 /* share the subconverters */
3607
3608 if(cnvData->currentConverter != NULL) {
3609 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3610 localClone->mydata.currentConverter =
3611 ucnv_safeClone(cnvData->currentConverter,
3612 &localClone->currentConverter,
3613 &size, status);
3614 if(U_FAILURE(*status)) {
3615 return NULL;
3616 }
3617 }
3618
3619 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3620 if(cnvData->myConverterArray[i] != NULL) {
3621 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3622 }
3623 }
3624
3625 return &localClone->cnv;
3626 }
3627
3628 static void
3629 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3630 const USetAdder *sa,
3631 UConverterUnicodeSet which,
3632 UErrorCode *pErrorCode)
3633 {
3634 int32_t i;
3635 UConverterDataISO2022* cnvData;
3636
3637 if (U_FAILURE(*pErrorCode)) {
3638 return;
3639 }
3640 #ifdef U_ENABLE_GENERIC_ISO_2022
3641 if (cnv->sharedData == &_ISO2022Data) {
3642 /* We use UTF-8 in this case */
3643 sa->addRange(sa->set, 0, 0xd7FF);
3644 sa->addRange(sa->set, 0xE000, 0x10FFFF);
3645 return;
3646 }
3647 #endif
3648
3649 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3650
3651 /* open a set and initialize it with code points that are algorithmically round-tripped */
3652 switch(cnvData->locale[0]){
3653 case 'j':
3654 /* include JIS X 0201 which is hardcoded */
3655 sa->add(sa->set, 0xa5);
3656 sa->add(sa->set, 0x203e);
3657 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3658 /* include Latin-1 for some variants of JP */
3659 sa->addRange(sa->set, 0, 0xff);
3660 } else {
3661 /* include ASCII for JP */
3662 sa->addRange(sa->set, 0, 0x7f);
3663 }
3664 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3665 /*
3666 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3667 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3668 * use half-width Katakana.
3669 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3670 * half-width Katakana via the ESC ( I sequence.
3671 * However, we only emit (fromUnicode) half-width Katakana according to the
3672 * definition of each variant.
3673 *
3674 * When including fallbacks,
3675 * we need to include half-width Katakana Unicode code points for all JP variants because
3676 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3677 */
3678 /* include half-width Katakana for JP */
3679 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3680 }
3681 break;
3682 #if !UCONFIG_ONLY_HTML_CONVERSION
3683 case 'c':
3684 case 'z':
3685 /* include ASCII for CN */
3686 sa->addRange(sa->set, 0, 0x7f);
3687 break;
3688 case 'k':
3689 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3690 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3691 cnvData->currentConverter, sa, which, pErrorCode);
3692 /* the loop over myConverterArray[] will simply not find another converter */
3693 break;
3694 #endif
3695 default:
3696 break;
3697 }
3698
3699 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3700 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3701 cnvData->version==0 && i==CNS_11643
3702 ) {
3703 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3704 ucnv_MBCSGetUnicodeSetForBytes(
3705 cnvData->myConverterArray[i],
3706 sa, UCNV_ROUNDTRIP_SET,
3707 0, 0x81, 0x82,
3708 pErrorCode);
3709 }
3710 #endif
3711
3712 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3713 UConverterSetFilter filter;
3714 if(cnvData->myConverterArray[i]!=NULL) {
3715 if(cnvData->locale[0]=='j' && i==JISX208) {
3716 /*
3717 * Only add code points that map to Shift-JIS codes
3718 * corresponding to JIS X 0208.
3719 */
3720 filter=UCNV_SET_FILTER_SJIS;
3721 #if !UCONFIG_ONLY_HTML_CONVERSION
3722 } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3723 cnvData->version==0 && i==CNS_11643) {
3724 /*
3725 * Version-specific for CN:
3726 * CN version 0 does not map CNS planes 3..7 although
3727 * they are all available in the CNS conversion table;
3728 * CN version 1 (-EXT) does map them all.
3729 * The two versions create different Unicode sets.
3730 */
3731 filter=UCNV_SET_FILTER_2022_CN;
3732 } else if(i==KSC5601) {
3733 /*
3734 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3735 * are broader than GR94.
3736 */
3737 filter=UCNV_SET_FILTER_GR94DBCS;
3738 #endif
3739 } else {
3740 filter=UCNV_SET_FILTER_NONE;
3741 }
3742 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3743 }
3744 }
3745
3746 /*
3747 * ISO 2022 converters must not convert SO/SI/ESC despite what
3748 * sub-converters do by themselves.
3749 * Remove these characters from the set.
3750 */
3751 sa->remove(sa->set, 0x0e);
3752 sa->remove(sa->set, 0x0f);
3753 sa->remove(sa->set, 0x1b);
3754
3755 /* ISO 2022 converters do not convert C1 controls either */
3756 sa->removeRange(sa->set, 0x80, 0x9f);
3757 }
3758
3759 static const UConverterImpl _ISO2022Impl={
3760 UCNV_ISO_2022,
3761
3762 NULL,
3763 NULL,
3764
3765 _ISO2022Open,
3766 _ISO2022Close,
3767 _ISO2022Reset,
3768
3769 #ifdef U_ENABLE_GENERIC_ISO_2022
3770 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3771 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3772 ucnv_fromUnicode_UTF8,
3773 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3774 #else
3775 NULL,
3776 NULL,
3777 NULL,
3778 NULL,
3779 #endif
3780 NULL,
3781
3782 NULL,
3783 _ISO2022getName,
3784 _ISO_2022_WriteSub,
3785 _ISO_2022_SafeClone,
3786 _ISO_2022_GetUnicodeSet,
3787
3788 NULL,
3789 NULL
3790 };
3791 static const UConverterStaticData _ISO2022StaticData={
3792 sizeof(UConverterStaticData),
3793 "ISO_2022",
3794 2022,
3795 UCNV_IBM,
3796 UCNV_ISO_2022,
3797 1,
3798 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3799 { 0x1a, 0, 0, 0 },
3800 1,
3801 FALSE,
3802 FALSE,
3803 0,
3804 0,
3805 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3806 };
3807 const UConverterSharedData _ISO2022Data=
3808 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
3809
3810 /*************JP****************/
3811 static const UConverterImpl _ISO2022JPImpl={
3812 UCNV_ISO_2022,
3813
3814 NULL,
3815 NULL,
3816
3817 _ISO2022Open,
3818 _ISO2022Close,
3819 _ISO2022Reset,
3820
3821 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3822 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3823 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3824 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3825 NULL,
3826
3827 NULL,
3828 _ISO2022getName,
3829 _ISO_2022_WriteSub,
3830 _ISO_2022_SafeClone,
3831 _ISO_2022_GetUnicodeSet,
3832
3833 NULL,
3834 NULL
3835 };
3836 static const UConverterStaticData _ISO2022JPStaticData={
3837 sizeof(UConverterStaticData),
3838 "ISO_2022_JP",
3839 0,
3840 UCNV_IBM,
3841 UCNV_ISO_2022,
3842 1,
3843 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3844 { 0x1a, 0, 0, 0 },
3845 1,
3846 FALSE,
3847 FALSE,
3848 0,
3849 0,
3850 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3851 };
3852
3853 namespace {
3854
3855 const UConverterSharedData _ISO2022JPData=
3856 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
3857
3858 } // namespace
3859
3860 #if !UCONFIG_ONLY_HTML_CONVERSION
3861 /************* KR ***************/
3862 static const UConverterImpl _ISO2022KRImpl={
3863 UCNV_ISO_2022,
3864
3865 NULL,
3866 NULL,
3867
3868 _ISO2022Open,
3869 _ISO2022Close,
3870 _ISO2022Reset,
3871
3872 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3873 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3874 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3875 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3876 NULL,
3877
3878 NULL,
3879 _ISO2022getName,
3880 _ISO_2022_WriteSub,
3881 _ISO_2022_SafeClone,
3882 _ISO_2022_GetUnicodeSet,
3883
3884 NULL,
3885 NULL
3886 };
3887 static const UConverterStaticData _ISO2022KRStaticData={
3888 sizeof(UConverterStaticData),
3889 "ISO_2022_KR",
3890 0,
3891 UCNV_IBM,
3892 UCNV_ISO_2022,
3893 1,
3894 8, /* max 8 bytes per UChar */
3895 { 0x1a, 0, 0, 0 },
3896 1,
3897 FALSE,
3898 FALSE,
3899 0,
3900 0,
3901 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3902 };
3903
3904 namespace {
3905
3906 const UConverterSharedData _ISO2022KRData=
3907 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
3908
3909 } // namespace
3910
3911 /*************** CN ***************/
3912 static const UConverterImpl _ISO2022CNImpl={
3913
3914 UCNV_ISO_2022,
3915
3916 NULL,
3917 NULL,
3918
3919 _ISO2022Open,
3920 _ISO2022Close,
3921 _ISO2022Reset,
3922
3923 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3924 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3925 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3926 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3927 NULL,
3928
3929 NULL,
3930 _ISO2022getName,
3931 _ISO_2022_WriteSub,
3932 _ISO_2022_SafeClone,
3933 _ISO_2022_GetUnicodeSet,
3934
3935 NULL,
3936 NULL
3937 };
3938 static const UConverterStaticData _ISO2022CNStaticData={
3939 sizeof(UConverterStaticData),
3940 "ISO_2022_CN",
3941 0,
3942 UCNV_IBM,
3943 UCNV_ISO_2022,
3944 1,
3945 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3946 { 0x1a, 0, 0, 0 },
3947 1,
3948 FALSE,
3949 FALSE,
3950 0,
3951 0,
3952 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3953 };
3954
3955 namespace {
3956
3957 const UConverterSharedData _ISO2022CNData=
3958 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
3959
3960 } // namespace
3961 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3962
3963 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */