icuSources/common/ucnv2022.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2000-2015, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv2022.cpp
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2000feb03
  12 *   created by: Markus W. Scherer
  13 *
  14 *   Change history:
  15 *
  16 *   06/29/2000  helena  Major rewrite of the callback APIs.
  17 *   08/08/2000  Ram     Included support for ISO-2022-JP-2
  18 *                       Changed implementation of toUnicode
  19 *                       function
  20 *   08/21/2000  Ram     Added support for ISO-2022-KR
  21 *   08/29/2000  Ram     Seperated implementation of EBCDIC to
  22 *                       ucnvebdc.c
  23 *   09/20/2000  Ram     Added support for ISO-2022-CN
  24 *                       Added implementations for getNextUChar()
  25 *                       for specific 2022 country variants.
  26 *   10/31/2000  Ram     Implemented offsets logic functions
  27 */
  28
  29 #include "unicode/utypes.h"
  30
  31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
  32
  33 #include "unicode/ucnv.h"
  34 #include "unicode/uset.h"
  35 #include "unicode/ucnv_err.h"
  36 #include "unicode/ucnv_cb.h"
  37 #include "unicode/utf16.h"
  38 #include "ucnv_imp.h"
  39 #include "ucnv_bld.h"
  40 #include "ucnv_cnv.h"
  41 #include "ucnvmbcs.h"
  42 #include "cstring.h"
  43 #include "cmemory.h"
  44 #include "uassert.h"
  45
  46 #ifdef U_ENABLE_GENERIC_ISO_2022
  47 /*
  48  * I am disabling the generic ISO-2022 converter after proposing to do so on
  49  * the icu mailing list two days ago.
  50  *
  51  * Reasons:
  52  * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
  53  *    its designation sequences, single shifts with return to the previous state,
  54  *    switch-with-no-return to UTF-16BE or similar, etc.
  55  *    This is unlike the language-specific variants like ISO-2022-JP which
  56  *    require a much smaller repertoire of ISO-2022 features.
  57  *    These variants continue to be supported.
  58  * 2. I believe that no one is really using the generic ISO-2022 converter
  59  *    but rather always one of the language-specific variants.
  60  *    Note that ICU's generic ISO-2022 converter has always output one escape
  61  *    sequence followed by UTF-8 for the whole stream.
  62  * 3. Switching between subcharsets is extremely slow, because each time
  63  *    the previous converter is closed and a new one opened,
  64  *    without any kind of caching, least-recently-used list, etc.
  65  * 4. The code is currently buggy, and given the above it does not seem
  66  *    reasonable to spend the time on maintenance.
  67  * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
  68  *    This means, for example, that when ISO-8859-7 is designated, the following
  69  *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
  70  *    The ICU ISO-2022 converter does not handle this - and has no information
  71  *    about which subconverter would have to be shifted vs. which is designed
  72  *    for 7-bit ISO-2022.
  73  *
  74  * Markus Scherer 2003-dec-03
  75  */
  76 #endif
  77
  78 #if !UCONFIG_ONLY_HTML_CONVERSION
  79 static const char SHIFT_IN_STR[]  = "\x0F";
  80 // static const char SHIFT_OUT_STR[] = "\x0E";
  81 #endif
  82
  83 #define CR      0x0D
  84 #define LF      0x0A
  85 #define H_TAB   0x09
  86 #define V_TAB   0x0B
  87 #define SPACE   0x20
  88
  89 enum {
  90     HWKANA_START=0xff61,
  91     HWKANA_END=0xff9f
  92 };
  93
  94 /*
  95  * 94-character sets with native byte values A1..FE are encoded in ISO 2022
  96  * as bytes 21..7E. (Subtract 0x80.)
  97  * 96-character sets with native byte values A0..FF are encoded in ISO 2022
  98  * as bytes 20..7F. (Subtract 0x80.)
  99  * Do not encode C1 control codes with native bytes 80..9F
 100  * as bytes 00..1F (C0 control codes).
 101  */
 102 enum {
 103     GR94_START=0xa1,
 104     GR94_END=0xfe,
 105     GR96_START=0xa0,
 106     GR96_END=0xff
 107 };
 108
 109 /*
 110  * ISO 2022 control codes must not be converted from Unicode
 111  * because they would mess up the byte stream.
 112  * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
 113  * corresponding to SO, SI, and ESC.
 114  */
 115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
 116
 117 /* for ISO-2022-JP and -CN implementations */
 118 typedef enum  {
 119         /* shared values */
 120         INVALID_STATE=-1,
 121         ASCII = 0,
 122
 123         SS2_STATE=0x10,
 124         SS3_STATE,
 125
 126         /* JP */
 127         ISO8859_1 = 1 ,
 128         ISO8859_7 = 2 ,
 129         JISX201  = 3,
 130         JISX208 = 4,
 131         JISX212 = 5,
 132         GB2312  =6,
 133         KSC5601 =7,
 134         HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
 135
 136         /* CN */
 137         /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
 138         GB2312_1=1,
 139         ISO_IR_165=2,
 140         CNS_11643=3,
 141
 142         /*
 143          * these are used in StateEnum and ISO2022State variables,
 144          * but CNS_11643 must be used to index into myConverterArray[]
 145          */
 146         CNS_11643_0=0x20,
 147         CNS_11643_1,
 148         CNS_11643_2,
 149         CNS_11643_3,
 150         CNS_11643_4,
 151         CNS_11643_5,
 152         CNS_11643_6,
 153         CNS_11643_7
 154 } StateEnum;
 155
 156 /* is the StateEnum charset value for a DBCS charset? */
 157 #if UCONFIG_ONLY_HTML_CONVERSION
 158 #define IS_JP_DBCS(cs) (JISX208==(cs))
 159 #else
 160 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
 161 #endif
 162
 163 #define CSM(cs) ((uint16_t)1<<(cs))
 164
 165 /*
 166  * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
 167  * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
 168  *
 169  * Note: The converter uses some leniency:
 170  * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
 171  *   all versions, not just JIS7 and JIS8.
 172  * - ICU does not distinguish between different versions of JIS X 0208.
 173  */
 174 #if UCONFIG_ONLY_HTML_CONVERSION
 175 enum { MAX_JA_VERSION=0 };
 176 #else
 177 enum { MAX_JA_VERSION=4 };
 178 #endif
 179 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
 180     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
 181 #if !UCONFIG_ONLY_HTML_CONVERSION
 182     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
 183     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
 184     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
 185     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
 186 #endif
 187 };
 188
 189 typedef enum {
 190         ASCII1=0,
 191         LATIN1,
 192         SBCS,
 193         DBCS,
 194         MBCS,
 195         HWKANA
 196 }Cnv2022Type;
 197
 198 typedef struct ISO2022State {
 199     int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
 200     int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
 201     int8_t prevG;       /* g before single shift (SS2 or SS3) */
 202 } ISO2022State;
 203
 204 #define UCNV_OPTIONS_VERSION_MASK 0xf
 205 #define UCNV_2022_MAX_CONVERTERS 10
 206
 207 typedef struct{
 208     UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
 209     UConverter *currentConverter;
 210     Cnv2022Type currentType;
 211     ISO2022State toU2022State, fromU2022State;
 212     uint32_t key;
 213     uint32_t version;
 214 #ifdef U_ENABLE_GENERIC_ISO_2022
 215     UBool isFirstBuffer;
 216 #endif
 217     UBool isEmptySegment;
 218     char name[30];
 219     char locale[3];
 220 }UConverterDataISO2022;
 221
 222 /* Protos */
 223 /* ISO-2022 ----------------------------------------------------------------- */
 224
 225 /*Forward declaration */
 226 U_CFUNC void
 227 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
 228                       UErrorCode * err);
 229 U_CFUNC void
 230 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
 231                                     UErrorCode * err);
 232
 233 #define ESC_2022 0x1B /*ESC*/
 234
 235 typedef enum
 236 {
 237         INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
 238         VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
 239         VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
 240         VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
 241 } UCNV_TableStates_2022;
 242
 243 /*
 244 * The way these state transition arrays work is:
 245 * ex : ESC$B is the sequence for JISX208
 246 *      a) First Iteration: char is ESC
 247 *          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
 248 *             int x = normalize_esq_chars_2022[27] which is equal to 1
 249 *         ii) Search for this value in escSeqStateTable_Key_2022[]
 250 *             value of x is stored at escSeqStateTable_Key_2022[0]
 251 *        iii) Save this index as offset
 252 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
 253 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
 254 *     b) Switch on this state and continue to next char
 255 *          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
 256 *             which is normalize_esq_chars_2022[36] == 4
 257 *         ii) x is currently 1(from above)
 258 *               x<<=5 -- x is now 32
 259 *               x+=normalize_esq_chars_2022[36]
 260 *               now x is 36
 261 *        iii) Search for this value in escSeqStateTable_Key_2022[]
 262 *             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
 263 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
 264 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
 265 *     c) Switch on this state and continue to next char
 266 *        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
 267 *        ii) x is currently 36 (from above)
 268 *            x<<=5 -- x is now 1152
 269 *            x+=normalize_esq_chars_2022[66]
 270 *            now x is 1161
 271 *       iii) Search for this value in escSeqStateTable_Key_2022[]
 272 *            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
 273 *        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
 274 *            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
 275 *         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
 276 */
 277
 278
 279 /*Below are the 3 arrays depicting a state transition table*/
 280 static const int8_t normalize_esq_chars_2022[256] = {
 281 /*       0      1       2       3       4      5       6        7       8       9           */
 282
 283          0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 284         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 285         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
 286         ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
 287         ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
 288         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 289         ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
 290         ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
 291         ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
 292         ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 293         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 294         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 295         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 296         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 297         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 298         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 299         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 300         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 301         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 302         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 303         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 304         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 305         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 306         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 307         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 308         ,0     ,0      ,0      ,0      ,0      ,0
 309 };
 310
 311 #ifdef U_ENABLE_GENERIC_ISO_2022
 312 /*
 313  * When the generic ISO-2022 converter is completely removed, not just disabled
 314  * per #ifdef, then the following state table and the associated tables that are
 315  * dimensioned with MAX_STATES_2022 should be trimmed.
 316  *
 317  * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
 318  * the associated escape sequences starting with ESC ( B should be removed.
 319  * This includes the ones with key values 1097 and all of the ones above 1000000.
 320  *
 321  * For the latter, the tables can simply be truncated.
 322  * For the former, since the tables must be kept parallel, it is probably best
 323  * to simply duplicate an adjacent table cell, parallel in all tables.
 324  *
 325  * It may make sense to restructure the tables, especially by using small search
 326  * tables for the variants instead of indexing them parallel to the table here.
 327  */
 328 #endif
 329
 330 #define MAX_STATES_2022 74
 331 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
 332 /*   0           1           2           3           4           5           6           7           8           9           */
 333
 334      1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
 335     ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
 336     ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
 337     ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
 338     ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
 339     ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
 340     ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
 341     ,35947631   ,35947635   ,35947636   ,35947638
 342 };
 343
 344 #ifdef U_ENABLE_GENERIC_ISO_2022
 345
 346 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
 347  /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
 348
 349      NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
 350     ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
 351     ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
 352     ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
 353     ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
 354     ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
 355     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
 356     ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
 357 };
 358
 359 #endif
 360
 361 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
 362 /*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
 363      VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 364     ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 365     ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
 366     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 367     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 368     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 369     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 370     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 371 };
 372
 373 /* Type def for refactoring changeState_2022 code*/
 374 typedef enum{
 375 #ifdef U_ENABLE_GENERIC_ISO_2022
 376     ISO_2022=0,
 377 #endif
 378     ISO_2022_JP=1,
 379 #if !UCONFIG_ONLY_HTML_CONVERSION
 380     ISO_2022_KR=2,
 381     ISO_2022_CN=3
 382 #endif
 383 } Variant2022;
 384
 385 /*********** ISO 2022 Converter Protos ***********/
 386 static void
 387 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
 388
 389 static void
 390  _ISO2022Close(UConverter *converter);
 391
 392 static void
 393 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
 394
 395 static const char*
 396 _ISO2022getName(const UConverter* cnv);
 397
 398 static void
 399 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
 400
 401 static UConverter *
 402 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
 403
 404 #ifdef U_ENABLE_GENERIC_ISO_2022
 405 static void
 406 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
 407 #endif
 408
 409 namespace {
 410
 411 /*const UConverterSharedData _ISO2022Data;*/
 412 extern const UConverterSharedData _ISO2022JPData;
 413
 414 #if !UCONFIG_ONLY_HTML_CONVERSION
 415 extern const UConverterSharedData _ISO2022KRData;
 416 extern const UConverterSharedData _ISO2022CNData;
 417 #endif
 418
 419 }  // namespace
 420
 421 /*************** Converter implementations ******************/
 422
 423 /* The purpose of this function is to get around gcc compiler warnings. */
 424 static inline void
 425 fromUWriteUInt8(UConverter *cnv,
 426                  const char *bytes, int32_t length,
 427                  uint8_t **target, const char *targetLimit,
 428                  int32_t **offsets,
 429                  int32_t sourceIndex,
 430                  UErrorCode *pErrorCode)
 431 {
 432     char *targetChars = (char *)*target;
 433     ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
 434                          offsets, sourceIndex, pErrorCode);
 435     *target = (uint8_t*)targetChars;
 436
 437 }
 438
 439 static inline void
 440 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
 441     if(myConverterData->version == 1) {
 442         UConverter *cnv = myConverterData->currentConverter;
 443
 444         cnv->toUnicodeStatus=0;     /* offset */
 445         cnv->mode=0;                /* state */
 446         cnv->toULength=0;           /* byteIndex */
 447     }
 448 }
 449
 450 static inline void
 451 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
 452    /* in ISO-2022-KR the designator sequence appears only once
 453     * in a file so we append it only once
 454     */
 455     if( converter->charErrorBufferLength==0){
 456
 457         converter->charErrorBufferLength = 4;
 458         converter->charErrorBuffer[0] = 0x1b;
 459         converter->charErrorBuffer[1] = 0x24;
 460         converter->charErrorBuffer[2] = 0x29;
 461         converter->charErrorBuffer[3] = 0x43;
 462     }
 463     if(myConverterData->version == 1) {
 464         UConverter *cnv = myConverterData->currentConverter;
 465
 466         cnv->fromUChar32=0;
 467         cnv->fromUnicodeStatus=1;   /* prevLength */
 468     }
 469 }
 470
 471 static void
 472 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
 473
 474     char myLocale[6]={' ',' ',' ',' ',' ',' '};
 475
 476     cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
 477     if(cnv->extraInfo != NULL) {
 478         UConverterNamePieces stackPieces;
 479         UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
 480         UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
 481         uint32_t version;
 482
 483         stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
 484
 485         uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
 486         myConverterData->currentType = ASCII1;
 487         cnv->fromUnicodeStatus =FALSE;
 488         if(pArgs->locale){
 489             uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
 490         }
 491         version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
 492         myConverterData->version = version;
 493         if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
 494             (myLocale[2]=='_' || myLocale[2]=='\0'))
 495         {
 496             /* open the required converters and cache them */
 497             if(version>MAX_JA_VERSION) {
 498                 // ICU 55 fails to open a converter for an unsupported version.
 499                 // Previously, it fell back to version 0, but that would yield
 500                 // unexpected behavior.
 501                 *errorCode = U_MISSING_RESOURCE_ERROR;
 502                 return;
 503             }
 504             if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
 505                 myConverterData->myConverterArray[ISO8859_7] =
 506                     ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
 507             }
 508             myConverterData->myConverterArray[JISX208] =
 509                 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
 510             if(jpCharsetMasks[version]&CSM(JISX212)) {
 511                 myConverterData->myConverterArray[JISX212] =
 512                     ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
 513             }
 514             if(jpCharsetMasks[version]&CSM(GB2312)) {
 515                 myConverterData->myConverterArray[GB2312] =
 516                     ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);   /* gb_2312_80-1 */
 517             }
 518             if(jpCharsetMasks[version]&CSM(KSC5601)) {
 519                 myConverterData->myConverterArray[KSC5601] =
 520                     ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
 521             }
 522
 523             /* set the function pointers to appropriate funtions */
 524             cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
 525             uprv_strcpy(myConverterData->locale,"ja");
 526
 527             (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
 528             size_t len = uprv_strlen(myConverterData->name);
 529             myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
 530             myConverterData->name[len+1]='\0';
 531         }
 532 #if !UCONFIG_ONLY_HTML_CONVERSION
 533         else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
 534             (myLocale[2]=='_' || myLocale[2]=='\0'))
 535         {
 536             if(version>1) {
 537                 // ICU 55 fails to open a converter for an unsupported version.
 538                 // Previously, it fell back to version 0, but that would yield
 539                 // unexpected behavior.
 540                 *errorCode = U_MISSING_RESOURCE_ERROR;
 541                 return;
 542             }
 543             const char *cnvName;
 544             if(version==1) {
 545                 cnvName="icu-internal-25546";
 546             } else {
 547                 cnvName="ibm-949";
 548                 myConverterData->version=version=0;
 549             }
 550             if(pArgs->onlyTestIsLoadable) {
 551                 ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
 552                 uprv_free(cnv->extraInfo);
 553                 cnv->extraInfo=NULL;
 554                 return;
 555             } else {
 556                 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
 557                 if (U_FAILURE(*errorCode)) {
 558                     _ISO2022Close(cnv);
 559                     return;
 560                 }
 561
 562                 if(version==1) {
 563                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
 564                     uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
 565                     cnv->subCharLen = myConverterData->currentConverter->subCharLen;
 566                 }else{
 567                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
 568                 }
 569
 570                 /* initialize the state variables */
 571                 setInitialStateToUnicodeKR(cnv, myConverterData);
 572                 setInitialStateFromUnicodeKR(cnv, myConverterData);
 573
 574                 /* set the function pointers to appropriate funtions */
 575                 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
 576                 uprv_strcpy(myConverterData->locale,"ko");
 577             }
 578         }
 579         else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
 580             (myLocale[2]=='_' || myLocale[2]=='\0'))
 581         {
 582             if(version>2) {
 583                 // ICU 55 fails to open a converter for an unsupported version.
 584                 // Previously, it fell back to version 0, but that would yield
 585                 // unexpected behavior.
 586                 *errorCode = U_MISSING_RESOURCE_ERROR;
 587                 return;
 588             }
 589
 590             /* open the required converters and cache them */
 591             myConverterData->myConverterArray[GB2312_1] =
 592                 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
 593             if(version==1) {
 594                 myConverterData->myConverterArray[ISO_IR_165] =
 595                     ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
 596             }
 597             myConverterData->myConverterArray[CNS_11643] =
 598                 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
 599
 600
 601             /* set the function pointers to appropriate funtions */
 602             cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
 603             uprv_strcpy(myConverterData->locale,"cn");
 604
 605             if (version==0){
 606                 myConverterData->version = 0;
 607                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
 608             }else if (version==1){
 609                 myConverterData->version = 1;
 610                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
 611             }else {
 612                 myConverterData->version = 2;
 613                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
 614             }
 615         }
 616 #endif  // !UCONFIG_ONLY_HTML_CONVERSION
 617         else{
 618 #ifdef U_ENABLE_GENERIC_ISO_2022
 619             myConverterData->isFirstBuffer = TRUE;
 620
 621             /* append the UTF-8 escape sequence */
 622             cnv->charErrorBufferLength = 3;
 623             cnv->charErrorBuffer[0] = 0x1b;
 624             cnv->charErrorBuffer[1] = 0x25;
 625             cnv->charErrorBuffer[2] = 0x42;
 626
 627             cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
 628             /* initialize the state variables */
 629             uprv_strcpy(myConverterData->name,"ISO_2022");
 630 #else
 631             *errorCode = U_MISSING_RESOURCE_ERROR;
 632             // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
 633             // data loading error code.
 634             return;
 635 #endif
 636         }
 637
 638         cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
 639
 640         if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
 641             _ISO2022Close(cnv);
 642         }
 643     } else {
 644         *errorCode = U_MEMORY_ALLOCATION_ERROR;
 645     }
 646 }
 647
 648
 649 static void
 650 _ISO2022Close(UConverter *converter) {
 651     UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
 652     UConverterSharedData **array = myData->myConverterArray;
 653     int32_t i;
 654
 655     if (converter->extraInfo != NULL) {
 656         /*close the array of converter pointers and free the memory*/
 657         for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
 658             if(array[i]!=NULL) {
 659                 ucnv_unloadSharedDataIfReady(array[i]);
 660             }
 661         }
 662
 663         ucnv_close(myData->currentConverter);
 664
 665         if(!converter->isExtraLocal){
 666             uprv_free (converter->extraInfo);
 667             converter->extraInfo = NULL;
 668         }
 669     }
 670 }
 671
 672 static void
 673 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
 674     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
 675     if(choice<=UCNV_RESET_TO_UNICODE) {
 676         uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
 677         myConverterData->key = 0;
 678         myConverterData->isEmptySegment = FALSE;
 679     }
 680     if(choice!=UCNV_RESET_TO_UNICODE) {
 681         uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
 682     }
 683 #ifdef U_ENABLE_GENERIC_ISO_2022
 684     if(myConverterData->locale[0] == 0){
 685         if(choice<=UCNV_RESET_TO_UNICODE) {
 686             myConverterData->isFirstBuffer = TRUE;
 687             myConverterData->key = 0;
 688             if (converter->mode == UCNV_SO){
 689                 ucnv_close (myConverterData->currentConverter);
 690                 myConverterData->currentConverter=NULL;
 691             }
 692             converter->mode = UCNV_SI;
 693         }
 694         if(choice!=UCNV_RESET_TO_UNICODE) {
 695             /* re-append UTF-8 escape sequence */
 696             converter->charErrorBufferLength = 3;
 697             converter->charErrorBuffer[0] = 0x1b;
 698             converter->charErrorBuffer[1] = 0x28;
 699             converter->charErrorBuffer[2] = 0x42;
 700         }
 701     }
 702     else
 703 #endif
 704     {
 705         /* reset the state variables */
 706         if(myConverterData->locale[0] == 'k'){
 707             if(choice<=UCNV_RESET_TO_UNICODE) {
 708                 setInitialStateToUnicodeKR(converter, myConverterData);
 709             }
 710             if(choice!=UCNV_RESET_TO_UNICODE) {
 711                 setInitialStateFromUnicodeKR(converter, myConverterData);
 712             }
 713         }
 714     }
 715 }
 716
 717 static const char*
 718 _ISO2022getName(const UConverter* cnv){
 719     if(cnv->extraInfo){
 720         UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
 721         return myData->name;
 722     }
 723     return NULL;
 724 }
 725
 726
 727 /*************** to unicode *******************/
 728 /****************************************************************************
 729  * Recognized escape sequences are
 730  * <ESC>(B  ASCII
 731  * <ESC>.A  ISO-8859-1
 732  * <ESC>.F  ISO-8859-7
 733  * <ESC>(J  JISX-201
 734  * <ESC>(I  JISX-201
 735  * <ESC>$B  JISX-208
 736  * <ESC>$@  JISX-208
 737  * <ESC>$(D JISX-212
 738  * <ESC>$A  GB2312
 739  * <ESC>$(C KSC5601
 740  */
 741 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
 742 /*      0                1               2               3               4               5               6               7               8               9    */
 743     INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 744     ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
 745     ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 746     ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
 747     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 748     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 749     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 750     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 751 };
 752
 753 #if !UCONFIG_ONLY_HTML_CONVERSION
 754 /*************** to unicode *******************/
 755 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
 756 /*      0                1               2               3               4               5               6               7               8               9    */
 757      INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 758     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 759     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 760     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 761     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
 762     ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 763     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 764     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 765 };
 766 #endif
 767
 768
 769 static UCNV_TableStates_2022
 770 getKey_2022(char c,int32_t* key,int32_t* offset){
 771     int32_t togo;
 772     int32_t low = 0;
 773     int32_t hi = MAX_STATES_2022;
 774     int32_t oldmid=0;
 775
 776     togo = normalize_esq_chars_2022[(uint8_t)c];
 777     if(togo == 0) {
 778         /* not a valid character anywhere in an escape sequence */
 779         *key = 0;
 780         *offset = 0;
 781         return INVALID_2022;
 782     }
 783     togo = (*key << 5) + togo;
 784
 785     while (hi != low)  /*binary search*/{
 786
 787         int32_t mid = (hi+low) >> 1; /*Finds median*/
 788
 789         if (mid == oldmid)
 790             break;
 791
 792         if (escSeqStateTable_Key_2022[mid] > togo){
 793             hi = mid;
 794         }
 795         else if (escSeqStateTable_Key_2022[mid] < togo){
 796             low = mid;
 797         }
 798         else /*we found it*/{
 799             *key = togo;
 800             *offset = mid;
 801             return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
 802         }
 803         oldmid = mid;
 804
 805     }
 806
 807     *key = 0;
 808     *offset = 0;
 809     return INVALID_2022;
 810 }
 811
 812 /*runs through a state machine to determine the escape sequence - codepage correspondance
 813  */
 814 static void
 815 changeState_2022(UConverter* _this,
 816                 const char** source,
 817                 const char* sourceLimit,
 818                 Variant2022 var,
 819                 UErrorCode* err){
 820     UCNV_TableStates_2022 value;
 821     UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
 822     uint32_t key = myData2022->key;
 823     int32_t offset = 0;
 824     int8_t initialToULength = _this->toULength;
 825     char c;
 826
 827     value = VALID_NON_TERMINAL_2022;
 828     while (*source < sourceLimit) {
 829         c = *(*source)++;
 830         _this->toUBytes[_this->toULength++]=(uint8_t)c;
 831         value = getKey_2022(c,(int32_t *) &key, &offset);
 832
 833         switch (value){
 834
 835         case VALID_NON_TERMINAL_2022 :
 836             /* continue with the loop */
 837             break;
 838
 839         case VALID_TERMINAL_2022:
 840             key = 0;
 841             goto DONE;
 842
 843         case INVALID_2022:
 844             goto DONE;
 845
 846         case VALID_MAYBE_TERMINAL_2022:
 847 #ifdef U_ENABLE_GENERIC_ISO_2022
 848             /* ESC ( B is ambiguous only for ISO_2022 itself */
 849             if(var == ISO_2022) {
 850                 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
 851                 _this->toULength = 0;
 852
 853                 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
 854
 855                 /* continue with the loop */
 856                 value = VALID_NON_TERMINAL_2022;
 857                 break;
 858             } else
 859 #endif
 860             {
 861                 /* not ISO_2022 itself, finish here */
 862                 value = VALID_TERMINAL_2022;
 863                 key = 0;
 864                 goto DONE;
 865             }
 866         }
 867     }
 868
 869 DONE:
 870     myData2022->key = key;
 871
 872     if (value == VALID_NON_TERMINAL_2022) {
 873         /* indicate that the escape sequence is incomplete: key!=0 */
 874         return;
 875     } else if (value == INVALID_2022 ) {
 876         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 877     } else /* value == VALID_TERMINAL_2022 */ {
 878         switch(var){
 879 #ifdef U_ENABLE_GENERIC_ISO_2022
 880         case ISO_2022:
 881         {
 882             const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
 883             if(chosenConverterName == NULL) {
 884                 /* SS2 or SS3 */
 885                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 886                 _this->toUCallbackReason = UCNV_UNASSIGNED;
 887                 return;
 888             }
 889
 890             _this->mode = UCNV_SI;
 891             ucnv_close(myData2022->currentConverter);
 892             myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
 893             if(U_SUCCESS(*err)) {
 894                 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
 895                 _this->mode = UCNV_SO;
 896             }
 897             break;
 898         }
 899 #endif
 900         case ISO_2022_JP:
 901             {
 902                 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
 903                 switch(tempState) {
 904                 case INVALID_STATE:
 905                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 906                     break;
 907                 case SS2_STATE:
 908                     if(myData2022->toU2022State.cs[2]!=0) {
 909                         if(myData2022->toU2022State.g<2) {
 910                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 911                         }
 912                         myData2022->toU2022State.g=2;
 913                     } else {
 914                         /* illegal to have SS2 before a matching designator */
 915                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 916                     }
 917                     break;
 918                 /* case SS3_STATE: not used in ISO-2022-JP-x */
 919                 case ISO8859_1:
 920                 case ISO8859_7:
 921                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
 922                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 923                     } else {
 924                         /* G2 charset for SS2 */
 925                         myData2022->toU2022State.cs[2]=(int8_t)tempState;
 926                     }
 927                     break;
 928                 default:
 929                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
 930                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 931                     } else {
 932                         /* G0 charset */
 933                         myData2022->toU2022State.cs[0]=(int8_t)tempState;
 934                     }
 935                     break;
 936                 }
 937             }
 938             break;
 939 #if !UCONFIG_ONLY_HTML_CONVERSION
 940         case ISO_2022_CN:
 941             {
 942                 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
 943                 switch(tempState) {
 944                 case INVALID_STATE:
 945                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 946                     break;
 947                 case SS2_STATE:
 948                     if(myData2022->toU2022State.cs[2]!=0) {
 949                         if(myData2022->toU2022State.g<2) {
 950                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 951                         }
 952                         myData2022->toU2022State.g=2;
 953                     } else {
 954                         /* illegal to have SS2 before a matching designator */
 955                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 956                     }
 957                     break;
 958                 case SS3_STATE:
 959                     if(myData2022->toU2022State.cs[3]!=0) {
 960                         if(myData2022->toU2022State.g<2) {
 961                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 962                         }
 963                         myData2022->toU2022State.g=3;
 964                     } else {
 965                         /* illegal to have SS3 before a matching designator */
 966                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 967                     }
 968                     break;
 969                 case ISO_IR_165:
 970                     if(myData2022->version==0) {
 971                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 972                         break;
 973                     }
 974                     /*fall through*/
 975                 case GB2312_1:
 976                     /*fall through*/
 977                 case CNS_11643_1:
 978                     myData2022->toU2022State.cs[1]=(int8_t)tempState;
 979                     break;
 980                 case CNS_11643_2:
 981                     myData2022->toU2022State.cs[2]=(int8_t)tempState;
 982                     break;
 983                 default:
 984                     /* other CNS 11643 planes */
 985                     if(myData2022->version==0) {
 986                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 987                     } else {
 988                        myData2022->toU2022State.cs[3]=(int8_t)tempState;
 989                     }
 990                     break;
 991                 }
 992             }
 993             break;
 994         case ISO_2022_KR:
 995             if(offset==0x30){
 996                 /* nothing to be done, just accept this one escape sequence */
 997             } else {
 998                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 999             }
1000             break;
1001 #endif  // !UCONFIG_ONLY_HTML_CONVERSION
1002
1003         default:
1004             *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1005             break;
1006         }
1007     }
1008     if(U_SUCCESS(*err)) {
1009         _this->toULength = 0;
1010     } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1011         if(_this->toULength>1) {
1012             /*
1013              * Ticket 5691: consistent illegal sequences:
1014              * - We include at least the first byte (ESC) in the illegal sequence.
1015              * - If any of the non-initial bytes could be the start of a character,
1016              *   we stop the illegal sequence before the first one of those.
1017              *   In escape sequences, all following bytes are "printable", that is,
1018              *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1019              *   they are valid single/lead bytes.
1020              *   For simplicity, we always only report the initial ESC byte as the
1021              *   illegal sequence and back out all other bytes we looked at.
1022              */
1023             /* Back out some bytes. */
1024             int8_t backOutDistance=_this->toULength-1;
1025             int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1026             if(backOutDistance<=bytesFromThisBuffer) {
1027                 /* same as initialToULength<=1 */
1028                 *source-=backOutDistance;
1029             } else {
1030                 /* Back out bytes from the previous buffer: Need to replay them. */
1031                 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1032                 /* same as -(initialToULength-1) */
1033                 /* preToULength is negative! */
1034                 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1035                 *source-=bytesFromThisBuffer;
1036             }
1037             _this->toULength=1;
1038         }
1039     } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1040         _this->toUCallbackReason = UCNV_UNASSIGNED;
1041     }
1042 }
1043
1044 #if !UCONFIG_ONLY_HTML_CONVERSION
1045 /*Checks the characters of the buffer against valid 2022 escape sequences
1046 *if the match we return a pointer to the initial start of the sequence otherwise
1047 *we return sourceLimit
1048 */
1049 /*for 2022 looks ahead in the stream
1050  *to determine the longest possible convertible
1051  *data stream
1052  */
1053 static inline const char*
1054 getEndOfBuffer_2022(const char** source,
1055                    const char* sourceLimit,
1056                    UBool /*flush*/){
1057
1058     const char* mySource = *source;
1059
1060 #ifdef U_ENABLE_GENERIC_ISO_2022
1061     if (*source >= sourceLimit)
1062         return sourceLimit;
1063
1064     do{
1065
1066         if (*mySource == ESC_2022){
1067             int8_t i;
1068             int32_t key = 0;
1069             int32_t offset;
1070             UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1071
1072             /* Kludge: I could not
1073             * figure out the reason for validating an escape sequence
1074             * twice - once here and once in changeState_2022().
1075             * is it possible to have an ESC character in a ISO2022
1076             * byte stream which is valid in a code page? Is it legal?
1077             */
1078             for (i=0;
1079             (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1080             i++) {
1081                 value =  getKey_2022(*(mySource+i), &key, &offset);
1082             }
1083             if (value > 0 || *mySource==ESC_2022)
1084                 return mySource;
1085
1086             if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1087                 return sourceLimit;
1088         }
1089     }while (++mySource < sourceLimit);
1090
1091     return sourceLimit;
1092 #else
1093     while(mySource < sourceLimit && *mySource != ESC_2022) {
1094         ++mySource;
1095     }
1096     return mySource;
1097 #endif
1098 }
1099 #endif
1100
1101 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1102  * any future change in _MBCSFromUChar32() function should be reflected here.
1103  * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1104  */
1105 static inline int32_t
1106 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1107                                          UChar32 c,
1108                                          uint32_t* value,
1109                                          UBool useFallback,
1110                                          int outputType)
1111 {
1112     const int32_t *cx;
1113     const uint16_t *table;
1114     uint32_t stage2Entry;
1115     uint32_t myValue;
1116     int32_t length;
1117     const uint8_t *p;
1118     /*
1119      * TODO(markus): Use and require new, faster MBCS conversion table structures.
1120      * Use internal version of ucnv_open() that verifies that the new structures are available,
1121      * else U_INTERNAL_PROGRAM_ERROR.
1122      */
1123     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1124     if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1125         table=sharedData->mbcs.fromUnicodeTable;
1126         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1127         /* get the bytes and the length for the output */
1128         if(outputType==MBCS_OUTPUT_2){
1129             myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1130             if(myValue<=0xff) {
1131                 length=1;
1132             } else {
1133                 length=2;
1134             }
1135         } else /* outputType==MBCS_OUTPUT_3 */ {
1136             p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1137             myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1138             if(myValue<=0xff) {
1139                 length=1;
1140             } else if(myValue<=0xffff) {
1141                 length=2;
1142             } else {
1143                 length=3;
1144             }
1145         }
1146         /* is this code point assigned, or do we use fallbacks? */
1147         if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1148             /* assigned */
1149             *value=myValue;
1150             return length;
1151         } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1152             /*
1153              * We allow a 0 byte output if the "assigned" bit is set for this entry.
1154              * There is no way with this data structure for fallback output
1155              * to be a zero byte.
1156              */
1157             *value=myValue;
1158             return -length;
1159         }
1160     }
1161
1162     cx=sharedData->mbcs.extIndexes;
1163     if(cx!=NULL) {
1164         return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1165     }
1166
1167     /* unassigned */
1168     return 0;
1169 }
1170
1171 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1172  * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1173  * @param retval pointer to output byte
1174  * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1175  */
1176 static inline int32_t
1177 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1178                                        UChar32 c,
1179                                        uint32_t* retval,
1180                                        UBool useFallback)
1181 {
1182     const uint16_t *table;
1183     int32_t value;
1184     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1185     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1186         return 0;
1187     }
1188     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1189     table=sharedData->mbcs.fromUnicodeTable;
1190     /* get the byte for the output */
1191     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1192     /* is this code point assigned, or do we use fallbacks? */
1193     *retval=(uint32_t)(value&0xff);
1194     if(value>=0xf00) {
1195         return 1;  /* roundtrip */
1196     } else if(useFallback ? value>=0x800 : value>=0xc00) {
1197         return -1;  /* fallback taken */
1198     } else {
1199         return 0;  /* no mapping */
1200     }
1201 }
1202
1203 /*
1204  * Check that the result is a 2-byte value with each byte in the range A1..FE
1205  * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1206  * to move it to the ISO 2022 range 21..7E.
1207  * Return 0 if out of range.
1208  */
1209 static inline uint32_t
1210 _2022FromGR94DBCS(uint32_t value) {
1211     if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1212         (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1213     ) {
1214         return value - 0x8080;  /* shift down to 21..7e byte range */
1215     } else {
1216         return 0;  /* not valid for ISO 2022 */
1217     }
1218 }
1219
1220 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1221 /*
1222  * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1223  * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1224  * unchanged.
1225  */
1226 static inline uint32_t
1227 _2022ToGR94DBCS(uint32_t value) {
1228     uint32_t returnValue = value + 0x8080;
1229     if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1230         (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1231         return returnValue;
1232     } else {
1233         return value;
1234     }
1235 }
1236 #endif
1237
1238 #ifdef U_ENABLE_GENERIC_ISO_2022
1239
1240 /**********************************************************************************
1241 *  ISO-2022 Converter
1242 *
1243 *
1244 */
1245
1246 static void
1247 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1248                                                            UErrorCode* err){
1249     const char* mySourceLimit, *realSourceLimit;
1250     const char* sourceStart;
1251     const UChar* myTargetStart;
1252     UConverter* saveThis;
1253     UConverterDataISO2022* myData;
1254     int8_t length;
1255
1256     saveThis = args->converter;
1257     myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1258
1259     realSourceLimit = args->sourceLimit;
1260     while (args->source < realSourceLimit) {
1261         if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1262             /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1263             mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1264
1265             if(args->source < mySourceLimit) {
1266                 if(myData->currentConverter==NULL) {
1267                     myData->currentConverter = ucnv_open("ASCII",err);
1268                     if(U_FAILURE(*err)){
1269                         return;
1270                     }
1271
1272                     myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1273                     saveThis->mode = UCNV_SO;
1274                 }
1275
1276                 /* convert to before the ESC or until the end of the buffer */
1277                 myData->isFirstBuffer=FALSE;
1278                 sourceStart = args->source;
1279                 myTargetStart = args->target;
1280                 args->converter = myData->currentConverter;
1281                 ucnv_toUnicode(args->converter,
1282                     &args->target,
1283                     args->targetLimit,
1284                     &args->source,
1285                     mySourceLimit,
1286                     args->offsets,
1287                     (UBool)(args->flush && mySourceLimit == realSourceLimit),
1288                     err);
1289                 args->converter = saveThis;
1290
1291                 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1292                     /* move the overflow buffer */
1293                     length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1294                     myData->currentConverter->UCharErrorBufferLength = 0;
1295                     if(length > 0) {
1296                         uprv_memcpy(saveThis->UCharErrorBuffer,
1297                                     myData->currentConverter->UCharErrorBuffer,
1298                                     length*U_SIZEOF_UCHAR);
1299                     }
1300                     return;
1301                 }
1302
1303                 /*
1304                  * At least one of:
1305                  * -Error while converting
1306                  * -Done with entire buffer
1307                  * -Need to write offsets or update the current offset
1308                  *  (leave that up to the code in ucnv.c)
1309                  *
1310                  * or else we just stopped at an ESC byte and continue with changeState_2022()
1311                  */
1312                 if (U_FAILURE(*err) ||
1313                     (args->source == realSourceLimit) ||
1314                     (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1315                     (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1316                 ) {
1317                     /* copy partial or error input for truncated detection and error handling */
1318                     if(U_FAILURE(*err)) {
1319                         length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1320                         if(length > 0) {
1321                             uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1322                         }
1323                     } else {
1324                         length = saveThis->toULength = myData->currentConverter->toULength;
1325                         if(length > 0) {
1326                             uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1327                             if(args->source < mySourceLimit) {
1328                                 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1329                             }
1330                         }
1331                     }
1332                     return;
1333                 }
1334             }
1335         }
1336
1337         sourceStart = args->source;
1338         changeState_2022(args->converter,
1339                &(args->source),
1340                realSourceLimit,
1341                ISO_2022,
1342                err);
1343         if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1344             /* let the ucnv.c code update its current offset */
1345             return;
1346         }
1347     }
1348 }
1349
1350 #endif
1351
1352 /*
1353  * To Unicode Callback helper function
1354  */
1355 static void
1356 toUnicodeCallback(UConverter *cnv,
1357                   const uint32_t sourceChar, const uint32_t targetUniChar,
1358                   UErrorCode* err){
1359     if(sourceChar>0xff){
1360         cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1361         cnv->toUBytes[1] = (uint8_t)sourceChar;
1362         cnv->toULength = 2;
1363     }
1364     else{
1365         cnv->toUBytes[0] =(char) sourceChar;
1366         cnv->toULength = 1;
1367     }
1368
1369     if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1370         *err = U_INVALID_CHAR_FOUND;
1371     }
1372     else{
1373         *err = U_ILLEGAL_CHAR_FOUND;
1374     }
1375 }
1376
1377 /**************************************ISO-2022-JP*************************************************/
1378
1379 /************************************** IMPORTANT **************************************************
1380 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1381 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1382 * The converter iterates over each Unicode codepoint
1383 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1384 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1385 * would do as far as possible.
1386 *
1387 * If the implementation of these macros or structure of sharedData struct change in the future, make
1388 * sure that ISO-2022 is also changed.
1389 ***************************************************************************************************
1390 */
1391
1392 /***************************************************************************************************
1393 * Rules for ISO-2022-jp encoding
1394 * (i)   Escape sequences must be fully contained within a line they should not
1395 *       span new lines or CRs
1396 * (ii)  If the last character on a line is represented by two bytes then an ASCII or
1397 *       JIS-Roman character escape sequence should follow before the line terminates
1398 * (iii) If the first character on the line is represented by two bytes then a two
1399 *       byte character escape sequence should precede it
1400 * (iv)  If no escape sequence is encountered then the characters are ASCII
1401 * (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1402 *       and invoked with SS2 (ESC N).
1403 * (vi)  If there is any G0 designation in text, there must be a switch to
1404 *       ASCII or to JIS X 0201-Roman before a space character (but not
1405 *       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1406 *       characters such as tab or CRLF.
1407 * (vi)  Supported encodings:
1408 *          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1409 *
1410 *  source : RFC-1554
1411 *
1412 *          JISX201, JISX208,JISX212 : new .cnv data files created
1413 *          KSC5601 : alias to ibm-949 mapping table
1414 *          GB2312 : alias to ibm-1386 mapping table
1415 *          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1416 *          ISO-8859-7 : alisas to ibm-9409 mapping table
1417 */
1418
1419 /* preference order of JP charsets */
1420 static const StateEnum jpCharsetPref[]={
1421     ASCII,
1422     JISX201,
1423     ISO8859_1,
1424     ISO8859_7,
1425     JISX208,
1426     JISX212,
1427     GB2312,
1428     KSC5601,
1429     HWKANA_7BIT
1430 };
1431
1432 /*
1433  * The escape sequences must be in order of the enum constants like JISX201  = 3,
1434  * not in order of jpCharsetPref[]!
1435  */
1436 static const char escSeqChars[][6] ={
1437     "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1438     "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1439     "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1440     "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1441     "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1442     "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1443     "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1444     "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1445     "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1446
1447 };
1448 static  const int8_t escSeqCharsLen[] ={
1449     3, /* length of <ESC>(B  ASCII       */
1450     3, /* length of <ESC>.A  ISO-8859-1  */
1451     3, /* length of <ESC>.F  ISO-8859-7  */
1452     3, /* length of <ESC>(J  JISX-201    */
1453     3, /* length of <ESC>$B  JISX-208    */
1454     4, /* length of <ESC>$(D JISX-212    */
1455     3, /* length of <ESC>$A  GB2312      */
1456     4, /* length of <ESC>$(C KSC5601     */
1457     3  /* length of <ESC>(I  HWKANA_7BIT */
1458 };
1459
1460 /*
1461 * The iteration over various code pages works this way:
1462 * i)   Get the currentState from myConverterData->currentState
1463 * ii)  Check if the character is mapped to a valid character in the currentState
1464 *      Yes ->  a) set the initIterState to currentState
1465 *       b) remain in this state until an invalid character is found
1466 *      No  ->  a) go to the next code page and find the character
1467 * iii) Before changing the state increment the current state check if the current state
1468 *      is equal to the intitIteration state
1469 *      Yes ->  A character that cannot be represented in any of the supported encodings
1470 *       break and return a U_INVALID_CHARACTER error
1471 *      No  ->  Continue and find the character in next code page
1472 *
1473 *
1474 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1475 */
1476
1477 /* Map 00..7F to Unicode according to JIS X 0201. */
1478 static inline uint32_t
1479 jisx201ToU(uint32_t value) {
1480     if(value < 0x5c) {
1481         return value;
1482     } else if(value == 0x5c) {
1483         return 0xa5;
1484     } else if(value == 0x7e) {
1485         return 0x203e;
1486     } else /* value <= 0x7f */ {
1487         return value;
1488     }
1489 }
1490
1491 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1492 static inline uint32_t
1493 jisx201FromU(uint32_t value) {
1494     if(value<=0x7f) {
1495         if(value!=0x5c && value!=0x7e) {
1496             return value;
1497         }
1498     } else if(value==0xa5) {
1499         return 0x5c;
1500     } else if(value==0x203e) {
1501         return 0x7e;
1502     }
1503     return 0xfffe;
1504 }
1505
1506 /*
1507  * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1508  * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1509  * Return 0 if the byte pair is out of range.
1510  */
1511 static inline uint32_t
1512 _2022FromSJIS(uint32_t value) {
1513     uint8_t trail;
1514
1515     if(value > 0xEFFC) {
1516         return 0;  /* beyond JIS X 0208 */
1517     }
1518
1519     trail = (uint8_t)value;
1520
1521     value &= 0xff00;  /* lead byte */
1522     if(value <= 0x9f00) {
1523         value -= 0x7000;
1524     } else /* 0xe000 <= value <= 0xef00 */ {
1525         value -= 0xb000;
1526     }
1527     value <<= 1;
1528
1529     if(trail <= 0x9e) {
1530         value -= 0x100;
1531         if(trail <= 0x7e) {
1532             value |= trail - 0x1f;
1533         } else {
1534             value |= trail - 0x20;
1535         }
1536     } else /* trail <= 0xfc */ {
1537         value |= trail - 0x7e;
1538     }
1539     return value;
1540 }
1541
1542 /*
1543  * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1544  * If either byte is outside 21..7E make sure that the result is not valid
1545  * for Shift-JIS so that the converter catches it.
1546  * Some invalid byte values already turn into equally invalid Shift-JIS
1547  * byte values and need not be tested explicitly.
1548  */
1549 static inline void
1550 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1551     if(c1&1) {
1552         ++c1;
1553         if(c2 <= 0x5f) {
1554             c2 += 0x1f;
1555         } else if(c2 <= 0x7e) {
1556             c2 += 0x20;
1557         } else {
1558             c2 = 0;  /* invalid */
1559         }
1560     } else {
1561         if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1562             c2 += 0x7e;
1563         } else {
1564             c2 = 0;  /* invalid */
1565         }
1566     }
1567     c1 >>= 1;
1568     if(c1 <= 0x2f) {
1569         c1 += 0x70;
1570     } else if(c1 <= 0x3f) {
1571         c1 += 0xb0;
1572     } else {
1573         c1 = 0;  /* invalid */
1574     }
1575     bytes[0] = (char)c1;
1576     bytes[1] = (char)c2;
1577 }
1578
1579 /*
1580  * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1581  * Katakana.
1582  * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1583  * because Shift-JIS roundtrips half-width Katakana to single bytes.
1584  * These were the only fallbacks in ICU's jisx-208.ucm file.
1585  */
1586 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1587     0x2123,  /* U+FF61 */
1588     0x2156,
1589     0x2157,
1590     0x2122,
1591     0x2126,
1592     0x2572,
1593     0x2521,
1594     0x2523,
1595     0x2525,
1596     0x2527,
1597     0x2529,
1598     0x2563,
1599     0x2565,
1600     0x2567,
1601     0x2543,
1602     0x213C,  /* U+FF70 */
1603     0x2522,
1604     0x2524,
1605     0x2526,
1606     0x2528,
1607     0x252A,
1608     0x252B,
1609     0x252D,
1610     0x252F,
1611     0x2531,
1612     0x2533,
1613     0x2535,
1614     0x2537,
1615     0x2539,
1616     0x253B,
1617     0x253D,
1618     0x253F,  /* U+FF80 */
1619     0x2541,
1620     0x2544,
1621     0x2546,
1622     0x2548,
1623     0x254A,
1624     0x254B,
1625     0x254C,
1626     0x254D,
1627     0x254E,
1628     0x254F,
1629     0x2552,
1630     0x2555,
1631     0x2558,
1632     0x255B,
1633     0x255E,
1634     0x255F,  /* U+FF90 */
1635     0x2560,
1636     0x2561,
1637     0x2562,
1638     0x2564,
1639     0x2566,
1640     0x2568,
1641     0x2569,
1642     0x256A,
1643     0x256B,
1644     0x256C,
1645     0x256D,
1646     0x256F,
1647     0x2573,
1648     0x212B,
1649     0x212C   /* U+FF9F */
1650 };
1651
1652 static void
1653 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1654     UConverter *cnv = args->converter;
1655     UConverterDataISO2022 *converterData;
1656     ISO2022State *pFromU2022State;
1657     uint8_t *target = (uint8_t *) args->target;
1658     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1659     const UChar* source = args->source;
1660     const UChar* sourceLimit = args->sourceLimit;
1661     int32_t* offsets = args->offsets;
1662     UChar32 sourceChar;
1663     char buffer[8];
1664     int32_t len, outLen;
1665     int8_t choices[10];
1666     int32_t choiceCount;
1667     uint32_t targetValue = 0;
1668     UBool useFallback;
1669
1670     int32_t i;
1671     int8_t cs, g;
1672
1673     /* set up the state */
1674     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1675     pFromU2022State   = &converterData->fromU2022State;
1676
1677     choiceCount = 0;
1678
1679     /* check if the last codepoint of previous buffer was a lead surrogate*/
1680     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1681         goto getTrail;
1682     }
1683
1684     while(source < sourceLimit) {
1685         if(target < targetLimit) {
1686
1687             sourceChar  = *(source++);
1688             /*check if the char is a First surrogate*/
1689             if(U16_IS_SURROGATE(sourceChar)) {
1690                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1691 getTrail:
1692                     /*look ahead to find the trail surrogate*/
1693                     if(source < sourceLimit) {
1694                         /* test the following code unit */
1695                         UChar trail=(UChar) *source;
1696                         if(U16_IS_TRAIL(trail)) {
1697                             source++;
1698                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1699                             cnv->fromUChar32=0x00;
1700                             /* convert this supplementary code point */
1701                             /* exit this condition tree */
1702                         } else {
1703                             /* this is an unmatched lead code unit (1st surrogate) */
1704                             /* callback(illegal) */
1705                             *err=U_ILLEGAL_CHAR_FOUND;
1706                             cnv->fromUChar32=sourceChar;
1707                             break;
1708                         }
1709                     } else {
1710                         /* no more input */
1711                         cnv->fromUChar32=sourceChar;
1712                         break;
1713                     }
1714                 } else {
1715                     /* this is an unmatched trail code unit (2nd surrogate) */
1716                     /* callback(illegal) */
1717                     *err=U_ILLEGAL_CHAR_FOUND;
1718                     cnv->fromUChar32=sourceChar;
1719                     break;
1720                 }
1721             }
1722
1723             /* do not convert SO/SI/ESC */
1724             if(IS_2022_CONTROL(sourceChar)) {
1725                 /* callback(illegal) */
1726                 *err=U_ILLEGAL_CHAR_FOUND;
1727                 cnv->fromUChar32=sourceChar;
1728                 break;
1729             }
1730
1731             /* do the conversion */
1732
1733             if(choiceCount == 0) {
1734                 uint16_t csm;
1735
1736                 /*
1737                  * The csm variable keeps track of which charsets are allowed
1738                  * and not used yet while building the choices[].
1739                  */
1740                 csm = jpCharsetMasks[converterData->version];
1741                 choiceCount = 0;
1742
1743                 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1744                 if(converterData->version == 3 || converterData->version == 4) {
1745                     choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1746                 }
1747                 /* Do not try single-byte half-width Katakana for other versions. */
1748                 csm &= ~CSM(HWKANA_7BIT);
1749
1750                 /* try the current G0 charset */
1751                 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1752                 csm &= ~CSM(cs);
1753
1754                 /* try the current G2 charset */
1755                 if((cs = pFromU2022State->cs[2]) != 0) {
1756                     choices[choiceCount++] = cs;
1757                     csm &= ~CSM(cs);
1758                 }
1759
1760                 /* try all the other possible charsets */
1761                 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
1762                     cs = (int8_t)jpCharsetPref[i];
1763                     if(CSM(cs) & csm) {
1764                         choices[choiceCount++] = cs;
1765                         csm &= ~CSM(cs);
1766                     }
1767                 }
1768             }
1769
1770             cs = g = 0;
1771             /*
1772              * len==0: no mapping found yet
1773              * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1774              * len>0: found a roundtrip result, done
1775              */
1776             len = 0;
1777             /*
1778              * We will turn off useFallback after finding a fallback,
1779              * but we still get fallbacks from PUA code points as usual.
1780              * Therefore, we will also need to check that we don't overwrite
1781              * an early fallback with a later one.
1782              */
1783             useFallback = cnv->useFallback;
1784
1785             for(i = 0; i < choiceCount && len <= 0; ++i) {
1786                 uint32_t value;
1787                 int32_t len2;
1788                 int8_t cs0 = choices[i];
1789                 switch(cs0) {
1790                 case ASCII:
1791                     if(sourceChar <= 0x7f) {
1792                         targetValue = (uint32_t)sourceChar;
1793                         len = 1;
1794                         cs = cs0;
1795                         g = 0;
1796                     }
1797                     break;
1798                 case ISO8859_1:
1799                     if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1800                         targetValue = (uint32_t)sourceChar - 0x80;
1801                         len = 1;
1802                         cs = cs0;
1803                         g = 2;
1804                     }
1805                     break;
1806                 case HWKANA_7BIT:
1807                     if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1808                         if(converterData->version==3) {
1809                             /* JIS7: use G1 (SO) */
1810                             /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1811                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1812                             len = 1;
1813                             pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1814                             g = 1;
1815                         } else if(converterData->version==4) {
1816                             /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1817                             /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1818                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1819                             len = 1;
1820
1821                             cs = pFromU2022State->cs[0];
1822                             if(IS_JP_DBCS(cs)) {
1823                                 /* switch from a DBCS charset to JISX201 */
1824                                 cs = (int8_t)JISX201;
1825                             }
1826                             /* else stay in the current G0 charset */
1827                             g = 0;
1828                         }
1829                         /* else do not use HWKANA_7BIT with other versions */
1830                     }
1831                     break;
1832                 case JISX201:
1833                     /* G0 SBCS */
1834                     value = jisx201FromU(sourceChar);
1835                     if(value <= 0x7f) {
1836                         targetValue = value;
1837                         len = 1;
1838                         cs = cs0;
1839                         g = 0;
1840                         useFallback = FALSE;
1841                     }
1842                     break;
1843                 case JISX208:
1844                     /* G0 DBCS from Shift-JIS table */
1845                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1846                                 converterData->myConverterArray[cs0],
1847                                 sourceChar, &value,
1848                                 useFallback, MBCS_OUTPUT_2);
1849                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1850                         value = _2022FromSJIS(value);
1851                         if(value != 0) {
1852                             targetValue = value;
1853                             len = len2;
1854                             cs = cs0;
1855                             g = 0;
1856                             useFallback = FALSE;
1857                         }
1858                     } else if(len == 0 && useFallback &&
1859                               (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1860                         targetValue = hwkana_fb[sourceChar - HWKANA_START];
1861                         len = -2;
1862                         cs = cs0;
1863                         g = 0;
1864                         useFallback = FALSE;
1865                     }
1866                     break;
1867                 case ISO8859_7:
1868                     /* G0 SBCS forced to 7-bit output */
1869                     len2 = MBCS_SINGLE_FROM_UCHAR32(
1870                                 converterData->myConverterArray[cs0],
1871                                 sourceChar, &value,
1872                                 useFallback);
1873                     if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1874                         targetValue = value - 0x80;
1875                         len = len2;
1876                         cs = cs0;
1877                         g = 2;
1878                         useFallback = FALSE;
1879                     }
1880                     break;
1881                 default:
1882                     /* G0 DBCS */
1883                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1884                                 converterData->myConverterArray[cs0],
1885                                 sourceChar, &value,
1886                                 useFallback, MBCS_OUTPUT_2);
1887                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1888                         if(cs0 == KSC5601) {
1889                             /*
1890                              * Check for valid bytes for the encoding scheme.
1891                              * This is necessary because the sub-converter (windows-949)
1892                              * has a broader encoding scheme than is valid for 2022.
1893                              */
1894                             value = _2022FromGR94DBCS(value);
1895                             if(value == 0) {
1896                                 break;
1897                             }
1898                         }
1899                         targetValue = value;
1900                         len = len2;
1901                         cs = cs0;
1902                         g = 0;
1903                         useFallback = FALSE;
1904                     }
1905                     break;
1906                 }
1907             }
1908
1909             if(len != 0) {
1910                 if(len < 0) {
1911                     len = -len;  /* fallback */
1912                 }
1913                 outLen = 0; /* count output bytes */
1914
1915                 /* write SI if necessary (only for JIS7) */
1916                 if(pFromU2022State->g == 1 && g == 0) {
1917                     buffer[outLen++] = UCNV_SI;
1918                     pFromU2022State->g = 0;
1919                 }
1920
1921                 /* write the designation sequence if necessary */
1922                 if(cs != pFromU2022State->cs[g]) {
1923                     int32_t escLen = escSeqCharsLen[cs];
1924                     uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1925                     outLen += escLen;
1926                     pFromU2022State->cs[g] = cs;
1927
1928                     /* invalidate the choices[] */
1929                     choiceCount = 0;
1930                 }
1931
1932                 /* write the shift sequence if necessary */
1933                 if(g != pFromU2022State->g) {
1934                     switch(g) {
1935                     /* case 0 handled before writing escapes */
1936                     case 1:
1937                         buffer[outLen++] = UCNV_SO;
1938                         pFromU2022State->g = 1;
1939                         break;
1940                     default: /* case 2 */
1941                         buffer[outLen++] = 0x1b;
1942                         buffer[outLen++] = 0x4e;
1943                         break;
1944                     /* no case 3: no SS3 in ISO-2022-JP-x */
1945                     }
1946                 }
1947
1948                 /* write the output bytes */
1949                 if(len == 1) {
1950                     buffer[outLen++] = (char)targetValue;
1951                 } else /* len == 2 */ {
1952                     buffer[outLen++] = (char)(targetValue >> 8);
1953                     buffer[outLen++] = (char)targetValue;
1954                 }
1955             } else {
1956                 /*
1957                  * if we cannot find the character after checking all codepages
1958                  * then this is an error
1959                  */
1960                 *err = U_INVALID_CHAR_FOUND;
1961                 cnv->fromUChar32=sourceChar;
1962                 break;
1963             }
1964
1965             if(sourceChar == CR || sourceChar == LF) {
1966                 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1967                 pFromU2022State->cs[2] = 0;
1968                 choiceCount = 0;
1969             }
1970
1971             /* output outLen>0 bytes in buffer[] */
1972             if(outLen == 1) {
1973                 *target++ = buffer[0];
1974                 if(offsets) {
1975                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1976                 }
1977             } else if(outLen == 2 && (target + 2) <= targetLimit) {
1978                 *target++ = buffer[0];
1979                 *target++ = buffer[1];
1980                 if(offsets) {
1981                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1982                     *offsets++ = sourceIndex;
1983                     *offsets++ = sourceIndex;
1984                 }
1985             } else {
1986                 fromUWriteUInt8(
1987                     cnv,
1988                     buffer, outLen,
1989                     &target, (const char *)targetLimit,
1990                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1991                     err);
1992                 if(U_FAILURE(*err)) {
1993                     break;
1994                 }
1995             }
1996         } /* end if(myTargetIndex<myTargetLength) */
1997         else{
1998             *err =U_BUFFER_OVERFLOW_ERROR;
1999             break;
2000         }
2001
2002     }/* end while(mySourceIndex<mySourceLength) */
2003
2004     /*
2005      * the end of the input stream and detection of truncated input
2006      * are handled by the framework, but for ISO-2022-JP conversion
2007      * we need to be in ASCII mode at the very end
2008      *
2009      * conditions:
2010      *   successful
2011      *   in SO mode or not in ASCII mode
2012      *   end of input and no truncated input
2013      */
2014     if( U_SUCCESS(*err) &&
2015         (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2016         args->flush && source>=sourceLimit && cnv->fromUChar32==0
2017     ) {
2018         int32_t sourceIndex;
2019
2020         outLen = 0;
2021
2022         if(pFromU2022State->g != 0) {
2023             buffer[outLen++] = UCNV_SI;
2024             pFromU2022State->g = 0;
2025         }
2026
2027         if(pFromU2022State->cs[0] != ASCII) {
2028             int32_t escLen = escSeqCharsLen[ASCII];
2029             uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2030             outLen += escLen;
2031             pFromU2022State->cs[0] = (int8_t)ASCII;
2032         }
2033
2034         /* get the source index of the last input character */
2035         /*
2036          * TODO this would be simpler and more reliable if we used a pair
2037          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2038          * so that we could simply use the prevSourceIndex here;
2039          * this code gives an incorrect result for the rare case of an unmatched
2040          * trail surrogate that is alone in the last buffer of the text stream
2041          */
2042         sourceIndex=(int32_t)(source-args->source);
2043         if(sourceIndex>0) {
2044             --sourceIndex;
2045             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2046                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2047             ) {
2048                 --sourceIndex;
2049             }
2050         } else {
2051             sourceIndex=-1;
2052         }
2053
2054         fromUWriteUInt8(
2055             cnv,
2056             buffer, outLen,
2057             &target, (const char *)targetLimit,
2058             &offsets, sourceIndex,
2059             err);
2060     }
2061
2062     /*save the state and return */
2063     args->source = source;
2064     args->target = (char*)target;
2065 }
2066
2067 /*************** to unicode *******************/
2068
2069 static void
2070 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2071                                                UErrorCode* err){
2072     char tempBuf[2];
2073     const char *mySource = (char *) args->source;
2074     UChar *myTarget = args->target;
2075     const char *mySourceLimit = args->sourceLimit;
2076     uint32_t targetUniChar = 0x0000;
2077     uint32_t mySourceChar = 0x0000;
2078     uint32_t tmpSourceChar = 0x0000;
2079     UConverterDataISO2022* myData;
2080     ISO2022State *pToU2022State;
2081     StateEnum cs;
2082
2083     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2084     pToU2022State = &myData->toU2022State;
2085
2086     if(myData->key != 0) {
2087         /* continue with a partial escape sequence */
2088         goto escape;
2089     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2090         /* continue with a partial double-byte character */
2091         mySourceChar = args->converter->toUBytes[0];
2092         args->converter->toULength = 0;
2093         cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2094         targetUniChar = missingCharMarker;
2095         goto getTrailByte;
2096     }
2097
2098     while(mySource < mySourceLimit){
2099
2100         targetUniChar =missingCharMarker;
2101
2102         if(myTarget < args->targetLimit){
2103
2104             mySourceChar= (unsigned char) *mySource++;
2105
2106             switch(mySourceChar) {
2107             case UCNV_SI:
2108                 if(myData->version==3) {
2109                     pToU2022State->g=0;
2110                     continue;
2111                 } else {
2112                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2113                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
2114                     break;
2115                 }
2116
2117             case UCNV_SO:
2118                 if(myData->version==3) {
2119                     /* JIS7: switch to G1 half-width Katakana */
2120                     pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2121                     pToU2022State->g=1;
2122                     continue;
2123                 } else {
2124                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2125                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
2126                     break;
2127                 }
2128
2129             case ESC_2022:
2130                 mySource--;
2131 escape:
2132                 {
2133                     const char * mySourceBefore = mySource;
2134                     int8_t toULengthBefore = args->converter->toULength;
2135
2136                     changeState_2022(args->converter,&(mySource),
2137                         mySourceLimit, ISO_2022_JP,err);
2138
2139                     /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2140                     if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2141                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2142                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
2143                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2144                     }
2145                 }
2146
2147                 /* invalid or illegal escape sequence */
2148                 if(U_FAILURE(*err)){
2149                     args->target = myTarget;
2150                     args->source = mySource;
2151                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
2152                     return;
2153                 }
2154                 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2155                 if(myData->key==0) {
2156                     myData->isEmptySegment = TRUE;
2157                 }
2158                 continue;
2159
2160             /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2161
2162             case CR:
2163                 /*falls through*/
2164             case LF:
2165                 /* automatically reset to single-byte mode */
2166                 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2167                     pToU2022State->cs[0] = (int8_t)ASCII;
2168                 }
2169                 pToU2022State->cs[2] = 0;
2170                 pToU2022State->g = 0;
2171                 /* falls through */
2172             default:
2173                 /* convert one or two bytes */
2174                 myData->isEmptySegment = FALSE;
2175                 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2176                 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2177                     !IS_JP_DBCS(cs)
2178                 ) {
2179                     /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2180                     targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2181
2182                     /* return from a single-shift state to the previous one */
2183                     if(pToU2022State->g >= 2) {
2184                         pToU2022State->g=pToU2022State->prevG;
2185                     }
2186                 } else switch(cs) {
2187                 case ASCII:
2188                     if(mySourceChar <= 0x7f) {
2189                         targetUniChar = mySourceChar;
2190                     }
2191                     break;
2192                 case ISO8859_1:
2193                     if(mySourceChar <= 0x7f) {
2194                         targetUniChar = mySourceChar + 0x80;
2195                     }
2196                     /* return from a single-shift state to the previous one */
2197                     pToU2022State->g=pToU2022State->prevG;
2198                     break;
2199                 case ISO8859_7:
2200                     if(mySourceChar <= 0x7f) {
2201                         /* convert mySourceChar+0x80 to use a normal 8-bit table */
2202                         targetUniChar =
2203                             _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2204                                 myData->myConverterArray[cs],
2205                                 mySourceChar + 0x80);
2206                     }
2207                     /* return from a single-shift state to the previous one */
2208                     pToU2022State->g=pToU2022State->prevG;
2209                     break;
2210                 case JISX201:
2211                     if(mySourceChar <= 0x7f) {
2212                         targetUniChar = jisx201ToU(mySourceChar);
2213                     }
2214                     break;
2215                 case HWKANA_7BIT:
2216                     if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2217                         /* 7-bit halfwidth Katakana */
2218                         targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2219                     }
2220                     break;
2221                 default:
2222                     /* G0 DBCS */
2223                     if(mySource < mySourceLimit) {
2224                         int leadIsOk, trailIsOk;
2225                         uint8_t trailByte;
2226 getTrailByte:
2227                         trailByte = (uint8_t)*mySource;
2228                         /*
2229                          * Ticket 5691: consistent illegal sequences:
2230                          * - We include at least the first byte in the illegal sequence.
2231                          * - If any of the non-initial bytes could be the start of a character,
2232                          *   we stop the illegal sequence before the first one of those.
2233                          *
2234                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2235                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2236                          * Otherwise we convert or report the pair of bytes.
2237                          */
2238                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2239                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2240                         if (leadIsOk && trailIsOk) {
2241                             ++mySource;
2242                             tmpSourceChar = (mySourceChar << 8) | trailByte;
2243                             if(cs == JISX208) {
2244                                 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2245                                 mySourceChar = tmpSourceChar;
2246                             } else {
2247                                 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2248                                 mySourceChar = tmpSourceChar;
2249                                 if (cs == KSC5601) {
2250                                     tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2251                                 }
2252                                 tempBuf[0] = (char)(tmpSourceChar >> 8);
2253                                 tempBuf[1] = (char)(tmpSourceChar);
2254                             }
2255                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2256                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2257                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2258                             ++mySource;
2259                             /* add another bit so that the code below writes 2 bytes in case of error */
2260                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2261                         }
2262                     } else {
2263                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2264                         args->converter->toULength = 1;
2265                         goto endloop;
2266                     }
2267                 }  /* End of inner switch */
2268                 break;
2269             }  /* End of outer switch */
2270             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2271                 if(args->offsets){
2272                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2273                 }
2274                 *(myTarget++)=(UChar)targetUniChar;
2275             }
2276             else if(targetUniChar > missingCharMarker){
2277                 /* disassemble the surrogate pair and write to output*/
2278                 targetUniChar-=0x0010000;
2279                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2280                 if(args->offsets){
2281                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2282                 }
2283                 ++myTarget;
2284                 if(myTarget< args->targetLimit){
2285                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2286                     if(args->offsets){
2287                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2288                     }
2289                     ++myTarget;
2290                 }else{
2291                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2292                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2293                 }
2294
2295             }
2296             else{
2297                 /* Call the callback function*/
2298                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2299                 break;
2300             }
2301         }
2302         else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2303             *err =U_BUFFER_OVERFLOW_ERROR;
2304             break;
2305         }
2306     }
2307 endloop:
2308     args->target = myTarget;
2309     args->source = mySource;
2310 }
2311
2312
2313 #if !UCONFIG_ONLY_HTML_CONVERSION
2314 /***************************************************************
2315 *   Rules for ISO-2022-KR encoding
2316 *   i) The KSC5601 designator sequence should appear only once in a file,
2317 *      at the begining of a line before any KSC5601 characters. This usually
2318 *      means that it appears by itself on the first line of the file
2319 *  ii) There are only 2 shifting sequences SO to shift into double byte mode
2320 *      and SI to shift into single byte mode
2321 */
2322 static void
2323 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2324
2325     UConverter* saveConv = args->converter;
2326     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2327     args->converter=myConverterData->currentConverter;
2328
2329     myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2330     ucnv_MBCSFromUnicodeWithOffsets(args,err);
2331     saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2332
2333     if(*err == U_BUFFER_OVERFLOW_ERROR) {
2334         if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2335             uprv_memcpy(
2336                 saveConv->charErrorBuffer,
2337                 myConverterData->currentConverter->charErrorBuffer,
2338                 myConverterData->currentConverter->charErrorBufferLength);
2339         }
2340         saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2341         myConverterData->currentConverter->charErrorBufferLength = 0;
2342     }
2343     args->converter=saveConv;
2344 }
2345
2346 static void
2347 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2348
2349     const UChar *source = args->source;
2350     const UChar *sourceLimit = args->sourceLimit;
2351     unsigned char *target = (unsigned char *) args->target;
2352     unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2353     int32_t* offsets = args->offsets;
2354     uint32_t targetByteUnit = 0x0000;
2355     UChar32 sourceChar = 0x0000;
2356     UBool isTargetByteDBCS;
2357     UBool oldIsTargetByteDBCS;
2358     UConverterDataISO2022 *converterData;
2359     UConverterSharedData* sharedData;
2360     UBool useFallback;
2361     int32_t length =0;
2362
2363     converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2364     /* if the version is 1 then the user is requesting
2365      * conversion with ibm-25546 pass the arguments to
2366      * MBCS converter and return
2367      */
2368     if(converterData->version==1){
2369         UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2370         return;
2371     }
2372
2373     /* initialize data */
2374     sharedData = converterData->currentConverter->sharedData;
2375     useFallback = args->converter->useFallback;
2376     isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2377     oldIsTargetByteDBCS = isTargetByteDBCS;
2378
2379     isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2380     if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2381         goto getTrail;
2382     }
2383     while(source < sourceLimit){
2384
2385         targetByteUnit = missingCharMarker;
2386
2387         if(target < (unsigned char*) args->targetLimit){
2388             sourceChar = *source++;
2389
2390             /* do not convert SO/SI/ESC */
2391             if(IS_2022_CONTROL(sourceChar)) {
2392                 /* callback(illegal) */
2393                 *err=U_ILLEGAL_CHAR_FOUND;
2394                 args->converter->fromUChar32=sourceChar;
2395                 break;
2396             }
2397
2398             length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2399             if(length < 0) {
2400                 length = -length;  /* fallback */
2401             }
2402             /* only DBCS or SBCS characters are expected*/
2403             /* DB characters with high bit set to 1 are expected */
2404             if( length > 2 || length==0 ||
2405                 (length == 1 && targetByteUnit > 0x7f) ||
2406                 (length == 2 &&
2407                     ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2408                     (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2409             ) {
2410                 targetByteUnit=missingCharMarker;
2411             }
2412             if (targetByteUnit != missingCharMarker){
2413
2414                 oldIsTargetByteDBCS = isTargetByteDBCS;
2415                 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2416                   /* append the shift sequence */
2417                 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2418
2419                     if (isTargetByteDBCS)
2420                         *target++ = UCNV_SO;
2421                     else
2422                         *target++ = UCNV_SI;
2423                     if(offsets)
2424                         *(offsets++) = (int32_t)(source - args->source-1);
2425                 }
2426                 /* write the targetUniChar  to target */
2427                 if(targetByteUnit <= 0x00FF){
2428                     if( target < targetLimit){
2429                         *(target++) = (unsigned char) targetByteUnit;
2430                         if(offsets){
2431                             *(offsets++) = (int32_t)(source - args->source-1);
2432                         }
2433
2434                     }else{
2435                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2436                         *err = U_BUFFER_OVERFLOW_ERROR;
2437                     }
2438                 }else{
2439                     if(target < targetLimit){
2440                         *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2441                         if(offsets){
2442                             *(offsets++) = (int32_t)(source - args->source-1);
2443                         }
2444                         if(target < targetLimit){
2445                             *(target++) =(unsigned char) (targetByteUnit -0x80);
2446                             if(offsets){
2447                                 *(offsets++) = (int32_t)(source - args->source-1);
2448                             }
2449                         }else{
2450                             args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2451                             *err = U_BUFFER_OVERFLOW_ERROR;
2452                         }
2453                     }else{
2454                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2455                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2456                         *err = U_BUFFER_OVERFLOW_ERROR;
2457                     }
2458                 }
2459
2460             }
2461             else{
2462                 /* oops.. the code point is unassingned
2463                  * set the error and reason
2464                  */
2465
2466                 /*check if the char is a First surrogate*/
2467                 if(U16_IS_SURROGATE(sourceChar)) {
2468                     if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2469 getTrail:
2470                         /*look ahead to find the trail surrogate*/
2471                         if(source <  sourceLimit) {
2472                             /* test the following code unit */
2473                             UChar trail=(UChar) *source;
2474                             if(U16_IS_TRAIL(trail)) {
2475                                 source++;
2476                                 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2477                                 *err = U_INVALID_CHAR_FOUND;
2478                                 /* convert this surrogate code point */
2479                                 /* exit this condition tree */
2480                             } else {
2481                                 /* this is an unmatched lead code unit (1st surrogate) */
2482                                 /* callback(illegal) */
2483                                 *err=U_ILLEGAL_CHAR_FOUND;
2484                             }
2485                         } else {
2486                             /* no more input */
2487                             *err = U_ZERO_ERROR;
2488                         }
2489                     } else {
2490                         /* this is an unmatched trail code unit (2nd surrogate) */
2491                         /* callback(illegal) */
2492                         *err=U_ILLEGAL_CHAR_FOUND;
2493                     }
2494                 } else {
2495                     /* callback(unassigned) for a BMP code point */
2496                     *err = U_INVALID_CHAR_FOUND;
2497                 }
2498
2499                 args->converter->fromUChar32=sourceChar;
2500                 break;
2501             }
2502         } /* end if(myTargetIndex<myTargetLength) */
2503         else{
2504             *err =U_BUFFER_OVERFLOW_ERROR;
2505             break;
2506         }
2507
2508     }/* end while(mySourceIndex<mySourceLength) */
2509
2510     /*
2511      * the end of the input stream and detection of truncated input
2512      * are handled by the framework, but for ISO-2022-KR conversion
2513      * we need to be in ASCII mode at the very end
2514      *
2515      * conditions:
2516      *   successful
2517      *   not in ASCII mode
2518      *   end of input and no truncated input
2519      */
2520     if( U_SUCCESS(*err) &&
2521         isTargetByteDBCS &&
2522         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2523     ) {
2524         int32_t sourceIndex;
2525
2526         /* we are switching to ASCII */
2527         isTargetByteDBCS=FALSE;
2528
2529         /* get the source index of the last input character */
2530         /*
2531          * TODO this would be simpler and more reliable if we used a pair
2532          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2533          * so that we could simply use the prevSourceIndex here;
2534          * this code gives an incorrect result for the rare case of an unmatched
2535          * trail surrogate that is alone in the last buffer of the text stream
2536          */
2537         sourceIndex=(int32_t)(source-args->source);
2538         if(sourceIndex>0) {
2539             --sourceIndex;
2540             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2541                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2542             ) {
2543                 --sourceIndex;
2544             }
2545         } else {
2546             sourceIndex=-1;
2547         }
2548
2549         fromUWriteUInt8(
2550             args->converter,
2551             SHIFT_IN_STR, 1,
2552             &target, (const char *)targetLimit,
2553             &offsets, sourceIndex,
2554             err);
2555     }
2556
2557     /*save the state and return */
2558     args->source = source;
2559     args->target = (char*)target;
2560     args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2561 }
2562
2563 /************************ To Unicode ***************************************/
2564
2565 static void
2566 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2567                                                             UErrorCode* err){
2568     char const* sourceStart;
2569     UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2570
2571     UConverterToUnicodeArgs subArgs;
2572     int32_t minArgsSize;
2573
2574     /* set up the subconverter arguments */
2575     if(args->size<sizeof(UConverterToUnicodeArgs)) {
2576         minArgsSize = args->size;
2577     } else {
2578         minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2579     }
2580
2581     uprv_memcpy(&subArgs, args, minArgsSize);
2582     subArgs.size = (uint16_t)minArgsSize;
2583     subArgs.converter = myData->currentConverter;
2584
2585     /* remember the original start of the input for offsets */
2586     sourceStart = args->source;
2587
2588     if(myData->key != 0) {
2589         /* continue with a partial escape sequence */
2590         goto escape;
2591     }
2592
2593     while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2594         /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2595         subArgs.source = args->source;
2596         subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2597         if(subArgs.source != subArgs.sourceLimit) {
2598             /*
2599              * get the current partial byte sequence
2600              *
2601              * it needs to be moved between the public and the subconverter
2602              * so that the conversion framework, which only sees the public
2603              * converter, can handle truncated and illegal input etc.
2604              */
2605             if(args->converter->toULength > 0) {
2606                 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2607             }
2608             subArgs.converter->toULength = args->converter->toULength;
2609
2610             /*
2611              * Convert up to the end of the input, or to before the next escape character.
2612              * Does not handle conversion extensions because the preToU[] state etc.
2613              * is not copied.
2614              */
2615             ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2616
2617             if(args->offsets != NULL && sourceStart != args->source) {
2618                 /* update offsets to base them on the actual start of the input */
2619                 int32_t *offsets = args->offsets;
2620                 UChar *target = args->target;
2621                 int32_t delta = (int32_t)(args->source - sourceStart);
2622                 while(target < subArgs.target) {
2623                     if(*offsets >= 0) {
2624                         *offsets += delta;
2625                     }
2626                     ++offsets;
2627                     ++target;
2628                 }
2629             }
2630             args->source = subArgs.source;
2631             args->target = subArgs.target;
2632             args->offsets = subArgs.offsets;
2633
2634             /* copy input/error/overflow buffers */
2635             if(subArgs.converter->toULength > 0) {
2636                 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2637             }
2638             args->converter->toULength = subArgs.converter->toULength;
2639
2640             if(*err == U_BUFFER_OVERFLOW_ERROR) {
2641                 if(subArgs.converter->UCharErrorBufferLength > 0) {
2642                     uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2643                                 subArgs.converter->UCharErrorBufferLength);
2644                 }
2645                 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2646                 subArgs.converter->UCharErrorBufferLength = 0;
2647             }
2648         }
2649
2650         if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2651             return;
2652         }
2653
2654 escape:
2655         changeState_2022(args->converter,
2656                &(args->source),
2657                args->sourceLimit,
2658                ISO_2022_KR,
2659                err);
2660     }
2661 }
2662
2663 static void
2664 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2665                                                             UErrorCode* err){
2666     char tempBuf[2];
2667     const char *mySource = ( char *) args->source;
2668     UChar *myTarget = args->target;
2669     const char *mySourceLimit = args->sourceLimit;
2670     UChar32 targetUniChar = 0x0000;
2671     UChar mySourceChar = 0x0000;
2672     UConverterDataISO2022* myData;
2673     UConverterSharedData* sharedData ;
2674     UBool useFallback;
2675
2676     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2677     if(myData->version==1){
2678         UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2679         return;
2680     }
2681
2682     /* initialize state */
2683     sharedData = myData->currentConverter->sharedData;
2684     useFallback = args->converter->useFallback;
2685
2686     if(myData->key != 0) {
2687         /* continue with a partial escape sequence */
2688         goto escape;
2689     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2690         /* continue with a partial double-byte character */
2691         mySourceChar = args->converter->toUBytes[0];
2692         args->converter->toULength = 0;
2693         goto getTrailByte;
2694     }
2695
2696     while(mySource< mySourceLimit){
2697
2698         if(myTarget < args->targetLimit){
2699
2700             mySourceChar= (unsigned char) *mySource++;
2701
2702             if(mySourceChar==UCNV_SI){
2703                 myData->toU2022State.g = 0;
2704                 if (myData->isEmptySegment) {
2705                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
2706                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2707                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
2708                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2709                     args->converter->toULength = 1;
2710                     args->target = myTarget;
2711                     args->source = mySource;
2712                     return;
2713                 }
2714                 /*consume the source */
2715                 continue;
2716             }else if(mySourceChar==UCNV_SO){
2717                 myData->toU2022State.g = 1;
2718                 myData->isEmptySegment = TRUE;  /* Begin a new segment, empty so far */
2719                 /*consume the source */
2720                 continue;
2721             }else if(mySourceChar==ESC_2022){
2722                 mySource--;
2723 escape:
2724                 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2725                 changeState_2022(args->converter,&(mySource),
2726                                 mySourceLimit, ISO_2022_KR, err);
2727                 if(U_FAILURE(*err)){
2728                     args->target = myTarget;
2729                     args->source = mySource;
2730                     return;
2731                 }
2732                 continue;
2733             }
2734
2735             myData->isEmptySegment = FALSE;     /* Any invalid char errors will be detected separately, so just reset this */
2736             if(myData->toU2022State.g == 1) {
2737                 if(mySource < mySourceLimit) {
2738                     int leadIsOk, trailIsOk;
2739                     uint8_t trailByte;
2740 getTrailByte:
2741                     targetUniChar = missingCharMarker;
2742                     trailByte = (uint8_t)*mySource;
2743                     /*
2744                      * Ticket 5691: consistent illegal sequences:
2745                      * - We include at least the first byte in the illegal sequence.
2746                      * - If any of the non-initial bytes could be the start of a character,
2747                      *   we stop the illegal sequence before the first one of those.
2748                      *
2749                      * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2750                      * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2751                      * Otherwise we convert or report the pair of bytes.
2752                      */
2753                     leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2754                     trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2755                     if (leadIsOk && trailIsOk) {
2756                         ++mySource;
2757                         tempBuf[0] = (char)(mySourceChar + 0x80);
2758                         tempBuf[1] = (char)(trailByte + 0x80);
2759                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2760                         mySourceChar = (mySourceChar << 8) | trailByte;
2761                     } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2762                         /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2763                         ++mySource;
2764                         /* add another bit so that the code below writes 2 bytes in case of error */
2765                         mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2766                     }
2767                 } else {
2768                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2769                     args->converter->toULength = 1;
2770                     break;
2771                 }
2772             }
2773             else if(mySourceChar <= 0x7f) {
2774                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2775             } else {
2776                 targetUniChar = 0xffff;
2777             }
2778             if(targetUniChar < 0xfffe){
2779                 if(args->offsets) {
2780                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2781                 }
2782                 *(myTarget++)=(UChar)targetUniChar;
2783             }
2784             else {
2785                 /* Call the callback function*/
2786                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2787                 break;
2788             }
2789         }
2790         else{
2791             *err =U_BUFFER_OVERFLOW_ERROR;
2792             break;
2793         }
2794     }
2795     args->target = myTarget;
2796     args->source = mySource;
2797 }
2798
2799 /*************************** END ISO2022-KR *********************************/
2800
2801 /*************************** ISO-2022-CN *********************************
2802 *
2803 * Rules for ISO-2022-CN Encoding:
2804 * i)   The designator sequence must appear once on a line before any instance
2805 *      of character set it designates.
2806 * ii)  If two lines contain characters from the same character set, both lines
2807 *      must include the designator sequence.
2808 * iii) Once the designator sequence is known, a shifting sequence has to be found
2809 *      to invoke the  shifting
2810 * iv)  All lines start in ASCII and end in ASCII.
2811 * v)   Four shifting sequences are employed for this purpose:
2812 *
2813 *      Sequcence   ASCII Eq    Charsets
2814 *      ----------  -------    ---------
2815 *      SI           <SI>        US-ASCII
2816 *      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2817 *      SS2          <ESC>N      CNS-11643-1992 Plane 2
2818 *      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2819 *
2820 * vi)
2821 *      SOdesignator  : ESC "$" ")" finalchar_for_SO
2822 *      SS2designator : ESC "$" "*" finalchar_for_SS2
2823 *      SS3designator : ESC "$" "+" finalchar_for_SS3
2824 *
2825 *      ESC $ ) A       Indicates the bytes following SO are Chinese
2826 *       characters as defined in GB 2312-80, until
2827 *       another SOdesignation appears
2828 *
2829 *
2830 *      ESC $ ) E       Indicates the bytes following SO are as defined
2831 *       in ISO-IR-165 (for details, see section 2.1),
2832 *       until another SOdesignation appears
2833 *
2834 *      ESC $ ) G       Indicates the bytes following SO are as defined
2835 *       in CNS 11643-plane-1, until another
2836 *       SOdesignation appears
2837 *
2838 *      ESC $ * H       Indicates the two bytes immediately following
2839 *       SS2 is a Chinese character as defined in CNS
2840 *       11643-plane-2, until another SS2designation
2841 *       appears
2842 *       (Meaning <ESC>N must preceed every 2 byte
2843 *        sequence.)
2844 *
2845 *      ESC $ + I       Indicates the immediate two bytes following SS3
2846 *       is a Chinese character as defined in CNS
2847 *       11643-plane-3, until another SS3designation
2848 *       appears
2849 *       (Meaning <ESC>O must preceed every 2 byte
2850 *        sequence.)
2851 *
2852 *      ESC $ + J       Indicates the immediate two bytes following SS3
2853 *       is a Chinese character as defined in CNS
2854 *       11643-plane-4, until another SS3designation
2855 *       appears
2856 *       (In English: <ESC>O must preceed every 2 byte
2857 *        sequence.)
2858 *
2859 *      ESC $ + K       Indicates the immediate two bytes following SS3
2860 *       is a Chinese character as defined in CNS
2861 *       11643-plane-5, until another SS3designation
2862 *       appears
2863 *
2864 *      ESC $ + L       Indicates the immediate two bytes following SS3
2865 *       is a Chinese character as defined in CNS
2866 *       11643-plane-6, until another SS3designation
2867 *       appears
2868 *
2869 *      ESC $ + M       Indicates the immediate two bytes following SS3
2870 *       is a Chinese character as defined in CNS
2871 *       11643-plane-7, until another SS3designation
2872 *       appears
2873 *
2874 *       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2875 *       has its own designation information before any Chinese characters
2876 *       appear
2877 *
2878 */
2879
2880 /* The following are defined this way to make the strings truly readonly */
2881 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2882 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2883 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2884 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2885 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2886 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2887 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2888 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2889 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2890
2891 /********************** ISO2022-CN Data **************************/
2892 static const char* const escSeqCharsCN[10] ={
2893         SHIFT_IN_STR,                   /* 0 ASCII */
2894         GB_2312_80_STR,                 /* 1 GB2312_1 */
2895         ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
2896         CNS_11643_1992_Plane_1_STR,
2897         CNS_11643_1992_Plane_2_STR,
2898         CNS_11643_1992_Plane_3_STR,
2899         CNS_11643_1992_Plane_4_STR,
2900         CNS_11643_1992_Plane_5_STR,
2901         CNS_11643_1992_Plane_6_STR,
2902         CNS_11643_1992_Plane_7_STR
2903 };
2904
2905 static void
2906 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2907     UConverter *cnv = args->converter;
2908     UConverterDataISO2022 *converterData;
2909     ISO2022State *pFromU2022State;
2910     uint8_t *target = (uint8_t *) args->target;
2911     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2912     const UChar* source = args->source;
2913     const UChar* sourceLimit = args->sourceLimit;
2914     int32_t* offsets = args->offsets;
2915     UChar32 sourceChar;
2916     char buffer[8];
2917     int32_t len;
2918     int8_t choices[3];
2919     int32_t choiceCount;
2920     uint32_t targetValue = 0;
2921     UBool useFallback;
2922
2923     /* set up the state */
2924     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2925     pFromU2022State   = &converterData->fromU2022State;
2926
2927     choiceCount = 0;
2928
2929     /* check if the last codepoint of previous buffer was a lead surrogate*/
2930     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2931         goto getTrail;
2932     }
2933
2934     while( source < sourceLimit){
2935         if(target < targetLimit){
2936
2937             sourceChar  = *(source++);
2938             /*check if the char is a First surrogate*/
2939              if(U16_IS_SURROGATE(sourceChar)) {
2940                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2941 getTrail:
2942                     /*look ahead to find the trail surrogate*/
2943                     if(source < sourceLimit) {
2944                         /* test the following code unit */
2945                         UChar trail=(UChar) *source;
2946                         if(U16_IS_TRAIL(trail)) {
2947                             source++;
2948                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2949                             cnv->fromUChar32=0x00;
2950                             /* convert this supplementary code point */
2951                             /* exit this condition tree */
2952                         } else {
2953                             /* this is an unmatched lead code unit (1st surrogate) */
2954                             /* callback(illegal) */
2955                             *err=U_ILLEGAL_CHAR_FOUND;
2956                             cnv->fromUChar32=sourceChar;
2957                             break;
2958                         }
2959                     } else {
2960                         /* no more input */
2961                         cnv->fromUChar32=sourceChar;
2962                         break;
2963                     }
2964                 } else {
2965                     /* this is an unmatched trail code unit (2nd surrogate) */
2966                     /* callback(illegal) */
2967                     *err=U_ILLEGAL_CHAR_FOUND;
2968                     cnv->fromUChar32=sourceChar;
2969                     break;
2970                 }
2971             }
2972
2973             /* do the conversion */
2974             if(sourceChar <= 0x007f ){
2975                 /* do not convert SO/SI/ESC */
2976                 if(IS_2022_CONTROL(sourceChar)) {
2977                     /* callback(illegal) */
2978                     *err=U_ILLEGAL_CHAR_FOUND;
2979                     cnv->fromUChar32=sourceChar;
2980                     break;
2981                 }
2982
2983                 /* US-ASCII */
2984                 if(pFromU2022State->g == 0) {
2985                     buffer[0] = (char)sourceChar;
2986                     len = 1;
2987                 } else {
2988                     buffer[0] = UCNV_SI;
2989                     buffer[1] = (char)sourceChar;
2990                     len = 2;
2991                     pFromU2022State->g = 0;
2992                     choiceCount = 0;
2993                 }
2994                 if(sourceChar == CR || sourceChar == LF) {
2995                     /* reset the state at the end of a line */
2996                     uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2997                     choiceCount = 0;
2998                 }
2999             }
3000             else{
3001                 /* convert U+0080..U+10ffff */
3002                 int32_t i;
3003                 int8_t cs, g;
3004
3005                 if(choiceCount == 0) {
3006                     /* try the current SO/G1 converter first */
3007                     choices[0] = pFromU2022State->cs[1];
3008
3009                     /* default to GB2312_1 if none is designated yet */
3010                     if(choices[0] == 0) {
3011                         choices[0] = GB2312_1;
3012                     }
3013
3014                     if(converterData->version == 0) {
3015                         /* ISO-2022-CN */
3016
3017                         /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3018                         if(choices[0] == GB2312_1) {
3019                             choices[1] = (int8_t)CNS_11643_1;
3020                         } else {
3021                             choices[1] = (int8_t)GB2312_1;
3022                         }
3023
3024                         choiceCount = 2;
3025                     } else if (converterData->version == 1) {
3026                         /* ISO-2022-CN-EXT */
3027
3028                         /* try one of the other converters */
3029                         switch(choices[0]) {
3030                         case GB2312_1:
3031                             choices[1] = (int8_t)CNS_11643_1;
3032                             choices[2] = (int8_t)ISO_IR_165;
3033                             break;
3034                         case ISO_IR_165:
3035                             choices[1] = (int8_t)GB2312_1;
3036                             choices[2] = (int8_t)CNS_11643_1;
3037                             break;
3038                         default: /* CNS_11643_x */
3039                             choices[1] = (int8_t)GB2312_1;
3040                             choices[2] = (int8_t)ISO_IR_165;
3041                             break;
3042                         }
3043
3044                         choiceCount = 3;
3045                     } else {
3046                         choices[0] = (int8_t)CNS_11643_1;
3047                         choices[1] = (int8_t)GB2312_1;
3048                     }
3049                 }
3050
3051                 cs = g = 0;
3052                 /*
3053                  * len==0: no mapping found yet
3054                  * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3055                  * len>0: found a roundtrip result, done
3056                  */
3057                 len = 0;
3058                 /*
3059                  * We will turn off useFallback after finding a fallback,
3060                  * but we still get fallbacks from PUA code points as usual.
3061                  * Therefore, we will also need to check that we don't overwrite
3062                  * an early fallback with a later one.
3063                  */
3064                 useFallback = cnv->useFallback;
3065
3066                 for(i = 0; i < choiceCount && len <= 0; ++i) {
3067                     int8_t cs0 = choices[i];
3068                     if(cs0 > 0) {
3069                         uint32_t value;
3070                         int32_t len2;
3071                         if(cs0 >= CNS_11643_0) {
3072                             len2 = MBCS_FROM_UCHAR32_ISO2022(
3073                                         converterData->myConverterArray[CNS_11643],
3074                                         sourceChar,
3075                                         &value,
3076                                         useFallback,
3077                                         MBCS_OUTPUT_3);
3078                             if(len2 == 3 || (len2 == -3 && len == 0)) {
3079                                 targetValue = value;
3080                                 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3081                                 if(len2 >= 0) {
3082                                     len = 2;
3083                                 } else {
3084                                     len = -2;
3085                                     useFallback = FALSE;
3086                                 }
3087                                 if(cs == CNS_11643_1) {
3088                                     g = 1;
3089                                 } else if(cs == CNS_11643_2) {
3090                                     g = 2;
3091                                 } else /* plane 3..7 */ if(converterData->version == 1) {
3092                                     g = 3;
3093                                 } else {
3094                                     /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3095                                     len = 0;
3096                                 }
3097                             }
3098                         } else {
3099                             /* GB2312_1 or ISO-IR-165 */
3100                             U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3101                             len2 = MBCS_FROM_UCHAR32_ISO2022(
3102                                         converterData->myConverterArray[cs0],
3103                                         sourceChar,
3104                                         &value,
3105                                         useFallback,
3106                                         MBCS_OUTPUT_2);
3107                             if(len2 == 2 || (len2 == -2 && len == 0)) {
3108                                 targetValue = value;
3109                                 len = len2;
3110                                 cs = cs0;
3111                                 g = 1;
3112                                 useFallback = FALSE;
3113                             }
3114                         }
3115                     }
3116                 }
3117
3118                 if(len != 0) {
3119                     len = 0; /* count output bytes; it must have been abs(len) == 2 */
3120
3121                     /* write the designation sequence if necessary */
3122                     if(cs != pFromU2022State->cs[g]) {
3123                         if(cs < CNS_11643) {
3124                             uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3125                         } else {
3126                             U_ASSERT(cs >= CNS_11643_1);
3127                             uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3128                         }
3129                         len = 4;
3130                         pFromU2022State->cs[g] = cs;
3131                         if(g == 1) {
3132                             /* changing the SO/G1 charset invalidates the choices[] */
3133                             choiceCount = 0;
3134                         }
3135                     }
3136
3137                     /* write the shift sequence if necessary */
3138                     if(g != pFromU2022State->g) {
3139                         switch(g) {
3140                         case 1:
3141                             buffer[len++] = UCNV_SO;
3142
3143                             /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3144                             pFromU2022State->g = 1;
3145                             break;
3146                         case 2:
3147                             buffer[len++] = 0x1b;
3148                             buffer[len++] = 0x4e;
3149                             break;
3150                         default: /* case 3 */
3151                             buffer[len++] = 0x1b;
3152                             buffer[len++] = 0x4f;
3153                             break;
3154                         }
3155                     }
3156
3157                     /* write the two output bytes */
3158                     buffer[len++] = (char)(targetValue >> 8);
3159                     buffer[len++] = (char)targetValue;
3160                 } else {
3161                     /* if we cannot find the character after checking all codepages
3162                      * then this is an error
3163                      */
3164                     *err = U_INVALID_CHAR_FOUND;
3165                     cnv->fromUChar32=sourceChar;
3166                     break;
3167                 }
3168             }
3169
3170             /* output len>0 bytes in buffer[] */
3171             if(len == 1) {
3172                 *target++ = buffer[0];
3173                 if(offsets) {
3174                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3175                 }
3176             } else if(len == 2 && (target + 2) <= targetLimit) {
3177                 *target++ = buffer[0];
3178                 *target++ = buffer[1];
3179                 if(offsets) {
3180                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3181                     *offsets++ = sourceIndex;
3182                     *offsets++ = sourceIndex;
3183                 }
3184             } else {
3185                 fromUWriteUInt8(
3186                     cnv,
3187                     buffer, len,
3188                     &target, (const char *)targetLimit,
3189                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3190                     err);
3191                 if(U_FAILURE(*err)) {
3192                     break;
3193                 }
3194             }
3195         } /* end if(myTargetIndex<myTargetLength) */
3196         else{
3197             *err =U_BUFFER_OVERFLOW_ERROR;
3198             break;
3199         }
3200
3201     }/* end while(mySourceIndex<mySourceLength) */
3202
3203     /*
3204      * the end of the input stream and detection of truncated input
3205      * are handled by the framework, but for ISO-2022-CN conversion
3206      * we need to be in ASCII mode at the very end
3207      *
3208      * conditions:
3209      *   successful
3210      *   not in ASCII mode
3211      *   end of input and no truncated input
3212      */
3213     if( U_SUCCESS(*err) &&
3214         pFromU2022State->g!=0 &&
3215         args->flush && source>=sourceLimit && cnv->fromUChar32==0
3216     ) {
3217         int32_t sourceIndex;
3218
3219         /* we are switching to ASCII */
3220         pFromU2022State->g=0;
3221
3222         /* get the source index of the last input character */
3223         /*
3224          * TODO this would be simpler and more reliable if we used a pair
3225          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3226          * so that we could simply use the prevSourceIndex here;
3227          * this code gives an incorrect result for the rare case of an unmatched
3228          * trail surrogate that is alone in the last buffer of the text stream
3229          */
3230         sourceIndex=(int32_t)(source-args->source);
3231         if(sourceIndex>0) {
3232             --sourceIndex;
3233             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3234                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3235             ) {
3236                 --sourceIndex;
3237             }
3238         } else {
3239             sourceIndex=-1;
3240         }
3241
3242         fromUWriteUInt8(
3243             cnv,
3244             SHIFT_IN_STR, 1,
3245             &target, (const char *)targetLimit,
3246             &offsets, sourceIndex,
3247             err);
3248     }
3249
3250     /*save the state and return */
3251     args->source = source;
3252     args->target = (char*)target;
3253 }
3254
3255
3256 static void
3257 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3258                                                UErrorCode* err){
3259     char tempBuf[3];
3260     const char *mySource = (char *) args->source;
3261     UChar *myTarget = args->target;
3262     const char *mySourceLimit = args->sourceLimit;
3263     uint32_t targetUniChar = 0x0000;
3264     uint32_t mySourceChar = 0x0000;
3265     UConverterDataISO2022* myData;
3266     ISO2022State *pToU2022State;
3267
3268     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3269     pToU2022State = &myData->toU2022State;
3270
3271     if(myData->key != 0) {
3272         /* continue with a partial escape sequence */
3273         goto escape;
3274     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3275         /* continue with a partial double-byte character */
3276         mySourceChar = args->converter->toUBytes[0];
3277         args->converter->toULength = 0;
3278         targetUniChar = missingCharMarker;
3279         goto getTrailByte;
3280     }
3281
3282     while(mySource < mySourceLimit){
3283
3284         targetUniChar =missingCharMarker;
3285
3286         if(myTarget < args->targetLimit){
3287
3288             mySourceChar= (unsigned char) *mySource++;
3289
3290             switch(mySourceChar){
3291             case UCNV_SI:
3292                 pToU2022State->g=0;
3293                 if (myData->isEmptySegment) {
3294                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
3295                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3296                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
3297                     args->converter->toUBytes[0] = mySourceChar;
3298                     args->converter->toULength = 1;
3299                     args->target = myTarget;
3300                     args->source = mySource;
3301                     return;
3302                 }
3303                 continue;
3304
3305             case UCNV_SO:
3306                 if(pToU2022State->cs[1] != 0) {
3307                     pToU2022State->g=1;
3308                     myData->isEmptySegment = TRUE;      /* Begin a new segment, empty so far */
3309                     continue;
3310                 } else {
3311                     /* illegal to have SO before a matching designator */
3312                     myData->isEmptySegment = FALSE;     /* Handling a different error, reset this to avoid future spurious errs */
3313                     break;
3314                 }
3315
3316             case ESC_2022:
3317                 mySource--;
3318 escape:
3319                 {
3320                     const char * mySourceBefore = mySource;
3321                     int8_t toULengthBefore = args->converter->toULength;
3322
3323                     changeState_2022(args->converter,&(mySource),
3324                         mySourceLimit, ISO_2022_CN,err);
3325
3326                     /* After SO there must be at least one character before a designator (designator error handled separately) */
3327                     if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3328                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3329                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
3330                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3331                     }
3332                 }
3333
3334                 /* invalid or illegal escape sequence */
3335                 if(U_FAILURE(*err)){
3336                     args->target = myTarget;
3337                     args->source = mySource;
3338                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
3339                     return;
3340                 }
3341                 continue;
3342
3343             /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3344
3345             case CR:
3346                 /*falls through*/
3347             case LF:
3348                 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3349                 /* falls through */
3350             default:
3351                 /* convert one or two bytes */
3352                 myData->isEmptySegment = FALSE;
3353                 if(pToU2022State->g != 0) {
3354                     if(mySource < mySourceLimit) {
3355                         UConverterSharedData *cnv;
3356                         StateEnum tempState;
3357                         int32_t tempBufLen;
3358                         int leadIsOk, trailIsOk;
3359                         uint8_t trailByte;
3360 getTrailByte:
3361                         trailByte = (uint8_t)*mySource;
3362                         /*
3363                          * Ticket 5691: consistent illegal sequences:
3364                          * - We include at least the first byte in the illegal sequence.
3365                          * - If any of the non-initial bytes could be the start of a character,
3366                          *   we stop the illegal sequence before the first one of those.
3367                          *
3368                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3369                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3370                          * Otherwise we convert or report the pair of bytes.
3371                          */
3372                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3373                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3374                         if (leadIsOk && trailIsOk) {
3375                             ++mySource;
3376                             tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3377                             if(tempState >= CNS_11643_0) {
3378                                 cnv = myData->myConverterArray[CNS_11643];
3379                                 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3380                                 tempBuf[1] = (char) (mySourceChar);
3381                                 tempBuf[2] = (char) trailByte;
3382                                 tempBufLen = 3;
3383
3384                             }else{
3385                                 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3386                                 cnv = myData->myConverterArray[tempState];
3387                                 tempBuf[0] = (char) (mySourceChar);
3388                                 tempBuf[1] = (char) trailByte;
3389                                 tempBufLen = 2;
3390                             }
3391                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3392                             mySourceChar = (mySourceChar << 8) | trailByte;
3393                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3394                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3395                             ++mySource;
3396                             /* add another bit so that the code below writes 2 bytes in case of error */
3397                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3398                         }
3399                         if(pToU2022State->g>=2) {
3400                             /* return from a single-shift state to the previous one */
3401                             pToU2022State->g=pToU2022State->prevG;
3402                         }
3403                     } else {
3404                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3405                         args->converter->toULength = 1;
3406                         goto endloop;
3407                     }
3408                 }
3409                 else{
3410                     if(mySourceChar <= 0x7f) {
3411                         targetUniChar = (UChar) mySourceChar;
3412                     }
3413                 }
3414                 break;
3415             }
3416             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3417                 if(args->offsets){
3418                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3419                 }
3420                 *(myTarget++)=(UChar)targetUniChar;
3421             }
3422             else if(targetUniChar > missingCharMarker){
3423                 /* disassemble the surrogate pair and write to output*/
3424                 targetUniChar-=0x0010000;
3425                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3426                 if(args->offsets){
3427                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3428                 }
3429                 ++myTarget;
3430                 if(myTarget< args->targetLimit){
3431                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3432                     if(args->offsets){
3433                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3434                     }
3435                     ++myTarget;
3436                 }else{
3437                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3438                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3439                 }
3440
3441             }
3442             else{
3443                 /* Call the callback function*/
3444                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3445                 break;
3446             }
3447         }
3448         else{
3449             *err =U_BUFFER_OVERFLOW_ERROR;
3450             break;
3451         }
3452     }
3453 endloop:
3454     args->target = myTarget;
3455     args->source = mySource;
3456 }
3457 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3458
3459 static void
3460 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3461     UConverter *cnv = args->converter;
3462     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3463     ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3464     char *p, *subchar;
3465     char buffer[8];
3466     int32_t length;
3467
3468     subchar=(char *)cnv->subChars;
3469     length=cnv->subCharLen; /* assume length==1 for most variants */
3470
3471     p = buffer;
3472     switch(myConverterData->locale[0]){
3473     case 'j':
3474         {
3475             int8_t cs;
3476
3477             if(pFromU2022State->g == 1) {
3478                 /* JIS7: switch from G1 to G0 */
3479                 pFromU2022State->g = 0;
3480                 *p++ = UCNV_SI;
3481             }
3482
3483             cs = pFromU2022State->cs[0];
3484             if(cs != ASCII && cs != JISX201) {
3485                 /* not in ASCII or JIS X 0201: switch to ASCII */
3486                 pFromU2022State->cs[0] = (int8_t)ASCII;
3487                 *p++ = '\x1b';
3488                 *p++ = '\x28';
3489                 *p++ = '\x42';
3490             }
3491
3492             *p++ = subchar[0];
3493             break;
3494         }
3495     case 'c':
3496         if(pFromU2022State->g != 0) {
3497             /* not in ASCII mode: switch to ASCII */
3498             pFromU2022State->g = 0;
3499             *p++ = UCNV_SI;
3500         }
3501         *p++ = subchar[0];
3502         break;
3503     case 'k':
3504         if(myConverterData->version == 0) {
3505             if(length == 1) {
3506                 if((UBool)args->converter->fromUnicodeStatus) {
3507                     /* in DBCS mode: switch to SBCS */
3508                     args->converter->fromUnicodeStatus = 0;
3509                     *p++ = UCNV_SI;
3510                 }
3511                 *p++ = subchar[0];
3512             } else /* length == 2*/ {
3513                 if(!(UBool)args->converter->fromUnicodeStatus) {
3514                     /* in SBCS mode: switch to DBCS */
3515                     args->converter->fromUnicodeStatus = 1;
3516                     *p++ = UCNV_SO;
3517                 }
3518                 *p++ = subchar[0];
3519                 *p++ = subchar[1];
3520             }
3521             break;
3522         } else {
3523             /* save the subconverter's substitution string */
3524             uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3525             int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3526
3527             /* set our substitution string into the subconverter */
3528             myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3529             myConverterData->currentConverter->subCharLen = (int8_t)length;
3530
3531             /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3532             args->converter = myConverterData->currentConverter;
3533             myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3534             ucnv_cbFromUWriteSub(args, 0, err);
3535             cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3536             args->converter = cnv;
3537
3538             /* restore the subconverter's substitution string */
3539             myConverterData->currentConverter->subChars = currentSubChars;
3540             myConverterData->currentConverter->subCharLen = currentSubCharLen;
3541
3542             if(*err == U_BUFFER_OVERFLOW_ERROR) {
3543                 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3544                     uprv_memcpy(
3545                         cnv->charErrorBuffer,
3546                         myConverterData->currentConverter->charErrorBuffer,
3547                         myConverterData->currentConverter->charErrorBufferLength);
3548                 }
3549                 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3550                 myConverterData->currentConverter->charErrorBufferLength = 0;
3551             }
3552             return;
3553         }
3554     default:
3555         /* not expected */
3556         break;
3557     }
3558     ucnv_cbFromUWriteBytes(args,
3559                            buffer, (int32_t)(p - buffer),
3560                            offsetIndex, err);
3561 }
3562
3563 /*
3564  * Structure for cloning an ISO 2022 converter into a single memory block.
3565  * ucnv_safeClone() of the converter will align the entire cloneStruct,
3566  * and then ucnv_safeClone() of the sub-converter may additionally align
3567  * currentConverter inside the cloneStruct, for which we need the deadSpace
3568  * after currentConverter.
3569  * This is because UAlignedMemory may be larger than the actually
3570  * necessary alignment size for the platform.
3571  * The other cloneStruct fields will not be moved around,
3572  * and are aligned properly with cloneStruct's alignment.
3573  */
3574 struct cloneStruct
3575 {
3576     UConverter cnv;
3577     UConverter currentConverter;
3578     UAlignedMemory deadSpace;
3579     UConverterDataISO2022 mydata;
3580 };
3581
3582
3583 static UConverter *
3584 _ISO_2022_SafeClone(
3585             const UConverter *cnv,
3586             void *stackBuffer,
3587             int32_t *pBufferSize,
3588             UErrorCode *status)
3589 {
3590     struct cloneStruct * localClone;
3591     UConverterDataISO2022 *cnvData;
3592     int32_t i, size;
3593
3594     if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3595         *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3596         return NULL;
3597     }
3598
3599     cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3600     localClone = (struct cloneStruct *)stackBuffer;
3601
3602     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3603
3604     uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3605     localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3606     localClone->cnv.isExtraLocal = TRUE;
3607
3608     /* share the subconverters */
3609
3610     if(cnvData->currentConverter != NULL) {
3611         size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3612         localClone->mydata.currentConverter =
3613             ucnv_safeClone(cnvData->currentConverter,
3614                             &localClone->currentConverter,
3615                             &size, status);
3616         if(U_FAILURE(*status)) {
3617             return NULL;
3618         }
3619     }
3620
3621     for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3622         if(cnvData->myConverterArray[i] != NULL) {
3623             ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3624         }
3625     }
3626
3627     return &localClone->cnv;
3628 }
3629
3630 static void
3631 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3632                     const USetAdder *sa,
3633                     UConverterUnicodeSet which,
3634                     UErrorCode *pErrorCode)
3635 {
3636     int32_t i;
3637     UConverterDataISO2022* cnvData;
3638
3639     if (U_FAILURE(*pErrorCode)) {
3640         return;
3641     }
3642 #ifdef U_ENABLE_GENERIC_ISO_2022
3643     if (cnv->sharedData == &_ISO2022Data) {
3644         /* We use UTF-8 in this case */
3645         sa->addRange(sa->set, 0, 0xd7FF);
3646         sa->addRange(sa->set, 0xE000, 0x10FFFF);
3647         return;
3648     }
3649 #endif
3650
3651     cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3652
3653     /* open a set and initialize it with code points that are algorithmically round-tripped */
3654     switch(cnvData->locale[0]){
3655     case 'j':
3656         /* include JIS X 0201 which is hardcoded */
3657         sa->add(sa->set, 0xa5);
3658         sa->add(sa->set, 0x203e);
3659         if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3660             /* include Latin-1 for some variants of JP */
3661             sa->addRange(sa->set, 0, 0xff);
3662         } else {
3663             /* include ASCII for JP */
3664             sa->addRange(sa->set, 0, 0x7f);
3665         }
3666         if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3667             /*
3668              * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3669              * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3670              * use half-width Katakana.
3671              * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3672              * half-width Katakana via the ESC ( I sequence.
3673              * However, we only emit (fromUnicode) half-width Katakana according to the
3674              * definition of each variant.
3675              *
3676              * When including fallbacks,
3677              * we need to include half-width Katakana Unicode code points for all JP variants because
3678              * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3679              */
3680             /* include half-width Katakana for JP */
3681             sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3682         }
3683         break;
3684 #if !UCONFIG_ONLY_HTML_CONVERSION
3685     case 'c':
3686     case 'z':
3687         /* include ASCII for CN */
3688         sa->addRange(sa->set, 0, 0x7f);
3689         break;
3690     case 'k':
3691         /* there is only one converter for KR, and it is not in the myConverterArray[] */
3692         cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3693                 cnvData->currentConverter, sa, which, pErrorCode);
3694         /* the loop over myConverterArray[] will simply not find another converter */
3695         break;
3696 #endif
3697     default:
3698         break;
3699     }
3700
3701 #if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3702             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3703                 cnvData->version==0 && i==CNS_11643
3704             ) {
3705                 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3706                 ucnv_MBCSGetUnicodeSetForBytes(
3707                         cnvData->myConverterArray[i],
3708                         sa, UCNV_ROUNDTRIP_SET,
3709                         0, 0x81, 0x82,
3710                         pErrorCode);
3711             }
3712 #endif
3713
3714     for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3715         UConverterSetFilter filter;
3716         if(cnvData->myConverterArray[i]!=NULL) {
3717             if(cnvData->locale[0]=='j' && i==JISX208) {
3718                 /*
3719                  * Only add code points that map to Shift-JIS codes
3720                  * corresponding to JIS X 0208.
3721                  */
3722                 filter=UCNV_SET_FILTER_SJIS;
3723 #if !UCONFIG_ONLY_HTML_CONVERSION
3724             } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3725                        cnvData->version==0 && i==CNS_11643) {
3726                 /*
3727                  * Version-specific for CN:
3728                  * CN version 0 does not map CNS planes 3..7 although
3729                  * they are all available in the CNS conversion table;
3730                  * CN version 1 (-EXT) does map them all.
3731                  * The two versions create different Unicode sets.
3732                  */
3733                 filter=UCNV_SET_FILTER_2022_CN;
3734             } else if(i==KSC5601) {
3735                 /*
3736                  * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3737                  * are broader than GR94.
3738                  */
3739                 filter=UCNV_SET_FILTER_GR94DBCS;
3740 #endif
3741             } else {
3742                 filter=UCNV_SET_FILTER_NONE;
3743             }
3744             ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3745         }
3746     }
3747
3748     /*
3749      * ISO 2022 converters must not convert SO/SI/ESC despite what
3750      * sub-converters do by themselves.
3751      * Remove these characters from the set.
3752      */
3753     sa->remove(sa->set, 0x0e);
3754     sa->remove(sa->set, 0x0f);
3755     sa->remove(sa->set, 0x1b);
3756
3757     /* ISO 2022 converters do not convert C1 controls either */
3758     sa->removeRange(sa->set, 0x80, 0x9f);
3759 }
3760
3761 static const UConverterImpl _ISO2022Impl={
3762     UCNV_ISO_2022,
3763
3764     NULL,
3765     NULL,
3766
3767     _ISO2022Open,
3768     _ISO2022Close,
3769     _ISO2022Reset,
3770
3771 #ifdef U_ENABLE_GENERIC_ISO_2022
3772     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3773     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3774     ucnv_fromUnicode_UTF8,
3775     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3776 #else
3777     NULL,
3778     NULL,
3779     NULL,
3780     NULL,
3781 #endif
3782     NULL,
3783
3784     NULL,
3785     _ISO2022getName,
3786     _ISO_2022_WriteSub,
3787     _ISO_2022_SafeClone,
3788     _ISO_2022_GetUnicodeSet,
3789
3790     NULL,
3791     NULL
3792 };
3793 static const UConverterStaticData _ISO2022StaticData={
3794     sizeof(UConverterStaticData),
3795     "ISO_2022",
3796     2022,
3797     UCNV_IBM,
3798     UCNV_ISO_2022,
3799     1,
3800     3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3801     { 0x1a, 0, 0, 0 },
3802     1,
3803     FALSE,
3804     FALSE,
3805     0,
3806     0,
3807     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3808 };
3809 const UConverterSharedData _ISO2022Data={
3810     sizeof(UConverterSharedData),
3811     ~((uint32_t) 0),
3812     NULL,
3813     NULL,
3814     &_ISO2022StaticData,
3815     FALSE,
3816     &_ISO2022Impl,
3817     0, UCNV_MBCS_TABLE_INITIALIZER
3818 };
3819
3820 /*************JP****************/
3821 static const UConverterImpl _ISO2022JPImpl={
3822     UCNV_ISO_2022,
3823
3824     NULL,
3825     NULL,
3826
3827     _ISO2022Open,
3828     _ISO2022Close,
3829     _ISO2022Reset,
3830
3831     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3832     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3833     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3834     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3835     NULL,
3836
3837     NULL,
3838     _ISO2022getName,
3839     _ISO_2022_WriteSub,
3840     _ISO_2022_SafeClone,
3841     _ISO_2022_GetUnicodeSet,
3842
3843     NULL,
3844     NULL
3845 };
3846 static const UConverterStaticData _ISO2022JPStaticData={
3847     sizeof(UConverterStaticData),
3848     "ISO_2022_JP",
3849     0,
3850     UCNV_IBM,
3851     UCNV_ISO_2022,
3852     1,
3853     6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3854     { 0x1a, 0, 0, 0 },
3855     1,
3856     FALSE,
3857     FALSE,
3858     0,
3859     0,
3860     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3861 };
3862
3863 namespace {
3864
3865 const UConverterSharedData _ISO2022JPData={
3866     sizeof(UConverterSharedData),
3867     ~((uint32_t) 0),
3868     NULL,
3869     NULL,
3870     &_ISO2022JPStaticData,
3871     FALSE,
3872     &_ISO2022JPImpl,
3873     0, UCNV_MBCS_TABLE_INITIALIZER
3874 };
3875
3876 }  // namespace
3877
3878 #if !UCONFIG_ONLY_HTML_CONVERSION
3879 /************* KR ***************/
3880 static const UConverterImpl _ISO2022KRImpl={
3881     UCNV_ISO_2022,
3882
3883     NULL,
3884     NULL,
3885
3886     _ISO2022Open,
3887     _ISO2022Close,
3888     _ISO2022Reset,
3889
3890     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3891     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3892     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3893     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3894     NULL,
3895
3896     NULL,
3897     _ISO2022getName,
3898     _ISO_2022_WriteSub,
3899     _ISO_2022_SafeClone,
3900     _ISO_2022_GetUnicodeSet,
3901
3902     NULL,
3903     NULL
3904 };
3905 static const UConverterStaticData _ISO2022KRStaticData={
3906     sizeof(UConverterStaticData),
3907     "ISO_2022_KR",
3908     0,
3909     UCNV_IBM,
3910     UCNV_ISO_2022,
3911     1,
3912     3, /* max 3 bytes per UChar: SO+DBCS */
3913     { 0x1a, 0, 0, 0 },
3914     1,
3915     FALSE,
3916     FALSE,
3917     0,
3918     0,
3919     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3920 };
3921
3922 namespace {
3923
3924 const UConverterSharedData _ISO2022KRData={
3925     sizeof(UConverterSharedData),
3926     ~((uint32_t) 0),
3927     NULL,
3928     NULL,
3929     &_ISO2022KRStaticData,
3930     FALSE,
3931     &_ISO2022KRImpl,
3932     0, UCNV_MBCS_TABLE_INITIALIZER
3933 };
3934
3935 }  // namespace
3936
3937 /*************** CN ***************/
3938 static const UConverterImpl _ISO2022CNImpl={
3939
3940     UCNV_ISO_2022,
3941
3942     NULL,
3943     NULL,
3944
3945     _ISO2022Open,
3946     _ISO2022Close,
3947     _ISO2022Reset,
3948
3949     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3950     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3951     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3952     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3953     NULL,
3954
3955     NULL,
3956     _ISO2022getName,
3957     _ISO_2022_WriteSub,
3958     _ISO_2022_SafeClone,
3959     _ISO_2022_GetUnicodeSet,
3960
3961     NULL,
3962     NULL
3963 };
3964 static const UConverterStaticData _ISO2022CNStaticData={
3965     sizeof(UConverterStaticData),
3966     "ISO_2022_CN",
3967     0,
3968     UCNV_IBM,
3969     UCNV_ISO_2022,
3970     1,
3971     8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3972     { 0x1a, 0, 0, 0 },
3973     1,
3974     FALSE,
3975     FALSE,
3976     0,
3977     0,
3978     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3979 };
3980
3981 namespace {
3982
3983 const UConverterSharedData _ISO2022CNData={
3984     sizeof(UConverterSharedData),
3985     ~((uint32_t) 0),
3986     NULL,
3987     NULL,
3988     &_ISO2022CNStaticData,
3989     FALSE,
3990     &_ISO2022CNImpl,
3991     0, UCNV_MBCS_TABLE_INITIALIZER
3992 };
3993
3994 }  // namespace
3995 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3996
3997 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */