icuSources/common/ucnv2022.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2000-2010, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv2022.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2000feb03
  12 *   created by: Markus W. Scherer
  13 *
  14 *   Change history:
  15 *
  16 *   06/29/2000  helena  Major rewrite of the callback APIs.
  17 *   08/08/2000  Ram     Included support for ISO-2022-JP-2
  18 *                       Changed implementation of toUnicode
  19 *                       function
  20 *   08/21/2000  Ram     Added support for ISO-2022-KR
  21 *   08/29/2000  Ram     Seperated implementation of EBCDIC to
  22 *                       ucnvebdc.c
  23 *   09/20/2000  Ram     Added support for ISO-2022-CN
  24 *                       Added implementations for getNextUChar()
  25 *                       for specific 2022 country variants.
  26 *   10/31/2000  Ram     Implemented offsets logic functions
  27 */
  28
  29 #include "unicode/utypes.h"
  30
  31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
  32
  33 #include "unicode/ucnv.h"
  34 #include "unicode/uset.h"
  35 #include "unicode/ucnv_err.h"
  36 #include "unicode/ucnv_cb.h"
  37 #include "ucnv_imp.h"
  38 #include "ucnv_bld.h"
  39 #include "ucnv_cnv.h"
  40 #include "ucnvmbcs.h"
  41 #include "cstring.h"
  42 #include "cmemory.h"
  43
  44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  45
  46 #ifdef U_ENABLE_GENERIC_ISO_2022
  47 /*
  48  * I am disabling the generic ISO-2022 converter after proposing to do so on
  49  * the icu mailing list two days ago.
  50  *
  51  * Reasons:
  52  * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
  53  *    its designation sequences, single shifts with return to the previous state,
  54  *    switch-with-no-return to UTF-16BE or similar, etc.
  55  *    This is unlike the language-specific variants like ISO-2022-JP which
  56  *    require a much smaller repertoire of ISO-2022 features.
  57  *    These variants continue to be supported.
  58  * 2. I believe that no one is really using the generic ISO-2022 converter
  59  *    but rather always one of the language-specific variants.
  60  *    Note that ICU's generic ISO-2022 converter has always output one escape
  61  *    sequence followed by UTF-8 for the whole stream.
  62  * 3. Switching between subcharsets is extremely slow, because each time
  63  *    the previous converter is closed and a new one opened,
  64  *    without any kind of caching, least-recently-used list, etc.
  65  * 4. The code is currently buggy, and given the above it does not seem
  66  *    reasonable to spend the time on maintenance.
  67  * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
  68  *    This means, for example, that when ISO-8859-7 is designated, the following
  69  *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
  70  *    The ICU ISO-2022 converter does not handle this - and has no information
  71  *    about which subconverter would have to be shifted vs. which is designed
  72  *    for 7-bit ISO-2022.
  73  *
  74  * Markus Scherer 2003-dec-03
  75  */
  76 #endif
  77
  78 static const char SHIFT_IN_STR[]  = "\x0F";
  79 static const char SHIFT_OUT_STR[] = "\x0E";
  80
  81 #define CR      0x0D
  82 #define LF      0x0A
  83 #define H_TAB   0x09
  84 #define V_TAB   0x0B
  85 #define SPACE   0x20
  86
  87 enum {
  88     HWKANA_START=0xff61,
  89     HWKANA_END=0xff9f
  90 };
  91
  92 /*
  93  * 94-character sets with native byte values A1..FE are encoded in ISO 2022
  94  * as bytes 21..7E. (Subtract 0x80.)
  95  * 96-character sets with native byte values A0..FF are encoded in ISO 2022
  96  * as bytes 20..7F. (Subtract 0x80.)
  97  * Do not encode C1 control codes with native bytes 80..9F
  98  * as bytes 00..1F (C0 control codes).
  99  */
 100 enum {
 101     GR94_START=0xa1,
 102     GR94_END=0xfe,
 103     GR96_START=0xa0,
 104     GR96_END=0xff
 105 };
 106
 107 /*
 108  * ISO 2022 control codes must not be converted from Unicode
 109  * because they would mess up the byte stream.
 110  * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
 111  * corresponding to SO, SI, and ESC.
 112  */
 113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
 114
 115 /* for ISO-2022-JP and -CN implementations */
 116 typedef enum  {
 117         /* shared values */
 118         INVALID_STATE=-1,
 119         ASCII = 0,
 120
 121         SS2_STATE=0x10,
 122         SS3_STATE,
 123
 124         /* JP */
 125         ISO8859_1 = 1 ,
 126         ISO8859_7 = 2 ,
 127         JISX201  = 3,
 128         JISX208 = 4,
 129         JISX212 = 5,
 130         GB2312  =6,
 131         KSC5601 =7,
 132         HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
 133
 134         /* CN */
 135         /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
 136         GB2312_1=1,
 137         ISO_IR_165=2,
 138         CNS_11643=3,
 139
 140         /*
 141          * these are used in StateEnum and ISO2022State variables,
 142          * but CNS_11643 must be used to index into myConverterArray[]
 143          */
 144         CNS_11643_0=0x20,
 145         CNS_11643_1,
 146         CNS_11643_2,
 147         CNS_11643_3,
 148         CNS_11643_4,
 149         CNS_11643_5,
 150         CNS_11643_6,
 151         CNS_11643_7
 152 } StateEnum;
 153
 154 /* is the StateEnum charset value for a DBCS charset? */
 155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
 156
 157 #define CSM(cs) ((uint16_t)1<<(cs))
 158
 159 /*
 160  * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
 161  * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
 162  *
 163  * Note: The converter uses some leniency:
 164  * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
 165  *   all versions, not just JIS7 and JIS8.
 166  * - ICU does not distinguish between different versions of JIS X 0208.
 167  */
 168 enum { MAX_JA_VERSION=4 };
 169 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
 170     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
 171     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
 172     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
 173     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
 174     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
 175 };
 176
 177 typedef enum {
 178         ASCII1=0,
 179         LATIN1,
 180         SBCS,
 181         DBCS,
 182         MBCS,
 183         HWKANA
 184 }Cnv2022Type;
 185
 186 typedef struct ISO2022State {
 187     int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
 188     int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
 189     int8_t prevG;       /* g before single shift (SS2 or SS3) */
 190 } ISO2022State;
 191
 192 #define UCNV_OPTIONS_VERSION_MASK 0xf
 193 #define UCNV_2022_MAX_CONVERTERS 10
 194
 195 typedef struct{
 196     UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
 197     UConverter *currentConverter;
 198     Cnv2022Type currentType;
 199     ISO2022State toU2022State, fromU2022State;
 200     uint32_t key;
 201     uint32_t version;
 202 #ifdef U_ENABLE_GENERIC_ISO_2022
 203     UBool isFirstBuffer;
 204 #endif
 205     UBool isEmptySegment;
 206     char name[30];
 207     char locale[3];
 208 }UConverterDataISO2022;
 209
 210 /* Protos */
 211 /* ISO-2022 ----------------------------------------------------------------- */
 212
 213 /*Forward declaration */
 214 U_CFUNC void
 215 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
 216                       UErrorCode * err);
 217 U_CFUNC void
 218 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
 219                                     UErrorCode * err);
 220
 221 #define ESC_2022 0x1B /*ESC*/
 222
 223 typedef enum
 224 {
 225         INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
 226         VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
 227         VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
 228         VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
 229 } UCNV_TableStates_2022;
 230
 231 /*
 232 * The way these state transition arrays work is:
 233 * ex : ESC$B is the sequence for JISX208
 234 *      a) First Iteration: char is ESC
 235 *          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
 236 *             int x = normalize_esq_chars_2022[27] which is equal to 1
 237 *         ii) Search for this value in escSeqStateTable_Key_2022[]
 238 *             value of x is stored at escSeqStateTable_Key_2022[0]
 239 *        iii) Save this index as offset
 240 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
 241 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
 242 *     b) Switch on this state and continue to next char
 243 *          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
 244 *             which is normalize_esq_chars_2022[36] == 4
 245 *         ii) x is currently 1(from above)
 246 *               x<<=5 -- x is now 32
 247 *               x+=normalize_esq_chars_2022[36]
 248 *               now x is 36
 249 *        iii) Search for this value in escSeqStateTable_Key_2022[]
 250 *             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
 251 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
 252 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
 253 *     c) Switch on this state and continue to next char
 254 *        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
 255 *        ii) x is currently 36 (from above)
 256 *            x<<=5 -- x is now 1152
 257 *            x+=normalize_esq_chars_2022[66]
 258 *            now x is 1161
 259 *       iii) Search for this value in escSeqStateTable_Key_2022[]
 260 *            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
 261 *        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
 262 *            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
 263 *         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
 264 */
 265
 266
 267 /*Below are the 3 arrays depicting a state transition table*/
 268 static const int8_t normalize_esq_chars_2022[256] = {
 269 /*       0      1       2       3       4      5       6        7       8       9           */
 270
 271          0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 272         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 273         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
 274         ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
 275         ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
 276         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 277         ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
 278         ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
 279         ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
 280         ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 281         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 282         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 283         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 284         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 285         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 286         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 287         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 288         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 289         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 290         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 291         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 292         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 293         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 294         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 295         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 296         ,0     ,0      ,0      ,0      ,0      ,0
 297 };
 298
 299 #ifdef U_ENABLE_GENERIC_ISO_2022
 300 /*
 301  * When the generic ISO-2022 converter is completely removed, not just disabled
 302  * per #ifdef, then the following state table and the associated tables that are
 303  * dimensioned with MAX_STATES_2022 should be trimmed.
 304  *
 305  * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
 306  * the associated escape sequences starting with ESC ( B should be removed.
 307  * This includes the ones with key values 1097 and all of the ones above 1000000.
 308  *
 309  * For the latter, the tables can simply be truncated.
 310  * For the former, since the tables must be kept parallel, it is probably best
 311  * to simply duplicate an adjacent table cell, parallel in all tables.
 312  *
 313  * It may make sense to restructure the tables, especially by using small search
 314  * tables for the variants instead of indexing them parallel to the table here.
 315  */
 316 #endif
 317
 318 #define MAX_STATES_2022 74
 319 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
 320 /*   0           1           2           3           4           5           6           7           8           9           */
 321
 322      1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
 323     ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
 324     ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
 325     ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
 326     ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
 327     ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
 328     ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
 329     ,35947631   ,35947635   ,35947636   ,35947638
 330 };
 331
 332 #ifdef U_ENABLE_GENERIC_ISO_2022
 333
 334 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
 335  /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
 336
 337      NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
 338     ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
 339     ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
 340     ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
 341     ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
 342     ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
 343     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
 344     ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
 345 };
 346
 347 #endif
 348
 349 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
 350 /*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
 351      VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 352     ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 353     ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
 354     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 355     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 356     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 357     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 358     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 359 };
 360
 361
 362 /* Type def for refactoring changeState_2022 code*/
 363 typedef enum{
 364 #ifdef U_ENABLE_GENERIC_ISO_2022
 365     ISO_2022=0,
 366 #endif
 367     ISO_2022_JP=1,
 368     ISO_2022_KR=2,
 369     ISO_2022_CN=3
 370 } Variant2022;
 371
 372 /*********** ISO 2022 Converter Protos ***********/
 373 static void
 374 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
 375
 376 static void
 377  _ISO2022Close(UConverter *converter);
 378
 379 static void
 380 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
 381
 382 static const char*
 383 _ISO2022getName(const UConverter* cnv);
 384
 385 static void
 386 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
 387
 388 static UConverter *
 389 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
 390
 391 #ifdef U_ENABLE_GENERIC_ISO_2022
 392 static void
 393 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
 394 #endif
 395
 396 /*const UConverterSharedData _ISO2022Data;*/
 397 static const UConverterSharedData _ISO2022JPData;
 398 static const UConverterSharedData _ISO2022KRData;
 399 static const UConverterSharedData _ISO2022CNData;
 400
 401 /*************** Converter implementations ******************/
 402
 403 /* The purpose of this function is to get around gcc compiler warnings. */
 404 static U_INLINE void
 405 fromUWriteUInt8(UConverter *cnv,
 406                  const char *bytes, int32_t length,
 407                  uint8_t **target, const char *targetLimit,
 408                  int32_t **offsets,
 409                  int32_t sourceIndex,
 410                  UErrorCode *pErrorCode)
 411 {
 412     char *targetChars = (char *)*target;
 413     ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
 414                          offsets, sourceIndex, pErrorCode);
 415     *target = (uint8_t*)targetChars;
 416
 417 }
 418
 419 static U_INLINE void
 420 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
 421     if(myConverterData->version == 1) {
 422         UConverter *cnv = myConverterData->currentConverter;
 423
 424         cnv->toUnicodeStatus=0;     /* offset */
 425         cnv->mode=0;                /* state */
 426         cnv->toULength=0;           /* byteIndex */
 427     }
 428 }
 429
 430 static U_INLINE void
 431 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
 432    /* in ISO-2022-KR the designator sequence appears only once
 433     * in a file so we append it only once
 434     */
 435     if( converter->charErrorBufferLength==0){
 436
 437         converter->charErrorBufferLength = 4;
 438         converter->charErrorBuffer[0] = 0x1b;
 439         converter->charErrorBuffer[1] = 0x24;
 440         converter->charErrorBuffer[2] = 0x29;
 441         converter->charErrorBuffer[3] = 0x43;
 442     }
 443     if(myConverterData->version == 1) {
 444         UConverter *cnv = myConverterData->currentConverter;
 445
 446         cnv->fromUChar32=0;
 447         cnv->fromUnicodeStatus=1;   /* prevLength */
 448     }
 449 }
 450
 451 static void
 452 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
 453
 454     char myLocale[6]={' ',' ',' ',' ',' ',' '};
 455
 456     cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
 457     if(cnv->extraInfo != NULL) {
 458         UConverterNamePieces stackPieces;
 459         UConverterLoadArgs stackArgs={ (int32_t)sizeof(UConverterLoadArgs) };
 460         UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
 461         uint32_t version;
 462
 463         stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
 464
 465         uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
 466         myConverterData->currentType = ASCII1;
 467         cnv->fromUnicodeStatus =FALSE;
 468         if(pArgs->locale){
 469             uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
 470         }
 471         version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
 472         myConverterData->version = version;
 473         if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
 474             (myLocale[2]=='_' || myLocale[2]=='\0'))
 475         {
 476             size_t len=0;
 477             /* open the required converters and cache them */
 478             if(version>MAX_JA_VERSION) {
 479                 /* prevent indexing beyond jpCharsetMasks[] */
 480                 myConverterData->version = version = 0;
 481             }
 482             if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
 483                 myConverterData->myConverterArray[ISO8859_7] =
 484                     ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
 485             }
 486             myConverterData->myConverterArray[JISX208] =
 487                 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
 488             if(jpCharsetMasks[version]&CSM(JISX212)) {
 489                 myConverterData->myConverterArray[JISX212] =
 490                     ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
 491             }
 492             if(jpCharsetMasks[version]&CSM(GB2312)) {
 493                 myConverterData->myConverterArray[GB2312] =
 494                     ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);   /* gb_2312_80-1 */
 495             }
 496             if(jpCharsetMasks[version]&CSM(KSC5601)) {
 497                 myConverterData->myConverterArray[KSC5601] =
 498                     ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
 499             }
 500
 501             /* set the function pointers to appropriate funtions */
 502             cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
 503             uprv_strcpy(myConverterData->locale,"ja");
 504
 505             (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
 506             len = uprv_strlen(myConverterData->name);
 507             myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
 508             myConverterData->name[len+1]='\0';
 509         }
 510         else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
 511             (myLocale[2]=='_' || myLocale[2]=='\0'))
 512         {
 513             const char *cnvName;
 514             if(version==1) {
 515                 cnvName="icu-internal-25546";
 516             } else {
 517                 cnvName="ibm-949";
 518                 myConverterData->version=version=0;
 519             }
 520             if(pArgs->onlyTestIsLoadable) {
 521                 ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
 522                 uprv_free(cnv->extraInfo);
 523                 cnv->extraInfo=NULL;
 524                 return;
 525             } else {
 526                 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
 527                 if (U_FAILURE(*errorCode)) {
 528                     _ISO2022Close(cnv);
 529                     return;
 530                 }
 531
 532                 if(version==1) {
 533                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
 534                     uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
 535                     cnv->subCharLen = myConverterData->currentConverter->subCharLen;
 536                 }else{
 537                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
 538                 }
 539
 540                 /* initialize the state variables */
 541                 setInitialStateToUnicodeKR(cnv, myConverterData);
 542                 setInitialStateFromUnicodeKR(cnv, myConverterData);
 543
 544                 /* set the function pointers to appropriate funtions */
 545                 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
 546                 uprv_strcpy(myConverterData->locale,"ko");
 547             }
 548         }
 549         else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
 550             (myLocale[2]=='_' || myLocale[2]=='\0'))
 551         {
 552
 553             /* open the required converters and cache them */
 554             myConverterData->myConverterArray[GB2312_1] =
 555                 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
 556             if(version==1) {
 557                 myConverterData->myConverterArray[ISO_IR_165] =
 558                     ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
 559             }
 560             myConverterData->myConverterArray[CNS_11643] =
 561                 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
 562
 563
 564             /* set the function pointers to appropriate funtions */
 565             cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
 566             uprv_strcpy(myConverterData->locale,"cn");
 567
 568             if (version==0){
 569                 myConverterData->version = 0;
 570                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
 571             }else if (version==1){
 572                 myConverterData->version = 1;
 573                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
 574             }else {
 575                 myConverterData->version = 2;
 576                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
 577             }
 578         }
 579         else{
 580 #ifdef U_ENABLE_GENERIC_ISO_2022
 581             myConverterData->isFirstBuffer = TRUE;
 582
 583             /* append the UTF-8 escape sequence */
 584             cnv->charErrorBufferLength = 3;
 585             cnv->charErrorBuffer[0] = 0x1b;
 586             cnv->charErrorBuffer[1] = 0x25;
 587             cnv->charErrorBuffer[2] = 0x42;
 588
 589             cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
 590             /* initialize the state variables */
 591             uprv_strcpy(myConverterData->name,"ISO_2022");
 592 #else
 593             *errorCode = U_UNSUPPORTED_ERROR;
 594             return;
 595 #endif
 596         }
 597
 598         cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
 599
 600         if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
 601             _ISO2022Close(cnv);
 602         }
 603     } else {
 604         *errorCode = U_MEMORY_ALLOCATION_ERROR;
 605     }
 606 }
 607
 608
 609 static void
 610 _ISO2022Close(UConverter *converter) {
 611     UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
 612     UConverterSharedData **array = myData->myConverterArray;
 613     int32_t i;
 614
 615     if (converter->extraInfo != NULL) {
 616         /*close the array of converter pointers and free the memory*/
 617         for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
 618             if(array[i]!=NULL) {
 619                 ucnv_unloadSharedDataIfReady(array[i]);
 620             }
 621         }
 622
 623         ucnv_close(myData->currentConverter);
 624
 625         if(!converter->isExtraLocal){
 626             uprv_free (converter->extraInfo);
 627             converter->extraInfo = NULL;
 628         }
 629     }
 630 }
 631
 632 static void
 633 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
 634     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
 635     if(choice<=UCNV_RESET_TO_UNICODE) {
 636         uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
 637         myConverterData->key = 0;
 638         myConverterData->isEmptySegment = FALSE;
 639     }
 640     if(choice!=UCNV_RESET_TO_UNICODE) {
 641         uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
 642     }
 643 #ifdef U_ENABLE_GENERIC_ISO_2022
 644     if(myConverterData->locale[0] == 0){
 645         if(choice<=UCNV_RESET_TO_UNICODE) {
 646             myConverterData->isFirstBuffer = TRUE;
 647             myConverterData->key = 0;
 648             if (converter->mode == UCNV_SO){
 649                 ucnv_close (myConverterData->currentConverter);
 650                 myConverterData->currentConverter=NULL;
 651             }
 652             converter->mode = UCNV_SI;
 653         }
 654         if(choice!=UCNV_RESET_TO_UNICODE) {
 655             /* re-append UTF-8 escape sequence */
 656             converter->charErrorBufferLength = 3;
 657             converter->charErrorBuffer[0] = 0x1b;
 658             converter->charErrorBuffer[1] = 0x28;
 659             converter->charErrorBuffer[2] = 0x42;
 660         }
 661     }
 662     else
 663 #endif
 664     {
 665         /* reset the state variables */
 666         if(myConverterData->locale[0] == 'k'){
 667             if(choice<=UCNV_RESET_TO_UNICODE) {
 668                 setInitialStateToUnicodeKR(converter, myConverterData);
 669             }
 670             if(choice!=UCNV_RESET_TO_UNICODE) {
 671                 setInitialStateFromUnicodeKR(converter, myConverterData);
 672             }
 673         }
 674     }
 675 }
 676
 677 static const char*
 678 _ISO2022getName(const UConverter* cnv){
 679     if(cnv->extraInfo){
 680         UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
 681         return myData->name;
 682     }
 683     return NULL;
 684 }
 685
 686
 687 /*************** to unicode *******************/
 688 /****************************************************************************
 689  * Recognized escape sequences are
 690  * <ESC>(B  ASCII
 691  * <ESC>.A  ISO-8859-1
 692  * <ESC>.F  ISO-8859-7
 693  * <ESC>(J  JISX-201
 694  * <ESC>(I  JISX-201
 695  * <ESC>$B  JISX-208
 696  * <ESC>$@  JISX-208
 697  * <ESC>$(D JISX-212
 698  * <ESC>$A  GB2312
 699  * <ESC>$(C KSC5601
 700  */
 701 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
 702 /*      0                1               2               3               4               5               6               7               8               9    */
 703     INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 704     ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
 705     ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 706     ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
 707     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 708     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 709     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 710     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 711 };
 712
 713 /*************** to unicode *******************/
 714 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
 715 /*      0                1               2               3               4               5               6               7               8               9    */
 716      INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 717     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 718     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 719     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 720     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
 721     ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 722     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 723     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 724 };
 725
 726
 727 static UCNV_TableStates_2022
 728 getKey_2022(char c,int32_t* key,int32_t* offset){
 729     int32_t togo;
 730     int32_t low = 0;
 731     int32_t hi = MAX_STATES_2022;
 732     int32_t oldmid=0;
 733
 734     togo = normalize_esq_chars_2022[(uint8_t)c];
 735     if(togo == 0) {
 736         /* not a valid character anywhere in an escape sequence */
 737         *key = 0;
 738         *offset = 0;
 739         return INVALID_2022;
 740     }
 741     togo = (*key << 5) + togo;
 742
 743     while (hi != low)  /*binary search*/{
 744
 745         register int32_t mid = (hi+low) >> 1; /*Finds median*/
 746
 747         if (mid == oldmid)
 748             break;
 749
 750         if (escSeqStateTable_Key_2022[mid] > togo){
 751             hi = mid;
 752         }
 753         else if (escSeqStateTable_Key_2022[mid] < togo){
 754             low = mid;
 755         }
 756         else /*we found it*/{
 757             *key = togo;
 758             *offset = mid;
 759             return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
 760         }
 761         oldmid = mid;
 762
 763     }
 764
 765     *key = 0;
 766     *offset = 0;
 767     return INVALID_2022;
 768 }
 769
 770 /*runs through a state machine to determine the escape sequence - codepage correspondance
 771  */
 772 static void
 773 changeState_2022(UConverter* _this,
 774                 const char** source,
 775                 const char* sourceLimit,
 776                 Variant2022 var,
 777                 UErrorCode* err){
 778     UCNV_TableStates_2022 value;
 779     UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
 780     uint32_t key = myData2022->key;
 781     int32_t offset = 0;
 782     int8_t initialToULength = _this->toULength;
 783     char c;
 784
 785     value = VALID_NON_TERMINAL_2022;
 786     while (*source < sourceLimit) {
 787         c = *(*source)++;
 788         _this->toUBytes[_this->toULength++]=(uint8_t)c;
 789         value = getKey_2022(c,(int32_t *) &key, &offset);
 790
 791         switch (value){
 792
 793         case VALID_NON_TERMINAL_2022 :
 794             /* continue with the loop */
 795             break;
 796
 797         case VALID_TERMINAL_2022:
 798             key = 0;
 799             goto DONE;
 800
 801         case INVALID_2022:
 802             goto DONE;
 803
 804         case VALID_MAYBE_TERMINAL_2022:
 805 #ifdef U_ENABLE_GENERIC_ISO_2022
 806             /* ESC ( B is ambiguous only for ISO_2022 itself */
 807             if(var == ISO_2022) {
 808                 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
 809                 _this->toULength = 0;
 810
 811                 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
 812
 813                 /* continue with the loop */
 814                 value = VALID_NON_TERMINAL_2022;
 815                 break;
 816             } else
 817 #endif
 818             {
 819                 /* not ISO_2022 itself, finish here */
 820                 value = VALID_TERMINAL_2022;
 821                 key = 0;
 822                 goto DONE;
 823             }
 824         }
 825     }
 826
 827 DONE:
 828     myData2022->key = key;
 829
 830     if (value == VALID_NON_TERMINAL_2022) {
 831         /* indicate that the escape sequence is incomplete: key!=0 */
 832         return;
 833     } else if (value == INVALID_2022 ) {
 834         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 835     } else /* value == VALID_TERMINAL_2022 */ {
 836         switch(var){
 837 #ifdef U_ENABLE_GENERIC_ISO_2022
 838         case ISO_2022:
 839         {
 840             const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
 841             if(chosenConverterName == NULL) {
 842                 /* SS2 or SS3 */
 843                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 844                 _this->toUCallbackReason = UCNV_UNASSIGNED;
 845                 return;
 846             }
 847
 848             _this->mode = UCNV_SI;
 849             ucnv_close(myData2022->currentConverter);
 850             myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
 851             if(U_SUCCESS(*err)) {
 852                 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
 853                 _this->mode = UCNV_SO;
 854             }
 855             break;
 856         }
 857 #endif
 858         case ISO_2022_JP:
 859             {
 860                 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
 861                 switch(tempState) {
 862                 case INVALID_STATE:
 863                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 864                     break;
 865                 case SS2_STATE:
 866                     if(myData2022->toU2022State.cs[2]!=0) {
 867                         if(myData2022->toU2022State.g<2) {
 868                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 869                         }
 870                         myData2022->toU2022State.g=2;
 871                     } else {
 872                         /* illegal to have SS2 before a matching designator */
 873                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 874                     }
 875                     break;
 876                 /* case SS3_STATE: not used in ISO-2022-JP-x */
 877                 case ISO8859_1:
 878                 case ISO8859_7:
 879                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
 880                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 881                     } else {
 882                         /* G2 charset for SS2 */
 883                         myData2022->toU2022State.cs[2]=(int8_t)tempState;
 884                     }
 885                     break;
 886                 default:
 887                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
 888                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 889                     } else {
 890                         /* G0 charset */
 891                         myData2022->toU2022State.cs[0]=(int8_t)tempState;
 892                     }
 893                     break;
 894                 }
 895             }
 896             break;
 897         case ISO_2022_CN:
 898             {
 899                 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
 900                 switch(tempState) {
 901                 case INVALID_STATE:
 902                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 903                     break;
 904                 case SS2_STATE:
 905                     if(myData2022->toU2022State.cs[2]!=0) {
 906                         if(myData2022->toU2022State.g<2) {
 907                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 908                         }
 909                         myData2022->toU2022State.g=2;
 910                     } else {
 911                         /* illegal to have SS2 before a matching designator */
 912                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 913                     }
 914                     break;
 915                 case SS3_STATE:
 916                     if(myData2022->toU2022State.cs[3]!=0) {
 917                         if(myData2022->toU2022State.g<2) {
 918                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 919                         }
 920                         myData2022->toU2022State.g=3;
 921                     } else {
 922                         /* illegal to have SS3 before a matching designator */
 923                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 924                     }
 925                     break;
 926                 case ISO_IR_165:
 927                     if(myData2022->version==0) {
 928                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 929                         break;
 930                     }
 931                     /*fall through*/
 932                 case GB2312_1:
 933                     /*fall through*/
 934                 case CNS_11643_1:
 935                     myData2022->toU2022State.cs[1]=(int8_t)tempState;
 936                     break;
 937                 case CNS_11643_2:
 938                     myData2022->toU2022State.cs[2]=(int8_t)tempState;
 939                     break;
 940                 default:
 941                     /* other CNS 11643 planes */
 942                     if(myData2022->version==0) {
 943                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 944                     } else {
 945                        myData2022->toU2022State.cs[3]=(int8_t)tempState;
 946                     }
 947                     break;
 948                 }
 949             }
 950             break;
 951         case ISO_2022_KR:
 952             if(offset==0x30){
 953                 /* nothing to be done, just accept this one escape sequence */
 954             } else {
 955                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 956             }
 957             break;
 958
 959         default:
 960             *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 961             break;
 962         }
 963     }
 964     if(U_SUCCESS(*err)) {
 965         _this->toULength = 0;
 966     } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
 967         if(_this->toULength>1) {
 968             /*
 969              * Ticket 5691: consistent illegal sequences:
 970              * - We include at least the first byte (ESC) in the illegal sequence.
 971              * - If any of the non-initial bytes could be the start of a character,
 972              *   we stop the illegal sequence before the first one of those.
 973              *   In escape sequences, all following bytes are "printable", that is,
 974              *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
 975              *   they are valid single/lead bytes.
 976              *   For simplicity, we always only report the initial ESC byte as the
 977              *   illegal sequence and back out all other bytes we looked at.
 978              */
 979             /* Back out some bytes. */
 980             int8_t backOutDistance=_this->toULength-1;
 981             int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
 982             if(backOutDistance<=bytesFromThisBuffer) {
 983                 /* same as initialToULength<=1 */
 984                 *source-=backOutDistance;
 985             } else {
 986                 /* Back out bytes from the previous buffer: Need to replay them. */
 987                 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
 988                 /* same as -(initialToULength-1) */
 989                 /* preToULength is negative! */
 990                 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
 991                 *source-=bytesFromThisBuffer;
 992             }
 993             _this->toULength=1;
 994         }
 995     } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
 996         _this->toUCallbackReason = UCNV_UNASSIGNED;
 997     }
 998 }
 999
1000 /*Checks the characters of the buffer against valid 2022 escape sequences
1001 *if the match we return a pointer to the initial start of the sequence otherwise
1002 *we return sourceLimit
1003 */
1004 /*for 2022 looks ahead in the stream
1005  *to determine the longest possible convertible
1006  *data stream
1007  */
1008 static U_INLINE const char*
1009 getEndOfBuffer_2022(const char** source,
1010                    const char* sourceLimit,
1011                    UBool flush){
1012
1013     const char* mySource = *source;
1014
1015 #ifdef U_ENABLE_GENERIC_ISO_2022
1016     if (*source >= sourceLimit)
1017         return sourceLimit;
1018
1019     do{
1020
1021         if (*mySource == ESC_2022){
1022             int8_t i;
1023             int32_t key = 0;
1024             int32_t offset;
1025             UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1026
1027             /* Kludge: I could not
1028             * figure out the reason for validating an escape sequence
1029             * twice - once here and once in changeState_2022().
1030             * is it possible to have an ESC character in a ISO2022
1031             * byte stream which is valid in a code page? Is it legal?
1032             */
1033             for (i=0;
1034             (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1035             i++) {
1036                 value =  getKey_2022(*(mySource+i), &key, &offset);
1037             }
1038             if (value > 0 || *mySource==ESC_2022)
1039                 return mySource;
1040
1041             if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1042                 return sourceLimit;
1043         }
1044     }while (++mySource < sourceLimit);
1045
1046     return sourceLimit;
1047 #else
1048     while(mySource < sourceLimit && *mySource != ESC_2022) {
1049         ++mySource;
1050     }
1051     return mySource;
1052 #endif
1053 }
1054
1055
1056 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1057  * any future change in _MBCSFromUChar32() function should be reflected here.
1058  * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1059  */
1060 static U_INLINE int32_t
1061 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1062                                          UChar32 c,
1063                                          uint32_t* value,
1064                                          UBool useFallback,
1065                                          int outputType)
1066 {
1067     const int32_t *cx;
1068     const uint16_t *table;
1069     uint32_t stage2Entry;
1070     uint32_t myValue;
1071     int32_t length;
1072     const uint8_t *p;
1073     /*
1074      * TODO(markus): Use and require new, faster MBCS conversion table structures.
1075      * Use internal version of ucnv_open() that verifies that the new structures are available,
1076      * else U_INTERNAL_PROGRAM_ERROR.
1077      */
1078     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1079     if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1080         table=sharedData->mbcs.fromUnicodeTable;
1081         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1082         /* get the bytes and the length for the output */
1083         if(outputType==MBCS_OUTPUT_2){
1084             myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1085             if(myValue<=0xff) {
1086                 length=1;
1087             } else {
1088                 length=2;
1089             }
1090         } else /* outputType==MBCS_OUTPUT_3 */ {
1091             p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1092             myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1093             if(myValue<=0xff) {
1094                 length=1;
1095             } else if(myValue<=0xffff) {
1096                 length=2;
1097             } else {
1098                 length=3;
1099             }
1100         }
1101         /* is this code point assigned, or do we use fallbacks? */
1102         if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1103             /* assigned */
1104             *value=myValue;
1105             return length;
1106         } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1107             /*
1108              * We allow a 0 byte output if the "assigned" bit is set for this entry.
1109              * There is no way with this data structure for fallback output
1110              * to be a zero byte.
1111              */
1112             *value=myValue;
1113             return -length;
1114         }
1115     }
1116
1117     cx=sharedData->mbcs.extIndexes;
1118     if(cx!=NULL) {
1119         return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1120     }
1121
1122     /* unassigned */
1123     return 0;
1124 }
1125
1126 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1127  * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1128  * @param retval pointer to output byte
1129  * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1130  */
1131 static U_INLINE int32_t
1132 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1133                                        UChar32 c,
1134                                        uint32_t* retval,
1135                                        UBool useFallback)
1136 {
1137     const uint16_t *table;
1138     int32_t value;
1139     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1140     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1141         return 0;
1142     }
1143     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1144     table=sharedData->mbcs.fromUnicodeTable;
1145     /* get the byte for the output */
1146     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1147     /* is this code point assigned, or do we use fallbacks? */
1148     *retval=(uint32_t)(value&0xff);
1149     if(value>=0xf00) {
1150         return 1;  /* roundtrip */
1151     } else if(useFallback ? value>=0x800 : value>=0xc00) {
1152         return -1;  /* fallback taken */
1153     } else {
1154         return 0;  /* no mapping */
1155     }
1156 }
1157
1158 /*
1159  * Check that the result is a 2-byte value with each byte in the range A1..FE
1160  * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1161  * to move it to the ISO 2022 range 21..7E.
1162  * Return 0 if out of range.
1163  */
1164 static U_INLINE uint32_t
1165 _2022FromGR94DBCS(uint32_t value) {
1166     if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1167         (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1168     ) {
1169         return value - 0x8080;  /* shift down to 21..7e byte range */
1170     } else {
1171         return 0;  /* not valid for ISO 2022 */
1172     }
1173 }
1174
1175 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1176 /*
1177  * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1178  * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1179  * unchanged.
1180  */
1181 static U_INLINE uint32_t
1182 _2022ToGR94DBCS(uint32_t value) {
1183     uint32_t returnValue = value + 0x8080;
1184     if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1185         (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1186         return returnValue;
1187     } else {
1188         return value;
1189     }
1190 }
1191 #endif
1192
1193 #ifdef U_ENABLE_GENERIC_ISO_2022
1194
1195 /**********************************************************************************
1196 *  ISO-2022 Converter
1197 *
1198 *
1199 */
1200
1201 static void
1202 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1203                                                            UErrorCode* err){
1204     const char* mySourceLimit, *realSourceLimit;
1205     const char* sourceStart;
1206     const UChar* myTargetStart;
1207     UConverter* saveThis;
1208     UConverterDataISO2022* myData;
1209     int8_t length;
1210
1211     saveThis = args->converter;
1212     myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1213
1214     realSourceLimit = args->sourceLimit;
1215     while (args->source < realSourceLimit) {
1216         if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1217             /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1218             mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1219
1220             if(args->source < mySourceLimit) {
1221                 if(myData->currentConverter==NULL) {
1222                     myData->currentConverter = ucnv_open("ASCII",err);
1223                     if(U_FAILURE(*err)){
1224                         return;
1225                     }
1226
1227                     myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1228                     saveThis->mode = UCNV_SO;
1229                 }
1230
1231                 /* convert to before the ESC or until the end of the buffer */
1232                 myData->isFirstBuffer=FALSE;
1233                 sourceStart = args->source;
1234                 myTargetStart = args->target;
1235                 args->converter = myData->currentConverter;
1236                 ucnv_toUnicode(args->converter,
1237                     &args->target,
1238                     args->targetLimit,
1239                     &args->source,
1240                     mySourceLimit,
1241                     args->offsets,
1242                     (UBool)(args->flush && mySourceLimit == realSourceLimit),
1243                     err);
1244                 args->converter = saveThis;
1245
1246                 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1247                     /* move the overflow buffer */
1248                     length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1249                     myData->currentConverter->UCharErrorBufferLength = 0;
1250                     if(length > 0) {
1251                         uprv_memcpy(saveThis->UCharErrorBuffer,
1252                                     myData->currentConverter->UCharErrorBuffer,
1253                                     length*U_SIZEOF_UCHAR);
1254                     }
1255                     return;
1256                 }
1257
1258                 /*
1259                  * At least one of:
1260                  * -Error while converting
1261                  * -Done with entire buffer
1262                  * -Need to write offsets or update the current offset
1263                  *  (leave that up to the code in ucnv.c)
1264                  *
1265                  * or else we just stopped at an ESC byte and continue with changeState_2022()
1266                  */
1267                 if (U_FAILURE(*err) ||
1268                     (args->source == realSourceLimit) ||
1269                     (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1270                     (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1271                 ) {
1272                     /* copy partial or error input for truncated detection and error handling */
1273                     if(U_FAILURE(*err)) {
1274                         length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1275                         if(length > 0) {
1276                             uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1277                         }
1278                     } else {
1279                         length = saveThis->toULength = myData->currentConverter->toULength;
1280                         if(length > 0) {
1281                             uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1282                             if(args->source < mySourceLimit) {
1283                                 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1284                             }
1285                         }
1286                     }
1287                     return;
1288                 }
1289             }
1290         }
1291
1292         sourceStart = args->source;
1293         changeState_2022(args->converter,
1294                &(args->source),
1295                realSourceLimit,
1296                ISO_2022,
1297                err);
1298         if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1299             /* let the ucnv.c code update its current offset */
1300             return;
1301         }
1302     }
1303 }
1304
1305 #endif
1306
1307 /*
1308  * To Unicode Callback helper function
1309  */
1310 static void
1311 toUnicodeCallback(UConverter *cnv,
1312                   const uint32_t sourceChar, const uint32_t targetUniChar,
1313                   UErrorCode* err){
1314     if(sourceChar>0xff){
1315         cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1316         cnv->toUBytes[1] = (uint8_t)sourceChar;
1317         cnv->toULength = 2;
1318     }
1319     else{
1320         cnv->toUBytes[0] =(char) sourceChar;
1321         cnv->toULength = 1;
1322     }
1323
1324     if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1325         *err = U_INVALID_CHAR_FOUND;
1326     }
1327     else{
1328         *err = U_ILLEGAL_CHAR_FOUND;
1329     }
1330 }
1331
1332 /**************************************ISO-2022-JP*************************************************/
1333
1334 /************************************** IMPORTANT **************************************************
1335 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1336 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1337 * The converter iterates over each Unicode codepoint
1338 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1339 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1340 * would do as far as possible.
1341 *
1342 * If the implementation of these macros or structure of sharedData struct change in the future, make
1343 * sure that ISO-2022 is also changed.
1344 ***************************************************************************************************
1345 */
1346
1347 /***************************************************************************************************
1348 * Rules for ISO-2022-jp encoding
1349 * (i)   Escape sequences must be fully contained within a line they should not
1350 *       span new lines or CRs
1351 * (ii)  If the last character on a line is represented by two bytes then an ASCII or
1352 *       JIS-Roman character escape sequence should follow before the line terminates
1353 * (iii) If the first character on the line is represented by two bytes then a two
1354 *       byte character escape sequence should precede it
1355 * (iv)  If no escape sequence is encountered then the characters are ASCII
1356 * (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1357 *       and invoked with SS2 (ESC N).
1358 * (vi)  If there is any G0 designation in text, there must be a switch to
1359 *       ASCII or to JIS X 0201-Roman before a space character (but not
1360 *       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1361 *       characters such as tab or CRLF.
1362 * (vi)  Supported encodings:
1363 *          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1364 *
1365 *  source : RFC-1554
1366 *
1367 *          JISX201, JISX208,JISX212 : new .cnv data files created
1368 *          KSC5601 : alias to ibm-949 mapping table
1369 *          GB2312 : alias to ibm-1386 mapping table
1370 *          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1371 *          ISO-8859-7 : alisas to ibm-9409 mapping table
1372 */
1373
1374 /* preference order of JP charsets */
1375 static const StateEnum jpCharsetPref[]={
1376     ASCII,
1377     JISX201,
1378     ISO8859_1,
1379     ISO8859_7,
1380     JISX208,
1381     JISX212,
1382     GB2312,
1383     KSC5601,
1384     HWKANA_7BIT
1385 };
1386
1387 /*
1388  * The escape sequences must be in order of the enum constants like JISX201  = 3,
1389  * not in order of jpCharsetPref[]!
1390  */
1391 static const char escSeqChars[][6] ={
1392     "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1393     "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1394     "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1395     "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1396     "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1397     "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1398     "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1399     "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1400     "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1401
1402 };
1403 static  const int8_t escSeqCharsLen[] ={
1404     3, /* length of <ESC>(B  ASCII       */
1405     3, /* length of <ESC>.A  ISO-8859-1  */
1406     3, /* length of <ESC>.F  ISO-8859-7  */
1407     3, /* length of <ESC>(J  JISX-201    */
1408     3, /* length of <ESC>$B  JISX-208    */
1409     4, /* length of <ESC>$(D JISX-212    */
1410     3, /* length of <ESC>$A  GB2312      */
1411     4, /* length of <ESC>$(C KSC5601     */
1412     3  /* length of <ESC>(I  HWKANA_7BIT */
1413 };
1414
1415 /*
1416 * The iteration over various code pages works this way:
1417 * i)   Get the currentState from myConverterData->currentState
1418 * ii)  Check if the character is mapped to a valid character in the currentState
1419 *      Yes ->  a) set the initIterState to currentState
1420 *       b) remain in this state until an invalid character is found
1421 *      No  ->  a) go to the next code page and find the character
1422 * iii) Before changing the state increment the current state check if the current state
1423 *      is equal to the intitIteration state
1424 *      Yes ->  A character that cannot be represented in any of the supported encodings
1425 *       break and return a U_INVALID_CHARACTER error
1426 *      No  ->  Continue and find the character in next code page
1427 *
1428 *
1429 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1430 */
1431
1432 /* Map 00..7F to Unicode according to JIS X 0201. */
1433 static U_INLINE uint32_t
1434 jisx201ToU(uint32_t value) {
1435     if(value < 0x5c) {
1436         return value;
1437     } else if(value == 0x5c) {
1438         return 0xa5;
1439     } else if(value == 0x7e) {
1440         return 0x203e;
1441     } else /* value <= 0x7f */ {
1442         return value;
1443     }
1444 }
1445
1446 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1447 static U_INLINE uint32_t
1448 jisx201FromU(uint32_t value) {
1449     if(value<=0x7f) {
1450         if(value!=0x5c && value!=0x7e) {
1451             return value;
1452         }
1453     } else if(value==0xa5) {
1454         return 0x5c;
1455     } else if(value==0x203e) {
1456         return 0x7e;
1457     }
1458     return 0xfffe;
1459 }
1460
1461 /*
1462  * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1463  * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1464  * Return 0 if the byte pair is out of range.
1465  */
1466 static U_INLINE uint32_t
1467 _2022FromSJIS(uint32_t value) {
1468     uint8_t trail;
1469
1470     if(value > 0xEFFC) {
1471         return 0;  /* beyond JIS X 0208 */
1472     }
1473
1474     trail = (uint8_t)value;
1475
1476     value &= 0xff00;  /* lead byte */
1477     if(value <= 0x9f00) {
1478         value -= 0x7000;
1479     } else /* 0xe000 <= value <= 0xef00 */ {
1480         value -= 0xb000;
1481     }
1482     value <<= 1;
1483
1484     if(trail <= 0x9e) {
1485         value -= 0x100;
1486         if(trail <= 0x7e) {
1487             value |= trail - 0x1f;
1488         } else {
1489             value |= trail - 0x20;
1490         }
1491     } else /* trail <= 0xfc */ {
1492         value |= trail - 0x7e;
1493     }
1494     return value;
1495 }
1496
1497 /*
1498  * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1499  * If either byte is outside 21..7E make sure that the result is not valid
1500  * for Shift-JIS so that the converter catches it.
1501  * Some invalid byte values already turn into equally invalid Shift-JIS
1502  * byte values and need not be tested explicitly.
1503  */
1504 static U_INLINE void
1505 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1506     if(c1&1) {
1507         ++c1;
1508         if(c2 <= 0x5f) {
1509             c2 += 0x1f;
1510         } else if(c2 <= 0x7e) {
1511             c2 += 0x20;
1512         } else {
1513             c2 = 0;  /* invalid */
1514         }
1515     } else {
1516         if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1517             c2 += 0x7e;
1518         } else {
1519             c2 = 0;  /* invalid */
1520         }
1521     }
1522     c1 >>= 1;
1523     if(c1 <= 0x2f) {
1524         c1 += 0x70;
1525     } else if(c1 <= 0x3f) {
1526         c1 += 0xb0;
1527     } else {
1528         c1 = 0;  /* invalid */
1529     }
1530     bytes[0] = (char)c1;
1531     bytes[1] = (char)c2;
1532 }
1533
1534 /*
1535  * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1536  * Katakana.
1537  * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1538  * because Shift-JIS roundtrips half-width Katakana to single bytes.
1539  * These were the only fallbacks in ICU's jisx-208.ucm file.
1540  */
1541 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1542     0x2123,  /* U+FF61 */
1543     0x2156,
1544     0x2157,
1545     0x2122,
1546     0x2126,
1547     0x2572,
1548     0x2521,
1549     0x2523,
1550     0x2525,
1551     0x2527,
1552     0x2529,
1553     0x2563,
1554     0x2565,
1555     0x2567,
1556     0x2543,
1557     0x213C,  /* U+FF70 */
1558     0x2522,
1559     0x2524,
1560     0x2526,
1561     0x2528,
1562     0x252A,
1563     0x252B,
1564     0x252D,
1565     0x252F,
1566     0x2531,
1567     0x2533,
1568     0x2535,
1569     0x2537,
1570     0x2539,
1571     0x253B,
1572     0x253D,
1573     0x253F,  /* U+FF80 */
1574     0x2541,
1575     0x2544,
1576     0x2546,
1577     0x2548,
1578     0x254A,
1579     0x254B,
1580     0x254C,
1581     0x254D,
1582     0x254E,
1583     0x254F,
1584     0x2552,
1585     0x2555,
1586     0x2558,
1587     0x255B,
1588     0x255E,
1589     0x255F,  /* U+FF90 */
1590     0x2560,
1591     0x2561,
1592     0x2562,
1593     0x2564,
1594     0x2566,
1595     0x2568,
1596     0x2569,
1597     0x256A,
1598     0x256B,
1599     0x256C,
1600     0x256D,
1601     0x256F,
1602     0x2573,
1603     0x212B,
1604     0x212C   /* U+FF9F */
1605 };
1606
1607 static void
1608 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1609     UConverter *cnv = args->converter;
1610     UConverterDataISO2022 *converterData;
1611     ISO2022State *pFromU2022State;
1612     uint8_t *target = (uint8_t *) args->target;
1613     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1614     const UChar* source = args->source;
1615     const UChar* sourceLimit = args->sourceLimit;
1616     int32_t* offsets = args->offsets;
1617     UChar32 sourceChar;
1618     char buffer[8];
1619     int32_t len, outLen;
1620     int8_t choices[10];
1621     int32_t choiceCount;
1622     uint32_t targetValue = 0;
1623     UBool useFallback;
1624
1625     int32_t i;
1626     int8_t cs, g;
1627
1628     /* set up the state */
1629     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1630     pFromU2022State   = &converterData->fromU2022State;
1631
1632     choiceCount = 0;
1633
1634     /* check if the last codepoint of previous buffer was a lead surrogate*/
1635     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1636         goto getTrail;
1637     }
1638
1639     while(source < sourceLimit) {
1640         if(target < targetLimit) {
1641
1642             sourceChar  = *(source++);
1643             /*check if the char is a First surrogate*/
1644             if(UTF_IS_SURROGATE(sourceChar)) {
1645                 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1646 getTrail:
1647                     /*look ahead to find the trail surrogate*/
1648                     if(source < sourceLimit) {
1649                         /* test the following code unit */
1650                         UChar trail=(UChar) *source;
1651                         if(UTF_IS_SECOND_SURROGATE(trail)) {
1652                             source++;
1653                             sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1654                             cnv->fromUChar32=0x00;
1655                             /* convert this supplementary code point */
1656                             /* exit this condition tree */
1657                         } else {
1658                             /* this is an unmatched lead code unit (1st surrogate) */
1659                             /* callback(illegal) */
1660                             *err=U_ILLEGAL_CHAR_FOUND;
1661                             cnv->fromUChar32=sourceChar;
1662                             break;
1663                         }
1664                     } else {
1665                         /* no more input */
1666                         cnv->fromUChar32=sourceChar;
1667                         break;
1668                     }
1669                 } else {
1670                     /* this is an unmatched trail code unit (2nd surrogate) */
1671                     /* callback(illegal) */
1672                     *err=U_ILLEGAL_CHAR_FOUND;
1673                     cnv->fromUChar32=sourceChar;
1674                     break;
1675                 }
1676             }
1677
1678             /* do not convert SO/SI/ESC */
1679             if(IS_2022_CONTROL(sourceChar)) {
1680                 /* callback(illegal) */
1681                 *err=U_ILLEGAL_CHAR_FOUND;
1682                 cnv->fromUChar32=sourceChar;
1683                 break;
1684             }
1685
1686             /* do the conversion */
1687
1688             if(choiceCount == 0) {
1689                 uint16_t csm;
1690
1691                 /*
1692                  * The csm variable keeps track of which charsets are allowed
1693                  * and not used yet while building the choices[].
1694                  */
1695                 csm = jpCharsetMasks[converterData->version];
1696                 choiceCount = 0;
1697
1698                 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1699                 if(converterData->version == 3 || converterData->version == 4) {
1700                     choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1701                 }
1702                 /* Do not try single-byte half-width Katakana for other versions. */
1703                 csm &= ~CSM(HWKANA_7BIT);
1704
1705                 /* try the current G0 charset */
1706                 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1707                 csm &= ~CSM(cs);
1708
1709                 /* try the current G2 charset */
1710                 if((cs = pFromU2022State->cs[2]) != 0) {
1711                     choices[choiceCount++] = cs;
1712                     csm &= ~CSM(cs);
1713                 }
1714
1715                 /* try all the other possible charsets */
1716                 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1717                     cs = (int8_t)jpCharsetPref[i];
1718                     if(CSM(cs) & csm) {
1719                         choices[choiceCount++] = cs;
1720                         csm &= ~CSM(cs);
1721                     }
1722                 }
1723             }
1724
1725             cs = g = 0;
1726             /*
1727              * len==0: no mapping found yet
1728              * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1729              * len>0: found a roundtrip result, done
1730              */
1731             len = 0;
1732             /*
1733              * We will turn off useFallback after finding a fallback,
1734              * but we still get fallbacks from PUA code points as usual.
1735              * Therefore, we will also need to check that we don't overwrite
1736              * an early fallback with a later one.
1737              */
1738             useFallback = cnv->useFallback;
1739
1740             for(i = 0; i < choiceCount && len <= 0; ++i) {
1741                 uint32_t value;
1742                 int32_t len2;
1743                 int8_t cs0 = choices[i];
1744                 switch(cs0) {
1745                 case ASCII:
1746                     if(sourceChar <= 0x7f) {
1747                         targetValue = (uint32_t)sourceChar;
1748                         len = 1;
1749                         cs = cs0;
1750                         g = 0;
1751                     }
1752                     break;
1753                 case ISO8859_1:
1754                     if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1755                         targetValue = (uint32_t)sourceChar - 0x80;
1756                         len = 1;
1757                         cs = cs0;
1758                         g = 2;
1759                     }
1760                     break;
1761                 case HWKANA_7BIT:
1762                     if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1763                         if(converterData->version==3) {
1764                             /* JIS7: use G1 (SO) */
1765                             /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1766                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1767                             len = 1;
1768                             pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1769                             g = 1;
1770                         } else if(converterData->version==4) {
1771                             /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1772                             /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1773                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1774                             len = 1;
1775
1776                             cs = pFromU2022State->cs[0];
1777                             if(IS_JP_DBCS(cs)) {
1778                                 /* switch from a DBCS charset to JISX201 */
1779                                 cs = (int8_t)JISX201;
1780                             }
1781                             /* else stay in the current G0 charset */
1782                             g = 0;
1783                         }
1784                         /* else do not use HWKANA_7BIT with other versions */
1785                     }
1786                     break;
1787                 case JISX201:
1788                     /* G0 SBCS */
1789                     value = jisx201FromU(sourceChar);
1790                     if(value <= 0x7f) {
1791                         targetValue = value;
1792                         len = 1;
1793                         cs = cs0;
1794                         g = 0;
1795                         useFallback = FALSE;
1796                     }
1797                     break;
1798                 case JISX208:
1799                     /* G0 DBCS from Shift-JIS table */
1800                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1801                                 converterData->myConverterArray[cs0],
1802                                 sourceChar, &value,
1803                                 useFallback, MBCS_OUTPUT_2);
1804                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1805                         value = _2022FromSJIS(value);
1806                         if(value != 0) {
1807                             targetValue = value;
1808                             len = len2;
1809                             cs = cs0;
1810                             g = 0;
1811                             useFallback = FALSE;
1812                         }
1813                     } else if(len == 0 && useFallback &&
1814                               (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1815                         targetValue = hwkana_fb[sourceChar - HWKANA_START];
1816                         len = -2;
1817                         cs = cs0;
1818                         g = 0;
1819                         useFallback = FALSE;
1820                     }
1821                     break;
1822                 case ISO8859_7:
1823                     /* G0 SBCS forced to 7-bit output */
1824                     len2 = MBCS_SINGLE_FROM_UCHAR32(
1825                                 converterData->myConverterArray[cs0],
1826                                 sourceChar, &value,
1827                                 useFallback);
1828                     if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1829                         targetValue = value - 0x80;
1830                         len = len2;
1831                         cs = cs0;
1832                         g = 2;
1833                         useFallback = FALSE;
1834                     }
1835                     break;
1836                 default:
1837                     /* G0 DBCS */
1838                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1839                                 converterData->myConverterArray[cs0],
1840                                 sourceChar, &value,
1841                                 useFallback, MBCS_OUTPUT_2);
1842                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1843                         if(cs0 == KSC5601) {
1844                             /*
1845                              * Check for valid bytes for the encoding scheme.
1846                              * This is necessary because the sub-converter (windows-949)
1847                              * has a broader encoding scheme than is valid for 2022.
1848                              */
1849                             value = _2022FromGR94DBCS(value);
1850                             if(value == 0) {
1851                                 break;
1852                             }
1853                         }
1854                         targetValue = value;
1855                         len = len2;
1856                         cs = cs0;
1857                         g = 0;
1858                         useFallback = FALSE;
1859                     }
1860                     break;
1861                 }
1862             }
1863
1864             if(len != 0) {
1865                 if(len < 0) {
1866                     len = -len;  /* fallback */
1867                 }
1868                 outLen = 0; /* count output bytes */
1869
1870                 /* write SI if necessary (only for JIS7) */
1871                 if(pFromU2022State->g == 1 && g == 0) {
1872                     buffer[outLen++] = UCNV_SI;
1873                     pFromU2022State->g = 0;
1874                 }
1875
1876                 /* write the designation sequence if necessary */
1877                 if(cs != pFromU2022State->cs[g]) {
1878                     int32_t escLen = escSeqCharsLen[cs];
1879                     uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1880                     outLen += escLen;
1881                     pFromU2022State->cs[g] = cs;
1882
1883                     /* invalidate the choices[] */
1884                     choiceCount = 0;
1885                 }
1886
1887                 /* write the shift sequence if necessary */
1888                 if(g != pFromU2022State->g) {
1889                     switch(g) {
1890                     /* case 0 handled before writing escapes */
1891                     case 1:
1892                         buffer[outLen++] = UCNV_SO;
1893                         pFromU2022State->g = 1;
1894                         break;
1895                     default: /* case 2 */
1896                         buffer[outLen++] = 0x1b;
1897                         buffer[outLen++] = 0x4e;
1898                         break;
1899                     /* no case 3: no SS3 in ISO-2022-JP-x */
1900                     }
1901                 }
1902
1903                 /* write the output bytes */
1904                 if(len == 1) {
1905                     buffer[outLen++] = (char)targetValue;
1906                 } else /* len == 2 */ {
1907                     buffer[outLen++] = (char)(targetValue >> 8);
1908                     buffer[outLen++] = (char)targetValue;
1909                 }
1910             } else {
1911                 /*
1912                  * if we cannot find the character after checking all codepages
1913                  * then this is an error
1914                  */
1915                 *err = U_INVALID_CHAR_FOUND;
1916                 cnv->fromUChar32=sourceChar;
1917                 break;
1918             }
1919
1920             if(sourceChar == CR || sourceChar == LF) {
1921                 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1922                 pFromU2022State->cs[2] = 0;
1923                 choiceCount = 0;
1924             }
1925
1926             /* output outLen>0 bytes in buffer[] */
1927             if(outLen == 1) {
1928                 *target++ = buffer[0];
1929                 if(offsets) {
1930                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1931                 }
1932             } else if(outLen == 2 && (target + 2) <= targetLimit) {
1933                 *target++ = buffer[0];
1934                 *target++ = buffer[1];
1935                 if(offsets) {
1936                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1937                     *offsets++ = sourceIndex;
1938                     *offsets++ = sourceIndex;
1939                 }
1940             } else {
1941                 fromUWriteUInt8(
1942                     cnv,
1943                     buffer, outLen,
1944                     &target, (const char *)targetLimit,
1945                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1946                     err);
1947                 if(U_FAILURE(*err)) {
1948                     break;
1949                 }
1950             }
1951         } /* end if(myTargetIndex<myTargetLength) */
1952         else{
1953             *err =U_BUFFER_OVERFLOW_ERROR;
1954             break;
1955         }
1956
1957     }/* end while(mySourceIndex<mySourceLength) */
1958
1959     /*
1960      * the end of the input stream and detection of truncated input
1961      * are handled by the framework, but for ISO-2022-JP conversion
1962      * we need to be in ASCII mode at the very end
1963      *
1964      * conditions:
1965      *   successful
1966      *   in SO mode or not in ASCII mode
1967      *   end of input and no truncated input
1968      */
1969     if( U_SUCCESS(*err) &&
1970         (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1971         args->flush && source>=sourceLimit && cnv->fromUChar32==0
1972     ) {
1973         int32_t sourceIndex;
1974
1975         outLen = 0;
1976
1977         if(pFromU2022State->g != 0) {
1978             buffer[outLen++] = UCNV_SI;
1979             pFromU2022State->g = 0;
1980         }
1981
1982         if(pFromU2022State->cs[0] != ASCII) {
1983             int32_t escLen = escSeqCharsLen[ASCII];
1984             uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1985             outLen += escLen;
1986             pFromU2022State->cs[0] = (int8_t)ASCII;
1987         }
1988
1989         /* get the source index of the last input character */
1990         /*
1991          * TODO this would be simpler and more reliable if we used a pair
1992          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1993          * so that we could simply use the prevSourceIndex here;
1994          * this code gives an incorrect result for the rare case of an unmatched
1995          * trail surrogate that is alone in the last buffer of the text stream
1996          */
1997         sourceIndex=(int32_t)(source-args->source);
1998         if(sourceIndex>0) {
1999             --sourceIndex;
2000             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2001                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2002             ) {
2003                 --sourceIndex;
2004             }
2005         } else {
2006             sourceIndex=-1;
2007         }
2008
2009         fromUWriteUInt8(
2010             cnv,
2011             buffer, outLen,
2012             &target, (const char *)targetLimit,
2013             &offsets, sourceIndex,
2014             err);
2015     }
2016
2017     /*save the state and return */
2018     args->source = source;
2019     args->target = (char*)target;
2020 }
2021
2022 /*************** to unicode *******************/
2023
2024 static void
2025 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2026                                                UErrorCode* err){
2027     char tempBuf[2];
2028     const char *mySource = (char *) args->source;
2029     UChar *myTarget = args->target;
2030     const char *mySourceLimit = args->sourceLimit;
2031     uint32_t targetUniChar = 0x0000;
2032     uint32_t mySourceChar = 0x0000;
2033     uint32_t tmpSourceChar = 0x0000;
2034     UConverterDataISO2022* myData;
2035     ISO2022State *pToU2022State;
2036     StateEnum cs;
2037
2038     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2039     pToU2022State = &myData->toU2022State;
2040
2041     if(myData->key != 0) {
2042         /* continue with a partial escape sequence */
2043         goto escape;
2044     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2045         /* continue with a partial double-byte character */
2046         mySourceChar = args->converter->toUBytes[0];
2047         args->converter->toULength = 0;
2048         cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2049         targetUniChar = missingCharMarker;
2050         goto getTrailByte;
2051     }
2052
2053     while(mySource < mySourceLimit){
2054
2055         targetUniChar =missingCharMarker;
2056
2057         if(myTarget < args->targetLimit){
2058
2059             mySourceChar= (unsigned char) *mySource++;
2060
2061             switch(mySourceChar) {
2062             case UCNV_SI:
2063                 if(myData->version==3) {
2064                     pToU2022State->g=0;
2065                     continue;
2066                 } else {
2067                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2068                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
2069                     break;
2070                 }
2071
2072             case UCNV_SO:
2073                 if(myData->version==3) {
2074                     /* JIS7: switch to G1 half-width Katakana */
2075                     pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2076                     pToU2022State->g=1;
2077                     continue;
2078                 } else {
2079                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2080                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
2081                     break;
2082                 }
2083
2084             case ESC_2022:
2085                 mySource--;
2086 escape:
2087                 {
2088                     const char * mySourceBefore = mySource;
2089                     int8_t toULengthBefore = args->converter->toULength;
2090
2091                     changeState_2022(args->converter,&(mySource),
2092                         mySourceLimit, ISO_2022_JP,err);
2093
2094                     /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2095                     if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2096                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2097                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
2098                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2099                     }
2100                 }
2101
2102                 /* invalid or illegal escape sequence */
2103                 if(U_FAILURE(*err)){
2104                     args->target = myTarget;
2105                     args->source = mySource;
2106                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
2107                     return;
2108                 }
2109                 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2110                 if(myData->key==0) {
2111                     myData->isEmptySegment = TRUE;
2112                 }
2113                 continue;
2114
2115             /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2116
2117             case CR:
2118                 /*falls through*/
2119             case LF:
2120                 /* automatically reset to single-byte mode */
2121                 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2122                     pToU2022State->cs[0] = (int8_t)ASCII;
2123                 }
2124                 pToU2022State->cs[2] = 0;
2125                 pToU2022State->g = 0;
2126                 /* falls through */
2127             default:
2128                 /* convert one or two bytes */
2129                 myData->isEmptySegment = FALSE;
2130                 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2131                 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2132                     !IS_JP_DBCS(cs)
2133                 ) {
2134                     /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2135                     targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2136
2137                     /* return from a single-shift state to the previous one */
2138                     if(pToU2022State->g >= 2) {
2139                         pToU2022State->g=pToU2022State->prevG;
2140                     }
2141                 } else switch(cs) {
2142                 case ASCII:
2143                     if(mySourceChar <= 0x7f) {
2144                         targetUniChar = mySourceChar;
2145                     }
2146                     break;
2147                 case ISO8859_1:
2148                     if(mySourceChar <= 0x7f) {
2149                         targetUniChar = mySourceChar + 0x80;
2150                     }
2151                     /* return from a single-shift state to the previous one */
2152                     pToU2022State->g=pToU2022State->prevG;
2153                     break;
2154                 case ISO8859_7:
2155                     if(mySourceChar <= 0x7f) {
2156                         /* convert mySourceChar+0x80 to use a normal 8-bit table */
2157                         targetUniChar =
2158                             _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2159                                 myData->myConverterArray[cs],
2160                                 mySourceChar + 0x80);
2161                     }
2162                     /* return from a single-shift state to the previous one */
2163                     pToU2022State->g=pToU2022State->prevG;
2164                     break;
2165                 case JISX201:
2166                     if(mySourceChar <= 0x7f) {
2167                         targetUniChar = jisx201ToU(mySourceChar);
2168                     }
2169                     break;
2170                 case HWKANA_7BIT:
2171                     if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2172                         /* 7-bit halfwidth Katakana */
2173                         targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2174                     }
2175                     break;
2176                 default:
2177                     /* G0 DBCS */
2178                     if(mySource < mySourceLimit) {
2179                         int leadIsOk, trailIsOk;
2180                         uint8_t trailByte;
2181 getTrailByte:
2182                         trailByte = (uint8_t)*mySource;
2183                         /*
2184                          * Ticket 5691: consistent illegal sequences:
2185                          * - We include at least the first byte in the illegal sequence.
2186                          * - If any of the non-initial bytes could be the start of a character,
2187                          *   we stop the illegal sequence before the first one of those.
2188                          *
2189                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2190                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2191                          * Otherwise we convert or report the pair of bytes.
2192                          */
2193                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2194                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2195                         if (leadIsOk && trailIsOk) {
2196                             ++mySource;
2197                             tmpSourceChar = (mySourceChar << 8) | trailByte;
2198                             if(cs == JISX208) {
2199                                 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2200                                 mySourceChar = tmpSourceChar;
2201                             } else {
2202                                 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2203                                 mySourceChar = tmpSourceChar;
2204                                 if (cs == KSC5601) {
2205                                     tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2206                                 }
2207                                 tempBuf[0] = (char)(tmpSourceChar >> 8);
2208                                 tempBuf[1] = (char)(tmpSourceChar);
2209                             }
2210                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2211                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2212                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2213                             ++mySource;
2214                             /* add another bit so that the code below writes 2 bytes in case of error */
2215                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2216                         }
2217                     } else {
2218                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2219                         args->converter->toULength = 1;
2220                         goto endloop;
2221                     }
2222                 }  /* End of inner switch */
2223                 break;
2224             }  /* End of outer switch */
2225             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2226                 if(args->offsets){
2227                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2228                 }
2229                 *(myTarget++)=(UChar)targetUniChar;
2230             }
2231             else if(targetUniChar > missingCharMarker){
2232                 /* disassemble the surrogate pair and write to output*/
2233                 targetUniChar-=0x0010000;
2234                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2235                 if(args->offsets){
2236                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2237                 }
2238                 ++myTarget;
2239                 if(myTarget< args->targetLimit){
2240                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2241                     if(args->offsets){
2242                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2243                     }
2244                     ++myTarget;
2245                 }else{
2246                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2247                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2248                 }
2249
2250             }
2251             else{
2252                 /* Call the callback function*/
2253                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2254                 break;
2255             }
2256         }
2257         else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2258             *err =U_BUFFER_OVERFLOW_ERROR;
2259             break;
2260         }
2261     }
2262 endloop:
2263     args->target = myTarget;
2264     args->source = mySource;
2265 }
2266
2267
2268 /***************************************************************
2269 *   Rules for ISO-2022-KR encoding
2270 *   i) The KSC5601 designator sequence should appear only once in a file,
2271 *      at the begining of a line before any KSC5601 characters. This usually
2272 *      means that it appears by itself on the first line of the file
2273 *  ii) There are only 2 shifting sequences SO to shift into double byte mode
2274 *      and SI to shift into single byte mode
2275 */
2276 static void
2277 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2278
2279     UConverter* saveConv = args->converter;
2280     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2281     args->converter=myConverterData->currentConverter;
2282
2283     myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2284     ucnv_MBCSFromUnicodeWithOffsets(args,err);
2285     saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2286
2287     if(*err == U_BUFFER_OVERFLOW_ERROR) {
2288         if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2289             uprv_memcpy(
2290                 saveConv->charErrorBuffer,
2291                 myConverterData->currentConverter->charErrorBuffer,
2292                 myConverterData->currentConverter->charErrorBufferLength);
2293         }
2294         saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2295         myConverterData->currentConverter->charErrorBufferLength = 0;
2296     }
2297     args->converter=saveConv;
2298 }
2299
2300 static void
2301 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2302
2303     const UChar *source = args->source;
2304     const UChar *sourceLimit = args->sourceLimit;
2305     unsigned char *target = (unsigned char *) args->target;
2306     unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2307     int32_t* offsets = args->offsets;
2308     uint32_t targetByteUnit = 0x0000;
2309     UChar32 sourceChar = 0x0000;
2310     UBool isTargetByteDBCS;
2311     UBool oldIsTargetByteDBCS;
2312     UConverterDataISO2022 *converterData;
2313     UConverterSharedData* sharedData;
2314     UBool useFallback;
2315     int32_t length =0;
2316
2317     converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2318     /* if the version is 1 then the user is requesting
2319      * conversion with ibm-25546 pass the arguments to
2320      * MBCS converter and return
2321      */
2322     if(converterData->version==1){
2323         UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2324         return;
2325     }
2326
2327     /* initialize data */
2328     sharedData = converterData->currentConverter->sharedData;
2329     useFallback = args->converter->useFallback;
2330     isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2331     oldIsTargetByteDBCS = isTargetByteDBCS;
2332
2333     isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2334     if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2335         goto getTrail;
2336     }
2337     while(source < sourceLimit){
2338
2339         targetByteUnit = missingCharMarker;
2340
2341         if(target < (unsigned char*) args->targetLimit){
2342             sourceChar = *source++;
2343
2344             /* do not convert SO/SI/ESC */
2345             if(IS_2022_CONTROL(sourceChar)) {
2346                 /* callback(illegal) */
2347                 *err=U_ILLEGAL_CHAR_FOUND;
2348                 args->converter->fromUChar32=sourceChar;
2349                 break;
2350             }
2351
2352             length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2353             if(length < 0) {
2354                 length = -length;  /* fallback */
2355             }
2356             /* only DBCS or SBCS characters are expected*/
2357             /* DB characters with high bit set to 1 are expected */
2358             if( length > 2 || length==0 ||
2359                 (length == 1 && targetByteUnit > 0x7f) ||
2360                 (length == 2 &&
2361                     ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2362                     (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2363             ) {
2364                 targetByteUnit=missingCharMarker;
2365             }
2366             if (targetByteUnit != missingCharMarker){
2367
2368                 oldIsTargetByteDBCS = isTargetByteDBCS;
2369                 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2370                   /* append the shift sequence */
2371                 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2372
2373                     if (isTargetByteDBCS)
2374                         *target++ = UCNV_SO;
2375                     else
2376                         *target++ = UCNV_SI;
2377                     if(offsets)
2378                         *(offsets++) = (int32_t)(source - args->source-1);
2379                 }
2380                 /* write the targetUniChar  to target */
2381                 if(targetByteUnit <= 0x00FF){
2382                     if( target < targetLimit){
2383                         *(target++) = (unsigned char) targetByteUnit;
2384                         if(offsets){
2385                             *(offsets++) = (int32_t)(source - args->source-1);
2386                         }
2387
2388                     }else{
2389                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2390                         *err = U_BUFFER_OVERFLOW_ERROR;
2391                     }
2392                 }else{
2393                     if(target < targetLimit){
2394                         *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2395                         if(offsets){
2396                             *(offsets++) = (int32_t)(source - args->source-1);
2397                         }
2398                         if(target < targetLimit){
2399                             *(target++) =(unsigned char) (targetByteUnit -0x80);
2400                             if(offsets){
2401                                 *(offsets++) = (int32_t)(source - args->source-1);
2402                             }
2403                         }else{
2404                             args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2405                             *err = U_BUFFER_OVERFLOW_ERROR;
2406                         }
2407                     }else{
2408                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2409                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2410                         *err = U_BUFFER_OVERFLOW_ERROR;
2411                     }
2412                 }
2413
2414             }
2415             else{
2416                 /* oops.. the code point is unassingned
2417                  * set the error and reason
2418                  */
2419
2420                 /*check if the char is a First surrogate*/
2421                 if(UTF_IS_SURROGATE(sourceChar)) {
2422                     if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2423 getTrail:
2424                         /*look ahead to find the trail surrogate*/
2425                         if(source <  sourceLimit) {
2426                             /* test the following code unit */
2427                             UChar trail=(UChar) *source;
2428                             if(UTF_IS_SECOND_SURROGATE(trail)) {
2429                                 source++;
2430                                 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2431                                 *err = U_INVALID_CHAR_FOUND;
2432                                 /* convert this surrogate code point */
2433                                 /* exit this condition tree */
2434                             } else {
2435                                 /* this is an unmatched lead code unit (1st surrogate) */
2436                                 /* callback(illegal) */
2437                                 *err=U_ILLEGAL_CHAR_FOUND;
2438                             }
2439                         } else {
2440                             /* no more input */
2441                             *err = U_ZERO_ERROR;
2442                         }
2443                     } else {
2444                         /* this is an unmatched trail code unit (2nd surrogate) */
2445                         /* callback(illegal) */
2446                         *err=U_ILLEGAL_CHAR_FOUND;
2447                     }
2448                 } else {
2449                     /* callback(unassigned) for a BMP code point */
2450                     *err = U_INVALID_CHAR_FOUND;
2451                 }
2452
2453                 args->converter->fromUChar32=sourceChar;
2454                 break;
2455             }
2456         } /* end if(myTargetIndex<myTargetLength) */
2457         else{
2458             *err =U_BUFFER_OVERFLOW_ERROR;
2459             break;
2460         }
2461
2462     }/* end while(mySourceIndex<mySourceLength) */
2463
2464     /*
2465      * the end of the input stream and detection of truncated input
2466      * are handled by the framework, but for ISO-2022-KR conversion
2467      * we need to be in ASCII mode at the very end
2468      *
2469      * conditions:
2470      *   successful
2471      *   not in ASCII mode
2472      *   end of input and no truncated input
2473      */
2474     if( U_SUCCESS(*err) &&
2475         isTargetByteDBCS &&
2476         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2477     ) {
2478         int32_t sourceIndex;
2479
2480         /* we are switching to ASCII */
2481         isTargetByteDBCS=FALSE;
2482
2483         /* get the source index of the last input character */
2484         /*
2485          * TODO this would be simpler and more reliable if we used a pair
2486          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2487          * so that we could simply use the prevSourceIndex here;
2488          * this code gives an incorrect result for the rare case of an unmatched
2489          * trail surrogate that is alone in the last buffer of the text stream
2490          */
2491         sourceIndex=(int32_t)(source-args->source);
2492         if(sourceIndex>0) {
2493             --sourceIndex;
2494             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2495                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2496             ) {
2497                 --sourceIndex;
2498             }
2499         } else {
2500             sourceIndex=-1;
2501         }
2502
2503         fromUWriteUInt8(
2504             args->converter,
2505             SHIFT_IN_STR, 1,
2506             &target, (const char *)targetLimit,
2507             &offsets, sourceIndex,
2508             err);
2509     }
2510
2511     /*save the state and return */
2512     args->source = source;
2513     args->target = (char*)target;
2514     args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2515 }
2516
2517 /************************ To Unicode ***************************************/
2518
2519 static void
2520 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2521                                                             UErrorCode* err){
2522     char const* sourceStart;
2523     UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2524
2525     UConverterToUnicodeArgs subArgs;
2526     int32_t minArgsSize;
2527
2528     /* set up the subconverter arguments */
2529     if(args->size<sizeof(UConverterToUnicodeArgs)) {
2530         minArgsSize = args->size;
2531     } else {
2532         minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2533     }
2534
2535     uprv_memcpy(&subArgs, args, minArgsSize);
2536     subArgs.size = (uint16_t)minArgsSize;
2537     subArgs.converter = myData->currentConverter;
2538
2539     /* remember the original start of the input for offsets */
2540     sourceStart = args->source;
2541
2542     if(myData->key != 0) {
2543         /* continue with a partial escape sequence */
2544         goto escape;
2545     }
2546
2547     while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2548         /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2549         subArgs.source = args->source;
2550         subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2551         if(subArgs.source != subArgs.sourceLimit) {
2552             /*
2553              * get the current partial byte sequence
2554              *
2555              * it needs to be moved between the public and the subconverter
2556              * so that the conversion framework, which only sees the public
2557              * converter, can handle truncated and illegal input etc.
2558              */
2559             if(args->converter->toULength > 0) {
2560                 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2561             }
2562             subArgs.converter->toULength = args->converter->toULength;
2563
2564             /*
2565              * Convert up to the end of the input, or to before the next escape character.
2566              * Does not handle conversion extensions because the preToU[] state etc.
2567              * is not copied.
2568              */
2569             ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2570
2571             if(args->offsets != NULL && sourceStart != args->source) {
2572                 /* update offsets to base them on the actual start of the input */
2573                 int32_t *offsets = args->offsets;
2574                 UChar *target = args->target;
2575                 int32_t delta = (int32_t)(args->source - sourceStart);
2576                 while(target < subArgs.target) {
2577                     if(*offsets >= 0) {
2578                         *offsets += delta;
2579                     }
2580                     ++offsets;
2581                     ++target;
2582                 }
2583             }
2584             args->source = subArgs.source;
2585             args->target = subArgs.target;
2586             args->offsets = subArgs.offsets;
2587
2588             /* copy input/error/overflow buffers */
2589             if(subArgs.converter->toULength > 0) {
2590                 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2591             }
2592             args->converter->toULength = subArgs.converter->toULength;
2593
2594             if(*err == U_BUFFER_OVERFLOW_ERROR) {
2595                 if(subArgs.converter->UCharErrorBufferLength > 0) {
2596                     uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2597                                 subArgs.converter->UCharErrorBufferLength);
2598                 }
2599                 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2600                 subArgs.converter->UCharErrorBufferLength = 0;
2601             }
2602         }
2603
2604         if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2605             return;
2606         }
2607
2608 escape:
2609         changeState_2022(args->converter,
2610                &(args->source),
2611                args->sourceLimit,
2612                ISO_2022_KR,
2613                err);
2614     }
2615 }
2616
2617 static void
2618 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2619                                                             UErrorCode* err){
2620     char tempBuf[2];
2621     const char *mySource = ( char *) args->source;
2622     UChar *myTarget = args->target;
2623     const char *mySourceLimit = args->sourceLimit;
2624     UChar32 targetUniChar = 0x0000;
2625     UChar mySourceChar = 0x0000;
2626     UConverterDataISO2022* myData;
2627     UConverterSharedData* sharedData ;
2628     UBool useFallback;
2629
2630     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2631     if(myData->version==1){
2632         UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2633         return;
2634     }
2635
2636     /* initialize state */
2637     sharedData = myData->currentConverter->sharedData;
2638     useFallback = args->converter->useFallback;
2639
2640     if(myData->key != 0) {
2641         /* continue with a partial escape sequence */
2642         goto escape;
2643     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2644         /* continue with a partial double-byte character */
2645         mySourceChar = args->converter->toUBytes[0];
2646         args->converter->toULength = 0;
2647         goto getTrailByte;
2648     }
2649
2650     while(mySource< mySourceLimit){
2651
2652         if(myTarget < args->targetLimit){
2653
2654             mySourceChar= (unsigned char) *mySource++;
2655
2656             if(mySourceChar==UCNV_SI){
2657                 myData->toU2022State.g = 0;
2658                 if (myData->isEmptySegment) {
2659                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
2660                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2661                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
2662                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2663                     args->converter->toULength = 1;
2664                     args->target = myTarget;
2665                     args->source = mySource;
2666                     return;
2667                 }
2668                 /*consume the source */
2669                 continue;
2670             }else if(mySourceChar==UCNV_SO){
2671                 myData->toU2022State.g = 1;
2672                 myData->isEmptySegment = TRUE;  /* Begin a new segment, empty so far */
2673                 /*consume the source */
2674                 continue;
2675             }else if(mySourceChar==ESC_2022){
2676                 mySource--;
2677 escape:
2678                 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2679                 changeState_2022(args->converter,&(mySource),
2680                                 mySourceLimit, ISO_2022_KR, err);
2681                 if(U_FAILURE(*err)){
2682                     args->target = myTarget;
2683                     args->source = mySource;
2684                     return;
2685                 }
2686                 continue;
2687             }
2688
2689             myData->isEmptySegment = FALSE;     /* Any invalid char errors will be detected separately, so just reset this */
2690             if(myData->toU2022State.g == 1) {
2691                 if(mySource < mySourceLimit) {
2692                     int leadIsOk, trailIsOk;
2693                     uint8_t trailByte;
2694 getTrailByte:
2695                     targetUniChar = missingCharMarker;
2696                     trailByte = (uint8_t)*mySource;
2697                     /*
2698                      * Ticket 5691: consistent illegal sequences:
2699                      * - We include at least the first byte in the illegal sequence.
2700                      * - If any of the non-initial bytes could be the start of a character,
2701                      *   we stop the illegal sequence before the first one of those.
2702                      *
2703                      * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2704                      * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2705                      * Otherwise we convert or report the pair of bytes.
2706                      */
2707                     leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2708                     trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2709                     if (leadIsOk && trailIsOk) {
2710                         ++mySource;
2711                         tempBuf[0] = (char)(mySourceChar + 0x80);
2712                         tempBuf[1] = (char)(trailByte + 0x80);
2713                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2714                         mySourceChar = (mySourceChar << 8) | trailByte;
2715                     } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2716                         /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2717                         ++mySource;
2718                         /* add another bit so that the code below writes 2 bytes in case of error */
2719                         mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2720                     }
2721                 } else {
2722                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2723                     args->converter->toULength = 1;
2724                     break;
2725                 }
2726             }
2727             else if(mySourceChar <= 0x7f) {
2728                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2729             } else {
2730                 targetUniChar = 0xffff;
2731             }
2732             if(targetUniChar < 0xfffe){
2733                 if(args->offsets) {
2734                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2735                 }
2736                 *(myTarget++)=(UChar)targetUniChar;
2737             }
2738             else {
2739                 /* Call the callback function*/
2740                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2741                 break;
2742             }
2743         }
2744         else{
2745             *err =U_BUFFER_OVERFLOW_ERROR;
2746             break;
2747         }
2748     }
2749     args->target = myTarget;
2750     args->source = mySource;
2751 }
2752
2753 /*************************** END ISO2022-KR *********************************/
2754
2755 /*************************** ISO-2022-CN *********************************
2756 *
2757 * Rules for ISO-2022-CN Encoding:
2758 * i)   The designator sequence must appear once on a line before any instance
2759 *      of character set it designates.
2760 * ii)  If two lines contain characters from the same character set, both lines
2761 *      must include the designator sequence.
2762 * iii) Once the designator sequence is known, a shifting sequence has to be found
2763 *      to invoke the  shifting
2764 * iv)  All lines start in ASCII and end in ASCII.
2765 * v)   Four shifting sequences are employed for this purpose:
2766 *
2767 *      Sequcence   ASCII Eq    Charsets
2768 *      ----------  -------    ---------
2769 *      SI           <SI>        US-ASCII
2770 *      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2771 *      SS2          <ESC>N      CNS-11643-1992 Plane 2
2772 *      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2773 *
2774 * vi)
2775 *      SOdesignator  : ESC "$" ")" finalchar_for_SO
2776 *      SS2designator : ESC "$" "*" finalchar_for_SS2
2777 *      SS3designator : ESC "$" "+" finalchar_for_SS3
2778 *
2779 *      ESC $ ) A       Indicates the bytes following SO are Chinese
2780 *       characters as defined in GB 2312-80, until
2781 *       another SOdesignation appears
2782 *
2783 *
2784 *      ESC $ ) E       Indicates the bytes following SO are as defined
2785 *       in ISO-IR-165 (for details, see section 2.1),
2786 *       until another SOdesignation appears
2787 *
2788 *      ESC $ ) G       Indicates the bytes following SO are as defined
2789 *       in CNS 11643-plane-1, until another
2790 *       SOdesignation appears
2791 *
2792 *      ESC $ * H       Indicates the two bytes immediately following
2793 *       SS2 is a Chinese character as defined in CNS
2794 *       11643-plane-2, until another SS2designation
2795 *       appears
2796 *       (Meaning <ESC>N must preceed every 2 byte
2797 *        sequence.)
2798 *
2799 *      ESC $ + I       Indicates the immediate two bytes following SS3
2800 *       is a Chinese character as defined in CNS
2801 *       11643-plane-3, until another SS3designation
2802 *       appears
2803 *       (Meaning <ESC>O must preceed every 2 byte
2804 *        sequence.)
2805 *
2806 *      ESC $ + J       Indicates the immediate two bytes following SS3
2807 *       is a Chinese character as defined in CNS
2808 *       11643-plane-4, until another SS3designation
2809 *       appears
2810 *       (In English: <ESC>O must preceed every 2 byte
2811 *        sequence.)
2812 *
2813 *      ESC $ + K       Indicates the immediate two bytes following SS3
2814 *       is a Chinese character as defined in CNS
2815 *       11643-plane-5, until another SS3designation
2816 *       appears
2817 *
2818 *      ESC $ + L       Indicates the immediate two bytes following SS3
2819 *       is a Chinese character as defined in CNS
2820 *       11643-plane-6, until another SS3designation
2821 *       appears
2822 *
2823 *      ESC $ + M       Indicates the immediate two bytes following SS3
2824 *       is a Chinese character as defined in CNS
2825 *       11643-plane-7, until another SS3designation
2826 *       appears
2827 *
2828 *       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2829 *       has its own designation information before any Chinese characters
2830 *       appear
2831 *
2832 */
2833
2834 /* The following are defined this way to make the strings truely readonly */
2835 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2836 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2837 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2838 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2839 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2840 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2841 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2842 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2843 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2844
2845 /********************** ISO2022-CN Data **************************/
2846 static const char* const escSeqCharsCN[10] ={
2847         SHIFT_IN_STR,           /* ASCII */
2848         GB_2312_80_STR,
2849         ISO_IR_165_STR,
2850         CNS_11643_1992_Plane_1_STR,
2851         CNS_11643_1992_Plane_2_STR,
2852         CNS_11643_1992_Plane_3_STR,
2853         CNS_11643_1992_Plane_4_STR,
2854         CNS_11643_1992_Plane_5_STR,
2855         CNS_11643_1992_Plane_6_STR,
2856         CNS_11643_1992_Plane_7_STR
2857 };
2858
2859 static void
2860 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2861     UConverter *cnv = args->converter;
2862     UConverterDataISO2022 *converterData;
2863     ISO2022State *pFromU2022State;
2864     uint8_t *target = (uint8_t *) args->target;
2865     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2866     const UChar* source = args->source;
2867     const UChar* sourceLimit = args->sourceLimit;
2868     int32_t* offsets = args->offsets;
2869     UChar32 sourceChar;
2870     char buffer[8];
2871     int32_t len;
2872     int8_t choices[3];
2873     int32_t choiceCount;
2874     uint32_t targetValue = 0;
2875     UBool useFallback;
2876
2877     /* set up the state */
2878     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2879     pFromU2022State   = &converterData->fromU2022State;
2880
2881     choiceCount = 0;
2882
2883     /* check if the last codepoint of previous buffer was a lead surrogate*/
2884     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2885         goto getTrail;
2886     }
2887
2888     while( source < sourceLimit){
2889         if(target < targetLimit){
2890
2891             sourceChar  = *(source++);
2892             /*check if the char is a First surrogate*/
2893              if(UTF_IS_SURROGATE(sourceChar)) {
2894                 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2895 getTrail:
2896                     /*look ahead to find the trail surrogate*/
2897                     if(source < sourceLimit) {
2898                         /* test the following code unit */
2899                         UChar trail=(UChar) *source;
2900                         if(UTF_IS_SECOND_SURROGATE(trail)) {
2901                             source++;
2902                             sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2903                             cnv->fromUChar32=0x00;
2904                             /* convert this supplementary code point */
2905                             /* exit this condition tree */
2906                         } else {
2907                             /* this is an unmatched lead code unit (1st surrogate) */
2908                             /* callback(illegal) */
2909                             *err=U_ILLEGAL_CHAR_FOUND;
2910                             cnv->fromUChar32=sourceChar;
2911                             break;
2912                         }
2913                     } else {
2914                         /* no more input */
2915                         cnv->fromUChar32=sourceChar;
2916                         break;
2917                     }
2918                 } else {
2919                     /* this is an unmatched trail code unit (2nd surrogate) */
2920                     /* callback(illegal) */
2921                     *err=U_ILLEGAL_CHAR_FOUND;
2922                     cnv->fromUChar32=sourceChar;
2923                     break;
2924                 }
2925             }
2926
2927             /* do the conversion */
2928             if(sourceChar <= 0x007f ){
2929                 /* do not convert SO/SI/ESC */
2930                 if(IS_2022_CONTROL(sourceChar)) {
2931                     /* callback(illegal) */
2932                     *err=U_ILLEGAL_CHAR_FOUND;
2933                     cnv->fromUChar32=sourceChar;
2934                     break;
2935                 }
2936
2937                 /* US-ASCII */
2938                 if(pFromU2022State->g == 0) {
2939                     buffer[0] = (char)sourceChar;
2940                     len = 1;
2941                 } else {
2942                     buffer[0] = UCNV_SI;
2943                     buffer[1] = (char)sourceChar;
2944                     len = 2;
2945                     pFromU2022State->g = 0;
2946                     choiceCount = 0;
2947                 }
2948                 if(sourceChar == CR || sourceChar == LF) {
2949                     /* reset the state at the end of a line */
2950                     uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2951                     choiceCount = 0;
2952                 }
2953             }
2954             else{
2955                 /* convert U+0080..U+10ffff */
2956                 int32_t i;
2957                 int8_t cs, g;
2958
2959                 if(choiceCount == 0) {
2960                     /* try the current SO/G1 converter first */
2961                     choices[0] = pFromU2022State->cs[1];
2962
2963                     /* default to GB2312_1 if none is designated yet */
2964                     if(choices[0] == 0) {
2965                         choices[0] = GB2312_1;
2966                     }
2967
2968                     if(converterData->version == 0) {
2969                         /* ISO-2022-CN */
2970
2971                         /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2972                         if(choices[0] == GB2312_1) {
2973                             choices[1] = (int8_t)CNS_11643_1;
2974                         } else {
2975                             choices[1] = (int8_t)GB2312_1;
2976                         }
2977
2978                         choiceCount = 2;
2979                     } else if (converterData->version == 1) {
2980                         /* ISO-2022-CN-EXT */
2981
2982                         /* try one of the other converters */
2983                         switch(choices[0]) {
2984                         case GB2312_1:
2985                             choices[1] = (int8_t)CNS_11643_1;
2986                             choices[2] = (int8_t)ISO_IR_165;
2987                             break;
2988                         case ISO_IR_165:
2989                             choices[1] = (int8_t)GB2312_1;
2990                             choices[2] = (int8_t)CNS_11643_1;
2991                             break;
2992                         default: /* CNS_11643_x */
2993                             choices[1] = (int8_t)GB2312_1;
2994                             choices[2] = (int8_t)ISO_IR_165;
2995                             break;
2996                         }
2997
2998                         choiceCount = 3;
2999                     } else {
3000                         choices[0] = (int8_t)CNS_11643_1;
3001                         choices[1] = (int8_t)GB2312_1;
3002                     }
3003                 }
3004
3005                 cs = g = 0;
3006                 /*
3007                  * len==0: no mapping found yet
3008                  * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3009                  * len>0: found a roundtrip result, done
3010                  */
3011                 len = 0;
3012                 /*
3013                  * We will turn off useFallback after finding a fallback,
3014                  * but we still get fallbacks from PUA code points as usual.
3015                  * Therefore, we will also need to check that we don't overwrite
3016                  * an early fallback with a later one.
3017                  */
3018                 useFallback = cnv->useFallback;
3019
3020                 for(i = 0; i < choiceCount && len <= 0; ++i) {
3021                     int8_t cs0 = choices[i];
3022                     if(cs0 > 0) {
3023                         uint32_t value;
3024                         int32_t len2;
3025                         if(cs0 >= CNS_11643_0) {
3026                             len2 = MBCS_FROM_UCHAR32_ISO2022(
3027                                         converterData->myConverterArray[CNS_11643],
3028                                         sourceChar,
3029                                         &value,
3030                                         useFallback,
3031                                         MBCS_OUTPUT_3);
3032                             if(len2 == 3 || (len2 == -3 && len == 0)) {
3033                                 targetValue = value;
3034                                 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3035                                 if(len2 >= 0) {
3036                                     len = 2;
3037                                 } else {
3038                                     len = -2;
3039                                     useFallback = FALSE;
3040                                 }
3041                                 if(cs == CNS_11643_1) {
3042                                     g = 1;
3043                                 } else if(cs == CNS_11643_2) {
3044                                     g = 2;
3045                                 } else /* plane 3..7 */ if(converterData->version == 1) {
3046                                     g = 3;
3047                                 } else {
3048                                     /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3049                                     len = 0;
3050                                 }
3051                             }
3052                         } else {
3053                             /* GB2312_1 or ISO-IR-165 */
3054                             len2 = MBCS_FROM_UCHAR32_ISO2022(
3055                                         converterData->myConverterArray[cs0],
3056                                         sourceChar,
3057                                         &value,
3058                                         useFallback,
3059                                         MBCS_OUTPUT_2);
3060                             if(len2 == 2 || (len2 == -2 && len == 0)) {
3061                                 targetValue = value;
3062                                 len = len2;
3063                                 cs = cs0;
3064                                 g = 1;
3065                                 useFallback = FALSE;
3066                             }
3067                         }
3068                     }
3069                 }
3070
3071                 if(len != 0) {
3072                     len = 0; /* count output bytes; it must have been abs(len) == 2 */
3073
3074                     /* write the designation sequence if necessary */
3075                     if(cs != pFromU2022State->cs[g]) {
3076                         if(cs < CNS_11643) {
3077                             uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3078                         } else {
3079                             uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3080                         }
3081                         len = 4;
3082                         pFromU2022State->cs[g] = cs;
3083                         if(g == 1) {
3084                             /* changing the SO/G1 charset invalidates the choices[] */
3085                             choiceCount = 0;
3086                         }
3087                     }
3088
3089                     /* write the shift sequence if necessary */
3090                     if(g != pFromU2022State->g) {
3091                         switch(g) {
3092                         case 1:
3093                             buffer[len++] = UCNV_SO;
3094
3095                             /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3096                             pFromU2022State->g = 1;
3097                             break;
3098                         case 2:
3099                             buffer[len++] = 0x1b;
3100                             buffer[len++] = 0x4e;
3101                             break;
3102                         default: /* case 3 */
3103                             buffer[len++] = 0x1b;
3104                             buffer[len++] = 0x4f;
3105                             break;
3106                         }
3107                     }
3108
3109                     /* write the two output bytes */
3110                     buffer[len++] = (char)(targetValue >> 8);
3111                     buffer[len++] = (char)targetValue;
3112                 } else {
3113                     /* if we cannot find the character after checking all codepages
3114                      * then this is an error
3115                      */
3116                     *err = U_INVALID_CHAR_FOUND;
3117                     cnv->fromUChar32=sourceChar;
3118                     break;
3119                 }
3120             }
3121
3122             /* output len>0 bytes in buffer[] */
3123             if(len == 1) {
3124                 *target++ = buffer[0];
3125                 if(offsets) {
3126                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3127                 }
3128             } else if(len == 2 && (target + 2) <= targetLimit) {
3129                 *target++ = buffer[0];
3130                 *target++ = buffer[1];
3131                 if(offsets) {
3132                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3133                     *offsets++ = sourceIndex;
3134                     *offsets++ = sourceIndex;
3135                 }
3136             } else {
3137                 fromUWriteUInt8(
3138                     cnv,
3139                     buffer, len,
3140                     &target, (const char *)targetLimit,
3141                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3142                     err);
3143                 if(U_FAILURE(*err)) {
3144                     break;
3145                 }
3146             }
3147         } /* end if(myTargetIndex<myTargetLength) */
3148         else{
3149             *err =U_BUFFER_OVERFLOW_ERROR;
3150             break;
3151         }
3152
3153     }/* end while(mySourceIndex<mySourceLength) */
3154
3155     /*
3156      * the end of the input stream and detection of truncated input
3157      * are handled by the framework, but for ISO-2022-CN conversion
3158      * we need to be in ASCII mode at the very end
3159      *
3160      * conditions:
3161      *   successful
3162      *   not in ASCII mode
3163      *   end of input and no truncated input
3164      */
3165     if( U_SUCCESS(*err) &&
3166         pFromU2022State->g!=0 &&
3167         args->flush && source>=sourceLimit && cnv->fromUChar32==0
3168     ) {
3169         int32_t sourceIndex;
3170
3171         /* we are switching to ASCII */
3172         pFromU2022State->g=0;
3173
3174         /* get the source index of the last input character */
3175         /*
3176          * TODO this would be simpler and more reliable if we used a pair
3177          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3178          * so that we could simply use the prevSourceIndex here;
3179          * this code gives an incorrect result for the rare case of an unmatched
3180          * trail surrogate that is alone in the last buffer of the text stream
3181          */
3182         sourceIndex=(int32_t)(source-args->source);
3183         if(sourceIndex>0) {
3184             --sourceIndex;
3185             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3186                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3187             ) {
3188                 --sourceIndex;
3189             }
3190         } else {
3191             sourceIndex=-1;
3192         }
3193
3194         fromUWriteUInt8(
3195             cnv,
3196             SHIFT_IN_STR, 1,
3197             &target, (const char *)targetLimit,
3198             &offsets, sourceIndex,
3199             err);
3200     }
3201
3202     /*save the state and return */
3203     args->source = source;
3204     args->target = (char*)target;
3205 }
3206
3207
3208 static void
3209 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3210                                                UErrorCode* err){
3211     char tempBuf[3];
3212     const char *mySource = (char *) args->source;
3213     UChar *myTarget = args->target;
3214     const char *mySourceLimit = args->sourceLimit;
3215     uint32_t targetUniChar = 0x0000;
3216     uint32_t mySourceChar = 0x0000;
3217     UConverterDataISO2022* myData;
3218     ISO2022State *pToU2022State;
3219
3220     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3221     pToU2022State = &myData->toU2022State;
3222
3223     if(myData->key != 0) {
3224         /* continue with a partial escape sequence */
3225         goto escape;
3226     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3227         /* continue with a partial double-byte character */
3228         mySourceChar = args->converter->toUBytes[0];
3229         args->converter->toULength = 0;
3230         targetUniChar = missingCharMarker;
3231         goto getTrailByte;
3232     }
3233
3234     while(mySource < mySourceLimit){
3235
3236         targetUniChar =missingCharMarker;
3237
3238         if(myTarget < args->targetLimit){
3239
3240             mySourceChar= (unsigned char) *mySource++;
3241
3242             switch(mySourceChar){
3243             case UCNV_SI:
3244                 pToU2022State->g=0;
3245                 if (myData->isEmptySegment) {
3246                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
3247                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3248                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
3249                     args->converter->toUBytes[0] = mySourceChar;
3250                     args->converter->toULength = 1;
3251                     args->target = myTarget;
3252                     args->source = mySource;
3253                     return;
3254                 }
3255                 continue;
3256
3257             case UCNV_SO:
3258                 if(pToU2022State->cs[1] != 0) {
3259                     pToU2022State->g=1;
3260                     myData->isEmptySegment = TRUE;      /* Begin a new segment, empty so far */
3261                     continue;
3262                 } else {
3263                     /* illegal to have SO before a matching designator */
3264                     myData->isEmptySegment = FALSE;     /* Handling a different error, reset this to avoid future spurious errs */
3265                     break;
3266                 }
3267
3268             case ESC_2022:
3269                 mySource--;
3270 escape:
3271                 {
3272                     const char * mySourceBefore = mySource;
3273                     int8_t toULengthBefore = args->converter->toULength;
3274
3275                     changeState_2022(args->converter,&(mySource),
3276                         mySourceLimit, ISO_2022_CN,err);
3277
3278                     /* After SO there must be at least one character before a designator (designator error handled separately) */
3279                     if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3280                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3281                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
3282                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3283                     }
3284                 }
3285
3286                 /* invalid or illegal escape sequence */
3287                 if(U_FAILURE(*err)){
3288                     args->target = myTarget;
3289                     args->source = mySource;
3290                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
3291                     return;
3292                 }
3293                 continue;
3294
3295             /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3296
3297             case CR:
3298                 /*falls through*/
3299             case LF:
3300                 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3301                 /* falls through */
3302             default:
3303                 /* convert one or two bytes */
3304                 myData->isEmptySegment = FALSE;
3305                 if(pToU2022State->g != 0) {
3306                     if(mySource < mySourceLimit) {
3307                         UConverterSharedData *cnv;
3308                         StateEnum tempState;
3309                         int32_t tempBufLen;
3310                         int leadIsOk, trailIsOk;
3311                         uint8_t trailByte;
3312 getTrailByte:
3313                         trailByte = (uint8_t)*mySource;
3314                         /*
3315                          * Ticket 5691: consistent illegal sequences:
3316                          * - We include at least the first byte in the illegal sequence.
3317                          * - If any of the non-initial bytes could be the start of a character,
3318                          *   we stop the illegal sequence before the first one of those.
3319                          *
3320                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3321                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3322                          * Otherwise we convert or report the pair of bytes.
3323                          */
3324                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3325                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3326                         if (leadIsOk && trailIsOk) {
3327                             ++mySource;
3328                             tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3329                             if(tempState >= CNS_11643_0) {
3330                                 cnv = myData->myConverterArray[CNS_11643];
3331                                 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3332                                 tempBuf[1] = (char) (mySourceChar);
3333                                 tempBuf[2] = (char) trailByte;
3334                                 tempBufLen = 3;
3335
3336                             }else{
3337                                 cnv = myData->myConverterArray[tempState];
3338                                 tempBuf[0] = (char) (mySourceChar);
3339                                 tempBuf[1] = (char) trailByte;
3340                                 tempBufLen = 2;
3341                             }
3342                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3343                             mySourceChar = (mySourceChar << 8) | trailByte;
3344                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3345                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3346                             ++mySource;
3347                             /* add another bit so that the code below writes 2 bytes in case of error */
3348                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3349                         }
3350                         if(pToU2022State->g>=2) {
3351                             /* return from a single-shift state to the previous one */
3352                             pToU2022State->g=pToU2022State->prevG;
3353                         }
3354                     } else {
3355                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3356                         args->converter->toULength = 1;
3357                         goto endloop;
3358                     }
3359                 }
3360                 else{
3361                     if(mySourceChar <= 0x7f) {
3362                         targetUniChar = (UChar) mySourceChar;
3363                     }
3364                 }
3365                 break;
3366             }
3367             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3368                 if(args->offsets){
3369                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3370                 }
3371                 *(myTarget++)=(UChar)targetUniChar;
3372             }
3373             else if(targetUniChar > missingCharMarker){
3374                 /* disassemble the surrogate pair and write to output*/
3375                 targetUniChar-=0x0010000;
3376                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3377                 if(args->offsets){
3378                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3379                 }
3380                 ++myTarget;
3381                 if(myTarget< args->targetLimit){
3382                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3383                     if(args->offsets){
3384                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3385                     }
3386                     ++myTarget;
3387                 }else{
3388                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3389                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3390                 }
3391
3392             }
3393             else{
3394                 /* Call the callback function*/
3395                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3396                 break;
3397             }
3398         }
3399         else{
3400             *err =U_BUFFER_OVERFLOW_ERROR;
3401             break;
3402         }
3403     }
3404 endloop:
3405     args->target = myTarget;
3406     args->source = mySource;
3407 }
3408
3409 static void
3410 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3411     UConverter *cnv = args->converter;
3412     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3413     ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3414     char *p, *subchar;
3415     char buffer[8];
3416     int32_t length;
3417
3418     subchar=(char *)cnv->subChars;
3419     length=cnv->subCharLen; /* assume length==1 for most variants */
3420
3421     p = buffer;
3422     switch(myConverterData->locale[0]){
3423     case 'j':
3424         {
3425             int8_t cs;
3426
3427             if(pFromU2022State->g == 1) {
3428                 /* JIS7: switch from G1 to G0 */
3429                 pFromU2022State->g = 0;
3430                 *p++ = UCNV_SI;
3431             }
3432
3433             cs = pFromU2022State->cs[0];
3434             if(cs != ASCII && cs != JISX201) {
3435                 /* not in ASCII or JIS X 0201: switch to ASCII */
3436                 pFromU2022State->cs[0] = (int8_t)ASCII;
3437                 *p++ = '\x1b';
3438                 *p++ = '\x28';
3439                 *p++ = '\x42';
3440             }
3441
3442             *p++ = subchar[0];
3443             break;
3444         }
3445     case 'c':
3446         if(pFromU2022State->g != 0) {
3447             /* not in ASCII mode: switch to ASCII */
3448             pFromU2022State->g = 0;
3449             *p++ = UCNV_SI;
3450         }
3451         *p++ = subchar[0];
3452         break;
3453     case 'k':
3454         if(myConverterData->version == 0) {
3455             if(length == 1) {
3456                 if((UBool)args->converter->fromUnicodeStatus) {
3457                     /* in DBCS mode: switch to SBCS */
3458                     args->converter->fromUnicodeStatus = 0;
3459                     *p++ = UCNV_SI;
3460                 }
3461                 *p++ = subchar[0];
3462             } else /* length == 2*/ {
3463                 if(!(UBool)args->converter->fromUnicodeStatus) {
3464                     /* in SBCS mode: switch to DBCS */
3465                     args->converter->fromUnicodeStatus = 1;
3466                     *p++ = UCNV_SO;
3467                 }
3468                 *p++ = subchar[0];
3469                 *p++ = subchar[1];
3470             }
3471             break;
3472         } else {
3473             /* save the subconverter's substitution string */
3474             uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3475             int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3476
3477             /* set our substitution string into the subconverter */
3478             myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3479             myConverterData->currentConverter->subCharLen = (int8_t)length;
3480
3481             /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3482             args->converter = myConverterData->currentConverter;
3483             myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3484             ucnv_cbFromUWriteSub(args, 0, err);
3485             cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3486             args->converter = cnv;
3487
3488             /* restore the subconverter's substitution string */
3489             myConverterData->currentConverter->subChars = currentSubChars;
3490             myConverterData->currentConverter->subCharLen = currentSubCharLen;
3491
3492             if(*err == U_BUFFER_OVERFLOW_ERROR) {
3493                 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3494                     uprv_memcpy(
3495                         cnv->charErrorBuffer,
3496                         myConverterData->currentConverter->charErrorBuffer,
3497                         myConverterData->currentConverter->charErrorBufferLength);
3498                 }
3499                 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3500                 myConverterData->currentConverter->charErrorBufferLength = 0;
3501             }
3502             return;
3503         }
3504     default:
3505         /* not expected */
3506         break;
3507     }
3508     ucnv_cbFromUWriteBytes(args,
3509                            buffer, (int32_t)(p - buffer),
3510                            offsetIndex, err);
3511 }
3512
3513 /*
3514  * Structure for cloning an ISO 2022 converter into a single memory block.
3515  * ucnv_safeClone() of the converter will align the entire cloneStruct,
3516  * and then ucnv_safeClone() of the sub-converter may additionally align
3517  * currentConverter inside the cloneStruct, for which we need the deadSpace
3518  * after currentConverter.
3519  * This is because UAlignedMemory may be larger than the actually
3520  * necessary alignment size for the platform.
3521  * The other cloneStruct fields will not be moved around,
3522  * and are aligned properly with cloneStruct's alignment.
3523  */
3524 struct cloneStruct
3525 {
3526     UConverter cnv;
3527     UConverter currentConverter;
3528     UAlignedMemory deadSpace;
3529     UConverterDataISO2022 mydata;
3530 };
3531
3532
3533 static UConverter *
3534 _ISO_2022_SafeClone(
3535             const UConverter *cnv,
3536             void *stackBuffer,
3537             int32_t *pBufferSize,
3538             UErrorCode *status)
3539 {
3540     struct cloneStruct * localClone;
3541     UConverterDataISO2022 *cnvData;
3542     int32_t i, size;
3543
3544     if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3545         *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3546         return NULL;
3547     }
3548
3549     cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3550     localClone = (struct cloneStruct *)stackBuffer;
3551
3552     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3553
3554     uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3555     localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3556     localClone->cnv.isExtraLocal = TRUE;
3557
3558     /* share the subconverters */
3559
3560     if(cnvData->currentConverter != NULL) {
3561         size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3562         localClone->mydata.currentConverter =
3563             ucnv_safeClone(cnvData->currentConverter,
3564                             &localClone->currentConverter,
3565                             &size, status);
3566         if(U_FAILURE(*status)) {
3567             return NULL;
3568         }
3569     }
3570
3571     for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3572         if(cnvData->myConverterArray[i] != NULL) {
3573             ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3574         }
3575     }
3576
3577     return &localClone->cnv;
3578 }
3579
3580 static void
3581 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3582                     const USetAdder *sa,
3583                     UConverterUnicodeSet which,
3584                     UErrorCode *pErrorCode)
3585 {
3586     int32_t i;
3587     UConverterDataISO2022* cnvData;
3588
3589     if (U_FAILURE(*pErrorCode)) {
3590         return;
3591     }
3592 #ifdef U_ENABLE_GENERIC_ISO_2022
3593     if (cnv->sharedData == &_ISO2022Data) {
3594         /* We use UTF-8 in this case */
3595         sa->addRange(sa->set, 0, 0xd7FF);
3596         sa->addRange(sa->set, 0xE000, 0x10FFFF);
3597         return;
3598     }
3599 #endif
3600
3601     cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3602
3603     /* open a set and initialize it with code points that are algorithmically round-tripped */
3604     switch(cnvData->locale[0]){
3605     case 'j':
3606         /* include JIS X 0201 which is hardcoded */
3607         sa->add(sa->set, 0xa5);
3608         sa->add(sa->set, 0x203e);
3609         if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3610             /* include Latin-1 for some variants of JP */
3611             sa->addRange(sa->set, 0, 0xff);
3612         } else {
3613             /* include ASCII for JP */
3614             sa->addRange(sa->set, 0, 0x7f);
3615         }
3616         if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3617             /*
3618              * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3619              * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3620              * use half-width Katakana.
3621              * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3622              * half-width Katakana via the ESC ( I sequence.
3623              * However, we only emit (fromUnicode) half-width Katakana according to the
3624              * definition of each variant.
3625              *
3626              * When including fallbacks,
3627              * we need to include half-width Katakana Unicode code points for all JP variants because
3628              * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3629              */
3630             /* include half-width Katakana for JP */
3631             sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3632         }
3633         break;
3634     case 'c':
3635     case 'z':
3636         /* include ASCII for CN */
3637         sa->addRange(sa->set, 0, 0x7f);
3638         break;
3639     case 'k':
3640         /* there is only one converter for KR, and it is not in the myConverterArray[] */
3641         cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3642                 cnvData->currentConverter, sa, which, pErrorCode);
3643         /* the loop over myConverterArray[] will simply not find another converter */
3644         break;
3645     default:
3646         break;
3647     }
3648
3649 #if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3650             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3651                 cnvData->version==0 && i==CNS_11643
3652             ) {
3653                 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3654                 ucnv_MBCSGetUnicodeSetForBytes(
3655                         cnvData->myConverterArray[i],
3656                         sa, UCNV_ROUNDTRIP_SET,
3657                         0, 0x81, 0x82,
3658                         pErrorCode);
3659             }
3660 #endif
3661
3662     for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3663         UConverterSetFilter filter;
3664         if(cnvData->myConverterArray[i]!=NULL) {
3665             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3666                 cnvData->version==0 && i==CNS_11643
3667             ) {
3668                 /*
3669                  * Version-specific for CN:
3670                  * CN version 0 does not map CNS planes 3..7 although
3671                  * they are all available in the CNS conversion table;
3672                  * CN version 1 (-EXT) does map them all.
3673                  * The two versions create different Unicode sets.
3674                  */
3675                 filter=UCNV_SET_FILTER_2022_CN;
3676             } else if(cnvData->locale[0]=='j' && i==JISX208) {
3677                 /*
3678                  * Only add code points that map to Shift-JIS codes
3679                  * corresponding to JIS X 0208.
3680                  */
3681                 filter=UCNV_SET_FILTER_SJIS;
3682             } else if(i==KSC5601) {
3683                 /*
3684                  * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3685                  * are broader than GR94.
3686                  */
3687                 filter=UCNV_SET_FILTER_GR94DBCS;
3688             } else {
3689                 filter=UCNV_SET_FILTER_NONE;
3690             }
3691             ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3692         }
3693     }
3694
3695     /*
3696      * ISO 2022 converters must not convert SO/SI/ESC despite what
3697      * sub-converters do by themselves.
3698      * Remove these characters from the set.
3699      */
3700     sa->remove(sa->set, 0x0e);
3701     sa->remove(sa->set, 0x0f);
3702     sa->remove(sa->set, 0x1b);
3703
3704     /* ISO 2022 converters do not convert C1 controls either */
3705     sa->removeRange(sa->set, 0x80, 0x9f);
3706 }
3707
3708 static const UConverterImpl _ISO2022Impl={
3709     UCNV_ISO_2022,
3710
3711     NULL,
3712     NULL,
3713
3714     _ISO2022Open,
3715     _ISO2022Close,
3716     _ISO2022Reset,
3717
3718 #ifdef U_ENABLE_GENERIC_ISO_2022
3719     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3720     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3721     ucnv_fromUnicode_UTF8,
3722     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3723 #else
3724     NULL,
3725     NULL,
3726     NULL,
3727     NULL,
3728 #endif
3729     NULL,
3730
3731     NULL,
3732     _ISO2022getName,
3733     _ISO_2022_WriteSub,
3734     _ISO_2022_SafeClone,
3735     _ISO_2022_GetUnicodeSet
3736 };
3737 static const UConverterStaticData _ISO2022StaticData={
3738     sizeof(UConverterStaticData),
3739     "ISO_2022",
3740     2022,
3741     UCNV_IBM,
3742     UCNV_ISO_2022,
3743     1,
3744     3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3745     { 0x1a, 0, 0, 0 },
3746     1,
3747     FALSE,
3748     FALSE,
3749     0,
3750     0,
3751     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3752 };
3753 const UConverterSharedData _ISO2022Data={
3754     sizeof(UConverterSharedData),
3755     ~((uint32_t) 0),
3756     NULL,
3757     NULL,
3758     &_ISO2022StaticData,
3759     FALSE,
3760     &_ISO2022Impl,
3761     0
3762 };
3763
3764 /*************JP****************/
3765 static const UConverterImpl _ISO2022JPImpl={
3766     UCNV_ISO_2022,
3767
3768     NULL,
3769     NULL,
3770
3771     _ISO2022Open,
3772     _ISO2022Close,
3773     _ISO2022Reset,
3774
3775     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3776     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3777     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3778     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3779     NULL,
3780
3781     NULL,
3782     _ISO2022getName,
3783     _ISO_2022_WriteSub,
3784     _ISO_2022_SafeClone,
3785     _ISO_2022_GetUnicodeSet
3786 };
3787 static const UConverterStaticData _ISO2022JPStaticData={
3788     sizeof(UConverterStaticData),
3789     "ISO_2022_JP",
3790     0,
3791     UCNV_IBM,
3792     UCNV_ISO_2022,
3793     1,
3794     6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3795     { 0x1a, 0, 0, 0 },
3796     1,
3797     FALSE,
3798     FALSE,
3799     0,
3800     0,
3801     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3802 };
3803 static const UConverterSharedData _ISO2022JPData={
3804     sizeof(UConverterSharedData),
3805     ~((uint32_t) 0),
3806     NULL,
3807     NULL,
3808     &_ISO2022JPStaticData,
3809     FALSE,
3810     &_ISO2022JPImpl,
3811     0
3812 };
3813
3814 /************* KR ***************/
3815 static const UConverterImpl _ISO2022KRImpl={
3816     UCNV_ISO_2022,
3817
3818     NULL,
3819     NULL,
3820
3821     _ISO2022Open,
3822     _ISO2022Close,
3823     _ISO2022Reset,
3824
3825     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3826     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3827     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3828     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3829     NULL,
3830
3831     NULL,
3832     _ISO2022getName,
3833     _ISO_2022_WriteSub,
3834     _ISO_2022_SafeClone,
3835     _ISO_2022_GetUnicodeSet
3836 };
3837 static const UConverterStaticData _ISO2022KRStaticData={
3838     sizeof(UConverterStaticData),
3839     "ISO_2022_KR",
3840     0,
3841     UCNV_IBM,
3842     UCNV_ISO_2022,
3843     1,
3844     3, /* max 3 bytes per UChar: SO+DBCS */
3845     { 0x1a, 0, 0, 0 },
3846     1,
3847     FALSE,
3848     FALSE,
3849     0,
3850     0,
3851     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3852 };
3853 static const UConverterSharedData _ISO2022KRData={
3854     sizeof(UConverterSharedData),
3855     ~((uint32_t) 0),
3856     NULL,
3857     NULL,
3858     &_ISO2022KRStaticData,
3859     FALSE,
3860     &_ISO2022KRImpl,
3861     0
3862 };
3863
3864 /*************** CN ***************/
3865 static const UConverterImpl _ISO2022CNImpl={
3866
3867     UCNV_ISO_2022,
3868
3869     NULL,
3870     NULL,
3871
3872     _ISO2022Open,
3873     _ISO2022Close,
3874     _ISO2022Reset,
3875
3876     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3877     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3878     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3879     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3880     NULL,
3881
3882     NULL,
3883     _ISO2022getName,
3884     _ISO_2022_WriteSub,
3885     _ISO_2022_SafeClone,
3886     _ISO_2022_GetUnicodeSet
3887 };
3888 static const UConverterStaticData _ISO2022CNStaticData={
3889     sizeof(UConverterStaticData),
3890     "ISO_2022_CN",
3891     0,
3892     UCNV_IBM,
3893     UCNV_ISO_2022,
3894     1,
3895     8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3896     { 0x1a, 0, 0, 0 },
3897     1,
3898     FALSE,
3899     FALSE,
3900     0,
3901     0,
3902     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3903 };
3904 static const UConverterSharedData _ISO2022CNData={
3905     sizeof(UConverterSharedData),
3906     ~((uint32_t) 0),
3907     NULL,
3908     NULL,
3909     &_ISO2022CNStaticData,
3910     FALSE,
3911     &_ISO2022CNImpl,
3912     0
3913 };
3914
3915
3916
3917 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */