icuSources/common/ucnv2022.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2000-2012, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv2022.cpp
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2000feb03
  12 *   created by: Markus W. Scherer
  13 *
  14 *   Change history:
  15 *
  16 *   06/29/2000  helena  Major rewrite of the callback APIs.
  17 *   08/08/2000  Ram     Included support for ISO-2022-JP-2
  18 *                       Changed implementation of toUnicode
  19 *                       function
  20 *   08/21/2000  Ram     Added support for ISO-2022-KR
  21 *   08/29/2000  Ram     Seperated implementation of EBCDIC to
  22 *                       ucnvebdc.c
  23 *   09/20/2000  Ram     Added support for ISO-2022-CN
  24 *                       Added implementations for getNextUChar()
  25 *                       for specific 2022 country variants.
  26 *   10/31/2000  Ram     Implemented offsets logic functions
  27 */
  28
  29 #include "unicode/utypes.h"
  30
  31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
  32
  33 #include "unicode/ucnv.h"
  34 #include "unicode/uset.h"
  35 #include "unicode/ucnv_err.h"
  36 #include "unicode/ucnv_cb.h"
  37 #include "unicode/utf16.h"
  38 #include "ucnv_imp.h"
  39 #include "ucnv_bld.h"
  40 #include "ucnv_cnv.h"
  41 #include "ucnvmbcs.h"
  42 #include "cstring.h"
  43 #include "cmemory.h"
  44 #include "uassert.h"
  45
  46 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  47
  48 #ifdef U_ENABLE_GENERIC_ISO_2022
  49 /*
  50  * I am disabling the generic ISO-2022 converter after proposing to do so on
  51  * the icu mailing list two days ago.
  52  *
  53  * Reasons:
  54  * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
  55  *    its designation sequences, single shifts with return to the previous state,
  56  *    switch-with-no-return to UTF-16BE or similar, etc.
  57  *    This is unlike the language-specific variants like ISO-2022-JP which
  58  *    require a much smaller repertoire of ISO-2022 features.
  59  *    These variants continue to be supported.
  60  * 2. I believe that no one is really using the generic ISO-2022 converter
  61  *    but rather always one of the language-specific variants.
  62  *    Note that ICU's generic ISO-2022 converter has always output one escape
  63  *    sequence followed by UTF-8 for the whole stream.
  64  * 3. Switching between subcharsets is extremely slow, because each time
  65  *    the previous converter is closed and a new one opened,
  66  *    without any kind of caching, least-recently-used list, etc.
  67  * 4. The code is currently buggy, and given the above it does not seem
  68  *    reasonable to spend the time on maintenance.
  69  * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
  70  *    This means, for example, that when ISO-8859-7 is designated, the following
  71  *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
  72  *    The ICU ISO-2022 converter does not handle this - and has no information
  73  *    about which subconverter would have to be shifted vs. which is designed
  74  *    for 7-bit ISO-2022.
  75  *
  76  * Markus Scherer 2003-dec-03
  77  */
  78 #endif
  79
  80 static const char SHIFT_IN_STR[]  = "\x0F";
  81 // static const char SHIFT_OUT_STR[] = "\x0E";
  82
  83 #define CR      0x0D
  84 #define LF      0x0A
  85 #define H_TAB   0x09
  86 #define V_TAB   0x0B
  87 #define SPACE   0x20
  88
  89 enum {
  90     HWKANA_START=0xff61,
  91     HWKANA_END=0xff9f
  92 };
  93
  94 /*
  95  * 94-character sets with native byte values A1..FE are encoded in ISO 2022
  96  * as bytes 21..7E. (Subtract 0x80.)
  97  * 96-character sets with native byte values A0..FF are encoded in ISO 2022
  98  * as bytes 20..7F. (Subtract 0x80.)
  99  * Do not encode C1 control codes with native bytes 80..9F
 100  * as bytes 00..1F (C0 control codes).
 101  */
 102 enum {
 103     GR94_START=0xa1,
 104     GR94_END=0xfe,
 105     GR96_START=0xa0,
 106     GR96_END=0xff
 107 };
 108
 109 /*
 110  * ISO 2022 control codes must not be converted from Unicode
 111  * because they would mess up the byte stream.
 112  * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
 113  * corresponding to SO, SI, and ESC.
 114  */
 115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
 116
 117 /* for ISO-2022-JP and -CN implementations */
 118 typedef enum  {
 119         /* shared values */
 120         INVALID_STATE=-1,
 121         ASCII = 0,
 122
 123         SS2_STATE=0x10,
 124         SS3_STATE,
 125
 126         /* JP */
 127         ISO8859_1 = 1 ,
 128         ISO8859_7 = 2 ,
 129         JISX201  = 3,
 130         JISX208 = 4,
 131         JISX212 = 5,
 132         GB2312  =6,
 133         KSC5601 =7,
 134         HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
 135
 136         /* CN */
 137         /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
 138         GB2312_1=1,
 139         ISO_IR_165=2,
 140         CNS_11643=3,
 141
 142         /*
 143          * these are used in StateEnum and ISO2022State variables,
 144          * but CNS_11643 must be used to index into myConverterArray[]
 145          */
 146         CNS_11643_0=0x20,
 147         CNS_11643_1,
 148         CNS_11643_2,
 149         CNS_11643_3,
 150         CNS_11643_4,
 151         CNS_11643_5,
 152         CNS_11643_6,
 153         CNS_11643_7
 154 } StateEnum;
 155
 156 /* is the StateEnum charset value for a DBCS charset? */
 157 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
 158
 159 #define CSM(cs) ((uint16_t)1<<(cs))
 160
 161 /*
 162  * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
 163  * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
 164  *
 165  * Note: The converter uses some leniency:
 166  * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
 167  *   all versions, not just JIS7 and JIS8.
 168  * - ICU does not distinguish between different versions of JIS X 0208.
 169  */
 170 enum { MAX_JA_VERSION=4 };
 171 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
 172     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
 173     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
 174     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
 175     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
 176     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
 177 };
 178
 179 typedef enum {
 180         ASCII1=0,
 181         LATIN1,
 182         SBCS,
 183         DBCS,
 184         MBCS,
 185         HWKANA
 186 }Cnv2022Type;
 187
 188 typedef struct ISO2022State {
 189     int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
 190     int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
 191     int8_t prevG;       /* g before single shift (SS2 or SS3) */
 192 } ISO2022State;
 193
 194 #define UCNV_OPTIONS_VERSION_MASK 0xf
 195 #define UCNV_2022_MAX_CONVERTERS 10
 196
 197 typedef struct{
 198     UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
 199     UConverter *currentConverter;
 200     Cnv2022Type currentType;
 201     ISO2022State toU2022State, fromU2022State;
 202     uint32_t key;
 203     uint32_t version;
 204 #ifdef U_ENABLE_GENERIC_ISO_2022
 205     UBool isFirstBuffer;
 206 #endif
 207     UBool isEmptySegment;
 208     char name[30];
 209     char locale[3];
 210 }UConverterDataISO2022;
 211
 212 /* Protos */
 213 /* ISO-2022 ----------------------------------------------------------------- */
 214
 215 /*Forward declaration */
 216 U_CFUNC void
 217 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
 218                       UErrorCode * err);
 219 U_CFUNC void
 220 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
 221                                     UErrorCode * err);
 222
 223 #define ESC_2022 0x1B /*ESC*/
 224
 225 typedef enum
 226 {
 227         INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
 228         VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
 229         VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
 230         VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
 231 } UCNV_TableStates_2022;
 232
 233 /*
 234 * The way these state transition arrays work is:
 235 * ex : ESC$B is the sequence for JISX208
 236 *      a) First Iteration: char is ESC
 237 *          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
 238 *             int x = normalize_esq_chars_2022[27] which is equal to 1
 239 *         ii) Search for this value in escSeqStateTable_Key_2022[]
 240 *             value of x is stored at escSeqStateTable_Key_2022[0]
 241 *        iii) Save this index as offset
 242 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
 243 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
 244 *     b) Switch on this state and continue to next char
 245 *          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
 246 *             which is normalize_esq_chars_2022[36] == 4
 247 *         ii) x is currently 1(from above)
 248 *               x<<=5 -- x is now 32
 249 *               x+=normalize_esq_chars_2022[36]
 250 *               now x is 36
 251 *        iii) Search for this value in escSeqStateTable_Key_2022[]
 252 *             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
 253 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
 254 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
 255 *     c) Switch on this state and continue to next char
 256 *        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
 257 *        ii) x is currently 36 (from above)
 258 *            x<<=5 -- x is now 1152
 259 *            x+=normalize_esq_chars_2022[66]
 260 *            now x is 1161
 261 *       iii) Search for this value in escSeqStateTable_Key_2022[]
 262 *            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
 263 *        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
 264 *            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
 265 *         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
 266 */
 267
 268
 269 /*Below are the 3 arrays depicting a state transition table*/
 270 static const int8_t normalize_esq_chars_2022[256] = {
 271 /*       0      1       2       3       4      5       6        7       8       9           */
 272
 273          0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 274         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 275         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
 276         ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
 277         ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
 278         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 279         ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
 280         ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
 281         ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
 282         ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 283         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 284         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 285         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 286         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 287         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 288         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 289         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 290         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 291         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 292         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 293         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 294         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 295         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 296         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 297         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 298         ,0     ,0      ,0      ,0      ,0      ,0
 299 };
 300
 301 #ifdef U_ENABLE_GENERIC_ISO_2022
 302 /*
 303  * When the generic ISO-2022 converter is completely removed, not just disabled
 304  * per #ifdef, then the following state table and the associated tables that are
 305  * dimensioned with MAX_STATES_2022 should be trimmed.
 306  *
 307  * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
 308  * the associated escape sequences starting with ESC ( B should be removed.
 309  * This includes the ones with key values 1097 and all of the ones above 1000000.
 310  *
 311  * For the latter, the tables can simply be truncated.
 312  * For the former, since the tables must be kept parallel, it is probably best
 313  * to simply duplicate an adjacent table cell, parallel in all tables.
 314  *
 315  * It may make sense to restructure the tables, especially by using small search
 316  * tables for the variants instead of indexing them parallel to the table here.
 317  */
 318 #endif
 319
 320 #define MAX_STATES_2022 74
 321 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
 322 /*   0           1           2           3           4           5           6           7           8           9           */
 323
 324      1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
 325     ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
 326     ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
 327     ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
 328     ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
 329     ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
 330     ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
 331     ,35947631   ,35947635   ,35947636   ,35947638
 332 };
 333
 334 #ifdef U_ENABLE_GENERIC_ISO_2022
 335
 336 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
 337  /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
 338
 339      NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
 340     ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
 341     ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
 342     ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
 343     ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
 344     ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
 345     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
 346     ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
 347 };
 348
 349 #endif
 350
 351 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
 352 /*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
 353      VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 354     ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 355     ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
 356     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 357     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 358     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 359     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 360     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 361 };
 362
 363
 364 /* Type def for refactoring changeState_2022 code*/
 365 typedef enum{
 366 #ifdef U_ENABLE_GENERIC_ISO_2022
 367     ISO_2022=0,
 368 #endif
 369     ISO_2022_JP=1,
 370     ISO_2022_KR=2,
 371     ISO_2022_CN=3
 372 } Variant2022;
 373
 374 /*********** ISO 2022 Converter Protos ***********/
 375 static void
 376 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
 377
 378 static void
 379  _ISO2022Close(UConverter *converter);
 380
 381 static void
 382 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
 383
 384 static const char*
 385 _ISO2022getName(const UConverter* cnv);
 386
 387 static void
 388 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
 389
 390 static UConverter *
 391 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
 392
 393 #ifdef U_ENABLE_GENERIC_ISO_2022
 394 static void
 395 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
 396 #endif
 397
 398 namespace {
 399
 400 /*const UConverterSharedData _ISO2022Data;*/
 401 extern const UConverterSharedData _ISO2022JPData;
 402 extern const UConverterSharedData _ISO2022KRData;
 403 extern const UConverterSharedData _ISO2022CNData;
 404
 405 }  // namespace
 406
 407 /*************** Converter implementations ******************/
 408
 409 /* The purpose of this function is to get around gcc compiler warnings. */
 410 static inline void
 411 fromUWriteUInt8(UConverter *cnv,
 412                  const char *bytes, int32_t length,
 413                  uint8_t **target, const char *targetLimit,
 414                  int32_t **offsets,
 415                  int32_t sourceIndex,
 416                  UErrorCode *pErrorCode)
 417 {
 418     char *targetChars = (char *)*target;
 419     ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
 420                          offsets, sourceIndex, pErrorCode);
 421     *target = (uint8_t*)targetChars;
 422
 423 }
 424
 425 static inline void
 426 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
 427     if(myConverterData->version == 1) {
 428         UConverter *cnv = myConverterData->currentConverter;
 429
 430         cnv->toUnicodeStatus=0;     /* offset */
 431         cnv->mode=0;                /* state */
 432         cnv->toULength=0;           /* byteIndex */
 433     }
 434 }
 435
 436 static inline void
 437 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
 438    /* in ISO-2022-KR the designator sequence appears only once
 439     * in a file so we append it only once
 440     */
 441     if( converter->charErrorBufferLength==0){
 442
 443         converter->charErrorBufferLength = 4;
 444         converter->charErrorBuffer[0] = 0x1b;
 445         converter->charErrorBuffer[1] = 0x24;
 446         converter->charErrorBuffer[2] = 0x29;
 447         converter->charErrorBuffer[3] = 0x43;
 448     }
 449     if(myConverterData->version == 1) {
 450         UConverter *cnv = myConverterData->currentConverter;
 451
 452         cnv->fromUChar32=0;
 453         cnv->fromUnicodeStatus=1;   /* prevLength */
 454     }
 455 }
 456
 457 static void
 458 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
 459
 460     char myLocale[6]={' ',' ',' ',' ',' ',' '};
 461
 462     cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
 463     if(cnv->extraInfo != NULL) {
 464         UConverterNamePieces stackPieces;
 465         UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
 466         UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
 467         uint32_t version;
 468
 469         stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
 470
 471         uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
 472         myConverterData->currentType = ASCII1;
 473         cnv->fromUnicodeStatus =FALSE;
 474         if(pArgs->locale){
 475             uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
 476         }
 477         version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
 478         myConverterData->version = version;
 479         if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
 480             (myLocale[2]=='_' || myLocale[2]=='\0'))
 481         {
 482             size_t len=0;
 483             /* open the required converters and cache them */
 484             if(version>MAX_JA_VERSION) {
 485                 /* prevent indexing beyond jpCharsetMasks[] */
 486                 myConverterData->version = version = 0;
 487             }
 488             if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
 489                 myConverterData->myConverterArray[ISO8859_7] =
 490                     ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
 491             }
 492             myConverterData->myConverterArray[JISX208] =
 493                 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
 494             if(jpCharsetMasks[version]&CSM(JISX212)) {
 495                 myConverterData->myConverterArray[JISX212] =
 496                     ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
 497             }
 498             if(jpCharsetMasks[version]&CSM(GB2312)) {
 499                 myConverterData->myConverterArray[GB2312] =
 500                     ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);   /* gb_2312_80-1 */
 501             }
 502             if(jpCharsetMasks[version]&CSM(KSC5601)) {
 503                 myConverterData->myConverterArray[KSC5601] =
 504                     ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
 505             }
 506
 507             /* set the function pointers to appropriate funtions */
 508             cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
 509             uprv_strcpy(myConverterData->locale,"ja");
 510
 511             (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
 512             len = uprv_strlen(myConverterData->name);
 513             myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
 514             myConverterData->name[len+1]='\0';
 515         }
 516         else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
 517             (myLocale[2]=='_' || myLocale[2]=='\0'))
 518         {
 519             const char *cnvName;
 520             if(version==1) {
 521                 cnvName="icu-internal-25546";
 522             } else {
 523                 cnvName="ibm-949";
 524                 myConverterData->version=version=0;
 525             }
 526             if(pArgs->onlyTestIsLoadable) {
 527                 ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
 528                 uprv_free(cnv->extraInfo);
 529                 cnv->extraInfo=NULL;
 530                 return;
 531             } else {
 532                 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
 533                 if (U_FAILURE(*errorCode)) {
 534                     _ISO2022Close(cnv);
 535                     return;
 536                 }
 537
 538                 if(version==1) {
 539                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
 540                     uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
 541                     cnv->subCharLen = myConverterData->currentConverter->subCharLen;
 542                 }else{
 543                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
 544                 }
 545
 546                 /* initialize the state variables */
 547                 setInitialStateToUnicodeKR(cnv, myConverterData);
 548                 setInitialStateFromUnicodeKR(cnv, myConverterData);
 549
 550                 /* set the function pointers to appropriate funtions */
 551                 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
 552                 uprv_strcpy(myConverterData->locale,"ko");
 553             }
 554         }
 555         else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
 556             (myLocale[2]=='_' || myLocale[2]=='\0'))
 557         {
 558
 559             /* open the required converters and cache them */
 560             myConverterData->myConverterArray[GB2312_1] =
 561                 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
 562             if(version==1) {
 563                 myConverterData->myConverterArray[ISO_IR_165] =
 564                     ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
 565             }
 566             myConverterData->myConverterArray[CNS_11643] =
 567                 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
 568
 569
 570             /* set the function pointers to appropriate funtions */
 571             cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
 572             uprv_strcpy(myConverterData->locale,"cn");
 573
 574             if (version==0){
 575                 myConverterData->version = 0;
 576                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
 577             }else if (version==1){
 578                 myConverterData->version = 1;
 579                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
 580             }else {
 581                 myConverterData->version = 2;
 582                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
 583             }
 584         }
 585         else{
 586 #ifdef U_ENABLE_GENERIC_ISO_2022
 587             myConverterData->isFirstBuffer = TRUE;
 588
 589             /* append the UTF-8 escape sequence */
 590             cnv->charErrorBufferLength = 3;
 591             cnv->charErrorBuffer[0] = 0x1b;
 592             cnv->charErrorBuffer[1] = 0x25;
 593             cnv->charErrorBuffer[2] = 0x42;
 594
 595             cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
 596             /* initialize the state variables */
 597             uprv_strcpy(myConverterData->name,"ISO_2022");
 598 #else
 599             *errorCode = U_UNSUPPORTED_ERROR;
 600             return;
 601 #endif
 602         }
 603
 604         cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
 605
 606         if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
 607             _ISO2022Close(cnv);
 608         }
 609     } else {
 610         *errorCode = U_MEMORY_ALLOCATION_ERROR;
 611     }
 612 }
 613
 614
 615 static void
 616 _ISO2022Close(UConverter *converter) {
 617     UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
 618     UConverterSharedData **array = myData->myConverterArray;
 619     int32_t i;
 620
 621     if (converter->extraInfo != NULL) {
 622         /*close the array of converter pointers and free the memory*/
 623         for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
 624             if(array[i]!=NULL) {
 625                 ucnv_unloadSharedDataIfReady(array[i]);
 626             }
 627         }
 628
 629         ucnv_close(myData->currentConverter);
 630
 631         if(!converter->isExtraLocal){
 632             uprv_free (converter->extraInfo);
 633             converter->extraInfo = NULL;
 634         }
 635     }
 636 }
 637
 638 static void
 639 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
 640     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
 641     if(choice<=UCNV_RESET_TO_UNICODE) {
 642         uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
 643         myConverterData->key = 0;
 644         myConverterData->isEmptySegment = FALSE;
 645     }
 646     if(choice!=UCNV_RESET_TO_UNICODE) {
 647         uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
 648     }
 649 #ifdef U_ENABLE_GENERIC_ISO_2022
 650     if(myConverterData->locale[0] == 0){
 651         if(choice<=UCNV_RESET_TO_UNICODE) {
 652             myConverterData->isFirstBuffer = TRUE;
 653             myConverterData->key = 0;
 654             if (converter->mode == UCNV_SO){
 655                 ucnv_close (myConverterData->currentConverter);
 656                 myConverterData->currentConverter=NULL;
 657             }
 658             converter->mode = UCNV_SI;
 659         }
 660         if(choice!=UCNV_RESET_TO_UNICODE) {
 661             /* re-append UTF-8 escape sequence */
 662             converter->charErrorBufferLength = 3;
 663             converter->charErrorBuffer[0] = 0x1b;
 664             converter->charErrorBuffer[1] = 0x28;
 665             converter->charErrorBuffer[2] = 0x42;
 666         }
 667     }
 668     else
 669 #endif
 670     {
 671         /* reset the state variables */
 672         if(myConverterData->locale[0] == 'k'){
 673             if(choice<=UCNV_RESET_TO_UNICODE) {
 674                 setInitialStateToUnicodeKR(converter, myConverterData);
 675             }
 676             if(choice!=UCNV_RESET_TO_UNICODE) {
 677                 setInitialStateFromUnicodeKR(converter, myConverterData);
 678             }
 679         }
 680     }
 681 }
 682
 683 static const char*
 684 _ISO2022getName(const UConverter* cnv){
 685     if(cnv->extraInfo){
 686         UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
 687         return myData->name;
 688     }
 689     return NULL;
 690 }
 691
 692
 693 /*************** to unicode *******************/
 694 /****************************************************************************
 695  * Recognized escape sequences are
 696  * <ESC>(B  ASCII
 697  * <ESC>.A  ISO-8859-1
 698  * <ESC>.F  ISO-8859-7
 699  * <ESC>(J  JISX-201
 700  * <ESC>(I  JISX-201
 701  * <ESC>$B  JISX-208
 702  * <ESC>$@  JISX-208
 703  * <ESC>$(D JISX-212
 704  * <ESC>$A  GB2312
 705  * <ESC>$(C KSC5601
 706  */
 707 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
 708 /*      0                1               2               3               4               5               6               7               8               9    */
 709     INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 710     ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
 711     ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 712     ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
 713     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 714     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 715     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 716     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 717 };
 718
 719 /*************** to unicode *******************/
 720 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
 721 /*      0                1               2               3               4               5               6               7               8               9    */
 722      INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 723     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 724     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 725     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 726     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
 727     ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 728     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 729     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 730 };
 731
 732
 733 static UCNV_TableStates_2022
 734 getKey_2022(char c,int32_t* key,int32_t* offset){
 735     int32_t togo;
 736     int32_t low = 0;
 737     int32_t hi = MAX_STATES_2022;
 738     int32_t oldmid=0;
 739
 740     togo = normalize_esq_chars_2022[(uint8_t)c];
 741     if(togo == 0) {
 742         /* not a valid character anywhere in an escape sequence */
 743         *key = 0;
 744         *offset = 0;
 745         return INVALID_2022;
 746     }
 747     togo = (*key << 5) + togo;
 748
 749     while (hi != low)  /*binary search*/{
 750
 751         register int32_t mid = (hi+low) >> 1; /*Finds median*/
 752
 753         if (mid == oldmid)
 754             break;
 755
 756         if (escSeqStateTable_Key_2022[mid] > togo){
 757             hi = mid;
 758         }
 759         else if (escSeqStateTable_Key_2022[mid] < togo){
 760             low = mid;
 761         }
 762         else /*we found it*/{
 763             *key = togo;
 764             *offset = mid;
 765             return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
 766         }
 767         oldmid = mid;
 768
 769     }
 770
 771     *key = 0;
 772     *offset = 0;
 773     return INVALID_2022;
 774 }
 775
 776 /*runs through a state machine to determine the escape sequence - codepage correspondance
 777  */
 778 static void
 779 changeState_2022(UConverter* _this,
 780                 const char** source,
 781                 const char* sourceLimit,
 782                 Variant2022 var,
 783                 UErrorCode* err){
 784     UCNV_TableStates_2022 value;
 785     UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
 786     uint32_t key = myData2022->key;
 787     int32_t offset = 0;
 788     int8_t initialToULength = _this->toULength;
 789     char c;
 790
 791     value = VALID_NON_TERMINAL_2022;
 792     while (*source < sourceLimit) {
 793         c = *(*source)++;
 794         _this->toUBytes[_this->toULength++]=(uint8_t)c;
 795         value = getKey_2022(c,(int32_t *) &key, &offset);
 796
 797         switch (value){
 798
 799         case VALID_NON_TERMINAL_2022 :
 800             /* continue with the loop */
 801             break;
 802
 803         case VALID_TERMINAL_2022:
 804             key = 0;
 805             goto DONE;
 806
 807         case INVALID_2022:
 808             goto DONE;
 809
 810         case VALID_MAYBE_TERMINAL_2022:
 811 #ifdef U_ENABLE_GENERIC_ISO_2022
 812             /* ESC ( B is ambiguous only for ISO_2022 itself */
 813             if(var == ISO_2022) {
 814                 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
 815                 _this->toULength = 0;
 816
 817                 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
 818
 819                 /* continue with the loop */
 820                 value = VALID_NON_TERMINAL_2022;
 821                 break;
 822             } else
 823 #endif
 824             {
 825                 /* not ISO_2022 itself, finish here */
 826                 value = VALID_TERMINAL_2022;
 827                 key = 0;
 828                 goto DONE;
 829             }
 830         }
 831     }
 832
 833 DONE:
 834     myData2022->key = key;
 835
 836     if (value == VALID_NON_TERMINAL_2022) {
 837         /* indicate that the escape sequence is incomplete: key!=0 */
 838         return;
 839     } else if (value == INVALID_2022 ) {
 840         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 841     } else /* value == VALID_TERMINAL_2022 */ {
 842         switch(var){
 843 #ifdef U_ENABLE_GENERIC_ISO_2022
 844         case ISO_2022:
 845         {
 846             const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
 847             if(chosenConverterName == NULL) {
 848                 /* SS2 or SS3 */
 849                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 850                 _this->toUCallbackReason = UCNV_UNASSIGNED;
 851                 return;
 852             }
 853
 854             _this->mode = UCNV_SI;
 855             ucnv_close(myData2022->currentConverter);
 856             myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
 857             if(U_SUCCESS(*err)) {
 858                 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
 859                 _this->mode = UCNV_SO;
 860             }
 861             break;
 862         }
 863 #endif
 864         case ISO_2022_JP:
 865             {
 866                 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
 867                 switch(tempState) {
 868                 case INVALID_STATE:
 869                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 870                     break;
 871                 case SS2_STATE:
 872                     if(myData2022->toU2022State.cs[2]!=0) {
 873                         if(myData2022->toU2022State.g<2) {
 874                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 875                         }
 876                         myData2022->toU2022State.g=2;
 877                     } else {
 878                         /* illegal to have SS2 before a matching designator */
 879                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 880                     }
 881                     break;
 882                 /* case SS3_STATE: not used in ISO-2022-JP-x */
 883                 case ISO8859_1:
 884                 case ISO8859_7:
 885                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
 886                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 887                     } else {
 888                         /* G2 charset for SS2 */
 889                         myData2022->toU2022State.cs[2]=(int8_t)tempState;
 890                     }
 891                     break;
 892                 default:
 893                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
 894                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 895                     } else {
 896                         /* G0 charset */
 897                         myData2022->toU2022State.cs[0]=(int8_t)tempState;
 898                     }
 899                     break;
 900                 }
 901             }
 902             break;
 903         case ISO_2022_CN:
 904             {
 905                 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
 906                 switch(tempState) {
 907                 case INVALID_STATE:
 908                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 909                     break;
 910                 case SS2_STATE:
 911                     if(myData2022->toU2022State.cs[2]!=0) {
 912                         if(myData2022->toU2022State.g<2) {
 913                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 914                         }
 915                         myData2022->toU2022State.g=2;
 916                     } else {
 917                         /* illegal to have SS2 before a matching designator */
 918                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 919                     }
 920                     break;
 921                 case SS3_STATE:
 922                     if(myData2022->toU2022State.cs[3]!=0) {
 923                         if(myData2022->toU2022State.g<2) {
 924                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 925                         }
 926                         myData2022->toU2022State.g=3;
 927                     } else {
 928                         /* illegal to have SS3 before a matching designator */
 929                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 930                     }
 931                     break;
 932                 case ISO_IR_165:
 933                     if(myData2022->version==0) {
 934                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 935                         break;
 936                     }
 937                     /*fall through*/
 938                 case GB2312_1:
 939                     /*fall through*/
 940                 case CNS_11643_1:
 941                     myData2022->toU2022State.cs[1]=(int8_t)tempState;
 942                     break;
 943                 case CNS_11643_2:
 944                     myData2022->toU2022State.cs[2]=(int8_t)tempState;
 945                     break;
 946                 default:
 947                     /* other CNS 11643 planes */
 948                     if(myData2022->version==0) {
 949                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 950                     } else {
 951                        myData2022->toU2022State.cs[3]=(int8_t)tempState;
 952                     }
 953                     break;
 954                 }
 955             }
 956             break;
 957         case ISO_2022_KR:
 958             if(offset==0x30){
 959                 /* nothing to be done, just accept this one escape sequence */
 960             } else {
 961                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 962             }
 963             break;
 964
 965         default:
 966             *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 967             break;
 968         }
 969     }
 970     if(U_SUCCESS(*err)) {
 971         _this->toULength = 0;
 972     } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
 973         if(_this->toULength>1) {
 974             /*
 975              * Ticket 5691: consistent illegal sequences:
 976              * - We include at least the first byte (ESC) in the illegal sequence.
 977              * - If any of the non-initial bytes could be the start of a character,
 978              *   we stop the illegal sequence before the first one of those.
 979              *   In escape sequences, all following bytes are "printable", that is,
 980              *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
 981              *   they are valid single/lead bytes.
 982              *   For simplicity, we always only report the initial ESC byte as the
 983              *   illegal sequence and back out all other bytes we looked at.
 984              */
 985             /* Back out some bytes. */
 986             int8_t backOutDistance=_this->toULength-1;
 987             int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
 988             if(backOutDistance<=bytesFromThisBuffer) {
 989                 /* same as initialToULength<=1 */
 990                 *source-=backOutDistance;
 991             } else {
 992                 /* Back out bytes from the previous buffer: Need to replay them. */
 993                 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
 994                 /* same as -(initialToULength-1) */
 995                 /* preToULength is negative! */
 996                 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
 997                 *source-=bytesFromThisBuffer;
 998             }
 999             _this->toULength=1;
1000         }
1001     } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1002         _this->toUCallbackReason = UCNV_UNASSIGNED;
1003     }
1004 }
1005
1006 /*Checks the characters of the buffer against valid 2022 escape sequences
1007 *if the match we return a pointer to the initial start of the sequence otherwise
1008 *we return sourceLimit
1009 */
1010 /*for 2022 looks ahead in the stream
1011  *to determine the longest possible convertible
1012  *data stream
1013  */
1014 static inline const char*
1015 getEndOfBuffer_2022(const char** source,
1016                    const char* sourceLimit,
1017                    UBool /*flush*/){
1018
1019     const char* mySource = *source;
1020
1021 #ifdef U_ENABLE_GENERIC_ISO_2022
1022     if (*source >= sourceLimit)
1023         return sourceLimit;
1024
1025     do{
1026
1027         if (*mySource == ESC_2022){
1028             int8_t i;
1029             int32_t key = 0;
1030             int32_t offset;
1031             UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1032
1033             /* Kludge: I could not
1034             * figure out the reason for validating an escape sequence
1035             * twice - once here and once in changeState_2022().
1036             * is it possible to have an ESC character in a ISO2022
1037             * byte stream which is valid in a code page? Is it legal?
1038             */
1039             for (i=0;
1040             (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1041             i++) {
1042                 value =  getKey_2022(*(mySource+i), &key, &offset);
1043             }
1044             if (value > 0 || *mySource==ESC_2022)
1045                 return mySource;
1046
1047             if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1048                 return sourceLimit;
1049         }
1050     }while (++mySource < sourceLimit);
1051
1052     return sourceLimit;
1053 #else
1054     while(mySource < sourceLimit && *mySource != ESC_2022) {
1055         ++mySource;
1056     }
1057     return mySource;
1058 #endif
1059 }
1060
1061
1062 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1063  * any future change in _MBCSFromUChar32() function should be reflected here.
1064  * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1065  */
1066 static inline int32_t
1067 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1068                                          UChar32 c,
1069                                          uint32_t* value,
1070                                          UBool useFallback,
1071                                          int outputType)
1072 {
1073     const int32_t *cx;
1074     const uint16_t *table;
1075     uint32_t stage2Entry;
1076     uint32_t myValue;
1077     int32_t length;
1078     const uint8_t *p;
1079     /*
1080      * TODO(markus): Use and require new, faster MBCS conversion table structures.
1081      * Use internal version of ucnv_open() that verifies that the new structures are available,
1082      * else U_INTERNAL_PROGRAM_ERROR.
1083      */
1084     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1085     if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1086         table=sharedData->mbcs.fromUnicodeTable;
1087         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1088         /* get the bytes and the length for the output */
1089         if(outputType==MBCS_OUTPUT_2){
1090             myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1091             if(myValue<=0xff) {
1092                 length=1;
1093             } else {
1094                 length=2;
1095             }
1096         } else /* outputType==MBCS_OUTPUT_3 */ {
1097             p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1098             myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1099             if(myValue<=0xff) {
1100                 length=1;
1101             } else if(myValue<=0xffff) {
1102                 length=2;
1103             } else {
1104                 length=3;
1105             }
1106         }
1107         /* is this code point assigned, or do we use fallbacks? */
1108         if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1109             /* assigned */
1110             *value=myValue;
1111             return length;
1112         } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1113             /*
1114              * We allow a 0 byte output if the "assigned" bit is set for this entry.
1115              * There is no way with this data structure for fallback output
1116              * to be a zero byte.
1117              */
1118             *value=myValue;
1119             return -length;
1120         }
1121     }
1122
1123     cx=sharedData->mbcs.extIndexes;
1124     if(cx!=NULL) {
1125         return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1126     }
1127
1128     /* unassigned */
1129     return 0;
1130 }
1131
1132 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1133  * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1134  * @param retval pointer to output byte
1135  * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1136  */
1137 static inline int32_t
1138 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1139                                        UChar32 c,
1140                                        uint32_t* retval,
1141                                        UBool useFallback)
1142 {
1143     const uint16_t *table;
1144     int32_t value;
1145     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1146     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1147         return 0;
1148     }
1149     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1150     table=sharedData->mbcs.fromUnicodeTable;
1151     /* get the byte for the output */
1152     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1153     /* is this code point assigned, or do we use fallbacks? */
1154     *retval=(uint32_t)(value&0xff);
1155     if(value>=0xf00) {
1156         return 1;  /* roundtrip */
1157     } else if(useFallback ? value>=0x800 : value>=0xc00) {
1158         return -1;  /* fallback taken */
1159     } else {
1160         return 0;  /* no mapping */
1161     }
1162 }
1163
1164 /*
1165  * Check that the result is a 2-byte value with each byte in the range A1..FE
1166  * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1167  * to move it to the ISO 2022 range 21..7E.
1168  * Return 0 if out of range.
1169  */
1170 static inline uint32_t
1171 _2022FromGR94DBCS(uint32_t value) {
1172     if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1173         (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1174     ) {
1175         return value - 0x8080;  /* shift down to 21..7e byte range */
1176     } else {
1177         return 0;  /* not valid for ISO 2022 */
1178     }
1179 }
1180
1181 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1182 /*
1183  * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1184  * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1185  * unchanged.
1186  */
1187 static inline uint32_t
1188 _2022ToGR94DBCS(uint32_t value) {
1189     uint32_t returnValue = value + 0x8080;
1190     if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1191         (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1192         return returnValue;
1193     } else {
1194         return value;
1195     }
1196 }
1197 #endif
1198
1199 #ifdef U_ENABLE_GENERIC_ISO_2022
1200
1201 /**********************************************************************************
1202 *  ISO-2022 Converter
1203 *
1204 *
1205 */
1206
1207 static void
1208 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1209                                                            UErrorCode* err){
1210     const char* mySourceLimit, *realSourceLimit;
1211     const char* sourceStart;
1212     const UChar* myTargetStart;
1213     UConverter* saveThis;
1214     UConverterDataISO2022* myData;
1215     int8_t length;
1216
1217     saveThis = args->converter;
1218     myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1219
1220     realSourceLimit = args->sourceLimit;
1221     while (args->source < realSourceLimit) {
1222         if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1223             /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1224             mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1225
1226             if(args->source < mySourceLimit) {
1227                 if(myData->currentConverter==NULL) {
1228                     myData->currentConverter = ucnv_open("ASCII",err);
1229                     if(U_FAILURE(*err)){
1230                         return;
1231                     }
1232
1233                     myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1234                     saveThis->mode = UCNV_SO;
1235                 }
1236
1237                 /* convert to before the ESC or until the end of the buffer */
1238                 myData->isFirstBuffer=FALSE;
1239                 sourceStart = args->source;
1240                 myTargetStart = args->target;
1241                 args->converter = myData->currentConverter;
1242                 ucnv_toUnicode(args->converter,
1243                     &args->target,
1244                     args->targetLimit,
1245                     &args->source,
1246                     mySourceLimit,
1247                     args->offsets,
1248                     (UBool)(args->flush && mySourceLimit == realSourceLimit),
1249                     err);
1250                 args->converter = saveThis;
1251
1252                 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1253                     /* move the overflow buffer */
1254                     length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1255                     myData->currentConverter->UCharErrorBufferLength = 0;
1256                     if(length > 0) {
1257                         uprv_memcpy(saveThis->UCharErrorBuffer,
1258                                     myData->currentConverter->UCharErrorBuffer,
1259                                     length*U_SIZEOF_UCHAR);
1260                     }
1261                     return;
1262                 }
1263
1264                 /*
1265                  * At least one of:
1266                  * -Error while converting
1267                  * -Done with entire buffer
1268                  * -Need to write offsets or update the current offset
1269                  *  (leave that up to the code in ucnv.c)
1270                  *
1271                  * or else we just stopped at an ESC byte and continue with changeState_2022()
1272                  */
1273                 if (U_FAILURE(*err) ||
1274                     (args->source == realSourceLimit) ||
1275                     (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1276                     (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1277                 ) {
1278                     /* copy partial or error input for truncated detection and error handling */
1279                     if(U_FAILURE(*err)) {
1280                         length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1281                         if(length > 0) {
1282                             uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1283                         }
1284                     } else {
1285                         length = saveThis->toULength = myData->currentConverter->toULength;
1286                         if(length > 0) {
1287                             uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1288                             if(args->source < mySourceLimit) {
1289                                 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1290                             }
1291                         }
1292                     }
1293                     return;
1294                 }
1295             }
1296         }
1297
1298         sourceStart = args->source;
1299         changeState_2022(args->converter,
1300                &(args->source),
1301                realSourceLimit,
1302                ISO_2022,
1303                err);
1304         if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1305             /* let the ucnv.c code update its current offset */
1306             return;
1307         }
1308     }
1309 }
1310
1311 #endif
1312
1313 /*
1314  * To Unicode Callback helper function
1315  */
1316 static void
1317 toUnicodeCallback(UConverter *cnv,
1318                   const uint32_t sourceChar, const uint32_t targetUniChar,
1319                   UErrorCode* err){
1320     if(sourceChar>0xff){
1321         cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1322         cnv->toUBytes[1] = (uint8_t)sourceChar;
1323         cnv->toULength = 2;
1324     }
1325     else{
1326         cnv->toUBytes[0] =(char) sourceChar;
1327         cnv->toULength = 1;
1328     }
1329
1330     if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1331         *err = U_INVALID_CHAR_FOUND;
1332     }
1333     else{
1334         *err = U_ILLEGAL_CHAR_FOUND;
1335     }
1336 }
1337
1338 /**************************************ISO-2022-JP*************************************************/
1339
1340 /************************************** IMPORTANT **************************************************
1341 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1342 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1343 * The converter iterates over each Unicode codepoint
1344 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1345 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1346 * would do as far as possible.
1347 *
1348 * If the implementation of these macros or structure of sharedData struct change in the future, make
1349 * sure that ISO-2022 is also changed.
1350 ***************************************************************************************************
1351 */
1352
1353 /***************************************************************************************************
1354 * Rules for ISO-2022-jp encoding
1355 * (i)   Escape sequences must be fully contained within a line they should not
1356 *       span new lines or CRs
1357 * (ii)  If the last character on a line is represented by two bytes then an ASCII or
1358 *       JIS-Roman character escape sequence should follow before the line terminates
1359 * (iii) If the first character on the line is represented by two bytes then a two
1360 *       byte character escape sequence should precede it
1361 * (iv)  If no escape sequence is encountered then the characters are ASCII
1362 * (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1363 *       and invoked with SS2 (ESC N).
1364 * (vi)  If there is any G0 designation in text, there must be a switch to
1365 *       ASCII or to JIS X 0201-Roman before a space character (but not
1366 *       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1367 *       characters such as tab or CRLF.
1368 * (vi)  Supported encodings:
1369 *          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1370 *
1371 *  source : RFC-1554
1372 *
1373 *          JISX201, JISX208,JISX212 : new .cnv data files created
1374 *          KSC5601 : alias to ibm-949 mapping table
1375 *          GB2312 : alias to ibm-1386 mapping table
1376 *          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1377 *          ISO-8859-7 : alisas to ibm-9409 mapping table
1378 */
1379
1380 /* preference order of JP charsets */
1381 static const StateEnum jpCharsetPref[]={
1382     ASCII,
1383     JISX201,
1384     ISO8859_1,
1385     ISO8859_7,
1386     JISX208,
1387     JISX212,
1388     GB2312,
1389     KSC5601,
1390     HWKANA_7BIT
1391 };
1392
1393 /*
1394  * The escape sequences must be in order of the enum constants like JISX201  = 3,
1395  * not in order of jpCharsetPref[]!
1396  */
1397 static const char escSeqChars[][6] ={
1398     "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1399     "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1400     "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1401     "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1402     "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1403     "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1404     "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1405     "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1406     "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1407
1408 };
1409 static  const int8_t escSeqCharsLen[] ={
1410     3, /* length of <ESC>(B  ASCII       */
1411     3, /* length of <ESC>.A  ISO-8859-1  */
1412     3, /* length of <ESC>.F  ISO-8859-7  */
1413     3, /* length of <ESC>(J  JISX-201    */
1414     3, /* length of <ESC>$B  JISX-208    */
1415     4, /* length of <ESC>$(D JISX-212    */
1416     3, /* length of <ESC>$A  GB2312      */
1417     4, /* length of <ESC>$(C KSC5601     */
1418     3  /* length of <ESC>(I  HWKANA_7BIT */
1419 };
1420
1421 /*
1422 * The iteration over various code pages works this way:
1423 * i)   Get the currentState from myConverterData->currentState
1424 * ii)  Check if the character is mapped to a valid character in the currentState
1425 *      Yes ->  a) set the initIterState to currentState
1426 *       b) remain in this state until an invalid character is found
1427 *      No  ->  a) go to the next code page and find the character
1428 * iii) Before changing the state increment the current state check if the current state
1429 *      is equal to the intitIteration state
1430 *      Yes ->  A character that cannot be represented in any of the supported encodings
1431 *       break and return a U_INVALID_CHARACTER error
1432 *      No  ->  Continue and find the character in next code page
1433 *
1434 *
1435 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1436 */
1437
1438 /* Map 00..7F to Unicode according to JIS X 0201. */
1439 static inline uint32_t
1440 jisx201ToU(uint32_t value) {
1441     if(value < 0x5c) {
1442         return value;
1443     } else if(value == 0x5c) {
1444         return 0xa5;
1445     } else if(value == 0x7e) {
1446         return 0x203e;
1447     } else /* value <= 0x7f */ {
1448         return value;
1449     }
1450 }
1451
1452 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1453 static inline uint32_t
1454 jisx201FromU(uint32_t value) {
1455     if(value<=0x7f) {
1456         if(value!=0x5c && value!=0x7e) {
1457             return value;
1458         }
1459     } else if(value==0xa5) {
1460         return 0x5c;
1461     } else if(value==0x203e) {
1462         return 0x7e;
1463     }
1464     return 0xfffe;
1465 }
1466
1467 /*
1468  * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1469  * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1470  * Return 0 if the byte pair is out of range.
1471  */
1472 static inline uint32_t
1473 _2022FromSJIS(uint32_t value) {
1474     uint8_t trail;
1475
1476     if(value > 0xEFFC) {
1477         return 0;  /* beyond JIS X 0208 */
1478     }
1479
1480     trail = (uint8_t)value;
1481
1482     value &= 0xff00;  /* lead byte */
1483     if(value <= 0x9f00) {
1484         value -= 0x7000;
1485     } else /* 0xe000 <= value <= 0xef00 */ {
1486         value -= 0xb000;
1487     }
1488     value <<= 1;
1489
1490     if(trail <= 0x9e) {
1491         value -= 0x100;
1492         if(trail <= 0x7e) {
1493             value |= trail - 0x1f;
1494         } else {
1495             value |= trail - 0x20;
1496         }
1497     } else /* trail <= 0xfc */ {
1498         value |= trail - 0x7e;
1499     }
1500     return value;
1501 }
1502
1503 /*
1504  * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1505  * If either byte is outside 21..7E make sure that the result is not valid
1506  * for Shift-JIS so that the converter catches it.
1507  * Some invalid byte values already turn into equally invalid Shift-JIS
1508  * byte values and need not be tested explicitly.
1509  */
1510 static inline void
1511 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1512     if(c1&1) {
1513         ++c1;
1514         if(c2 <= 0x5f) {
1515             c2 += 0x1f;
1516         } else if(c2 <= 0x7e) {
1517             c2 += 0x20;
1518         } else {
1519             c2 = 0;  /* invalid */
1520         }
1521     } else {
1522         if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1523             c2 += 0x7e;
1524         } else {
1525             c2 = 0;  /* invalid */
1526         }
1527     }
1528     c1 >>= 1;
1529     if(c1 <= 0x2f) {
1530         c1 += 0x70;
1531     } else if(c1 <= 0x3f) {
1532         c1 += 0xb0;
1533     } else {
1534         c1 = 0;  /* invalid */
1535     }
1536     bytes[0] = (char)c1;
1537     bytes[1] = (char)c2;
1538 }
1539
1540 /*
1541  * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1542  * Katakana.
1543  * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1544  * because Shift-JIS roundtrips half-width Katakana to single bytes.
1545  * These were the only fallbacks in ICU's jisx-208.ucm file.
1546  */
1547 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1548     0x2123,  /* U+FF61 */
1549     0x2156,
1550     0x2157,
1551     0x2122,
1552     0x2126,
1553     0x2572,
1554     0x2521,
1555     0x2523,
1556     0x2525,
1557     0x2527,
1558     0x2529,
1559     0x2563,
1560     0x2565,
1561     0x2567,
1562     0x2543,
1563     0x213C,  /* U+FF70 */
1564     0x2522,
1565     0x2524,
1566     0x2526,
1567     0x2528,
1568     0x252A,
1569     0x252B,
1570     0x252D,
1571     0x252F,
1572     0x2531,
1573     0x2533,
1574     0x2535,
1575     0x2537,
1576     0x2539,
1577     0x253B,
1578     0x253D,
1579     0x253F,  /* U+FF80 */
1580     0x2541,
1581     0x2544,
1582     0x2546,
1583     0x2548,
1584     0x254A,
1585     0x254B,
1586     0x254C,
1587     0x254D,
1588     0x254E,
1589     0x254F,
1590     0x2552,
1591     0x2555,
1592     0x2558,
1593     0x255B,
1594     0x255E,
1595     0x255F,  /* U+FF90 */
1596     0x2560,
1597     0x2561,
1598     0x2562,
1599     0x2564,
1600     0x2566,
1601     0x2568,
1602     0x2569,
1603     0x256A,
1604     0x256B,
1605     0x256C,
1606     0x256D,
1607     0x256F,
1608     0x2573,
1609     0x212B,
1610     0x212C   /* U+FF9F */
1611 };
1612
1613 static void
1614 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1615     UConverter *cnv = args->converter;
1616     UConverterDataISO2022 *converterData;
1617     ISO2022State *pFromU2022State;
1618     uint8_t *target = (uint8_t *) args->target;
1619     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1620     const UChar* source = args->source;
1621     const UChar* sourceLimit = args->sourceLimit;
1622     int32_t* offsets = args->offsets;
1623     UChar32 sourceChar;
1624     char buffer[8];
1625     int32_t len, outLen;
1626     int8_t choices[10];
1627     int32_t choiceCount;
1628     uint32_t targetValue = 0;
1629     UBool useFallback;
1630
1631     int32_t i;
1632     int8_t cs, g;
1633
1634     /* set up the state */
1635     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1636     pFromU2022State   = &converterData->fromU2022State;
1637
1638     choiceCount = 0;
1639
1640     /* check if the last codepoint of previous buffer was a lead surrogate*/
1641     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1642         goto getTrail;
1643     }
1644
1645     while(source < sourceLimit) {
1646         if(target < targetLimit) {
1647
1648             sourceChar  = *(source++);
1649             /*check if the char is a First surrogate*/
1650             if(U16_IS_SURROGATE(sourceChar)) {
1651                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1652 getTrail:
1653                     /*look ahead to find the trail surrogate*/
1654                     if(source < sourceLimit) {
1655                         /* test the following code unit */
1656                         UChar trail=(UChar) *source;
1657                         if(U16_IS_TRAIL(trail)) {
1658                             source++;
1659                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1660                             cnv->fromUChar32=0x00;
1661                             /* convert this supplementary code point */
1662                             /* exit this condition tree */
1663                         } else {
1664                             /* this is an unmatched lead code unit (1st surrogate) */
1665                             /* callback(illegal) */
1666                             *err=U_ILLEGAL_CHAR_FOUND;
1667                             cnv->fromUChar32=sourceChar;
1668                             break;
1669                         }
1670                     } else {
1671                         /* no more input */
1672                         cnv->fromUChar32=sourceChar;
1673                         break;
1674                     }
1675                 } else {
1676                     /* this is an unmatched trail code unit (2nd surrogate) */
1677                     /* callback(illegal) */
1678                     *err=U_ILLEGAL_CHAR_FOUND;
1679                     cnv->fromUChar32=sourceChar;
1680                     break;
1681                 }
1682             }
1683
1684             /* do not convert SO/SI/ESC */
1685             if(IS_2022_CONTROL(sourceChar)) {
1686                 /* callback(illegal) */
1687                 *err=U_ILLEGAL_CHAR_FOUND;
1688                 cnv->fromUChar32=sourceChar;
1689                 break;
1690             }
1691
1692             /* do the conversion */
1693
1694             if(choiceCount == 0) {
1695                 uint16_t csm;
1696
1697                 /*
1698                  * The csm variable keeps track of which charsets are allowed
1699                  * and not used yet while building the choices[].
1700                  */
1701                 csm = jpCharsetMasks[converterData->version];
1702                 choiceCount = 0;
1703
1704                 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1705                 if(converterData->version == 3 || converterData->version == 4) {
1706                     choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1707                 }
1708                 /* Do not try single-byte half-width Katakana for other versions. */
1709                 csm &= ~CSM(HWKANA_7BIT);
1710
1711                 /* try the current G0 charset */
1712                 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1713                 csm &= ~CSM(cs);
1714
1715                 /* try the current G2 charset */
1716                 if((cs = pFromU2022State->cs[2]) != 0) {
1717                     choices[choiceCount++] = cs;
1718                     csm &= ~CSM(cs);
1719                 }
1720
1721                 /* try all the other possible charsets */
1722                 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1723                     cs = (int8_t)jpCharsetPref[i];
1724                     if(CSM(cs) & csm) {
1725                         choices[choiceCount++] = cs;
1726                         csm &= ~CSM(cs);
1727                     }
1728                 }
1729             }
1730
1731             cs = g = 0;
1732             /*
1733              * len==0: no mapping found yet
1734              * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1735              * len>0: found a roundtrip result, done
1736              */
1737             len = 0;
1738             /*
1739              * We will turn off useFallback after finding a fallback,
1740              * but we still get fallbacks from PUA code points as usual.
1741              * Therefore, we will also need to check that we don't overwrite
1742              * an early fallback with a later one.
1743              */
1744             useFallback = cnv->useFallback;
1745
1746             for(i = 0; i < choiceCount && len <= 0; ++i) {
1747                 uint32_t value;
1748                 int32_t len2;
1749                 int8_t cs0 = choices[i];
1750                 switch(cs0) {
1751                 case ASCII:
1752                     if(sourceChar <= 0x7f) {
1753                         targetValue = (uint32_t)sourceChar;
1754                         len = 1;
1755                         cs = cs0;
1756                         g = 0;
1757                     }
1758                     break;
1759                 case ISO8859_1:
1760                     if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1761                         targetValue = (uint32_t)sourceChar - 0x80;
1762                         len = 1;
1763                         cs = cs0;
1764                         g = 2;
1765                     }
1766                     break;
1767                 case HWKANA_7BIT:
1768                     if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1769                         if(converterData->version==3) {
1770                             /* JIS7: use G1 (SO) */
1771                             /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1772                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1773                             len = 1;
1774                             pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1775                             g = 1;
1776                         } else if(converterData->version==4) {
1777                             /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1778                             /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1779                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1780                             len = 1;
1781
1782                             cs = pFromU2022State->cs[0];
1783                             if(IS_JP_DBCS(cs)) {
1784                                 /* switch from a DBCS charset to JISX201 */
1785                                 cs = (int8_t)JISX201;
1786                             }
1787                             /* else stay in the current G0 charset */
1788                             g = 0;
1789                         }
1790                         /* else do not use HWKANA_7BIT with other versions */
1791                     }
1792                     break;
1793                 case JISX201:
1794                     /* G0 SBCS */
1795                     value = jisx201FromU(sourceChar);
1796                     if(value <= 0x7f) {
1797                         targetValue = value;
1798                         len = 1;
1799                         cs = cs0;
1800                         g = 0;
1801                         useFallback = FALSE;
1802                     }
1803                     break;
1804                 case JISX208:
1805                     /* G0 DBCS from Shift-JIS table */
1806                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1807                                 converterData->myConverterArray[cs0],
1808                                 sourceChar, &value,
1809                                 useFallback, MBCS_OUTPUT_2);
1810                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1811                         value = _2022FromSJIS(value);
1812                         if(value != 0) {
1813                             targetValue = value;
1814                             len = len2;
1815                             cs = cs0;
1816                             g = 0;
1817                             useFallback = FALSE;
1818                         }
1819                     } else if(len == 0 && useFallback &&
1820                               (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1821                         targetValue = hwkana_fb[sourceChar - HWKANA_START];
1822                         len = -2;
1823                         cs = cs0;
1824                         g = 0;
1825                         useFallback = FALSE;
1826                     }
1827                     break;
1828                 case ISO8859_7:
1829                     /* G0 SBCS forced to 7-bit output */
1830                     len2 = MBCS_SINGLE_FROM_UCHAR32(
1831                                 converterData->myConverterArray[cs0],
1832                                 sourceChar, &value,
1833                                 useFallback);
1834                     if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1835                         targetValue = value - 0x80;
1836                         len = len2;
1837                         cs = cs0;
1838                         g = 2;
1839                         useFallback = FALSE;
1840                     }
1841                     break;
1842                 default:
1843                     /* G0 DBCS */
1844                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1845                                 converterData->myConverterArray[cs0],
1846                                 sourceChar, &value,
1847                                 useFallback, MBCS_OUTPUT_2);
1848                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1849                         if(cs0 == KSC5601) {
1850                             /*
1851                              * Check for valid bytes for the encoding scheme.
1852                              * This is necessary because the sub-converter (windows-949)
1853                              * has a broader encoding scheme than is valid for 2022.
1854                              */
1855                             value = _2022FromGR94DBCS(value);
1856                             if(value == 0) {
1857                                 break;
1858                             }
1859                         }
1860                         targetValue = value;
1861                         len = len2;
1862                         cs = cs0;
1863                         g = 0;
1864                         useFallback = FALSE;
1865                     }
1866                     break;
1867                 }
1868             }
1869
1870             if(len != 0) {
1871                 if(len < 0) {
1872                     len = -len;  /* fallback */
1873                 }
1874                 outLen = 0; /* count output bytes */
1875
1876                 /* write SI if necessary (only for JIS7) */
1877                 if(pFromU2022State->g == 1 && g == 0) {
1878                     buffer[outLen++] = UCNV_SI;
1879                     pFromU2022State->g = 0;
1880                 }
1881
1882                 /* write the designation sequence if necessary */
1883                 if(cs != pFromU2022State->cs[g]) {
1884                     int32_t escLen = escSeqCharsLen[cs];
1885                     uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1886                     outLen += escLen;
1887                     pFromU2022State->cs[g] = cs;
1888
1889                     /* invalidate the choices[] */
1890                     choiceCount = 0;
1891                 }
1892
1893                 /* write the shift sequence if necessary */
1894                 if(g != pFromU2022State->g) {
1895                     switch(g) {
1896                     /* case 0 handled before writing escapes */
1897                     case 1:
1898                         buffer[outLen++] = UCNV_SO;
1899                         pFromU2022State->g = 1;
1900                         break;
1901                     default: /* case 2 */
1902                         buffer[outLen++] = 0x1b;
1903                         buffer[outLen++] = 0x4e;
1904                         break;
1905                     /* no case 3: no SS3 in ISO-2022-JP-x */
1906                     }
1907                 }
1908
1909                 /* write the output bytes */
1910                 if(len == 1) {
1911                     buffer[outLen++] = (char)targetValue;
1912                 } else /* len == 2 */ {
1913                     buffer[outLen++] = (char)(targetValue >> 8);
1914                     buffer[outLen++] = (char)targetValue;
1915                 }
1916             } else {
1917                 /*
1918                  * if we cannot find the character after checking all codepages
1919                  * then this is an error
1920                  */
1921                 *err = U_INVALID_CHAR_FOUND;
1922                 cnv->fromUChar32=sourceChar;
1923                 break;
1924             }
1925
1926             if(sourceChar == CR || sourceChar == LF) {
1927                 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1928                 pFromU2022State->cs[2] = 0;
1929                 choiceCount = 0;
1930             }
1931
1932             /* output outLen>0 bytes in buffer[] */
1933             if(outLen == 1) {
1934                 *target++ = buffer[0];
1935                 if(offsets) {
1936                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1937                 }
1938             } else if(outLen == 2 && (target + 2) <= targetLimit) {
1939                 *target++ = buffer[0];
1940                 *target++ = buffer[1];
1941                 if(offsets) {
1942                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1943                     *offsets++ = sourceIndex;
1944                     *offsets++ = sourceIndex;
1945                 }
1946             } else {
1947                 fromUWriteUInt8(
1948                     cnv,
1949                     buffer, outLen,
1950                     &target, (const char *)targetLimit,
1951                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1952                     err);
1953                 if(U_FAILURE(*err)) {
1954                     break;
1955                 }
1956             }
1957         } /* end if(myTargetIndex<myTargetLength) */
1958         else{
1959             *err =U_BUFFER_OVERFLOW_ERROR;
1960             break;
1961         }
1962
1963     }/* end while(mySourceIndex<mySourceLength) */
1964
1965     /*
1966      * the end of the input stream and detection of truncated input
1967      * are handled by the framework, but for ISO-2022-JP conversion
1968      * we need to be in ASCII mode at the very end
1969      *
1970      * conditions:
1971      *   successful
1972      *   in SO mode or not in ASCII mode
1973      *   end of input and no truncated input
1974      */
1975     if( U_SUCCESS(*err) &&
1976         (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1977         args->flush && source>=sourceLimit && cnv->fromUChar32==0
1978     ) {
1979         int32_t sourceIndex;
1980
1981         outLen = 0;
1982
1983         if(pFromU2022State->g != 0) {
1984             buffer[outLen++] = UCNV_SI;
1985             pFromU2022State->g = 0;
1986         }
1987
1988         if(pFromU2022State->cs[0] != ASCII) {
1989             int32_t escLen = escSeqCharsLen[ASCII];
1990             uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1991             outLen += escLen;
1992             pFromU2022State->cs[0] = (int8_t)ASCII;
1993         }
1994
1995         /* get the source index of the last input character */
1996         /*
1997          * TODO this would be simpler and more reliable if we used a pair
1998          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1999          * so that we could simply use the prevSourceIndex here;
2000          * this code gives an incorrect result for the rare case of an unmatched
2001          * trail surrogate that is alone in the last buffer of the text stream
2002          */
2003         sourceIndex=(int32_t)(source-args->source);
2004         if(sourceIndex>0) {
2005             --sourceIndex;
2006             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2007                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2008             ) {
2009                 --sourceIndex;
2010             }
2011         } else {
2012             sourceIndex=-1;
2013         }
2014
2015         fromUWriteUInt8(
2016             cnv,
2017             buffer, outLen,
2018             &target, (const char *)targetLimit,
2019             &offsets, sourceIndex,
2020             err);
2021     }
2022
2023     /*save the state and return */
2024     args->source = source;
2025     args->target = (char*)target;
2026 }
2027
2028 /*************** to unicode *******************/
2029
2030 static void
2031 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2032                                                UErrorCode* err){
2033     char tempBuf[2];
2034     const char *mySource = (char *) args->source;
2035     UChar *myTarget = args->target;
2036     const char *mySourceLimit = args->sourceLimit;
2037     uint32_t targetUniChar = 0x0000;
2038     uint32_t mySourceChar = 0x0000;
2039     uint32_t tmpSourceChar = 0x0000;
2040     UConverterDataISO2022* myData;
2041     ISO2022State *pToU2022State;
2042     StateEnum cs;
2043
2044     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2045     pToU2022State = &myData->toU2022State;
2046
2047     if(myData->key != 0) {
2048         /* continue with a partial escape sequence */
2049         goto escape;
2050     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2051         /* continue with a partial double-byte character */
2052         mySourceChar = args->converter->toUBytes[0];
2053         args->converter->toULength = 0;
2054         cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2055         targetUniChar = missingCharMarker;
2056         goto getTrailByte;
2057     }
2058
2059     while(mySource < mySourceLimit){
2060
2061         targetUniChar =missingCharMarker;
2062
2063         if(myTarget < args->targetLimit){
2064
2065             mySourceChar= (unsigned char) *mySource++;
2066
2067             switch(mySourceChar) {
2068             case UCNV_SI:
2069                 if(myData->version==3) {
2070                     pToU2022State->g=0;
2071                     continue;
2072                 } else {
2073                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2074                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
2075                     break;
2076                 }
2077
2078             case UCNV_SO:
2079                 if(myData->version==3) {
2080                     /* JIS7: switch to G1 half-width Katakana */
2081                     pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2082                     pToU2022State->g=1;
2083                     continue;
2084                 } else {
2085                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2086                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
2087                     break;
2088                 }
2089
2090             case ESC_2022:
2091                 mySource--;
2092 escape:
2093                 {
2094                     const char * mySourceBefore = mySource;
2095                     int8_t toULengthBefore = args->converter->toULength;
2096
2097                     changeState_2022(args->converter,&(mySource),
2098                         mySourceLimit, ISO_2022_JP,err);
2099
2100                     /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2101                     if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2102                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2103                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
2104                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2105                     }
2106                 }
2107
2108                 /* invalid or illegal escape sequence */
2109                 if(U_FAILURE(*err)){
2110                     args->target = myTarget;
2111                     args->source = mySource;
2112                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
2113                     return;
2114                 }
2115                 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2116                 if(myData->key==0) {
2117                     myData->isEmptySegment = TRUE;
2118                 }
2119                 continue;
2120
2121             /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2122
2123             case CR:
2124                 /*falls through*/
2125             case LF:
2126                 /* automatically reset to single-byte mode */
2127                 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2128                     pToU2022State->cs[0] = (int8_t)ASCII;
2129                 }
2130                 pToU2022State->cs[2] = 0;
2131                 pToU2022State->g = 0;
2132                 /* falls through */
2133             default:
2134                 /* convert one or two bytes */
2135                 myData->isEmptySegment = FALSE;
2136                 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2137                 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2138                     !IS_JP_DBCS(cs)
2139                 ) {
2140                     /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2141                     targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2142
2143                     /* return from a single-shift state to the previous one */
2144                     if(pToU2022State->g >= 2) {
2145                         pToU2022State->g=pToU2022State->prevG;
2146                     }
2147                 } else switch(cs) {
2148                 case ASCII:
2149                     if(mySourceChar <= 0x7f) {
2150                         targetUniChar = mySourceChar;
2151                     }
2152                     break;
2153                 case ISO8859_1:
2154                     if(mySourceChar <= 0x7f) {
2155                         targetUniChar = mySourceChar + 0x80;
2156                     }
2157                     /* return from a single-shift state to the previous one */
2158                     pToU2022State->g=pToU2022State->prevG;
2159                     break;
2160                 case ISO8859_7:
2161                     if(mySourceChar <= 0x7f) {
2162                         /* convert mySourceChar+0x80 to use a normal 8-bit table */
2163                         targetUniChar =
2164                             _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2165                                 myData->myConverterArray[cs],
2166                                 mySourceChar + 0x80);
2167                     }
2168                     /* return from a single-shift state to the previous one */
2169                     pToU2022State->g=pToU2022State->prevG;
2170                     break;
2171                 case JISX201:
2172                     if(mySourceChar <= 0x7f) {
2173                         targetUniChar = jisx201ToU(mySourceChar);
2174                     }
2175                     break;
2176                 case HWKANA_7BIT:
2177                     if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2178                         /* 7-bit halfwidth Katakana */
2179                         targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2180                     }
2181                     break;
2182                 default:
2183                     /* G0 DBCS */
2184                     if(mySource < mySourceLimit) {
2185                         int leadIsOk, trailIsOk;
2186                         uint8_t trailByte;
2187 getTrailByte:
2188                         trailByte = (uint8_t)*mySource;
2189                         /*
2190                          * Ticket 5691: consistent illegal sequences:
2191                          * - We include at least the first byte in the illegal sequence.
2192                          * - If any of the non-initial bytes could be the start of a character,
2193                          *   we stop the illegal sequence before the first one of those.
2194                          *
2195                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2196                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2197                          * Otherwise we convert or report the pair of bytes.
2198                          */
2199                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2200                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2201                         if (leadIsOk && trailIsOk) {
2202                             ++mySource;
2203                             tmpSourceChar = (mySourceChar << 8) | trailByte;
2204                             if(cs == JISX208) {
2205                                 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2206                                 mySourceChar = tmpSourceChar;
2207                             } else {
2208                                 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2209                                 mySourceChar = tmpSourceChar;
2210                                 if (cs == KSC5601) {
2211                                     tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2212                                 }
2213                                 tempBuf[0] = (char)(tmpSourceChar >> 8);
2214                                 tempBuf[1] = (char)(tmpSourceChar);
2215                             }
2216                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2217                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2218                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2219                             ++mySource;
2220                             /* add another bit so that the code below writes 2 bytes in case of error */
2221                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2222                         }
2223                     } else {
2224                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2225                         args->converter->toULength = 1;
2226                         goto endloop;
2227                     }
2228                 }  /* End of inner switch */
2229                 break;
2230             }  /* End of outer switch */
2231             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2232                 if(args->offsets){
2233                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2234                 }
2235                 *(myTarget++)=(UChar)targetUniChar;
2236             }
2237             else if(targetUniChar > missingCharMarker){
2238                 /* disassemble the surrogate pair and write to output*/
2239                 targetUniChar-=0x0010000;
2240                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2241                 if(args->offsets){
2242                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2243                 }
2244                 ++myTarget;
2245                 if(myTarget< args->targetLimit){
2246                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2247                     if(args->offsets){
2248                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2249                     }
2250                     ++myTarget;
2251                 }else{
2252                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2253                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2254                 }
2255
2256             }
2257             else{
2258                 /* Call the callback function*/
2259                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2260                 break;
2261             }
2262         }
2263         else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2264             *err =U_BUFFER_OVERFLOW_ERROR;
2265             break;
2266         }
2267     }
2268 endloop:
2269     args->target = myTarget;
2270     args->source = mySource;
2271 }
2272
2273
2274 /***************************************************************
2275 *   Rules for ISO-2022-KR encoding
2276 *   i) The KSC5601 designator sequence should appear only once in a file,
2277 *      at the begining of a line before any KSC5601 characters. This usually
2278 *      means that it appears by itself on the first line of the file
2279 *  ii) There are only 2 shifting sequences SO to shift into double byte mode
2280 *      and SI to shift into single byte mode
2281 */
2282 static void
2283 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2284
2285     UConverter* saveConv = args->converter;
2286     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2287     args->converter=myConverterData->currentConverter;
2288
2289     myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2290     ucnv_MBCSFromUnicodeWithOffsets(args,err);
2291     saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2292
2293     if(*err == U_BUFFER_OVERFLOW_ERROR) {
2294         if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2295             uprv_memcpy(
2296                 saveConv->charErrorBuffer,
2297                 myConverterData->currentConverter->charErrorBuffer,
2298                 myConverterData->currentConverter->charErrorBufferLength);
2299         }
2300         saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2301         myConverterData->currentConverter->charErrorBufferLength = 0;
2302     }
2303     args->converter=saveConv;
2304 }
2305
2306 static void
2307 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2308
2309     const UChar *source = args->source;
2310     const UChar *sourceLimit = args->sourceLimit;
2311     unsigned char *target = (unsigned char *) args->target;
2312     unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2313     int32_t* offsets = args->offsets;
2314     uint32_t targetByteUnit = 0x0000;
2315     UChar32 sourceChar = 0x0000;
2316     UBool isTargetByteDBCS;
2317     UBool oldIsTargetByteDBCS;
2318     UConverterDataISO2022 *converterData;
2319     UConverterSharedData* sharedData;
2320     UBool useFallback;
2321     int32_t length =0;
2322
2323     converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2324     /* if the version is 1 then the user is requesting
2325      * conversion with ibm-25546 pass the arguments to
2326      * MBCS converter and return
2327      */
2328     if(converterData->version==1){
2329         UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2330         return;
2331     }
2332
2333     /* initialize data */
2334     sharedData = converterData->currentConverter->sharedData;
2335     useFallback = args->converter->useFallback;
2336     isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2337     oldIsTargetByteDBCS = isTargetByteDBCS;
2338
2339     isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2340     if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2341         goto getTrail;
2342     }
2343     while(source < sourceLimit){
2344
2345         targetByteUnit = missingCharMarker;
2346
2347         if(target < (unsigned char*) args->targetLimit){
2348             sourceChar = *source++;
2349
2350             /* do not convert SO/SI/ESC */
2351             if(IS_2022_CONTROL(sourceChar)) {
2352                 /* callback(illegal) */
2353                 *err=U_ILLEGAL_CHAR_FOUND;
2354                 args->converter->fromUChar32=sourceChar;
2355                 break;
2356             }
2357
2358             length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2359             if(length < 0) {
2360                 length = -length;  /* fallback */
2361             }
2362             /* only DBCS or SBCS characters are expected*/
2363             /* DB characters with high bit set to 1 are expected */
2364             if( length > 2 || length==0 ||
2365                 (length == 1 && targetByteUnit > 0x7f) ||
2366                 (length == 2 &&
2367                     ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2368                     (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2369             ) {
2370                 targetByteUnit=missingCharMarker;
2371             }
2372             if (targetByteUnit != missingCharMarker){
2373
2374                 oldIsTargetByteDBCS = isTargetByteDBCS;
2375                 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2376                   /* append the shift sequence */
2377                 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2378
2379                     if (isTargetByteDBCS)
2380                         *target++ = UCNV_SO;
2381                     else
2382                         *target++ = UCNV_SI;
2383                     if(offsets)
2384                         *(offsets++) = (int32_t)(source - args->source-1);
2385                 }
2386                 /* write the targetUniChar  to target */
2387                 if(targetByteUnit <= 0x00FF){
2388                     if( target < targetLimit){
2389                         *(target++) = (unsigned char) targetByteUnit;
2390                         if(offsets){
2391                             *(offsets++) = (int32_t)(source - args->source-1);
2392                         }
2393
2394                     }else{
2395                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2396                         *err = U_BUFFER_OVERFLOW_ERROR;
2397                     }
2398                 }else{
2399                     if(target < targetLimit){
2400                         *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2401                         if(offsets){
2402                             *(offsets++) = (int32_t)(source - args->source-1);
2403                         }
2404                         if(target < targetLimit){
2405                             *(target++) =(unsigned char) (targetByteUnit -0x80);
2406                             if(offsets){
2407                                 *(offsets++) = (int32_t)(source - args->source-1);
2408                             }
2409                         }else{
2410                             args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2411                             *err = U_BUFFER_OVERFLOW_ERROR;
2412                         }
2413                     }else{
2414                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2415                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2416                         *err = U_BUFFER_OVERFLOW_ERROR;
2417                     }
2418                 }
2419
2420             }
2421             else{
2422                 /* oops.. the code point is unassingned
2423                  * set the error and reason
2424                  */
2425
2426                 /*check if the char is a First surrogate*/
2427                 if(U16_IS_SURROGATE(sourceChar)) {
2428                     if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2429 getTrail:
2430                         /*look ahead to find the trail surrogate*/
2431                         if(source <  sourceLimit) {
2432                             /* test the following code unit */
2433                             UChar trail=(UChar) *source;
2434                             if(U16_IS_TRAIL(trail)) {
2435                                 source++;
2436                                 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2437                                 *err = U_INVALID_CHAR_FOUND;
2438                                 /* convert this surrogate code point */
2439                                 /* exit this condition tree */
2440                             } else {
2441                                 /* this is an unmatched lead code unit (1st surrogate) */
2442                                 /* callback(illegal) */
2443                                 *err=U_ILLEGAL_CHAR_FOUND;
2444                             }
2445                         } else {
2446                             /* no more input */
2447                             *err = U_ZERO_ERROR;
2448                         }
2449                     } else {
2450                         /* this is an unmatched trail code unit (2nd surrogate) */
2451                         /* callback(illegal) */
2452                         *err=U_ILLEGAL_CHAR_FOUND;
2453                     }
2454                 } else {
2455                     /* callback(unassigned) for a BMP code point */
2456                     *err = U_INVALID_CHAR_FOUND;
2457                 }
2458
2459                 args->converter->fromUChar32=sourceChar;
2460                 break;
2461             }
2462         } /* end if(myTargetIndex<myTargetLength) */
2463         else{
2464             *err =U_BUFFER_OVERFLOW_ERROR;
2465             break;
2466         }
2467
2468     }/* end while(mySourceIndex<mySourceLength) */
2469
2470     /*
2471      * the end of the input stream and detection of truncated input
2472      * are handled by the framework, but for ISO-2022-KR conversion
2473      * we need to be in ASCII mode at the very end
2474      *
2475      * conditions:
2476      *   successful
2477      *   not in ASCII mode
2478      *   end of input and no truncated input
2479      */
2480     if( U_SUCCESS(*err) &&
2481         isTargetByteDBCS &&
2482         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2483     ) {
2484         int32_t sourceIndex;
2485
2486         /* we are switching to ASCII */
2487         isTargetByteDBCS=FALSE;
2488
2489         /* get the source index of the last input character */
2490         /*
2491          * TODO this would be simpler and more reliable if we used a pair
2492          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2493          * so that we could simply use the prevSourceIndex here;
2494          * this code gives an incorrect result for the rare case of an unmatched
2495          * trail surrogate that is alone in the last buffer of the text stream
2496          */
2497         sourceIndex=(int32_t)(source-args->source);
2498         if(sourceIndex>0) {
2499             --sourceIndex;
2500             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2501                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2502             ) {
2503                 --sourceIndex;
2504             }
2505         } else {
2506             sourceIndex=-1;
2507         }
2508
2509         fromUWriteUInt8(
2510             args->converter,
2511             SHIFT_IN_STR, 1,
2512             &target, (const char *)targetLimit,
2513             &offsets, sourceIndex,
2514             err);
2515     }
2516
2517     /*save the state and return */
2518     args->source = source;
2519     args->target = (char*)target;
2520     args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2521 }
2522
2523 /************************ To Unicode ***************************************/
2524
2525 static void
2526 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2527                                                             UErrorCode* err){
2528     char const* sourceStart;
2529     UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2530
2531     UConverterToUnicodeArgs subArgs;
2532     int32_t minArgsSize;
2533
2534     /* set up the subconverter arguments */
2535     if(args->size<sizeof(UConverterToUnicodeArgs)) {
2536         minArgsSize = args->size;
2537     } else {
2538         minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2539     }
2540
2541     uprv_memcpy(&subArgs, args, minArgsSize);
2542     subArgs.size = (uint16_t)minArgsSize;
2543     subArgs.converter = myData->currentConverter;
2544
2545     /* remember the original start of the input for offsets */
2546     sourceStart = args->source;
2547
2548     if(myData->key != 0) {
2549         /* continue with a partial escape sequence */
2550         goto escape;
2551     }
2552
2553     while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2554         /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2555         subArgs.source = args->source;
2556         subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2557         if(subArgs.source != subArgs.sourceLimit) {
2558             /*
2559              * get the current partial byte sequence
2560              *
2561              * it needs to be moved between the public and the subconverter
2562              * so that the conversion framework, which only sees the public
2563              * converter, can handle truncated and illegal input etc.
2564              */
2565             if(args->converter->toULength > 0) {
2566                 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2567             }
2568             subArgs.converter->toULength = args->converter->toULength;
2569
2570             /*
2571              * Convert up to the end of the input, or to before the next escape character.
2572              * Does not handle conversion extensions because the preToU[] state etc.
2573              * is not copied.
2574              */
2575             ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2576
2577             if(args->offsets != NULL && sourceStart != args->source) {
2578                 /* update offsets to base them on the actual start of the input */
2579                 int32_t *offsets = args->offsets;
2580                 UChar *target = args->target;
2581                 int32_t delta = (int32_t)(args->source - sourceStart);
2582                 while(target < subArgs.target) {
2583                     if(*offsets >= 0) {
2584                         *offsets += delta;
2585                     }
2586                     ++offsets;
2587                     ++target;
2588                 }
2589             }
2590             args->source = subArgs.source;
2591             args->target = subArgs.target;
2592             args->offsets = subArgs.offsets;
2593
2594             /* copy input/error/overflow buffers */
2595             if(subArgs.converter->toULength > 0) {
2596                 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2597             }
2598             args->converter->toULength = subArgs.converter->toULength;
2599
2600             if(*err == U_BUFFER_OVERFLOW_ERROR) {
2601                 if(subArgs.converter->UCharErrorBufferLength > 0) {
2602                     uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2603                                 subArgs.converter->UCharErrorBufferLength);
2604                 }
2605                 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2606                 subArgs.converter->UCharErrorBufferLength = 0;
2607             }
2608         }
2609
2610         if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2611             return;
2612         }
2613
2614 escape:
2615         changeState_2022(args->converter,
2616                &(args->source),
2617                args->sourceLimit,
2618                ISO_2022_KR,
2619                err);
2620     }
2621 }
2622
2623 static void
2624 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2625                                                             UErrorCode* err){
2626     char tempBuf[2];
2627     const char *mySource = ( char *) args->source;
2628     UChar *myTarget = args->target;
2629     const char *mySourceLimit = args->sourceLimit;
2630     UChar32 targetUniChar = 0x0000;
2631     UChar mySourceChar = 0x0000;
2632     UConverterDataISO2022* myData;
2633     UConverterSharedData* sharedData ;
2634     UBool useFallback;
2635
2636     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2637     if(myData->version==1){
2638         UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2639         return;
2640     }
2641
2642     /* initialize state */
2643     sharedData = myData->currentConverter->sharedData;
2644     useFallback = args->converter->useFallback;
2645
2646     if(myData->key != 0) {
2647         /* continue with a partial escape sequence */
2648         goto escape;
2649     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2650         /* continue with a partial double-byte character */
2651         mySourceChar = args->converter->toUBytes[0];
2652         args->converter->toULength = 0;
2653         goto getTrailByte;
2654     }
2655
2656     while(mySource< mySourceLimit){
2657
2658         if(myTarget < args->targetLimit){
2659
2660             mySourceChar= (unsigned char) *mySource++;
2661
2662             if(mySourceChar==UCNV_SI){
2663                 myData->toU2022State.g = 0;
2664                 if (myData->isEmptySegment) {
2665                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
2666                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2667                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
2668                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2669                     args->converter->toULength = 1;
2670                     args->target = myTarget;
2671                     args->source = mySource;
2672                     return;
2673                 }
2674                 /*consume the source */
2675                 continue;
2676             }else if(mySourceChar==UCNV_SO){
2677                 myData->toU2022State.g = 1;
2678                 myData->isEmptySegment = TRUE;  /* Begin a new segment, empty so far */
2679                 /*consume the source */
2680                 continue;
2681             }else if(mySourceChar==ESC_2022){
2682                 mySource--;
2683 escape:
2684                 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2685                 changeState_2022(args->converter,&(mySource),
2686                                 mySourceLimit, ISO_2022_KR, err);
2687                 if(U_FAILURE(*err)){
2688                     args->target = myTarget;
2689                     args->source = mySource;
2690                     return;
2691                 }
2692                 continue;
2693             }
2694
2695             myData->isEmptySegment = FALSE;     /* Any invalid char errors will be detected separately, so just reset this */
2696             if(myData->toU2022State.g == 1) {
2697                 if(mySource < mySourceLimit) {
2698                     int leadIsOk, trailIsOk;
2699                     uint8_t trailByte;
2700 getTrailByte:
2701                     targetUniChar = missingCharMarker;
2702                     trailByte = (uint8_t)*mySource;
2703                     /*
2704                      * Ticket 5691: consistent illegal sequences:
2705                      * - We include at least the first byte in the illegal sequence.
2706                      * - If any of the non-initial bytes could be the start of a character,
2707                      *   we stop the illegal sequence before the first one of those.
2708                      *
2709                      * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2710                      * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2711                      * Otherwise we convert or report the pair of bytes.
2712                      */
2713                     leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2714                     trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2715                     if (leadIsOk && trailIsOk) {
2716                         ++mySource;
2717                         tempBuf[0] = (char)(mySourceChar + 0x80);
2718                         tempBuf[1] = (char)(trailByte + 0x80);
2719                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2720                         mySourceChar = (mySourceChar << 8) | trailByte;
2721                     } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2722                         /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2723                         ++mySource;
2724                         /* add another bit so that the code below writes 2 bytes in case of error */
2725                         mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2726                     }
2727                 } else {
2728                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2729                     args->converter->toULength = 1;
2730                     break;
2731                 }
2732             }
2733             else if(mySourceChar <= 0x7f) {
2734                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2735             } else {
2736                 targetUniChar = 0xffff;
2737             }
2738             if(targetUniChar < 0xfffe){
2739                 if(args->offsets) {
2740                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2741                 }
2742                 *(myTarget++)=(UChar)targetUniChar;
2743             }
2744             else {
2745                 /* Call the callback function*/
2746                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2747                 break;
2748             }
2749         }
2750         else{
2751             *err =U_BUFFER_OVERFLOW_ERROR;
2752             break;
2753         }
2754     }
2755     args->target = myTarget;
2756     args->source = mySource;
2757 }
2758
2759 /*************************** END ISO2022-KR *********************************/
2760
2761 /*************************** ISO-2022-CN *********************************
2762 *
2763 * Rules for ISO-2022-CN Encoding:
2764 * i)   The designator sequence must appear once on a line before any instance
2765 *      of character set it designates.
2766 * ii)  If two lines contain characters from the same character set, both lines
2767 *      must include the designator sequence.
2768 * iii) Once the designator sequence is known, a shifting sequence has to be found
2769 *      to invoke the  shifting
2770 * iv)  All lines start in ASCII and end in ASCII.
2771 * v)   Four shifting sequences are employed for this purpose:
2772 *
2773 *      Sequcence   ASCII Eq    Charsets
2774 *      ----------  -------    ---------
2775 *      SI           <SI>        US-ASCII
2776 *      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2777 *      SS2          <ESC>N      CNS-11643-1992 Plane 2
2778 *      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2779 *
2780 * vi)
2781 *      SOdesignator  : ESC "$" ")" finalchar_for_SO
2782 *      SS2designator : ESC "$" "*" finalchar_for_SS2
2783 *      SS3designator : ESC "$" "+" finalchar_for_SS3
2784 *
2785 *      ESC $ ) A       Indicates the bytes following SO are Chinese
2786 *       characters as defined in GB 2312-80, until
2787 *       another SOdesignation appears
2788 *
2789 *
2790 *      ESC $ ) E       Indicates the bytes following SO are as defined
2791 *       in ISO-IR-165 (for details, see section 2.1),
2792 *       until another SOdesignation appears
2793 *
2794 *      ESC $ ) G       Indicates the bytes following SO are as defined
2795 *       in CNS 11643-plane-1, until another
2796 *       SOdesignation appears
2797 *
2798 *      ESC $ * H       Indicates the two bytes immediately following
2799 *       SS2 is a Chinese character as defined in CNS
2800 *       11643-plane-2, until another SS2designation
2801 *       appears
2802 *       (Meaning <ESC>N must preceed every 2 byte
2803 *        sequence.)
2804 *
2805 *      ESC $ + I       Indicates the immediate two bytes following SS3
2806 *       is a Chinese character as defined in CNS
2807 *       11643-plane-3, until another SS3designation
2808 *       appears
2809 *       (Meaning <ESC>O must preceed every 2 byte
2810 *        sequence.)
2811 *
2812 *      ESC $ + J       Indicates the immediate two bytes following SS3
2813 *       is a Chinese character as defined in CNS
2814 *       11643-plane-4, until another SS3designation
2815 *       appears
2816 *       (In English: <ESC>O must preceed every 2 byte
2817 *        sequence.)
2818 *
2819 *      ESC $ + K       Indicates the immediate two bytes following SS3
2820 *       is a Chinese character as defined in CNS
2821 *       11643-plane-5, until another SS3designation
2822 *       appears
2823 *
2824 *      ESC $ + L       Indicates the immediate two bytes following SS3
2825 *       is a Chinese character as defined in CNS
2826 *       11643-plane-6, until another SS3designation
2827 *       appears
2828 *
2829 *      ESC $ + M       Indicates the immediate two bytes following SS3
2830 *       is a Chinese character as defined in CNS
2831 *       11643-plane-7, until another SS3designation
2832 *       appears
2833 *
2834 *       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2835 *       has its own designation information before any Chinese characters
2836 *       appear
2837 *
2838 */
2839
2840 /* The following are defined this way to make the strings truly readonly */
2841 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2842 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2843 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2844 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2845 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2846 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2847 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2848 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2849 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2850
2851 /********************** ISO2022-CN Data **************************/
2852 static const char* const escSeqCharsCN[10] ={
2853         SHIFT_IN_STR,                   /* 0 ASCII */
2854         GB_2312_80_STR,                 /* 1 GB2312_1 */
2855         ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
2856         CNS_11643_1992_Plane_1_STR,
2857         CNS_11643_1992_Plane_2_STR,
2858         CNS_11643_1992_Plane_3_STR,
2859         CNS_11643_1992_Plane_4_STR,
2860         CNS_11643_1992_Plane_5_STR,
2861         CNS_11643_1992_Plane_6_STR,
2862         CNS_11643_1992_Plane_7_STR
2863 };
2864
2865 static void
2866 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2867     UConverter *cnv = args->converter;
2868     UConverterDataISO2022 *converterData;
2869     ISO2022State *pFromU2022State;
2870     uint8_t *target = (uint8_t *) args->target;
2871     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2872     const UChar* source = args->source;
2873     const UChar* sourceLimit = args->sourceLimit;
2874     int32_t* offsets = args->offsets;
2875     UChar32 sourceChar;
2876     char buffer[8];
2877     int32_t len;
2878     int8_t choices[3];
2879     int32_t choiceCount;
2880     uint32_t targetValue = 0;
2881     UBool useFallback;
2882
2883     /* set up the state */
2884     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2885     pFromU2022State   = &converterData->fromU2022State;
2886
2887     choiceCount = 0;
2888
2889     /* check if the last codepoint of previous buffer was a lead surrogate*/
2890     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2891         goto getTrail;
2892     }
2893
2894     while( source < sourceLimit){
2895         if(target < targetLimit){
2896
2897             sourceChar  = *(source++);
2898             /*check if the char is a First surrogate*/
2899              if(U16_IS_SURROGATE(sourceChar)) {
2900                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2901 getTrail:
2902                     /*look ahead to find the trail surrogate*/
2903                     if(source < sourceLimit) {
2904                         /* test the following code unit */
2905                         UChar trail=(UChar) *source;
2906                         if(U16_IS_TRAIL(trail)) {
2907                             source++;
2908                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2909                             cnv->fromUChar32=0x00;
2910                             /* convert this supplementary code point */
2911                             /* exit this condition tree */
2912                         } else {
2913                             /* this is an unmatched lead code unit (1st surrogate) */
2914                             /* callback(illegal) */
2915                             *err=U_ILLEGAL_CHAR_FOUND;
2916                             cnv->fromUChar32=sourceChar;
2917                             break;
2918                         }
2919                     } else {
2920                         /* no more input */
2921                         cnv->fromUChar32=sourceChar;
2922                         break;
2923                     }
2924                 } else {
2925                     /* this is an unmatched trail code unit (2nd surrogate) */
2926                     /* callback(illegal) */
2927                     *err=U_ILLEGAL_CHAR_FOUND;
2928                     cnv->fromUChar32=sourceChar;
2929                     break;
2930                 }
2931             }
2932
2933             /* do the conversion */
2934             if(sourceChar <= 0x007f ){
2935                 /* do not convert SO/SI/ESC */
2936                 if(IS_2022_CONTROL(sourceChar)) {
2937                     /* callback(illegal) */
2938                     *err=U_ILLEGAL_CHAR_FOUND;
2939                     cnv->fromUChar32=sourceChar;
2940                     break;
2941                 }
2942
2943                 /* US-ASCII */
2944                 if(pFromU2022State->g == 0) {
2945                     buffer[0] = (char)sourceChar;
2946                     len = 1;
2947                 } else {
2948                     buffer[0] = UCNV_SI;
2949                     buffer[1] = (char)sourceChar;
2950                     len = 2;
2951                     pFromU2022State->g = 0;
2952                     choiceCount = 0;
2953                 }
2954                 if(sourceChar == CR || sourceChar == LF) {
2955                     /* reset the state at the end of a line */
2956                     uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2957                     choiceCount = 0;
2958                 }
2959             }
2960             else{
2961                 /* convert U+0080..U+10ffff */
2962                 int32_t i;
2963                 int8_t cs, g;
2964
2965                 if(choiceCount == 0) {
2966                     /* try the current SO/G1 converter first */
2967                     choices[0] = pFromU2022State->cs[1];
2968
2969                     /* default to GB2312_1 if none is designated yet */
2970                     if(choices[0] == 0) {
2971                         choices[0] = GB2312_1;
2972                     }
2973
2974                     if(converterData->version == 0) {
2975                         /* ISO-2022-CN */
2976
2977                         /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2978                         if(choices[0] == GB2312_1) {
2979                             choices[1] = (int8_t)CNS_11643_1;
2980                         } else {
2981                             choices[1] = (int8_t)GB2312_1;
2982                         }
2983
2984                         choiceCount = 2;
2985                     } else if (converterData->version == 1) {
2986                         /* ISO-2022-CN-EXT */
2987
2988                         /* try one of the other converters */
2989                         switch(choices[0]) {
2990                         case GB2312_1:
2991                             choices[1] = (int8_t)CNS_11643_1;
2992                             choices[2] = (int8_t)ISO_IR_165;
2993                             break;
2994                         case ISO_IR_165:
2995                             choices[1] = (int8_t)GB2312_1;
2996                             choices[2] = (int8_t)CNS_11643_1;
2997                             break;
2998                         default: /* CNS_11643_x */
2999                             choices[1] = (int8_t)GB2312_1;
3000                             choices[2] = (int8_t)ISO_IR_165;
3001                             break;
3002                         }
3003
3004                         choiceCount = 3;
3005                     } else {
3006                         choices[0] = (int8_t)CNS_11643_1;
3007                         choices[1] = (int8_t)GB2312_1;
3008                     }
3009                 }
3010
3011                 cs = g = 0;
3012                 /*
3013                  * len==0: no mapping found yet
3014                  * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3015                  * len>0: found a roundtrip result, done
3016                  */
3017                 len = 0;
3018                 /*
3019                  * We will turn off useFallback after finding a fallback,
3020                  * but we still get fallbacks from PUA code points as usual.
3021                  * Therefore, we will also need to check that we don't overwrite
3022                  * an early fallback with a later one.
3023                  */
3024                 useFallback = cnv->useFallback;
3025
3026                 for(i = 0; i < choiceCount && len <= 0; ++i) {
3027                     int8_t cs0 = choices[i];
3028                     if(cs0 > 0) {
3029                         uint32_t value;
3030                         int32_t len2;
3031                         if(cs0 >= CNS_11643_0) {
3032                             len2 = MBCS_FROM_UCHAR32_ISO2022(
3033                                         converterData->myConverterArray[CNS_11643],
3034                                         sourceChar,
3035                                         &value,
3036                                         useFallback,
3037                                         MBCS_OUTPUT_3);
3038                             if(len2 == 3 || (len2 == -3 && len == 0)) {
3039                                 targetValue = value;
3040                                 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3041                                 if(len2 >= 0) {
3042                                     len = 2;
3043                                 } else {
3044                                     len = -2;
3045                                     useFallback = FALSE;
3046                                 }
3047                                 if(cs == CNS_11643_1) {
3048                                     g = 1;
3049                                 } else if(cs == CNS_11643_2) {
3050                                     g = 2;
3051                                 } else /* plane 3..7 */ if(converterData->version == 1) {
3052                                     g = 3;
3053                                 } else {
3054                                     /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3055                                     len = 0;
3056                                 }
3057                             }
3058                         } else {
3059                             /* GB2312_1 or ISO-IR-165 */
3060                             U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3061                             len2 = MBCS_FROM_UCHAR32_ISO2022(
3062                                         converterData->myConverterArray[cs0],
3063                                         sourceChar,
3064                                         &value,
3065                                         useFallback,
3066                                         MBCS_OUTPUT_2);
3067                             if(len2 == 2 || (len2 == -2 && len == 0)) {
3068                                 targetValue = value;
3069                                 len = len2;
3070                                 cs = cs0;
3071                                 g = 1;
3072                                 useFallback = FALSE;
3073                             }
3074                         }
3075                     }
3076                 }
3077
3078                 if(len != 0) {
3079                     len = 0; /* count output bytes; it must have been abs(len) == 2 */
3080
3081                     /* write the designation sequence if necessary */
3082                     if(cs != pFromU2022State->cs[g]) {
3083                         if(cs < CNS_11643) {
3084                             uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3085                         } else {
3086                             U_ASSERT(cs >= CNS_11643_1);
3087                             uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3088                         }
3089                         len = 4;
3090                         pFromU2022State->cs[g] = cs;
3091                         if(g == 1) {
3092                             /* changing the SO/G1 charset invalidates the choices[] */
3093                             choiceCount = 0;
3094                         }
3095                     }
3096
3097                     /* write the shift sequence if necessary */
3098                     if(g != pFromU2022State->g) {
3099                         switch(g) {
3100                         case 1:
3101                             buffer[len++] = UCNV_SO;
3102
3103                             /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3104                             pFromU2022State->g = 1;
3105                             break;
3106                         case 2:
3107                             buffer[len++] = 0x1b;
3108                             buffer[len++] = 0x4e;
3109                             break;
3110                         default: /* case 3 */
3111                             buffer[len++] = 0x1b;
3112                             buffer[len++] = 0x4f;
3113                             break;
3114                         }
3115                     }
3116
3117                     /* write the two output bytes */
3118                     buffer[len++] = (char)(targetValue >> 8);
3119                     buffer[len++] = (char)targetValue;
3120                 } else {
3121                     /* if we cannot find the character after checking all codepages
3122                      * then this is an error
3123                      */
3124                     *err = U_INVALID_CHAR_FOUND;
3125                     cnv->fromUChar32=sourceChar;
3126                     break;
3127                 }
3128             }
3129
3130             /* output len>0 bytes in buffer[] */
3131             if(len == 1) {
3132                 *target++ = buffer[0];
3133                 if(offsets) {
3134                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3135                 }
3136             } else if(len == 2 && (target + 2) <= targetLimit) {
3137                 *target++ = buffer[0];
3138                 *target++ = buffer[1];
3139                 if(offsets) {
3140                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3141                     *offsets++ = sourceIndex;
3142                     *offsets++ = sourceIndex;
3143                 }
3144             } else {
3145                 fromUWriteUInt8(
3146                     cnv,
3147                     buffer, len,
3148                     &target, (const char *)targetLimit,
3149                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3150                     err);
3151                 if(U_FAILURE(*err)) {
3152                     break;
3153                 }
3154             }
3155         } /* end if(myTargetIndex<myTargetLength) */
3156         else{
3157             *err =U_BUFFER_OVERFLOW_ERROR;
3158             break;
3159         }
3160
3161     }/* end while(mySourceIndex<mySourceLength) */
3162
3163     /*
3164      * the end of the input stream and detection of truncated input
3165      * are handled by the framework, but for ISO-2022-CN conversion
3166      * we need to be in ASCII mode at the very end
3167      *
3168      * conditions:
3169      *   successful
3170      *   not in ASCII mode
3171      *   end of input and no truncated input
3172      */
3173     if( U_SUCCESS(*err) &&
3174         pFromU2022State->g!=0 &&
3175         args->flush && source>=sourceLimit && cnv->fromUChar32==0
3176     ) {
3177         int32_t sourceIndex;
3178
3179         /* we are switching to ASCII */
3180         pFromU2022State->g=0;
3181
3182         /* get the source index of the last input character */
3183         /*
3184          * TODO this would be simpler and more reliable if we used a pair
3185          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3186          * so that we could simply use the prevSourceIndex here;
3187          * this code gives an incorrect result for the rare case of an unmatched
3188          * trail surrogate that is alone in the last buffer of the text stream
3189          */
3190         sourceIndex=(int32_t)(source-args->source);
3191         if(sourceIndex>0) {
3192             --sourceIndex;
3193             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3194                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3195             ) {
3196                 --sourceIndex;
3197             }
3198         } else {
3199             sourceIndex=-1;
3200         }
3201
3202         fromUWriteUInt8(
3203             cnv,
3204             SHIFT_IN_STR, 1,
3205             &target, (const char *)targetLimit,
3206             &offsets, sourceIndex,
3207             err);
3208     }
3209
3210     /*save the state and return */
3211     args->source = source;
3212     args->target = (char*)target;
3213 }
3214
3215
3216 static void
3217 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3218                                                UErrorCode* err){
3219     char tempBuf[3];
3220     const char *mySource = (char *) args->source;
3221     UChar *myTarget = args->target;
3222     const char *mySourceLimit = args->sourceLimit;
3223     uint32_t targetUniChar = 0x0000;
3224     uint32_t mySourceChar = 0x0000;
3225     UConverterDataISO2022* myData;
3226     ISO2022State *pToU2022State;
3227
3228     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3229     pToU2022State = &myData->toU2022State;
3230
3231     if(myData->key != 0) {
3232         /* continue with a partial escape sequence */
3233         goto escape;
3234     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3235         /* continue with a partial double-byte character */
3236         mySourceChar = args->converter->toUBytes[0];
3237         args->converter->toULength = 0;
3238         targetUniChar = missingCharMarker;
3239         goto getTrailByte;
3240     }
3241
3242     while(mySource < mySourceLimit){
3243
3244         targetUniChar =missingCharMarker;
3245
3246         if(myTarget < args->targetLimit){
3247
3248             mySourceChar= (unsigned char) *mySource++;
3249
3250             switch(mySourceChar){
3251             case UCNV_SI:
3252                 pToU2022State->g=0;
3253                 if (myData->isEmptySegment) {
3254                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
3255                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3256                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
3257                     args->converter->toUBytes[0] = mySourceChar;
3258                     args->converter->toULength = 1;
3259                     args->target = myTarget;
3260                     args->source = mySource;
3261                     return;
3262                 }
3263                 continue;
3264
3265             case UCNV_SO:
3266                 if(pToU2022State->cs[1] != 0) {
3267                     pToU2022State->g=1;
3268                     myData->isEmptySegment = TRUE;      /* Begin a new segment, empty so far */
3269                     continue;
3270                 } else {
3271                     /* illegal to have SO before a matching designator */
3272                     myData->isEmptySegment = FALSE;     /* Handling a different error, reset this to avoid future spurious errs */
3273                     break;
3274                 }
3275
3276             case ESC_2022:
3277                 mySource--;
3278 escape:
3279                 {
3280                     const char * mySourceBefore = mySource;
3281                     int8_t toULengthBefore = args->converter->toULength;
3282
3283                     changeState_2022(args->converter,&(mySource),
3284                         mySourceLimit, ISO_2022_CN,err);
3285
3286                     /* After SO there must be at least one character before a designator (designator error handled separately) */
3287                     if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3288                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3289                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
3290                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3291                     }
3292                 }
3293
3294                 /* invalid or illegal escape sequence */
3295                 if(U_FAILURE(*err)){
3296                     args->target = myTarget;
3297                     args->source = mySource;
3298                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
3299                     return;
3300                 }
3301                 continue;
3302
3303             /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3304
3305             case CR:
3306                 /*falls through*/
3307             case LF:
3308                 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3309                 /* falls through */
3310             default:
3311                 /* convert one or two bytes */
3312                 myData->isEmptySegment = FALSE;
3313                 if(pToU2022State->g != 0) {
3314                     if(mySource < mySourceLimit) {
3315                         UConverterSharedData *cnv;
3316                         StateEnum tempState;
3317                         int32_t tempBufLen;
3318                         int leadIsOk, trailIsOk;
3319                         uint8_t trailByte;
3320 getTrailByte:
3321                         trailByte = (uint8_t)*mySource;
3322                         /*
3323                          * Ticket 5691: consistent illegal sequences:
3324                          * - We include at least the first byte in the illegal sequence.
3325                          * - If any of the non-initial bytes could be the start of a character,
3326                          *   we stop the illegal sequence before the first one of those.
3327                          *
3328                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3329                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3330                          * Otherwise we convert or report the pair of bytes.
3331                          */
3332                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3333                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3334                         if (leadIsOk && trailIsOk) {
3335                             ++mySource;
3336                             tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3337                             if(tempState >= CNS_11643_0) {
3338                                 cnv = myData->myConverterArray[CNS_11643];
3339                                 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3340                                 tempBuf[1] = (char) (mySourceChar);
3341                                 tempBuf[2] = (char) trailByte;
3342                                 tempBufLen = 3;
3343
3344                             }else{
3345                                 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3346                                 cnv = myData->myConverterArray[tempState];
3347                                 tempBuf[0] = (char) (mySourceChar);
3348                                 tempBuf[1] = (char) trailByte;
3349                                 tempBufLen = 2;
3350                             }
3351                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3352                             mySourceChar = (mySourceChar << 8) | trailByte;
3353                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3354                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3355                             ++mySource;
3356                             /* add another bit so that the code below writes 2 bytes in case of error */
3357                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3358                         }
3359                         if(pToU2022State->g>=2) {
3360                             /* return from a single-shift state to the previous one */
3361                             pToU2022State->g=pToU2022State->prevG;
3362                         }
3363                     } else {
3364                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3365                         args->converter->toULength = 1;
3366                         goto endloop;
3367                     }
3368                 }
3369                 else{
3370                     if(mySourceChar <= 0x7f) {
3371                         targetUniChar = (UChar) mySourceChar;
3372                     }
3373                 }
3374                 break;
3375             }
3376             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3377                 if(args->offsets){
3378                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3379                 }
3380                 *(myTarget++)=(UChar)targetUniChar;
3381             }
3382             else if(targetUniChar > missingCharMarker){
3383                 /* disassemble the surrogate pair and write to output*/
3384                 targetUniChar-=0x0010000;
3385                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3386                 if(args->offsets){
3387                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3388                 }
3389                 ++myTarget;
3390                 if(myTarget< args->targetLimit){
3391                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3392                     if(args->offsets){
3393                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3394                     }
3395                     ++myTarget;
3396                 }else{
3397                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3398                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3399                 }
3400
3401             }
3402             else{
3403                 /* Call the callback function*/
3404                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3405                 break;
3406             }
3407         }
3408         else{
3409             *err =U_BUFFER_OVERFLOW_ERROR;
3410             break;
3411         }
3412     }
3413 endloop:
3414     args->target = myTarget;
3415     args->source = mySource;
3416 }
3417
3418 static void
3419 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3420     UConverter *cnv = args->converter;
3421     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3422     ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3423     char *p, *subchar;
3424     char buffer[8];
3425     int32_t length;
3426
3427     subchar=(char *)cnv->subChars;
3428     length=cnv->subCharLen; /* assume length==1 for most variants */
3429
3430     p = buffer;
3431     switch(myConverterData->locale[0]){
3432     case 'j':
3433         {
3434             int8_t cs;
3435
3436             if(pFromU2022State->g == 1) {
3437                 /* JIS7: switch from G1 to G0 */
3438                 pFromU2022State->g = 0;
3439                 *p++ = UCNV_SI;
3440             }
3441
3442             cs = pFromU2022State->cs[0];
3443             if(cs != ASCII && cs != JISX201) {
3444                 /* not in ASCII or JIS X 0201: switch to ASCII */
3445                 pFromU2022State->cs[0] = (int8_t)ASCII;
3446                 *p++ = '\x1b';
3447                 *p++ = '\x28';
3448                 *p++ = '\x42';
3449             }
3450
3451             *p++ = subchar[0];
3452             break;
3453         }
3454     case 'c':
3455         if(pFromU2022State->g != 0) {
3456             /* not in ASCII mode: switch to ASCII */
3457             pFromU2022State->g = 0;
3458             *p++ = UCNV_SI;
3459         }
3460         *p++ = subchar[0];
3461         break;
3462     case 'k':
3463         if(myConverterData->version == 0) {
3464             if(length == 1) {
3465                 if((UBool)args->converter->fromUnicodeStatus) {
3466                     /* in DBCS mode: switch to SBCS */
3467                     args->converter->fromUnicodeStatus = 0;
3468                     *p++ = UCNV_SI;
3469                 }
3470                 *p++ = subchar[0];
3471             } else /* length == 2*/ {
3472                 if(!(UBool)args->converter->fromUnicodeStatus) {
3473                     /* in SBCS mode: switch to DBCS */
3474                     args->converter->fromUnicodeStatus = 1;
3475                     *p++ = UCNV_SO;
3476                 }
3477                 *p++ = subchar[0];
3478                 *p++ = subchar[1];
3479             }
3480             break;
3481         } else {
3482             /* save the subconverter's substitution string */
3483             uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3484             int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3485
3486             /* set our substitution string into the subconverter */
3487             myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3488             myConverterData->currentConverter->subCharLen = (int8_t)length;
3489
3490             /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3491             args->converter = myConverterData->currentConverter;
3492             myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3493             ucnv_cbFromUWriteSub(args, 0, err);
3494             cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3495             args->converter = cnv;
3496
3497             /* restore the subconverter's substitution string */
3498             myConverterData->currentConverter->subChars = currentSubChars;
3499             myConverterData->currentConverter->subCharLen = currentSubCharLen;
3500
3501             if(*err == U_BUFFER_OVERFLOW_ERROR) {
3502                 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3503                     uprv_memcpy(
3504                         cnv->charErrorBuffer,
3505                         myConverterData->currentConverter->charErrorBuffer,
3506                         myConverterData->currentConverter->charErrorBufferLength);
3507                 }
3508                 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3509                 myConverterData->currentConverter->charErrorBufferLength = 0;
3510             }
3511             return;
3512         }
3513     default:
3514         /* not expected */
3515         break;
3516     }
3517     ucnv_cbFromUWriteBytes(args,
3518                            buffer, (int32_t)(p - buffer),
3519                            offsetIndex, err);
3520 }
3521
3522 /*
3523  * Structure for cloning an ISO 2022 converter into a single memory block.
3524  * ucnv_safeClone() of the converter will align the entire cloneStruct,
3525  * and then ucnv_safeClone() of the sub-converter may additionally align
3526  * currentConverter inside the cloneStruct, for which we need the deadSpace
3527  * after currentConverter.
3528  * This is because UAlignedMemory may be larger than the actually
3529  * necessary alignment size for the platform.
3530  * The other cloneStruct fields will not be moved around,
3531  * and are aligned properly with cloneStruct's alignment.
3532  */
3533 struct cloneStruct
3534 {
3535     UConverter cnv;
3536     UConverter currentConverter;
3537     UAlignedMemory deadSpace;
3538     UConverterDataISO2022 mydata;
3539 };
3540
3541
3542 static UConverter *
3543 _ISO_2022_SafeClone(
3544             const UConverter *cnv,
3545             void *stackBuffer,
3546             int32_t *pBufferSize,
3547             UErrorCode *status)
3548 {
3549     struct cloneStruct * localClone;
3550     UConverterDataISO2022 *cnvData;
3551     int32_t i, size;
3552
3553     if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3554         *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3555         return NULL;
3556     }
3557
3558     cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3559     localClone = (struct cloneStruct *)stackBuffer;
3560
3561     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3562
3563     uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3564     localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3565     localClone->cnv.isExtraLocal = TRUE;
3566
3567     /* share the subconverters */
3568
3569     if(cnvData->currentConverter != NULL) {
3570         size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3571         localClone->mydata.currentConverter =
3572             ucnv_safeClone(cnvData->currentConverter,
3573                             &localClone->currentConverter,
3574                             &size, status);
3575         if(U_FAILURE(*status)) {
3576             return NULL;
3577         }
3578     }
3579
3580     for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3581         if(cnvData->myConverterArray[i] != NULL) {
3582             ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3583         }
3584     }
3585
3586     return &localClone->cnv;
3587 }
3588
3589 static void
3590 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3591                     const USetAdder *sa,
3592                     UConverterUnicodeSet which,
3593                     UErrorCode *pErrorCode)
3594 {
3595     int32_t i;
3596     UConverterDataISO2022* cnvData;
3597
3598     if (U_FAILURE(*pErrorCode)) {
3599         return;
3600     }
3601 #ifdef U_ENABLE_GENERIC_ISO_2022
3602     if (cnv->sharedData == &_ISO2022Data) {
3603         /* We use UTF-8 in this case */
3604         sa->addRange(sa->set, 0, 0xd7FF);
3605         sa->addRange(sa->set, 0xE000, 0x10FFFF);
3606         return;
3607     }
3608 #endif
3609
3610     cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3611
3612     /* open a set and initialize it with code points that are algorithmically round-tripped */
3613     switch(cnvData->locale[0]){
3614     case 'j':
3615         /* include JIS X 0201 which is hardcoded */
3616         sa->add(sa->set, 0xa5);
3617         sa->add(sa->set, 0x203e);
3618         if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3619             /* include Latin-1 for some variants of JP */
3620             sa->addRange(sa->set, 0, 0xff);
3621         } else {
3622             /* include ASCII for JP */
3623             sa->addRange(sa->set, 0, 0x7f);
3624         }
3625         if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3626             /*
3627              * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3628              * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3629              * use half-width Katakana.
3630              * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3631              * half-width Katakana via the ESC ( I sequence.
3632              * However, we only emit (fromUnicode) half-width Katakana according to the
3633              * definition of each variant.
3634              *
3635              * When including fallbacks,
3636              * we need to include half-width Katakana Unicode code points for all JP variants because
3637              * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3638              */
3639             /* include half-width Katakana for JP */
3640             sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3641         }
3642         break;
3643     case 'c':
3644     case 'z':
3645         /* include ASCII for CN */
3646         sa->addRange(sa->set, 0, 0x7f);
3647         break;
3648     case 'k':
3649         /* there is only one converter for KR, and it is not in the myConverterArray[] */
3650         cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3651                 cnvData->currentConverter, sa, which, pErrorCode);
3652         /* the loop over myConverterArray[] will simply not find another converter */
3653         break;
3654     default:
3655         break;
3656     }
3657
3658 #if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3659             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3660                 cnvData->version==0 && i==CNS_11643
3661             ) {
3662                 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3663                 ucnv_MBCSGetUnicodeSetForBytes(
3664                         cnvData->myConverterArray[i],
3665                         sa, UCNV_ROUNDTRIP_SET,
3666                         0, 0x81, 0x82,
3667                         pErrorCode);
3668             }
3669 #endif
3670
3671     for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3672         UConverterSetFilter filter;
3673         if(cnvData->myConverterArray[i]!=NULL) {
3674             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3675                 cnvData->version==0 && i==CNS_11643
3676             ) {
3677                 /*
3678                  * Version-specific for CN:
3679                  * CN version 0 does not map CNS planes 3..7 although
3680                  * they are all available in the CNS conversion table;
3681                  * CN version 1 (-EXT) does map them all.
3682                  * The two versions create different Unicode sets.
3683                  */
3684                 filter=UCNV_SET_FILTER_2022_CN;
3685             } else if(cnvData->locale[0]=='j' && i==JISX208) {
3686                 /*
3687                  * Only add code points that map to Shift-JIS codes
3688                  * corresponding to JIS X 0208.
3689                  */
3690                 filter=UCNV_SET_FILTER_SJIS;
3691             } else if(i==KSC5601) {
3692                 /*
3693                  * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3694                  * are broader than GR94.
3695                  */
3696                 filter=UCNV_SET_FILTER_GR94DBCS;
3697             } else {
3698                 filter=UCNV_SET_FILTER_NONE;
3699             }
3700             ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3701         }
3702     }
3703
3704     /*
3705      * ISO 2022 converters must not convert SO/SI/ESC despite what
3706      * sub-converters do by themselves.
3707      * Remove these characters from the set.
3708      */
3709     sa->remove(sa->set, 0x0e);
3710     sa->remove(sa->set, 0x0f);
3711     sa->remove(sa->set, 0x1b);
3712
3713     /* ISO 2022 converters do not convert C1 controls either */
3714     sa->removeRange(sa->set, 0x80, 0x9f);
3715 }
3716
3717 static const UConverterImpl _ISO2022Impl={
3718     UCNV_ISO_2022,
3719
3720     NULL,
3721     NULL,
3722
3723     _ISO2022Open,
3724     _ISO2022Close,
3725     _ISO2022Reset,
3726
3727 #ifdef U_ENABLE_GENERIC_ISO_2022
3728     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3729     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3730     ucnv_fromUnicode_UTF8,
3731     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3732 #else
3733     NULL,
3734     NULL,
3735     NULL,
3736     NULL,
3737 #endif
3738     NULL,
3739
3740     NULL,
3741     _ISO2022getName,
3742     _ISO_2022_WriteSub,
3743     _ISO_2022_SafeClone,
3744     _ISO_2022_GetUnicodeSet,
3745
3746     NULL,
3747     NULL
3748 };
3749 static const UConverterStaticData _ISO2022StaticData={
3750     sizeof(UConverterStaticData),
3751     "ISO_2022",
3752     2022,
3753     UCNV_IBM,
3754     UCNV_ISO_2022,
3755     1,
3756     3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3757     { 0x1a, 0, 0, 0 },
3758     1,
3759     FALSE,
3760     FALSE,
3761     0,
3762     0,
3763     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3764 };
3765 const UConverterSharedData _ISO2022Data={
3766     sizeof(UConverterSharedData),
3767     ~((uint32_t) 0),
3768     NULL,
3769     NULL,
3770     &_ISO2022StaticData,
3771     FALSE,
3772     &_ISO2022Impl,
3773     0, UCNV_MBCS_TABLE_INITIALIZER
3774 };
3775
3776 /*************JP****************/
3777 static const UConverterImpl _ISO2022JPImpl={
3778     UCNV_ISO_2022,
3779
3780     NULL,
3781     NULL,
3782
3783     _ISO2022Open,
3784     _ISO2022Close,
3785     _ISO2022Reset,
3786
3787     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3788     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3789     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3790     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3791     NULL,
3792
3793     NULL,
3794     _ISO2022getName,
3795     _ISO_2022_WriteSub,
3796     _ISO_2022_SafeClone,
3797     _ISO_2022_GetUnicodeSet,
3798
3799     NULL,
3800     NULL
3801 };
3802 static const UConverterStaticData _ISO2022JPStaticData={
3803     sizeof(UConverterStaticData),
3804     "ISO_2022_JP",
3805     0,
3806     UCNV_IBM,
3807     UCNV_ISO_2022,
3808     1,
3809     6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3810     { 0x1a, 0, 0, 0 },
3811     1,
3812     FALSE,
3813     FALSE,
3814     0,
3815     0,
3816     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3817 };
3818
3819 namespace {
3820
3821 const UConverterSharedData _ISO2022JPData={
3822     sizeof(UConverterSharedData),
3823     ~((uint32_t) 0),
3824     NULL,
3825     NULL,
3826     &_ISO2022JPStaticData,
3827     FALSE,
3828     &_ISO2022JPImpl,
3829     0, UCNV_MBCS_TABLE_INITIALIZER
3830 };
3831
3832 }  // namespace
3833
3834 /************* KR ***************/
3835 static const UConverterImpl _ISO2022KRImpl={
3836     UCNV_ISO_2022,
3837
3838     NULL,
3839     NULL,
3840
3841     _ISO2022Open,
3842     _ISO2022Close,
3843     _ISO2022Reset,
3844
3845     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3846     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3847     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3848     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3849     NULL,
3850
3851     NULL,
3852     _ISO2022getName,
3853     _ISO_2022_WriteSub,
3854     _ISO_2022_SafeClone,
3855     _ISO_2022_GetUnicodeSet,
3856
3857     NULL,
3858     NULL
3859 };
3860 static const UConverterStaticData _ISO2022KRStaticData={
3861     sizeof(UConverterStaticData),
3862     "ISO_2022_KR",
3863     0,
3864     UCNV_IBM,
3865     UCNV_ISO_2022,
3866     1,
3867     3, /* max 3 bytes per UChar: SO+DBCS */
3868     { 0x1a, 0, 0, 0 },
3869     1,
3870     FALSE,
3871     FALSE,
3872     0,
3873     0,
3874     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3875 };
3876
3877 namespace {
3878
3879 const UConverterSharedData _ISO2022KRData={
3880     sizeof(UConverterSharedData),
3881     ~((uint32_t) 0),
3882     NULL,
3883     NULL,
3884     &_ISO2022KRStaticData,
3885     FALSE,
3886     &_ISO2022KRImpl,
3887     0, UCNV_MBCS_TABLE_INITIALIZER
3888 };
3889
3890 }  // namespace
3891
3892 /*************** CN ***************/
3893 static const UConverterImpl _ISO2022CNImpl={
3894
3895     UCNV_ISO_2022,
3896
3897     NULL,
3898     NULL,
3899
3900     _ISO2022Open,
3901     _ISO2022Close,
3902     _ISO2022Reset,
3903
3904     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3905     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3906     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3907     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3908     NULL,
3909
3910     NULL,
3911     _ISO2022getName,
3912     _ISO_2022_WriteSub,
3913     _ISO_2022_SafeClone,
3914     _ISO_2022_GetUnicodeSet,
3915
3916     NULL,
3917     NULL
3918 };
3919 static const UConverterStaticData _ISO2022CNStaticData={
3920     sizeof(UConverterStaticData),
3921     "ISO_2022_CN",
3922     0,
3923     UCNV_IBM,
3924     UCNV_ISO_2022,
3925     1,
3926     8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3927     { 0x1a, 0, 0, 0 },
3928     1,
3929     FALSE,
3930     FALSE,
3931     0,
3932     0,
3933     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3934 };
3935
3936 namespace {
3937
3938 const UConverterSharedData _ISO2022CNData={
3939     sizeof(UConverterSharedData),
3940     ~((uint32_t) 0),
3941     NULL,
3942     NULL,
3943     &_ISO2022CNStaticData,
3944     FALSE,
3945     &_ISO2022CNImpl,
3946     0, UCNV_MBCS_TABLE_INITIALIZER
3947 };
3948
3949 }  // namespace
3950
3951 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */