icuSources/common/ucnv2022.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2000-2008, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv2022.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2000feb03
  12 *   created by: Markus W. Scherer
  13 *
  14 *   Change history:
  15 *
  16 *   06/29/2000  helena  Major rewrite of the callback APIs.
  17 *   08/08/2000  Ram     Included support for ISO-2022-JP-2
  18 *                       Changed implementation of toUnicode
  19 *                       function
  20 *   08/21/2000  Ram     Added support for ISO-2022-KR
  21 *   08/29/2000  Ram     Seperated implementation of EBCDIC to
  22 *                       ucnvebdc.c
  23 *   09/20/2000  Ram     Added support for ISO-2022-CN
  24 *                       Added implementations for getNextUChar()
  25 *                       for specific 2022 country variants.
  26 *   10/31/2000  Ram     Implemented offsets logic functions
  27 */
  28
  29 #include "unicode/utypes.h"
  30
  31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
  32
  33 #include "unicode/ucnv.h"
  34 #include "unicode/uset.h"
  35 #include "unicode/ucnv_err.h"
  36 #include "unicode/ucnv_cb.h"
  37 #include "ucnv_imp.h"
  38 #include "ucnv_bld.h"
  39 #include "ucnv_cnv.h"
  40 #include "ucnvmbcs.h"
  41 #include "cstring.h"
  42 #include "cmemory.h"
  43
  44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  45
  46 #ifdef U_ENABLE_GENERIC_ISO_2022
  47 /*
  48  * I am disabling the generic ISO-2022 converter after proposing to do so on
  49  * the icu mailing list two days ago.
  50  *
  51  * Reasons:
  52  * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
  53  *    its designation sequences, single shifts with return to the previous state,
  54  *    switch-with-no-return to UTF-16BE or similar, etc.
  55  *    This is unlike the language-specific variants like ISO-2022-JP which
  56  *    require a much smaller repertoire of ISO-2022 features.
  57  *    These variants continue to be supported.
  58  * 2. I believe that no one is really using the generic ISO-2022 converter
  59  *    but rather always one of the language-specific variants.
  60  *    Note that ICU's generic ISO-2022 converter has always output one escape
  61  *    sequence followed by UTF-8 for the whole stream.
  62  * 3. Switching between subcharsets is extremely slow, because each time
  63  *    the previous converter is closed and a new one opened,
  64  *    without any kind of caching, least-recently-used list, etc.
  65  * 4. The code is currently buggy, and given the above it does not seem
  66  *    reasonable to spend the time on maintenance.
  67  * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
  68  *    This means, for example, that when ISO-8859-7 is designated, the following
  69  *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
  70  *    The ICU ISO-2022 converter does not handle this - and has no information
  71  *    about which subconverter would have to be shifted vs. which is designed
  72  *    for 7-bit ISO-2022.
  73  *
  74  * Markus Scherer 2003-dec-03
  75  */
  76 #endif
  77
  78 static const char SHIFT_IN_STR[]  = "\x0F";
  79 static const char SHIFT_OUT_STR[] = "\x0E";
  80
  81 #define CR      0x0D
  82 #define LF      0x0A
  83 #define H_TAB   0x09
  84 #define V_TAB   0x0B
  85 #define SPACE   0x20
  86
  87 enum {
  88     HWKANA_START=0xff61,
  89     HWKANA_END=0xff9f
  90 };
  91
  92 /*
  93  * 94-character sets with native byte values A1..FE are encoded in ISO 2022
  94  * as bytes 21..7E. (Subtract 0x80.)
  95  * 96-character sets with native byte values A0..FF are encoded in ISO 2022
  96  * as bytes 20..7F. (Subtract 0x80.)
  97  * Do not encode C1 control codes with native bytes 80..9F
  98  * as bytes 00..1F (C0 control codes).
  99  */
 100 enum {
 101     GR94_START=0xa1,
 102     GR94_END=0xfe,
 103     GR96_START=0xa0,
 104     GR96_END=0xff
 105 };
 106
 107 /*
 108  * ISO 2022 control codes must not be converted from Unicode
 109  * because they would mess up the byte stream.
 110  * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
 111  * corresponding to SO, SI, and ESC.
 112  */
 113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
 114
 115 /* for ISO-2022-JP and -CN implementations */
 116 typedef enum  {
 117         /* shared values */
 118         INVALID_STATE=-1,
 119         ASCII = 0,
 120
 121         SS2_STATE=0x10,
 122         SS3_STATE,
 123
 124         /* JP */
 125         ISO8859_1 = 1 ,
 126         ISO8859_7 = 2 ,
 127         JISX201  = 3,
 128         JISX208 = 4,
 129         JISX212 = 5,
 130         GB2312  =6,
 131         KSC5601 =7,
 132         HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
 133
 134         /* CN */
 135         /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
 136         GB2312_1=1,
 137         ISO_IR_165=2,
 138         CNS_11643=3,
 139
 140         /*
 141          * these are used in StateEnum and ISO2022State variables,
 142          * but CNS_11643 must be used to index into myConverterArray[]
 143          */
 144         CNS_11643_0=0x20,
 145         CNS_11643_1,
 146         CNS_11643_2,
 147         CNS_11643_3,
 148         CNS_11643_4,
 149         CNS_11643_5,
 150         CNS_11643_6,
 151         CNS_11643_7
 152 } StateEnum;
 153
 154 /* is the StateEnum charset value for a DBCS charset? */
 155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
 156
 157 #define CSM(cs) ((uint16_t)1<<(cs))
 158
 159 /*
 160  * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
 161  * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
 162  *
 163  * Note: The converter uses some leniency:
 164  * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
 165  *   all versions, not just JIS7 and JIS8.
 166  * - ICU does not distinguish between different versions of JIS X 0208.
 167  */
 168 static const uint16_t jpCharsetMasks[5]={
 169     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
 170     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
 171     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
 172     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
 173     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
 174 };
 175
 176 typedef enum {
 177         ASCII1=0,
 178         LATIN1,
 179         SBCS,
 180         DBCS,
 181         MBCS,
 182         HWKANA
 183 }Cnv2022Type;
 184
 185 typedef struct ISO2022State {
 186     int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
 187     int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
 188     int8_t prevG;       /* g before single shift (SS2 or SS3) */
 189 } ISO2022State;
 190
 191 #define UCNV_OPTIONS_VERSION_MASK 0xf
 192 #define UCNV_2022_MAX_CONVERTERS 10
 193
 194 typedef struct{
 195     UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
 196     UConverter *currentConverter;
 197     Cnv2022Type currentType;
 198     ISO2022State toU2022State, fromU2022State;
 199     uint32_t key;
 200     uint32_t version;
 201 #ifdef U_ENABLE_GENERIC_ISO_2022
 202     UBool isFirstBuffer;
 203 #endif
 204     UBool isEmptySegment;
 205     char name[30];
 206     char locale[3];
 207 }UConverterDataISO2022;
 208
 209 /* Protos */
 210 /* ISO-2022 ----------------------------------------------------------------- */
 211
 212 /*Forward declaration */
 213 U_CFUNC void
 214 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
 215                       UErrorCode * err);
 216 U_CFUNC void
 217 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
 218                                     UErrorCode * err);
 219
 220 #define ESC_2022 0x1B /*ESC*/
 221
 222 typedef enum
 223 {
 224         INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
 225         VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
 226         VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
 227         VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
 228 } UCNV_TableStates_2022;
 229
 230 /*
 231 * The way these state transition arrays work is:
 232 * ex : ESC$B is the sequence for JISX208
 233 *      a) First Iteration: char is ESC
 234 *          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
 235 *             int x = normalize_esq_chars_2022[27] which is equal to 1
 236 *         ii) Search for this value in escSeqStateTable_Key_2022[]
 237 *             value of x is stored at escSeqStateTable_Key_2022[0]
 238 *        iii) Save this index as offset
 239 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
 240 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
 241 *     b) Switch on this state and continue to next char
 242 *          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
 243 *             which is normalize_esq_chars_2022[36] == 4
 244 *         ii) x is currently 1(from above)
 245 *               x<<=5 -- x is now 32
 246 *               x+=normalize_esq_chars_2022[36]
 247 *               now x is 36
 248 *        iii) Search for this value in escSeqStateTable_Key_2022[]
 249 *             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
 250 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
 251 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
 252 *     c) Switch on this state and continue to next char
 253 *        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
 254 *        ii) x is currently 36 (from above)
 255 *            x<<=5 -- x is now 1152
 256 *            x+=normalize_esq_chars_2022[66]
 257 *            now x is 1161
 258 *       iii) Search for this value in escSeqStateTable_Key_2022[]
 259 *            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
 260 *        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
 261 *            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
 262 *         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
 263 */
 264
 265
 266 /*Below are the 3 arrays depicting a state transition table*/
 267 static const int8_t normalize_esq_chars_2022[256] = {
 268 /*       0      1       2       3       4      5       6        7       8       9           */
 269
 270          0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 271         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 272         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
 273         ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
 274         ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
 275         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 276         ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
 277         ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
 278         ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
 279         ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 280         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 281         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 282         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 283         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 284         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 285         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 286         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 287         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 288         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 289         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 290         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 291         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 292         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 293         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 294         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 295         ,0     ,0      ,0      ,0      ,0      ,0
 296 };
 297
 298 #ifdef U_ENABLE_GENERIC_ISO_2022
 299 /*
 300  * When the generic ISO-2022 converter is completely removed, not just disabled
 301  * per #ifdef, then the following state table and the associated tables that are
 302  * dimensioned with MAX_STATES_2022 should be trimmed.
 303  *
 304  * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
 305  * the associated escape sequences starting with ESC ( B should be removed.
 306  * This includes the ones with key values 1097 and all of the ones above 1000000.
 307  *
 308  * For the latter, the tables can simply be truncated.
 309  * For the former, since the tables must be kept parallel, it is probably best
 310  * to simply duplicate an adjacent table cell, parallel in all tables.
 311  *
 312  * It may make sense to restructure the tables, especially by using small search
 313  * tables for the variants instead of indexing them parallel to the table here.
 314  */
 315 #endif
 316
 317 #define MAX_STATES_2022 74
 318 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
 319 /*   0           1           2           3           4           5           6           7           8           9           */
 320
 321      1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
 322     ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
 323     ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
 324     ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
 325     ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
 326     ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
 327     ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
 328     ,35947631   ,35947635   ,35947636   ,35947638
 329 };
 330
 331 #ifdef U_ENABLE_GENERIC_ISO_2022
 332
 333 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
 334  /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
 335
 336      NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
 337     ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
 338     ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
 339     ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
 340     ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
 341     ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
 342     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
 343     ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
 344 };
 345
 346 #endif
 347
 348 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
 349 /*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
 350      VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 351     ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 352     ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
 353     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 354     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 355     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 356     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 357     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 358 };
 359
 360
 361 /* Type def for refactoring changeState_2022 code*/
 362 typedef enum{
 363 #ifdef U_ENABLE_GENERIC_ISO_2022
 364     ISO_2022=0,
 365 #endif
 366     ISO_2022_JP=1,
 367     ISO_2022_KR=2,
 368     ISO_2022_CN=3
 369 } Variant2022;
 370
 371 /*********** ISO 2022 Converter Protos ***********/
 372 static void
 373 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode);
 374
 375 static void
 376  _ISO2022Close(UConverter *converter);
 377
 378 static void
 379 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
 380
 381 static const char*
 382 _ISO2022getName(const UConverter* cnv);
 383
 384 static void
 385 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
 386
 387 static UConverter *
 388 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
 389
 390 #ifdef U_ENABLE_GENERIC_ISO_2022
 391 static void
 392 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
 393 #endif
 394
 395 /*const UConverterSharedData _ISO2022Data;*/
 396 static const UConverterSharedData _ISO2022JPData;
 397 static const UConverterSharedData _ISO2022KRData;
 398 static const UConverterSharedData _ISO2022CNData;
 399
 400 /*************** Converter implementations ******************/
 401
 402 /* The purpose of this function is to get around gcc compiler warnings. */
 403 static U_INLINE void
 404 fromUWriteUInt8(UConverter *cnv,
 405                  const char *bytes, int32_t length,
 406                  uint8_t **target, const char *targetLimit,
 407                  int32_t **offsets,
 408                  int32_t sourceIndex,
 409                  UErrorCode *pErrorCode)
 410 {
 411     char *targetChars = (char *)*target;
 412     ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
 413                          offsets, sourceIndex, pErrorCode);
 414     *target = (uint8_t*)targetChars;
 415
 416 }
 417
 418 static U_INLINE void
 419 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
 420     if(myConverterData->version == 1) {
 421         UConverter *cnv = myConverterData->currentConverter;
 422
 423         cnv->toUnicodeStatus=0;     /* offset */
 424         cnv->mode=0;                /* state */
 425         cnv->toULength=0;           /* byteIndex */
 426     }
 427 }
 428
 429 static U_INLINE void
 430 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
 431    /* in ISO-2022-KR the designator sequence appears only once
 432     * in a file so we append it only once
 433     */
 434     if( converter->charErrorBufferLength==0){
 435
 436         converter->charErrorBufferLength = 4;
 437         converter->charErrorBuffer[0] = 0x1b;
 438         converter->charErrorBuffer[1] = 0x24;
 439         converter->charErrorBuffer[2] = 0x29;
 440         converter->charErrorBuffer[3] = 0x43;
 441     }
 442     if(myConverterData->version == 1) {
 443         UConverter *cnv = myConverterData->currentConverter;
 444
 445         cnv->fromUChar32=0;
 446         cnv->fromUnicodeStatus=1;   /* prevLength */
 447     }
 448 }
 449
 450 static void
 451 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
 452
 453     char myLocale[6]={' ',' ',' ',' ',' ',' '};
 454
 455     cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
 456     if(cnv->extraInfo != NULL) {
 457         UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
 458         uint32_t version;
 459
 460         uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
 461         myConverterData->currentType = ASCII1;
 462         cnv->fromUnicodeStatus =FALSE;
 463         if(locale){
 464             uprv_strncpy(myLocale, locale, sizeof(myLocale));
 465         }
 466         version = options & UCNV_OPTIONS_VERSION_MASK;
 467         myConverterData->version = version;
 468         if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
 469             (myLocale[2]=='_' || myLocale[2]=='\0'))
 470         {
 471             size_t len=0;
 472             /* open the required converters and cache them */
 473             if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
 474                 myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
 475             }
 476             myConverterData->myConverterArray[JISX208]      = ucnv_loadSharedData("Shift-JIS", NULL, errorCode);
 477             if(jpCharsetMasks[version]&CSM(JISX212)) {
 478                 myConverterData->myConverterArray[JISX212]  = ucnv_loadSharedData("jisx-212", NULL, errorCode);
 479             }
 480             if(jpCharsetMasks[version]&CSM(GB2312)) {
 481                 myConverterData->myConverterArray[GB2312]   = ucnv_loadSharedData("ibm-5478", NULL, errorCode);   /* gb_2312_80-1 */
 482             }
 483             if(jpCharsetMasks[version]&CSM(KSC5601)) {
 484                 myConverterData->myConverterArray[KSC5601]  = ucnv_loadSharedData("ksc_5601", NULL, errorCode);
 485             }
 486
 487             /* set the function pointers to appropriate funtions */
 488             cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
 489             uprv_strcpy(myConverterData->locale,"ja");
 490
 491             (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
 492             len = uprv_strlen(myConverterData->name);
 493             myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
 494             myConverterData->name[len+1]='\0';
 495         }
 496         else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
 497             (myLocale[2]=='_' || myLocale[2]=='\0'))
 498         {
 499             if (version==1){
 500                 myConverterData->currentConverter=
 501                     ucnv_open("icu-internal-25546",errorCode);
 502
 503                 if (U_FAILURE(*errorCode)) {
 504                     _ISO2022Close(cnv);
 505                     return;
 506                 }
 507
 508                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
 509                 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
 510                 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
 511             }else{
 512                 myConverterData->currentConverter=ucnv_open("ibm-949",errorCode);
 513
 514                 if (U_FAILURE(*errorCode)) {
 515                     _ISO2022Close(cnv);
 516                     return;
 517                 }
 518
 519                 myConverterData->version = 0;
 520                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
 521             }
 522
 523             /* initialize the state variables */
 524             setInitialStateToUnicodeKR(cnv, myConverterData);
 525             setInitialStateFromUnicodeKR(cnv, myConverterData);
 526
 527             /* set the function pointers to appropriate funtions */
 528             cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
 529             uprv_strcpy(myConverterData->locale,"ko");
 530         }
 531         else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
 532             (myLocale[2]=='_' || myLocale[2]=='\0'))
 533         {
 534
 535             /* open the required converters and cache them */
 536             myConverterData->myConverterArray[GB2312_1]         = ucnv_loadSharedData("ibm-5478", NULL, errorCode);
 537             if(version==1) {
 538                 myConverterData->myConverterArray[ISO_IR_165]   = ucnv_loadSharedData("iso-ir-165", NULL, errorCode);
 539             }
 540             myConverterData->myConverterArray[CNS_11643]        = ucnv_loadSharedData("cns-11643-1992", NULL, errorCode);
 541
 542
 543             /* set the function pointers to appropriate funtions */
 544             cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
 545             uprv_strcpy(myConverterData->locale,"cn");
 546
 547             if (version==1){
 548                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
 549             }else{
 550                 myConverterData->version = 0;
 551                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
 552             }
 553         }
 554         else{
 555 #ifdef U_ENABLE_GENERIC_ISO_2022
 556             myConverterData->isFirstBuffer = TRUE;
 557
 558             /* append the UTF-8 escape sequence */
 559             cnv->charErrorBufferLength = 3;
 560             cnv->charErrorBuffer[0] = 0x1b;
 561             cnv->charErrorBuffer[1] = 0x25;
 562             cnv->charErrorBuffer[2] = 0x42;
 563
 564             cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
 565             /* initialize the state variables */
 566             uprv_strcpy(myConverterData->name,"ISO_2022");
 567 #else
 568             *errorCode = U_UNSUPPORTED_ERROR;
 569             return;
 570 #endif
 571         }
 572
 573         cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
 574
 575         if(U_FAILURE(*errorCode)) {
 576             _ISO2022Close(cnv);
 577         }
 578     } else {
 579         *errorCode = U_MEMORY_ALLOCATION_ERROR;
 580     }
 581 }
 582
 583
 584 static void
 585 _ISO2022Close(UConverter *converter) {
 586     UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
 587     UConverterSharedData **array = myData->myConverterArray;
 588     int32_t i;
 589
 590     if (converter->extraInfo != NULL) {
 591         /*close the array of converter pointers and free the memory*/
 592         for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
 593             if(array[i]!=NULL) {
 594                 ucnv_unloadSharedDataIfReady(array[i]);
 595             }
 596         }
 597
 598         ucnv_close(myData->currentConverter);
 599
 600         if(!converter->isExtraLocal){
 601             uprv_free (converter->extraInfo);
 602             converter->extraInfo = NULL;
 603         }
 604     }
 605 }
 606
 607 static void
 608 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
 609     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
 610     if(choice<=UCNV_RESET_TO_UNICODE) {
 611         uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
 612         myConverterData->key = 0;
 613         myConverterData->isEmptySegment = FALSE;
 614     }
 615     if(choice!=UCNV_RESET_TO_UNICODE) {
 616         uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
 617     }
 618 #ifdef U_ENABLE_GENERIC_ISO_2022
 619     if(myConverterData->locale[0] == 0){
 620         if(choice<=UCNV_RESET_TO_UNICODE) {
 621             myConverterData->isFirstBuffer = TRUE;
 622             myConverterData->key = 0;
 623             if (converter->mode == UCNV_SO){
 624                 ucnv_close (myConverterData->currentConverter);
 625                 myConverterData->currentConverter=NULL;
 626             }
 627             converter->mode = UCNV_SI;
 628         }
 629         if(choice!=UCNV_RESET_TO_UNICODE) {
 630             /* re-append UTF-8 escape sequence */
 631             converter->charErrorBufferLength = 3;
 632             converter->charErrorBuffer[0] = 0x1b;
 633             converter->charErrorBuffer[1] = 0x28;
 634             converter->charErrorBuffer[2] = 0x42;
 635         }
 636     }
 637     else
 638 #endif
 639     {
 640         /* reset the state variables */
 641         if(myConverterData->locale[0] == 'k'){
 642             if(choice<=UCNV_RESET_TO_UNICODE) {
 643                 setInitialStateToUnicodeKR(converter, myConverterData);
 644             }
 645             if(choice!=UCNV_RESET_TO_UNICODE) {
 646                 setInitialStateFromUnicodeKR(converter, myConverterData);
 647             }
 648         }
 649     }
 650 }
 651
 652 static const char*
 653 _ISO2022getName(const UConverter* cnv){
 654     if(cnv->extraInfo){
 655         UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
 656         return myData->name;
 657     }
 658     return NULL;
 659 }
 660
 661
 662 /*************** to unicode *******************/
 663 /****************************************************************************
 664  * Recognized escape sequences are
 665  * <ESC>(B  ASCII
 666  * <ESC>.A  ISO-8859-1
 667  * <ESC>.F  ISO-8859-7
 668  * <ESC>(J  JISX-201
 669  * <ESC>(I  JISX-201
 670  * <ESC>$B  JISX-208
 671  * <ESC>$@  JISX-208
 672  * <ESC>$(D JISX-212
 673  * <ESC>$A  GB2312
 674  * <ESC>$(C KSC5601
 675  */
 676 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
 677 /*      0                1               2               3               4               5               6               7               8               9    */
 678     INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 679     ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
 680     ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 681     ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
 682     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 683     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 684     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 685     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 686 };
 687
 688 /*************** to unicode *******************/
 689 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
 690 /*      0                1               2               3               4               5               6               7               8               9    */
 691      INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 692     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 693     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 694     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 695     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
 696     ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 697     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 698     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 699 };
 700
 701
 702 static UCNV_TableStates_2022
 703 getKey_2022(char c,int32_t* key,int32_t* offset){
 704     int32_t togo;
 705     int32_t low = 0;
 706     int32_t hi = MAX_STATES_2022;
 707     int32_t oldmid=0;
 708
 709     togo = normalize_esq_chars_2022[(uint8_t)c];
 710     if(togo == 0) {
 711         /* not a valid character anywhere in an escape sequence */
 712         *key = 0;
 713         *offset = 0;
 714         return INVALID_2022;
 715     }
 716     togo = (*key << 5) + togo;
 717
 718     while (hi != low)  /*binary search*/{
 719
 720         register int32_t mid = (hi+low) >> 1; /*Finds median*/
 721
 722         if (mid == oldmid)
 723             break;
 724
 725         if (escSeqStateTable_Key_2022[mid] > togo){
 726             hi = mid;
 727         }
 728         else if (escSeqStateTable_Key_2022[mid] < togo){
 729             low = mid;
 730         }
 731         else /*we found it*/{
 732             *key = togo;
 733             *offset = mid;
 734             return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
 735         }
 736         oldmid = mid;
 737
 738     }
 739
 740     *key = 0;
 741     *offset = 0;
 742     return INVALID_2022;
 743 }
 744
 745 /*runs through a state machine to determine the escape sequence - codepage correspondance
 746  */
 747 static void
 748 changeState_2022(UConverter* _this,
 749                 const char** source,
 750                 const char* sourceLimit,
 751                 Variant2022 var,
 752                 UErrorCode* err){
 753     UCNV_TableStates_2022 value;
 754     UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
 755     uint32_t key = myData2022->key;
 756     int32_t offset = 0;
 757     int8_t initialToULength = _this->toULength;
 758     char c;
 759
 760     value = VALID_NON_TERMINAL_2022;
 761     while (*source < sourceLimit) {
 762         c = *(*source)++;
 763         _this->toUBytes[_this->toULength++]=(uint8_t)c;
 764         value = getKey_2022(c,(int32_t *) &key, &offset);
 765
 766         switch (value){
 767
 768         case VALID_NON_TERMINAL_2022 :
 769             /* continue with the loop */
 770             break;
 771
 772         case VALID_TERMINAL_2022:
 773             key = 0;
 774             goto DONE;
 775
 776         case INVALID_2022:
 777             goto DONE;
 778
 779         case VALID_MAYBE_TERMINAL_2022:
 780 #ifdef U_ENABLE_GENERIC_ISO_2022
 781             /* ESC ( B is ambiguous only for ISO_2022 itself */
 782             if(var == ISO_2022) {
 783                 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
 784                 _this->toULength = 0;
 785
 786                 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
 787
 788                 /* continue with the loop */
 789                 value = VALID_NON_TERMINAL_2022;
 790                 break;
 791             } else
 792 #endif
 793             {
 794                 /* not ISO_2022 itself, finish here */
 795                 value = VALID_TERMINAL_2022;
 796                 key = 0;
 797                 goto DONE;
 798             }
 799         }
 800     }
 801
 802 DONE:
 803     myData2022->key = key;
 804
 805     if (value == VALID_NON_TERMINAL_2022) {
 806         /* indicate that the escape sequence is incomplete: key!=0 */
 807         return;
 808     } else if (value == INVALID_2022 ) {
 809         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 810     } else /* value == VALID_TERMINAL_2022 */ {
 811         switch(var){
 812 #ifdef U_ENABLE_GENERIC_ISO_2022
 813         case ISO_2022:
 814         {
 815             const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
 816             if(chosenConverterName == NULL) {
 817                 /* SS2 or SS3 */
 818                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 819                 _this->toUCallbackReason = UCNV_UNASSIGNED;
 820                 return;
 821             }
 822
 823             _this->mode = UCNV_SI;
 824             ucnv_close(myData2022->currentConverter);
 825             myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
 826             if(U_SUCCESS(*err)) {
 827                 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
 828                 _this->mode = UCNV_SO;
 829             }
 830             break;
 831         }
 832 #endif
 833         case ISO_2022_JP:
 834             {
 835                 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
 836                 switch(tempState) {
 837                 case INVALID_STATE:
 838                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 839                     break;
 840                 case SS2_STATE:
 841                     if(myData2022->toU2022State.cs[2]!=0) {
 842                         if(myData2022->toU2022State.g<2) {
 843                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 844                         }
 845                         myData2022->toU2022State.g=2;
 846                     } else {
 847                         /* illegal to have SS2 before a matching designator */
 848                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 849                     }
 850                     break;
 851                 /* case SS3_STATE: not used in ISO-2022-JP-x */
 852                 case ISO8859_1:
 853                 case ISO8859_7:
 854                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
 855                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 856                     } else {
 857                         /* G2 charset for SS2 */
 858                         myData2022->toU2022State.cs[2]=(int8_t)tempState;
 859                     }
 860                     break;
 861                 default:
 862                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
 863                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 864                     } else {
 865                         /* G0 charset */
 866                         myData2022->toU2022State.cs[0]=(int8_t)tempState;
 867                     }
 868                     break;
 869                 }
 870             }
 871             break;
 872         case ISO_2022_CN:
 873             {
 874                 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
 875                 switch(tempState) {
 876                 case INVALID_STATE:
 877                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 878                     break;
 879                 case SS2_STATE:
 880                     if(myData2022->toU2022State.cs[2]!=0) {
 881                         if(myData2022->toU2022State.g<2) {
 882                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 883                         }
 884                         myData2022->toU2022State.g=2;
 885                     } else {
 886                         /* illegal to have SS2 before a matching designator */
 887                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 888                     }
 889                     break;
 890                 case SS3_STATE:
 891                     if(myData2022->toU2022State.cs[3]!=0) {
 892                         if(myData2022->toU2022State.g<2) {
 893                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 894                         }
 895                         myData2022->toU2022State.g=3;
 896                     } else {
 897                         /* illegal to have SS3 before a matching designator */
 898                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 899                     }
 900                     break;
 901                 case ISO_IR_165:
 902                     if(myData2022->version==0) {
 903                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 904                         break;
 905                     }
 906                     /*fall through*/
 907                 case GB2312_1:
 908                     /*fall through*/
 909                 case CNS_11643_1:
 910                     myData2022->toU2022State.cs[1]=(int8_t)tempState;
 911                     break;
 912                 case CNS_11643_2:
 913                     myData2022->toU2022State.cs[2]=(int8_t)tempState;
 914                     break;
 915                 default:
 916                     /* other CNS 11643 planes */
 917                     if(myData2022->version==0) {
 918                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 919                     } else {
 920                        myData2022->toU2022State.cs[3]=(int8_t)tempState;
 921                     }
 922                     break;
 923                 }
 924             }
 925             break;
 926         case ISO_2022_KR:
 927             if(offset==0x30){
 928                 /* nothing to be done, just accept this one escape sequence */
 929             } else {
 930                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 931             }
 932             break;
 933
 934         default:
 935             *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 936             break;
 937         }
 938     }
 939     if(U_SUCCESS(*err)) {
 940         _this->toULength = 0;
 941     } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
 942         if(_this->toULength>1) {
 943             /*
 944              * Ticket 5691: consistent illegal sequences:
 945              * - We include at least the first byte (ESC) in the illegal sequence.
 946              * - If any of the non-initial bytes could be the start of a character,
 947              *   we stop the illegal sequence before the first one of those.
 948              *   In escape sequences, all following bytes are "printable", that is,
 949              *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
 950              *   they are valid single/lead bytes.
 951              *   For simplicity, we always only report the initial ESC byte as the
 952              *   illegal sequence and back out all other bytes we looked at.
 953              */
 954             /* Back out some bytes. */
 955             int8_t backOutDistance=_this->toULength-1;
 956             int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
 957             if(backOutDistance<=bytesFromThisBuffer) {
 958                 /* same as initialToULength<=1 */
 959                 *source-=backOutDistance;
 960             } else {
 961                 /* Back out bytes from the previous buffer: Need to replay them. */
 962                 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
 963                 /* same as -(initialToULength-1) */
 964                 /* preToULength is negative! */
 965                 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
 966                 *source-=bytesFromThisBuffer;
 967             }
 968             _this->toULength=1;
 969         }
 970     } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
 971         _this->toUCallbackReason = UCNV_UNASSIGNED;
 972     }
 973 }
 974
 975 /*Checks the characters of the buffer against valid 2022 escape sequences
 976 *if the match we return a pointer to the initial start of the sequence otherwise
 977 *we return sourceLimit
 978 */
 979 /*for 2022 looks ahead in the stream
 980  *to determine the longest possible convertible
 981  *data stream
 982  */
 983 static U_INLINE const char*
 984 getEndOfBuffer_2022(const char** source,
 985                    const char* sourceLimit,
 986                    UBool flush){
 987
 988     const char* mySource = *source;
 989
 990 #ifdef U_ENABLE_GENERIC_ISO_2022
 991     if (*source >= sourceLimit)
 992         return sourceLimit;
 993
 994     do{
 995
 996         if (*mySource == ESC_2022){
 997             int8_t i;
 998             int32_t key = 0;
 999             int32_t offset;
1000             UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1001
1002             /* Kludge: I could not
1003             * figure out the reason for validating an escape sequence
1004             * twice - once here and once in changeState_2022().
1005             * is it possible to have an ESC character in a ISO2022
1006             * byte stream which is valid in a code page? Is it legal?
1007             */
1008             for (i=0;
1009             (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1010             i++) {
1011                 value =  getKey_2022(*(mySource+i), &key, &offset);
1012             }
1013             if (value > 0 || *mySource==ESC_2022)
1014                 return mySource;
1015
1016             if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1017                 return sourceLimit;
1018         }
1019     }while (++mySource < sourceLimit);
1020
1021     return sourceLimit;
1022 #else
1023     while(mySource < sourceLimit && *mySource != ESC_2022) {
1024         ++mySource;
1025     }
1026     return mySource;
1027 #endif
1028 }
1029
1030
1031 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1032  * any future change in _MBCSFromUChar32() function should be reflected here.
1033  * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1034  */
1035 static U_INLINE int32_t
1036 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1037                                          UChar32 c,
1038                                          uint32_t* value,
1039                                          UBool useFallback,
1040                                          int outputType)
1041 {
1042     const int32_t *cx;
1043     const uint16_t *table;
1044     uint32_t stage2Entry;
1045     uint32_t myValue;
1046     int32_t length;
1047     const uint8_t *p;
1048     /*
1049      * TODO(markus): Use and require new, faster MBCS conversion table structures.
1050      * Use internal version of ucnv_open() that verifies that the new structures are available,
1051      * else U_INTERNAL_PROGRAM_ERROR.
1052      */
1053     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1054     if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1055         table=sharedData->mbcs.fromUnicodeTable;
1056         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1057         /* get the bytes and the length for the output */
1058         if(outputType==MBCS_OUTPUT_2){
1059             myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1060             if(myValue<=0xff) {
1061                 length=1;
1062             } else {
1063                 length=2;
1064             }
1065         } else /* outputType==MBCS_OUTPUT_3 */ {
1066             p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1067             myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1068             if(myValue<=0xff) {
1069                 length=1;
1070             } else if(myValue<=0xffff) {
1071                 length=2;
1072             } else {
1073                 length=3;
1074             }
1075         }
1076         /* is this code point assigned, or do we use fallbacks? */
1077         if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1078             /* assigned */
1079             *value=myValue;
1080             return length;
1081         } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1082             /*
1083              * We allow a 0 byte output if the "assigned" bit is set for this entry.
1084              * There is no way with this data structure for fallback output
1085              * to be a zero byte.
1086              */
1087             *value=myValue;
1088             return -length;
1089         }
1090     }
1091
1092     cx=sharedData->mbcs.extIndexes;
1093     if(cx!=NULL) {
1094         return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1095     }
1096
1097     /* unassigned */
1098     return 0;
1099 }
1100
1101 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1102  * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1103  * @param retval pointer to output byte
1104  * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1105  */
1106 static U_INLINE int32_t
1107 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1108                                        UChar32 c,
1109                                        uint32_t* retval,
1110                                        UBool useFallback)
1111 {
1112     const uint16_t *table;
1113     int32_t value;
1114     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1115     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1116         return 0;
1117     }
1118     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1119     table=sharedData->mbcs.fromUnicodeTable;
1120     /* get the byte for the output */
1121     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1122     /* is this code point assigned, or do we use fallbacks? */
1123     *retval=(uint32_t)(value&0xff);
1124     if(value>=0xf00) {
1125         return 1;  /* roundtrip */
1126     } else if(useFallback ? value>=0x800 : value>=0xc00) {
1127         return -1;  /* fallback taken */
1128     } else {
1129         return 0;  /* no mapping */
1130     }
1131 }
1132
1133 /*
1134  * Check that the result is a 2-byte value with each byte in the range A1..FE
1135  * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1136  * to move it to the ISO 2022 range 21..7E.
1137  * Return 0 if out of range.
1138  */
1139 static U_INLINE uint32_t
1140 _2022FromGR94DBCS(uint32_t value) {
1141     if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1142         (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1143     ) {
1144         return value - 0x8080;  /* shift down to 21..7e byte range */
1145     } else {
1146         return 0;  /* not valid for ISO 2022 */
1147     }
1148 }
1149
1150 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1151 /*
1152  * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1153  * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1154  * unchanged.
1155  */
1156 static U_INLINE uint32_t
1157 _2022ToGR94DBCS(uint32_t value) {
1158     uint32_t returnValue = value + 0x8080;
1159     if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1160         (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1161         return returnValue;
1162     } else {
1163         return value;
1164     }
1165 }
1166 #endif
1167
1168 #ifdef U_ENABLE_GENERIC_ISO_2022
1169
1170 /**********************************************************************************
1171 *  ISO-2022 Converter
1172 *
1173 *
1174 */
1175
1176 static void
1177 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1178                                                            UErrorCode* err){
1179     const char* mySourceLimit, *realSourceLimit;
1180     const char* sourceStart;
1181     const UChar* myTargetStart;
1182     UConverter* saveThis;
1183     UConverterDataISO2022* myData;
1184     int8_t length;
1185
1186     saveThis = args->converter;
1187     myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1188
1189     realSourceLimit = args->sourceLimit;
1190     while (args->source < realSourceLimit) {
1191         if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1192             /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1193             mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1194
1195             if(args->source < mySourceLimit) {
1196                 if(myData->currentConverter==NULL) {
1197                     myData->currentConverter = ucnv_open("ASCII",err);
1198                     if(U_FAILURE(*err)){
1199                         return;
1200                     }
1201
1202                     myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1203                     saveThis->mode = UCNV_SO;
1204                 }
1205
1206                 /* convert to before the ESC or until the end of the buffer */
1207                 myData->isFirstBuffer=FALSE;
1208                 sourceStart = args->source;
1209                 myTargetStart = args->target;
1210                 args->converter = myData->currentConverter;
1211                 ucnv_toUnicode(args->converter,
1212                     &args->target,
1213                     args->targetLimit,
1214                     &args->source,
1215                     mySourceLimit,
1216                     args->offsets,
1217                     (UBool)(args->flush && mySourceLimit == realSourceLimit),
1218                     err);
1219                 args->converter = saveThis;
1220
1221                 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1222                     /* move the overflow buffer */
1223                     length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1224                     myData->currentConverter->UCharErrorBufferLength = 0;
1225                     if(length > 0) {
1226                         uprv_memcpy(saveThis->UCharErrorBuffer,
1227                                     myData->currentConverter->UCharErrorBuffer,
1228                                     length*U_SIZEOF_UCHAR);
1229                     }
1230                     return;
1231                 }
1232
1233                 /*
1234                  * At least one of:
1235                  * -Error while converting
1236                  * -Done with entire buffer
1237                  * -Need to write offsets or update the current offset
1238                  *  (leave that up to the code in ucnv.c)
1239                  *
1240                  * or else we just stopped at an ESC byte and continue with changeState_2022()
1241                  */
1242                 if (U_FAILURE(*err) ||
1243                     (args->source == realSourceLimit) ||
1244                     (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1245                     (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1246                 ) {
1247                     /* copy partial or error input for truncated detection and error handling */
1248                     if(U_FAILURE(*err)) {
1249                         length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1250                         if(length > 0) {
1251                             uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1252                         }
1253                     } else {
1254                         length = saveThis->toULength = myData->currentConverter->toULength;
1255                         if(length > 0) {
1256                             uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1257                             if(args->source < mySourceLimit) {
1258                                 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1259                             }
1260                         }
1261                     }
1262                     return;
1263                 }
1264             }
1265         }
1266
1267         sourceStart = args->source;
1268         changeState_2022(args->converter,
1269                &(args->source),
1270                realSourceLimit,
1271                ISO_2022,
1272                err);
1273         if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1274             /* let the ucnv.c code update its current offset */
1275             return;
1276         }
1277     }
1278 }
1279
1280 #endif
1281
1282 /*
1283  * To Unicode Callback helper function
1284  */
1285 static void
1286 toUnicodeCallback(UConverter *cnv,
1287                   const uint32_t sourceChar, const uint32_t targetUniChar,
1288                   UErrorCode* err){
1289     if(sourceChar>0xff){
1290         cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1291         cnv->toUBytes[1] = (uint8_t)sourceChar;
1292         cnv->toULength = 2;
1293     }
1294     else{
1295         cnv->toUBytes[0] =(char) sourceChar;
1296         cnv->toULength = 1;
1297     }
1298
1299     if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1300         *err = U_INVALID_CHAR_FOUND;
1301     }
1302     else{
1303         *err = U_ILLEGAL_CHAR_FOUND;
1304     }
1305 }
1306
1307 /**************************************ISO-2022-JP*************************************************/
1308
1309 /************************************** IMPORTANT **************************************************
1310 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1311 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1312 * The converter iterates over each Unicode codepoint
1313 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1314 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1315 * would do as far as possible.
1316 *
1317 * If the implementation of these macros or structure of sharedData struct change in the future, make
1318 * sure that ISO-2022 is also changed.
1319 ***************************************************************************************************
1320 */
1321
1322 /***************************************************************************************************
1323 * Rules for ISO-2022-jp encoding
1324 * (i)   Escape sequences must be fully contained within a line they should not
1325 *       span new lines or CRs
1326 * (ii)  If the last character on a line is represented by two bytes then an ASCII or
1327 *       JIS-Roman character escape sequence should follow before the line terminates
1328 * (iii) If the first character on the line is represented by two bytes then a two
1329 *       byte character escape sequence should precede it
1330 * (iv)  If no escape sequence is encountered then the characters are ASCII
1331 * (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1332 *       and invoked with SS2 (ESC N).
1333 * (vi)  If there is any G0 designation in text, there must be a switch to
1334 *       ASCII or to JIS X 0201-Roman before a space character (but not
1335 *       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1336 *       characters such as tab or CRLF.
1337 * (vi)  Supported encodings:
1338 *          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1339 *
1340 *  source : RFC-1554
1341 *
1342 *          JISX201, JISX208,JISX212 : new .cnv data files created
1343 *          KSC5601 : alias to ibm-949 mapping table
1344 *          GB2312 : alias to ibm-1386 mapping table
1345 *          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1346 *          ISO-8859-7 : alisas to ibm-9409 mapping table
1347 */
1348
1349 /* preference order of JP charsets */
1350 static const StateEnum jpCharsetPref[]={
1351     ASCII,
1352     JISX201,
1353     ISO8859_1,
1354     ISO8859_7,
1355     JISX208,
1356     JISX212,
1357     GB2312,
1358     KSC5601,
1359     HWKANA_7BIT
1360 };
1361
1362 /*
1363  * The escape sequences must be in order of the enum constants like JISX201  = 3,
1364  * not in order of jpCharsetPref[]!
1365  */
1366 static const char escSeqChars[][6] ={
1367     "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1368     "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1369     "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1370     "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1371     "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1372     "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1373     "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1374     "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1375     "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1376
1377 };
1378 static  const int8_t escSeqCharsLen[] ={
1379     3, /* length of <ESC>(B  ASCII       */
1380     3, /* length of <ESC>.A  ISO-8859-1  */
1381     3, /* length of <ESC>.F  ISO-8859-7  */
1382     3, /* length of <ESC>(J  JISX-201    */
1383     3, /* length of <ESC>$B  JISX-208    */
1384     4, /* length of <ESC>$(D JISX-212    */
1385     3, /* length of <ESC>$A  GB2312      */
1386     4, /* length of <ESC>$(C KSC5601     */
1387     3  /* length of <ESC>(I  HWKANA_7BIT */
1388 };
1389
1390 /*
1391 * The iteration over various code pages works this way:
1392 * i)   Get the currentState from myConverterData->currentState
1393 * ii)  Check if the character is mapped to a valid character in the currentState
1394 *      Yes ->  a) set the initIterState to currentState
1395 *       b) remain in this state until an invalid character is found
1396 *      No  ->  a) go to the next code page and find the character
1397 * iii) Before changing the state increment the current state check if the current state
1398 *      is equal to the intitIteration state
1399 *      Yes ->  A character that cannot be represented in any of the supported encodings
1400 *       break and return a U_INVALID_CHARACTER error
1401 *      No  ->  Continue and find the character in next code page
1402 *
1403 *
1404 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1405 */
1406
1407 /* Map 00..7F to Unicode according to JIS X 0201. */
1408 static U_INLINE uint32_t
1409 jisx201ToU(uint32_t value) {
1410     if(value < 0x5c) {
1411         return value;
1412     } else if(value == 0x5c) {
1413         return 0xa5;
1414     } else if(value == 0x7e) {
1415         return 0x203e;
1416     } else /* value <= 0x7f */ {
1417         return value;
1418     }
1419 }
1420
1421 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1422 static U_INLINE uint32_t
1423 jisx201FromU(uint32_t value) {
1424     if(value<=0x7f) {
1425         if(value!=0x5c && value!=0x7e) {
1426             return value;
1427         }
1428     } else if(value==0xa5) {
1429         return 0x5c;
1430     } else if(value==0x203e) {
1431         return 0x7e;
1432     }
1433     return 0xfffe;
1434 }
1435
1436 /*
1437  * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1438  * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1439  * Return 0 if the byte pair is out of range.
1440  */
1441 static U_INLINE uint32_t
1442 _2022FromSJIS(uint32_t value) {
1443     uint8_t trail;
1444
1445     if(value > 0xEFFC) {
1446         return 0;  /* beyond JIS X 0208 */
1447     }
1448
1449     trail = (uint8_t)value;
1450
1451     value &= 0xff00;  /* lead byte */
1452     if(value <= 0x9f00) {
1453         value -= 0x7000;
1454     } else /* 0xe000 <= value <= 0xef00 */ {
1455         value -= 0xb000;
1456     }
1457     value <<= 1;
1458
1459     if(trail <= 0x9e) {
1460         value -= 0x100;
1461         if(trail <= 0x7e) {
1462             value |= trail - 0x1f;
1463         } else {
1464             value |= trail - 0x20;
1465         }
1466     } else /* trail <= 0xfc */ {
1467         value |= trail - 0x7e;
1468     }
1469     return value;
1470 }
1471
1472 /*
1473  * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1474  * If either byte is outside 21..7E make sure that the result is not valid
1475  * for Shift-JIS so that the converter catches it.
1476  * Some invalid byte values already turn into equally invalid Shift-JIS
1477  * byte values and need not be tested explicitly.
1478  */
1479 static U_INLINE void
1480 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1481     if(c1&1) {
1482         ++c1;
1483         if(c2 <= 0x5f) {
1484             c2 += 0x1f;
1485         } else if(c2 <= 0x7e) {
1486             c2 += 0x20;
1487         } else {
1488             c2 = 0;  /* invalid */
1489         }
1490     } else {
1491         if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1492             c2 += 0x7e;
1493         } else {
1494             c2 = 0;  /* invalid */
1495         }
1496     }
1497     c1 >>= 1;
1498     if(c1 <= 0x2f) {
1499         c1 += 0x70;
1500     } else if(c1 <= 0x3f) {
1501         c1 += 0xb0;
1502     } else {
1503         c1 = 0;  /* invalid */
1504     }
1505     bytes[0] = (char)c1;
1506     bytes[1] = (char)c2;
1507 }
1508
1509 /*
1510  * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1511  * Katakana.
1512  * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1513  * because Shift-JIS roundtrips half-width Katakana to single bytes.
1514  * These were the only fallbacks in ICU's jisx-208.ucm file.
1515  */
1516 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1517     0x2123,  /* U+FF61 */
1518     0x2156,
1519     0x2157,
1520     0x2122,
1521     0x2126,
1522     0x2572,
1523     0x2521,
1524     0x2523,
1525     0x2525,
1526     0x2527,
1527     0x2529,
1528     0x2563,
1529     0x2565,
1530     0x2567,
1531     0x2543,
1532     0x213C,  /* U+FF70 */
1533     0x2522,
1534     0x2524,
1535     0x2526,
1536     0x2528,
1537     0x252A,
1538     0x252B,
1539     0x252D,
1540     0x252F,
1541     0x2531,
1542     0x2533,
1543     0x2535,
1544     0x2537,
1545     0x2539,
1546     0x253B,
1547     0x253D,
1548     0x253F,  /* U+FF80 */
1549     0x2541,
1550     0x2544,
1551     0x2546,
1552     0x2548,
1553     0x254A,
1554     0x254B,
1555     0x254C,
1556     0x254D,
1557     0x254E,
1558     0x254F,
1559     0x2552,
1560     0x2555,
1561     0x2558,
1562     0x255B,
1563     0x255E,
1564     0x255F,  /* U+FF90 */
1565     0x2560,
1566     0x2561,
1567     0x2562,
1568     0x2564,
1569     0x2566,
1570     0x2568,
1571     0x2569,
1572     0x256A,
1573     0x256B,
1574     0x256C,
1575     0x256D,
1576     0x256F,
1577     0x2573,
1578     0x212B,
1579     0x212C   /* U+FF9F */
1580 };
1581
1582 static void
1583 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1584     UConverter *cnv = args->converter;
1585     UConverterDataISO2022 *converterData;
1586     ISO2022State *pFromU2022State;
1587     uint8_t *target = (uint8_t *) args->target;
1588     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1589     const UChar* source = args->source;
1590     const UChar* sourceLimit = args->sourceLimit;
1591     int32_t* offsets = args->offsets;
1592     UChar32 sourceChar;
1593     char buffer[8];
1594     int32_t len, outLen;
1595     int8_t choices[10];
1596     int32_t choiceCount;
1597     uint32_t targetValue = 0;
1598     UBool useFallback;
1599
1600     int32_t i;
1601     int8_t cs, g;
1602
1603     /* set up the state */
1604     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1605     pFromU2022State   = &converterData->fromU2022State;
1606
1607     choiceCount = 0;
1608
1609     /* check if the last codepoint of previous buffer was a lead surrogate*/
1610     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1611         goto getTrail;
1612     }
1613
1614     while(source < sourceLimit) {
1615         if(target < targetLimit) {
1616
1617             sourceChar  = *(source++);
1618             /*check if the char is a First surrogate*/
1619             if(UTF_IS_SURROGATE(sourceChar)) {
1620                 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1621 getTrail:
1622                     /*look ahead to find the trail surrogate*/
1623                     if(source < sourceLimit) {
1624                         /* test the following code unit */
1625                         UChar trail=(UChar) *source;
1626                         if(UTF_IS_SECOND_SURROGATE(trail)) {
1627                             source++;
1628                             sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1629                             cnv->fromUChar32=0x00;
1630                             /* convert this supplementary code point */
1631                             /* exit this condition tree */
1632                         } else {
1633                             /* this is an unmatched lead code unit (1st surrogate) */
1634                             /* callback(illegal) */
1635                             *err=U_ILLEGAL_CHAR_FOUND;
1636                             cnv->fromUChar32=sourceChar;
1637                             break;
1638                         }
1639                     } else {
1640                         /* no more input */
1641                         cnv->fromUChar32=sourceChar;
1642                         break;
1643                     }
1644                 } else {
1645                     /* this is an unmatched trail code unit (2nd surrogate) */
1646                     /* callback(illegal) */
1647                     *err=U_ILLEGAL_CHAR_FOUND;
1648                     cnv->fromUChar32=sourceChar;
1649                     break;
1650                 }
1651             }
1652
1653             /* do not convert SO/SI/ESC */
1654             if(IS_2022_CONTROL(sourceChar)) {
1655                 /* callback(illegal) */
1656                 *err=U_ILLEGAL_CHAR_FOUND;
1657                 cnv->fromUChar32=sourceChar;
1658                 break;
1659             }
1660
1661             /* do the conversion */
1662
1663             if(choiceCount == 0) {
1664                 uint16_t csm;
1665
1666                 /*
1667                  * The csm variable keeps track of which charsets are allowed
1668                  * and not used yet while building the choices[].
1669                  */
1670                 csm = jpCharsetMasks[converterData->version];
1671                 choiceCount = 0;
1672
1673                 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1674                 if(converterData->version == 3 || converterData->version == 4) {
1675                     choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1676                 }
1677                 /* Do not try single-byte half-width Katakana for other versions. */
1678                 csm &= ~CSM(HWKANA_7BIT);
1679
1680                 /* try the current G0 charset */
1681                 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1682                 csm &= ~CSM(cs);
1683
1684                 /* try the current G2 charset */
1685                 if((cs = pFromU2022State->cs[2]) != 0) {
1686                     choices[choiceCount++] = cs;
1687                     csm &= ~CSM(cs);
1688                 }
1689
1690                 /* try all the other possible charsets */
1691                 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1692                     cs = (int8_t)jpCharsetPref[i];
1693                     if(CSM(cs) & csm) {
1694                         choices[choiceCount++] = cs;
1695                         csm &= ~CSM(cs);
1696                     }
1697                 }
1698             }
1699
1700             cs = g = 0;
1701             /*
1702              * len==0: no mapping found yet
1703              * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1704              * len>0: found a roundtrip result, done
1705              */
1706             len = 0;
1707             /*
1708              * We will turn off useFallback after finding a fallback,
1709              * but we still get fallbacks from PUA code points as usual.
1710              * Therefore, we will also need to check that we don't overwrite
1711              * an early fallback with a later one.
1712              */
1713             useFallback = cnv->useFallback;
1714
1715             for(i = 0; i < choiceCount && len <= 0; ++i) {
1716                 uint32_t value;
1717                 int32_t len2;
1718                 int8_t cs0 = choices[i];
1719                 switch(cs0) {
1720                 case ASCII:
1721                     if(sourceChar <= 0x7f) {
1722                         targetValue = (uint32_t)sourceChar;
1723                         len = 1;
1724                         cs = cs0;
1725                         g = 0;
1726                     }
1727                     break;
1728                 case ISO8859_1:
1729                     if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1730                         targetValue = (uint32_t)sourceChar - 0x80;
1731                         len = 1;
1732                         cs = cs0;
1733                         g = 2;
1734                     }
1735                     break;
1736                 case HWKANA_7BIT:
1737                     if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1738                         if(converterData->version==3) {
1739                             /* JIS7: use G1 (SO) */
1740                             /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1741                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1742                             len = 1;
1743                             pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1744                             g = 1;
1745                         } else if(converterData->version==4) {
1746                             /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1747                             /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1748                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1749                             len = 1;
1750
1751                             cs = pFromU2022State->cs[0];
1752                             if(IS_JP_DBCS(cs)) {
1753                                 /* switch from a DBCS charset to JISX201 */
1754                                 cs = (int8_t)JISX201;
1755                             }
1756                             /* else stay in the current G0 charset */
1757                             g = 0;
1758                         }
1759                         /* else do not use HWKANA_7BIT with other versions */
1760                     }
1761                     break;
1762                 case JISX201:
1763                     /* G0 SBCS */
1764                     value = jisx201FromU(sourceChar);
1765                     if(value <= 0x7f) {
1766                         targetValue = value;
1767                         len = 1;
1768                         cs = cs0;
1769                         g = 0;
1770                         useFallback = FALSE;
1771                     }
1772                     break;
1773                 case JISX208:
1774                     /* G0 DBCS from Shift-JIS table */
1775                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1776                                 converterData->myConverterArray[cs0],
1777                                 sourceChar, &value,
1778                                 useFallback, MBCS_OUTPUT_2);
1779                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1780                         value = _2022FromSJIS(value);
1781                         if(value != 0) {
1782                             targetValue = value;
1783                             len = len2;
1784                             cs = cs0;
1785                             g = 0;
1786                             useFallback = FALSE;
1787                         }
1788                     } else if(len == 0 && useFallback &&
1789                               (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1790                         targetValue = hwkana_fb[sourceChar - HWKANA_START];
1791                         len = -2;
1792                         cs = cs0;
1793                         g = 0;
1794                         useFallback = FALSE;
1795                     }
1796                     break;
1797                 case ISO8859_7:
1798                     /* G0 SBCS forced to 7-bit output */
1799                     len2 = MBCS_SINGLE_FROM_UCHAR32(
1800                                 converterData->myConverterArray[cs0],
1801                                 sourceChar, &value,
1802                                 useFallback);
1803                     if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1804                         targetValue = value - 0x80;
1805                         len = len2;
1806                         cs = cs0;
1807                         g = 2;
1808                         useFallback = FALSE;
1809                     }
1810                     break;
1811                 default:
1812                     /* G0 DBCS */
1813                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1814                                 converterData->myConverterArray[cs0],
1815                                 sourceChar, &value,
1816                                 useFallback, MBCS_OUTPUT_2);
1817                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1818                         if(cs0 == KSC5601) {
1819                             /*
1820                              * Check for valid bytes for the encoding scheme.
1821                              * This is necessary because the sub-converter (windows-949)
1822                              * has a broader encoding scheme than is valid for 2022.
1823                              */
1824                             value = _2022FromGR94DBCS(value);
1825                             if(value == 0) {
1826                                 break;
1827                             }
1828                         }
1829                         targetValue = value;
1830                         len = len2;
1831                         cs = cs0;
1832                         g = 0;
1833                         useFallback = FALSE;
1834                     }
1835                     break;
1836                 }
1837             }
1838
1839             if(len != 0) {
1840                 if(len < 0) {
1841                     len = -len;  /* fallback */
1842                 }
1843                 outLen = 0; /* count output bytes */
1844
1845                 /* write SI if necessary (only for JIS7) */
1846                 if(pFromU2022State->g == 1 && g == 0) {
1847                     buffer[outLen++] = UCNV_SI;
1848                     pFromU2022State->g = 0;
1849                 }
1850
1851                 /* write the designation sequence if necessary */
1852                 if(cs != pFromU2022State->cs[g]) {
1853                     int32_t escLen = escSeqCharsLen[cs];
1854                     uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1855                     outLen += escLen;
1856                     pFromU2022State->cs[g] = cs;
1857
1858                     /* invalidate the choices[] */
1859                     choiceCount = 0;
1860                 }
1861
1862                 /* write the shift sequence if necessary */
1863                 if(g != pFromU2022State->g) {
1864                     switch(g) {
1865                     /* case 0 handled before writing escapes */
1866                     case 1:
1867                         buffer[outLen++] = UCNV_SO;
1868                         pFromU2022State->g = 1;
1869                         break;
1870                     default: /* case 2 */
1871                         buffer[outLen++] = 0x1b;
1872                         buffer[outLen++] = 0x4e;
1873                         break;
1874                     /* no case 3: no SS3 in ISO-2022-JP-x */
1875                     }
1876                 }
1877
1878                 /* write the output bytes */
1879                 if(len == 1) {
1880                     buffer[outLen++] = (char)targetValue;
1881                 } else /* len == 2 */ {
1882                     buffer[outLen++] = (char)(targetValue >> 8);
1883                     buffer[outLen++] = (char)targetValue;
1884                 }
1885             } else {
1886                 /*
1887                  * if we cannot find the character after checking all codepages
1888                  * then this is an error
1889                  */
1890                 *err = U_INVALID_CHAR_FOUND;
1891                 cnv->fromUChar32=sourceChar;
1892                 break;
1893             }
1894
1895             if(sourceChar == CR || sourceChar == LF) {
1896                 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1897                 pFromU2022State->cs[2] = 0;
1898                 choiceCount = 0;
1899             }
1900
1901             /* output outLen>0 bytes in buffer[] */
1902             if(outLen == 1) {
1903                 *target++ = buffer[0];
1904                 if(offsets) {
1905                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1906                 }
1907             } else if(outLen == 2 && (target + 2) <= targetLimit) {
1908                 *target++ = buffer[0];
1909                 *target++ = buffer[1];
1910                 if(offsets) {
1911                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1912                     *offsets++ = sourceIndex;
1913                     *offsets++ = sourceIndex;
1914                 }
1915             } else {
1916                 fromUWriteUInt8(
1917                     cnv,
1918                     buffer, outLen,
1919                     &target, (const char *)targetLimit,
1920                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1921                     err);
1922                 if(U_FAILURE(*err)) {
1923                     break;
1924                 }
1925             }
1926         } /* end if(myTargetIndex<myTargetLength) */
1927         else{
1928             *err =U_BUFFER_OVERFLOW_ERROR;
1929             break;
1930         }
1931
1932     }/* end while(mySourceIndex<mySourceLength) */
1933
1934     /*
1935      * the end of the input stream and detection of truncated input
1936      * are handled by the framework, but for ISO-2022-JP conversion
1937      * we need to be in ASCII mode at the very end
1938      *
1939      * conditions:
1940      *   successful
1941      *   in SO mode or not in ASCII mode
1942      *   end of input and no truncated input
1943      */
1944     if( U_SUCCESS(*err) &&
1945         (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1946         args->flush && source>=sourceLimit && cnv->fromUChar32==0
1947     ) {
1948         int32_t sourceIndex;
1949
1950         outLen = 0;
1951
1952         if(pFromU2022State->g != 0) {
1953             buffer[outLen++] = UCNV_SI;
1954             pFromU2022State->g = 0;
1955         }
1956
1957         if(pFromU2022State->cs[0] != ASCII) {
1958             int32_t escLen = escSeqCharsLen[ASCII];
1959             uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1960             outLen += escLen;
1961             pFromU2022State->cs[0] = (int8_t)ASCII;
1962         }
1963
1964         /* get the source index of the last input character */
1965         /*
1966          * TODO this would be simpler and more reliable if we used a pair
1967          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1968          * so that we could simply use the prevSourceIndex here;
1969          * this code gives an incorrect result for the rare case of an unmatched
1970          * trail surrogate that is alone in the last buffer of the text stream
1971          */
1972         sourceIndex=(int32_t)(source-args->source);
1973         if(sourceIndex>0) {
1974             --sourceIndex;
1975             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1976                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1977             ) {
1978                 --sourceIndex;
1979             }
1980         } else {
1981             sourceIndex=-1;
1982         }
1983
1984         fromUWriteUInt8(
1985             cnv,
1986             buffer, outLen,
1987             &target, (const char *)targetLimit,
1988             &offsets, sourceIndex,
1989             err);
1990     }
1991
1992     /*save the state and return */
1993     args->source = source;
1994     args->target = (char*)target;
1995 }
1996
1997 /*************** to unicode *******************/
1998
1999 static void
2000 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2001                                                UErrorCode* err){
2002     char tempBuf[2];
2003     const char *mySource = (char *) args->source;
2004     UChar *myTarget = args->target;
2005     const char *mySourceLimit = args->sourceLimit;
2006     uint32_t targetUniChar = 0x0000;
2007     uint32_t mySourceChar = 0x0000;
2008     uint32_t tmpSourceChar = 0x0000;
2009     UConverterDataISO2022* myData;
2010     ISO2022State *pToU2022State;
2011     StateEnum cs;
2012
2013     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2014     pToU2022State = &myData->toU2022State;
2015
2016     if(myData->key != 0) {
2017         /* continue with a partial escape sequence */
2018         goto escape;
2019     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2020         /* continue with a partial double-byte character */
2021         mySourceChar = args->converter->toUBytes[0];
2022         args->converter->toULength = 0;
2023         cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2024         targetUniChar = missingCharMarker;
2025         goto getTrailByte;
2026     }
2027
2028     while(mySource < mySourceLimit){
2029
2030         targetUniChar =missingCharMarker;
2031
2032         if(myTarget < args->targetLimit){
2033
2034             mySourceChar= (unsigned char) *mySource++;
2035
2036             switch(mySourceChar) {
2037             case UCNV_SI:
2038                 if(myData->version==3) {
2039                     pToU2022State->g=0;
2040                     continue;
2041                 } else {
2042                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2043                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
2044                     break;
2045                 }
2046
2047             case UCNV_SO:
2048                 if(myData->version==3) {
2049                     /* JIS7: switch to G1 half-width Katakana */
2050                     pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2051                     pToU2022State->g=1;
2052                     continue;
2053                 } else {
2054                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2055                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
2056                     break;
2057                 }
2058
2059             case ESC_2022:
2060                 mySource--;
2061 escape:
2062                 {
2063                     const char * mySourceBefore = mySource;
2064                     int8_t toULengthBefore = args->converter->toULength;
2065
2066                     changeState_2022(args->converter,&(mySource),
2067                         mySourceLimit, ISO_2022_JP,err);
2068
2069                     /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2070                     if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2071                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2072                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
2073                         args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
2074                     }
2075                 }
2076
2077                 /* invalid or illegal escape sequence */
2078                 if(U_FAILURE(*err)){
2079                     args->target = myTarget;
2080                     args->source = mySource;
2081                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
2082                     return;
2083                 }
2084                 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2085                 if(myData->key==0) {
2086                     myData->isEmptySegment = TRUE;
2087                 }
2088                 continue;
2089
2090             /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2091
2092             case CR:
2093                 /*falls through*/
2094             case LF:
2095                 /* automatically reset to single-byte mode */
2096                 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2097                     pToU2022State->cs[0] = (int8_t)ASCII;
2098                 }
2099                 pToU2022State->cs[2] = 0;
2100                 pToU2022State->g = 0;
2101                 /* falls through */
2102             default:
2103                 /* convert one or two bytes */
2104                 myData->isEmptySegment = FALSE;
2105                 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2106                 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2107                     !IS_JP_DBCS(cs)
2108                 ) {
2109                     /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2110                     targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2111
2112                     /* return from a single-shift state to the previous one */
2113                     if(pToU2022State->g >= 2) {
2114                         pToU2022State->g=pToU2022State->prevG;
2115                     }
2116                 } else switch(cs) {
2117                 case ASCII:
2118                     if(mySourceChar <= 0x7f) {
2119                         targetUniChar = mySourceChar;
2120                     }
2121                     break;
2122                 case ISO8859_1:
2123                     if(mySourceChar <= 0x7f) {
2124                         targetUniChar = mySourceChar + 0x80;
2125                     }
2126                     /* return from a single-shift state to the previous one */
2127                     pToU2022State->g=pToU2022State->prevG;
2128                     break;
2129                 case ISO8859_7:
2130                     if(mySourceChar <= 0x7f) {
2131                         /* convert mySourceChar+0x80 to use a normal 8-bit table */
2132                         targetUniChar =
2133                             _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2134                                 myData->myConverterArray[cs],
2135                                 mySourceChar + 0x80);
2136                     }
2137                     /* return from a single-shift state to the previous one */
2138                     pToU2022State->g=pToU2022State->prevG;
2139                     break;
2140                 case JISX201:
2141                     if(mySourceChar <= 0x7f) {
2142                         targetUniChar = jisx201ToU(mySourceChar);
2143                     }
2144                     break;
2145                 case HWKANA_7BIT:
2146                     if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2147                         /* 7-bit halfwidth Katakana */
2148                         targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2149                     }
2150                     break;
2151                 default:
2152                     /* G0 DBCS */
2153                     if(mySource < mySourceLimit) {
2154                         int leadIsOk, trailIsOk;
2155                         uint8_t trailByte;
2156 getTrailByte:
2157                         trailByte = (uint8_t)*mySource;
2158                         /*
2159                          * Ticket 5691: consistent illegal sequences:
2160                          * - We include at least the first byte in the illegal sequence.
2161                          * - If any of the non-initial bytes could be the start of a character,
2162                          *   we stop the illegal sequence before the first one of those.
2163                          *
2164                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2165                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2166                          * Otherwise we convert or report the pair of bytes.
2167                          */
2168                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2169                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2170                         if (leadIsOk && trailIsOk) {
2171                             ++mySource;
2172                             tmpSourceChar = (mySourceChar << 8) | trailByte;
2173                             if(cs == JISX208) {
2174                                 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2175                                 mySourceChar = tmpSourceChar;
2176                             } else {
2177                                 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2178                                 mySourceChar = tmpSourceChar;
2179                                 if (cs == KSC5601) {
2180                                     tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2181                                 }
2182                                 tempBuf[0] = (char)(tmpSourceChar >> 8);
2183                                 tempBuf[1] = (char)(tmpSourceChar);
2184                             }
2185                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2186                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2187                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2188                             ++mySource;
2189                             /* add another bit so that the code below writes 2 bytes in case of error */
2190                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2191                         }
2192                     } else {
2193                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2194                         args->converter->toULength = 1;
2195                         goto endloop;
2196                     }
2197                 }  /* End of inner switch */
2198                 break;
2199             }  /* End of outer switch */
2200             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2201                 if(args->offsets){
2202                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2203                 }
2204                 *(myTarget++)=(UChar)targetUniChar;
2205             }
2206             else if(targetUniChar > missingCharMarker){
2207                 /* disassemble the surrogate pair and write to output*/
2208                 targetUniChar-=0x0010000;
2209                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2210                 if(args->offsets){
2211                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2212                 }
2213                 ++myTarget;
2214                 if(myTarget< args->targetLimit){
2215                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2216                     if(args->offsets){
2217                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2218                     }
2219                     ++myTarget;
2220                 }else{
2221                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2222                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2223                 }
2224
2225             }
2226             else{
2227                 /* Call the callback function*/
2228                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2229                 break;
2230             }
2231         }
2232         else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2233             *err =U_BUFFER_OVERFLOW_ERROR;
2234             break;
2235         }
2236     }
2237 endloop:
2238     args->target = myTarget;
2239     args->source = mySource;
2240 }
2241
2242
2243 /***************************************************************
2244 *   Rules for ISO-2022-KR encoding
2245 *   i) The KSC5601 designator sequence should appear only once in a file,
2246 *      at the begining of a line before any KSC5601 characters. This usually
2247 *      means that it appears by itself on the first line of the file
2248 *  ii) There are only 2 shifting sequences SO to shift into double byte mode
2249 *      and SI to shift into single byte mode
2250 */
2251 static void
2252 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2253
2254     UConverter* saveConv = args->converter;
2255     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2256     args->converter=myConverterData->currentConverter;
2257
2258     myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2259     ucnv_MBCSFromUnicodeWithOffsets(args,err);
2260     saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2261
2262     if(*err == U_BUFFER_OVERFLOW_ERROR) {
2263         if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2264             uprv_memcpy(
2265                 saveConv->charErrorBuffer,
2266                 myConverterData->currentConverter->charErrorBuffer,
2267                 myConverterData->currentConverter->charErrorBufferLength);
2268         }
2269         saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2270         myConverterData->currentConverter->charErrorBufferLength = 0;
2271     }
2272     args->converter=saveConv;
2273 }
2274
2275 static void
2276 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2277
2278     const UChar *source = args->source;
2279     const UChar *sourceLimit = args->sourceLimit;
2280     unsigned char *target = (unsigned char *) args->target;
2281     unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2282     int32_t* offsets = args->offsets;
2283     uint32_t targetByteUnit = 0x0000;
2284     UChar32 sourceChar = 0x0000;
2285     UBool isTargetByteDBCS;
2286     UBool oldIsTargetByteDBCS;
2287     UConverterDataISO2022 *converterData;
2288     UConverterSharedData* sharedData;
2289     UBool useFallback;
2290     int32_t length =0;
2291
2292     converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2293     /* if the version is 1 then the user is requesting
2294      * conversion with ibm-25546 pass the arguments to
2295      * MBCS converter and return
2296      */
2297     if(converterData->version==1){
2298         UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2299         return;
2300     }
2301
2302     /* initialize data */
2303     sharedData = converterData->currentConverter->sharedData;
2304     useFallback = args->converter->useFallback;
2305     isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2306     oldIsTargetByteDBCS = isTargetByteDBCS;
2307
2308     isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2309     if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2310         goto getTrail;
2311     }
2312     while(source < sourceLimit){
2313
2314         targetByteUnit = missingCharMarker;
2315
2316         if(target < (unsigned char*) args->targetLimit){
2317             sourceChar = *source++;
2318
2319             /* do not convert SO/SI/ESC */
2320             if(IS_2022_CONTROL(sourceChar)) {
2321                 /* callback(illegal) */
2322                 *err=U_ILLEGAL_CHAR_FOUND;
2323                 args->converter->fromUChar32=sourceChar;
2324                 break;
2325             }
2326
2327             length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2328             if(length < 0) {
2329                 length = -length;  /* fallback */
2330             }
2331             /* only DBCS or SBCS characters are expected*/
2332             /* DB characters with high bit set to 1 are expected */
2333             if( length > 2 || length==0 ||
2334                 (length == 1 && targetByteUnit > 0x7f) ||
2335                 (length == 2 &&
2336                     ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2337                     (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2338             ) {
2339                 targetByteUnit=missingCharMarker;
2340             }
2341             if (targetByteUnit != missingCharMarker){
2342
2343                 oldIsTargetByteDBCS = isTargetByteDBCS;
2344                 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2345                   /* append the shift sequence */
2346                 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2347
2348                     if (isTargetByteDBCS)
2349                         *target++ = UCNV_SO;
2350                     else
2351                         *target++ = UCNV_SI;
2352                     if(offsets)
2353                         *(offsets++) = (int32_t)(source - args->source-1);
2354                 }
2355                 /* write the targetUniChar  to target */
2356                 if(targetByteUnit <= 0x00FF){
2357                     if( target < targetLimit){
2358                         *(target++) = (unsigned char) targetByteUnit;
2359                         if(offsets){
2360                             *(offsets++) = (int32_t)(source - args->source-1);
2361                         }
2362
2363                     }else{
2364                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2365                         *err = U_BUFFER_OVERFLOW_ERROR;
2366                     }
2367                 }else{
2368                     if(target < targetLimit){
2369                         *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2370                         if(offsets){
2371                             *(offsets++) = (int32_t)(source - args->source-1);
2372                         }
2373                         if(target < targetLimit){
2374                             *(target++) =(unsigned char) (targetByteUnit -0x80);
2375                             if(offsets){
2376                                 *(offsets++) = (int32_t)(source - args->source-1);
2377                             }
2378                         }else{
2379                             args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2380                             *err = U_BUFFER_OVERFLOW_ERROR;
2381                         }
2382                     }else{
2383                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2384                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2385                         *err = U_BUFFER_OVERFLOW_ERROR;
2386                     }
2387                 }
2388
2389             }
2390             else{
2391                 /* oops.. the code point is unassingned
2392                  * set the error and reason
2393                  */
2394
2395                 /*check if the char is a First surrogate*/
2396                 if(UTF_IS_SURROGATE(sourceChar)) {
2397                     if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2398 getTrail:
2399                         /*look ahead to find the trail surrogate*/
2400                         if(source <  sourceLimit) {
2401                             /* test the following code unit */
2402                             UChar trail=(UChar) *source;
2403                             if(UTF_IS_SECOND_SURROGATE(trail)) {
2404                                 source++;
2405                                 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2406                                 *err = U_INVALID_CHAR_FOUND;
2407                                 /* convert this surrogate code point */
2408                                 /* exit this condition tree */
2409                             } else {
2410                                 /* this is an unmatched lead code unit (1st surrogate) */
2411                                 /* callback(illegal) */
2412                                 *err=U_ILLEGAL_CHAR_FOUND;
2413                             }
2414                         } else {
2415                             /* no more input */
2416                             *err = U_ZERO_ERROR;
2417                         }
2418                     } else {
2419                         /* this is an unmatched trail code unit (2nd surrogate) */
2420                         /* callback(illegal) */
2421                         *err=U_ILLEGAL_CHAR_FOUND;
2422                     }
2423                 } else {
2424                     /* callback(unassigned) for a BMP code point */
2425                     *err = U_INVALID_CHAR_FOUND;
2426                 }
2427
2428                 args->converter->fromUChar32=sourceChar;
2429                 break;
2430             }
2431         } /* end if(myTargetIndex<myTargetLength) */
2432         else{
2433             *err =U_BUFFER_OVERFLOW_ERROR;
2434             break;
2435         }
2436
2437     }/* end while(mySourceIndex<mySourceLength) */
2438
2439     /*
2440      * the end of the input stream and detection of truncated input
2441      * are handled by the framework, but for ISO-2022-KR conversion
2442      * we need to be in ASCII mode at the very end
2443      *
2444      * conditions:
2445      *   successful
2446      *   not in ASCII mode
2447      *   end of input and no truncated input
2448      */
2449     if( U_SUCCESS(*err) &&
2450         isTargetByteDBCS &&
2451         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2452     ) {
2453         int32_t sourceIndex;
2454
2455         /* we are switching to ASCII */
2456         isTargetByteDBCS=FALSE;
2457
2458         /* get the source index of the last input character */
2459         /*
2460          * TODO this would be simpler and more reliable if we used a pair
2461          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2462          * so that we could simply use the prevSourceIndex here;
2463          * this code gives an incorrect result for the rare case of an unmatched
2464          * trail surrogate that is alone in the last buffer of the text stream
2465          */
2466         sourceIndex=(int32_t)(source-args->source);
2467         if(sourceIndex>0) {
2468             --sourceIndex;
2469             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2470                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2471             ) {
2472                 --sourceIndex;
2473             }
2474         } else {
2475             sourceIndex=-1;
2476         }
2477
2478         fromUWriteUInt8(
2479             args->converter,
2480             SHIFT_IN_STR, 1,
2481             &target, (const char *)targetLimit,
2482             &offsets, sourceIndex,
2483             err);
2484     }
2485
2486     /*save the state and return */
2487     args->source = source;
2488     args->target = (char*)target;
2489     args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2490 }
2491
2492 /************************ To Unicode ***************************************/
2493
2494 static void
2495 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2496                                                             UErrorCode* err){
2497     char const* sourceStart;
2498     UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2499
2500     UConverterToUnicodeArgs subArgs;
2501     int32_t minArgsSize;
2502
2503     /* set up the subconverter arguments */
2504     if(args->size<sizeof(UConverterToUnicodeArgs)) {
2505         minArgsSize = args->size;
2506     } else {
2507         minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2508     }
2509
2510     uprv_memcpy(&subArgs, args, minArgsSize);
2511     subArgs.size = (uint16_t)minArgsSize;
2512     subArgs.converter = myData->currentConverter;
2513
2514     /* remember the original start of the input for offsets */
2515     sourceStart = args->source;
2516
2517     if(myData->key != 0) {
2518         /* continue with a partial escape sequence */
2519         goto escape;
2520     }
2521
2522     while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2523         /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2524         subArgs.source = args->source;
2525         subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2526         if(subArgs.source != subArgs.sourceLimit) {
2527             /*
2528              * get the current partial byte sequence
2529              *
2530              * it needs to be moved between the public and the subconverter
2531              * so that the conversion framework, which only sees the public
2532              * converter, can handle truncated and illegal input etc.
2533              */
2534             if(args->converter->toULength > 0) {
2535                 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2536             }
2537             subArgs.converter->toULength = args->converter->toULength;
2538
2539             /*
2540              * Convert up to the end of the input, or to before the next escape character.
2541              * Does not handle conversion extensions because the preToU[] state etc.
2542              * is not copied.
2543              */
2544             ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2545
2546             if(args->offsets != NULL && sourceStart != args->source) {
2547                 /* update offsets to base them on the actual start of the input */
2548                 int32_t *offsets = args->offsets;
2549                 UChar *target = args->target;
2550                 int32_t delta = (int32_t)(args->source - sourceStart);
2551                 while(target < subArgs.target) {
2552                     if(*offsets >= 0) {
2553                         *offsets += delta;
2554                     }
2555                     ++offsets;
2556                     ++target;
2557                 }
2558             }
2559             args->source = subArgs.source;
2560             args->target = subArgs.target;
2561             args->offsets = subArgs.offsets;
2562
2563             /* copy input/error/overflow buffers */
2564             if(subArgs.converter->toULength > 0) {
2565                 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2566             }
2567             args->converter->toULength = subArgs.converter->toULength;
2568
2569             if(*err == U_BUFFER_OVERFLOW_ERROR) {
2570                 if(subArgs.converter->UCharErrorBufferLength > 0) {
2571                     uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2572                                 subArgs.converter->UCharErrorBufferLength);
2573                 }
2574                 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2575                 subArgs.converter->UCharErrorBufferLength = 0;
2576             }
2577         }
2578
2579         if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2580             return;
2581         }
2582
2583 escape:
2584         changeState_2022(args->converter,
2585                &(args->source),
2586                args->sourceLimit,
2587                ISO_2022_KR,
2588                err);
2589     }
2590 }
2591
2592 static void
2593 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2594                                                             UErrorCode* err){
2595     char tempBuf[2];
2596     const char *mySource = ( char *) args->source;
2597     UChar *myTarget = args->target;
2598     const char *mySourceLimit = args->sourceLimit;
2599     UChar32 targetUniChar = 0x0000;
2600     UChar mySourceChar = 0x0000;
2601     UConverterDataISO2022* myData;
2602     UConverterSharedData* sharedData ;
2603     UBool useFallback;
2604
2605     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2606     if(myData->version==1){
2607         UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2608         return;
2609     }
2610
2611     /* initialize state */
2612     sharedData = myData->currentConverter->sharedData;
2613     useFallback = args->converter->useFallback;
2614
2615     if(myData->key != 0) {
2616         /* continue with a partial escape sequence */
2617         goto escape;
2618     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2619         /* continue with a partial double-byte character */
2620         mySourceChar = args->converter->toUBytes[0];
2621         args->converter->toULength = 0;
2622         goto getTrailByte;
2623     }
2624
2625     while(mySource< mySourceLimit){
2626
2627         if(myTarget < args->targetLimit){
2628
2629             mySourceChar= (unsigned char) *mySource++;
2630
2631             if(mySourceChar==UCNV_SI){
2632                 myData->toU2022State.g = 0;
2633                 if (myData->isEmptySegment) {
2634                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
2635                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2636                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
2637                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2638                     args->converter->toULength = 1;
2639                     args->target = myTarget;
2640                     args->source = mySource;
2641                     return;
2642                 }
2643                 /*consume the source */
2644                 continue;
2645             }else if(mySourceChar==UCNV_SO){
2646                 myData->toU2022State.g = 1;
2647                 myData->isEmptySegment = TRUE;  /* Begin a new segment, empty so far */
2648                 /*consume the source */
2649                 continue;
2650             }else if(mySourceChar==ESC_2022){
2651                 mySource--;
2652 escape:
2653                 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2654                 changeState_2022(args->converter,&(mySource),
2655                                 mySourceLimit, ISO_2022_KR, err);
2656                 if(U_FAILURE(*err)){
2657                     args->target = myTarget;
2658                     args->source = mySource;
2659                     return;
2660                 }
2661                 continue;
2662             }
2663
2664             myData->isEmptySegment = FALSE;     /* Any invalid char errors will be detected separately, so just reset this */
2665             if(myData->toU2022State.g == 1) {
2666                 if(mySource < mySourceLimit) {
2667                     int leadIsOk, trailIsOk;
2668                     uint8_t trailByte;
2669 getTrailByte:
2670                     targetUniChar = missingCharMarker;
2671                     trailByte = (uint8_t)*mySource;
2672                     /*
2673                      * Ticket 5691: consistent illegal sequences:
2674                      * - We include at least the first byte in the illegal sequence.
2675                      * - If any of the non-initial bytes could be the start of a character,
2676                      *   we stop the illegal sequence before the first one of those.
2677                      *
2678                      * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2679                      * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2680                      * Otherwise we convert or report the pair of bytes.
2681                      */
2682                     leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2683                     trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2684                     if (leadIsOk && trailIsOk) {
2685                         ++mySource;
2686                         tempBuf[0] = (char)(mySourceChar + 0x80);
2687                         tempBuf[1] = (char)(trailByte + 0x80);
2688                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2689                         mySourceChar = (mySourceChar << 8) | trailByte;
2690                     } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2691                         /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2692                         ++mySource;
2693                         /* add another bit so that the code below writes 2 bytes in case of error */
2694                         mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2695                     }
2696                 } else {
2697                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2698                     args->converter->toULength = 1;
2699                     break;
2700                 }
2701             }
2702             else if(mySourceChar <= 0x7f) {
2703                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2704             } else {
2705                 targetUniChar = 0xffff;
2706             }
2707             if(targetUniChar < 0xfffe){
2708                 if(args->offsets) {
2709                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2710                 }
2711                 *(myTarget++)=(UChar)targetUniChar;
2712             }
2713             else {
2714                 /* Call the callback function*/
2715                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2716                 break;
2717             }
2718         }
2719         else{
2720             *err =U_BUFFER_OVERFLOW_ERROR;
2721             break;
2722         }
2723     }
2724     args->target = myTarget;
2725     args->source = mySource;
2726 }
2727
2728 /*************************** END ISO2022-KR *********************************/
2729
2730 /*************************** ISO-2022-CN *********************************
2731 *
2732 * Rules for ISO-2022-CN Encoding:
2733 * i)   The designator sequence must appear once on a line before any instance
2734 *      of character set it designates.
2735 * ii)  If two lines contain characters from the same character set, both lines
2736 *      must include the designator sequence.
2737 * iii) Once the designator sequence is known, a shifting sequence has to be found
2738 *      to invoke the  shifting
2739 * iv)  All lines start in ASCII and end in ASCII.
2740 * v)   Four shifting sequences are employed for this purpose:
2741 *
2742 *      Sequcence   ASCII Eq    Charsets
2743 *      ----------  -------    ---------
2744 *      SI           <SI>        US-ASCII
2745 *      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2746 *      SS2          <ESC>N      CNS-11643-1992 Plane 2
2747 *      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2748 *
2749 * vi)
2750 *      SOdesignator  : ESC "$" ")" finalchar_for_SO
2751 *      SS2designator : ESC "$" "*" finalchar_for_SS2
2752 *      SS3designator : ESC "$" "+" finalchar_for_SS3
2753 *
2754 *      ESC $ ) A       Indicates the bytes following SO are Chinese
2755 *       characters as defined in GB 2312-80, until
2756 *       another SOdesignation appears
2757 *
2758 *
2759 *      ESC $ ) E       Indicates the bytes following SO are as defined
2760 *       in ISO-IR-165 (for details, see section 2.1),
2761 *       until another SOdesignation appears
2762 *
2763 *      ESC $ ) G       Indicates the bytes following SO are as defined
2764 *       in CNS 11643-plane-1, until another
2765 *       SOdesignation appears
2766 *
2767 *      ESC $ * H       Indicates the two bytes immediately following
2768 *       SS2 is a Chinese character as defined in CNS
2769 *       11643-plane-2, until another SS2designation
2770 *       appears
2771 *       (Meaning <ESC>N must preceed every 2 byte
2772 *        sequence.)
2773 *
2774 *      ESC $ + I       Indicates the immediate two bytes following SS3
2775 *       is a Chinese character as defined in CNS
2776 *       11643-plane-3, until another SS3designation
2777 *       appears
2778 *       (Meaning <ESC>O must preceed every 2 byte
2779 *        sequence.)
2780 *
2781 *      ESC $ + J       Indicates the immediate two bytes following SS3
2782 *       is a Chinese character as defined in CNS
2783 *       11643-plane-4, until another SS3designation
2784 *       appears
2785 *       (In English: <ESC>O must preceed every 2 byte
2786 *        sequence.)
2787 *
2788 *      ESC $ + K       Indicates the immediate two bytes following SS3
2789 *       is a Chinese character as defined in CNS
2790 *       11643-plane-5, until another SS3designation
2791 *       appears
2792 *
2793 *      ESC $ + L       Indicates the immediate two bytes following SS3
2794 *       is a Chinese character as defined in CNS
2795 *       11643-plane-6, until another SS3designation
2796 *       appears
2797 *
2798 *      ESC $ + M       Indicates the immediate two bytes following SS3
2799 *       is a Chinese character as defined in CNS
2800 *       11643-plane-7, until another SS3designation
2801 *       appears
2802 *
2803 *       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2804 *       has its own designation information before any Chinese characters
2805 *       appear
2806 *
2807 */
2808
2809 /* The following are defined this way to make the strings truely readonly */
2810 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2811 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2812 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2813 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2814 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2815 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2816 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2817 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2818 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2819
2820 /********************** ISO2022-CN Data **************************/
2821 static const char* const escSeqCharsCN[10] ={
2822         SHIFT_IN_STR,           /* ASCII */
2823         GB_2312_80_STR,
2824         ISO_IR_165_STR,
2825         CNS_11643_1992_Plane_1_STR,
2826         CNS_11643_1992_Plane_2_STR,
2827         CNS_11643_1992_Plane_3_STR,
2828         CNS_11643_1992_Plane_4_STR,
2829         CNS_11643_1992_Plane_5_STR,
2830         CNS_11643_1992_Plane_6_STR,
2831         CNS_11643_1992_Plane_7_STR
2832 };
2833
2834 static void
2835 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2836     UConverter *cnv = args->converter;
2837     UConverterDataISO2022 *converterData;
2838     ISO2022State *pFromU2022State;
2839     uint8_t *target = (uint8_t *) args->target;
2840     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2841     const UChar* source = args->source;
2842     const UChar* sourceLimit = args->sourceLimit;
2843     int32_t* offsets = args->offsets;
2844     UChar32 sourceChar;
2845     char buffer[8];
2846     int32_t len;
2847     int8_t choices[3];
2848     int32_t choiceCount;
2849     uint32_t targetValue = 0;
2850     UBool useFallback;
2851
2852     /* set up the state */
2853     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2854     pFromU2022State   = &converterData->fromU2022State;
2855
2856     choiceCount = 0;
2857
2858     /* check if the last codepoint of previous buffer was a lead surrogate*/
2859     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2860         goto getTrail;
2861     }
2862
2863     while( source < sourceLimit){
2864         if(target < targetLimit){
2865
2866             sourceChar  = *(source++);
2867             /*check if the char is a First surrogate*/
2868              if(UTF_IS_SURROGATE(sourceChar)) {
2869                 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2870 getTrail:
2871                     /*look ahead to find the trail surrogate*/
2872                     if(source < sourceLimit) {
2873                         /* test the following code unit */
2874                         UChar trail=(UChar) *source;
2875                         if(UTF_IS_SECOND_SURROGATE(trail)) {
2876                             source++;
2877                             sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2878                             cnv->fromUChar32=0x00;
2879                             /* convert this supplementary code point */
2880                             /* exit this condition tree */
2881                         } else {
2882                             /* this is an unmatched lead code unit (1st surrogate) */
2883                             /* callback(illegal) */
2884                             *err=U_ILLEGAL_CHAR_FOUND;
2885                             cnv->fromUChar32=sourceChar;
2886                             break;
2887                         }
2888                     } else {
2889                         /* no more input */
2890                         cnv->fromUChar32=sourceChar;
2891                         break;
2892                     }
2893                 } else {
2894                     /* this is an unmatched trail code unit (2nd surrogate) */
2895                     /* callback(illegal) */
2896                     *err=U_ILLEGAL_CHAR_FOUND;
2897                     cnv->fromUChar32=sourceChar;
2898                     break;
2899                 }
2900             }
2901
2902             /* do the conversion */
2903             if(sourceChar <= 0x007f ){
2904                 /* do not convert SO/SI/ESC */
2905                 if(IS_2022_CONTROL(sourceChar)) {
2906                     /* callback(illegal) */
2907                     *err=U_ILLEGAL_CHAR_FOUND;
2908                     cnv->fromUChar32=sourceChar;
2909                     break;
2910                 }
2911
2912                 /* US-ASCII */
2913                 if(pFromU2022State->g == 0) {
2914                     buffer[0] = (char)sourceChar;
2915                     len = 1;
2916                 } else {
2917                     buffer[0] = UCNV_SI;
2918                     buffer[1] = (char)sourceChar;
2919                     len = 2;
2920                     pFromU2022State->g = 0;
2921                     choiceCount = 0;
2922                 }
2923                 if(sourceChar == CR || sourceChar == LF) {
2924                     /* reset the state at the end of a line */
2925                     uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2926                     choiceCount = 0;
2927                 }
2928             }
2929             else{
2930                 /* convert U+0080..U+10ffff */
2931                 int32_t i;
2932                 int8_t cs, g;
2933
2934                 if(choiceCount == 0) {
2935                     /* try the current SO/G1 converter first */
2936                     choices[0] = pFromU2022State->cs[1];
2937
2938                     /* default to GB2312_1 if none is designated yet */
2939                     if(choices[0] == 0) {
2940                         choices[0] = GB2312_1;
2941                     }
2942
2943                     if(converterData->version == 0) {
2944                         /* ISO-2022-CN */
2945
2946                         /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2947                         if(choices[0] == GB2312_1) {
2948                             choices[1] = (int8_t)CNS_11643_1;
2949                         } else {
2950                             choices[1] = (int8_t)GB2312_1;
2951                         }
2952
2953                         choiceCount = 2;
2954                     } else {
2955                         /* ISO-2022-CN-EXT */
2956
2957                         /* try one of the other converters */
2958                         switch(choices[0]) {
2959                         case GB2312_1:
2960                             choices[1] = (int8_t)CNS_11643_1;
2961                             choices[2] = (int8_t)ISO_IR_165;
2962                             break;
2963                         case ISO_IR_165:
2964                             choices[1] = (int8_t)GB2312_1;
2965                             choices[2] = (int8_t)CNS_11643_1;
2966                             break;
2967                         default: /* CNS_11643_x */
2968                             choices[1] = (int8_t)GB2312_1;
2969                             choices[2] = (int8_t)ISO_IR_165;
2970                             break;
2971                         }
2972
2973                         choiceCount = 3;
2974                     }
2975                 }
2976
2977                 cs = g = 0;
2978                 /*
2979                  * len==0: no mapping found yet
2980                  * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
2981                  * len>0: found a roundtrip result, done
2982                  */
2983                 len = 0;
2984                 /*
2985                  * We will turn off useFallback after finding a fallback,
2986                  * but we still get fallbacks from PUA code points as usual.
2987                  * Therefore, we will also need to check that we don't overwrite
2988                  * an early fallback with a later one.
2989                  */
2990                 useFallback = cnv->useFallback;
2991
2992                 for(i = 0; i < choiceCount && len <= 0; ++i) {
2993                     int8_t cs0 = choices[i];
2994                     if(cs0 > 0) {
2995                         uint32_t value;
2996                         int32_t len2;
2997                         if(cs0 >= CNS_11643_0) {
2998                             len2 = MBCS_FROM_UCHAR32_ISO2022(
2999                                         converterData->myConverterArray[CNS_11643],
3000                                         sourceChar,
3001                                         &value,
3002                                         useFallback,
3003                                         MBCS_OUTPUT_3);
3004                             if(len2 == 3 || (len2 == -3 && len == 0)) {
3005                                 targetValue = value;
3006                                 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3007                                 if(len2 >= 0) {
3008                                     len = 2;
3009                                 } else {
3010                                     len = -2;
3011                                     useFallback = FALSE;
3012                                 }
3013                                 if(cs == CNS_11643_1) {
3014                                     g = 1;
3015                                 } else if(cs == CNS_11643_2) {
3016                                     g = 2;
3017                                 } else /* plane 3..7 */ if(converterData->version == 1) {
3018                                     g = 3;
3019                                 } else {
3020                                     /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3021                                     len = 0;
3022                                 }
3023                             }
3024                         } else {
3025                             /* GB2312_1 or ISO-IR-165 */
3026                             len2 = MBCS_FROM_UCHAR32_ISO2022(
3027                                         converterData->myConverterArray[cs0],
3028                                         sourceChar,
3029                                         &value,
3030                                         useFallback,
3031                                         MBCS_OUTPUT_2);
3032                             if(len2 == 2 || (len2 == -2 && len == 0)) {
3033                                 targetValue = value;
3034                                 len = len2;
3035                                 cs = cs0;
3036                                 g = 1;
3037                                 useFallback = FALSE;
3038                             }
3039                         }
3040                     }
3041                 }
3042
3043                 if(len != 0) {
3044                     len = 0; /* count output bytes; it must have been abs(len) == 2 */
3045
3046                     /* write the designation sequence if necessary */
3047                     if(cs != pFromU2022State->cs[g]) {
3048                         if(cs < CNS_11643) {
3049                             uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3050                         } else {
3051                             uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3052                         }
3053                         len = 4;
3054                         pFromU2022State->cs[g] = cs;
3055                         if(g == 1) {
3056                             /* changing the SO/G1 charset invalidates the choices[] */
3057                             choiceCount = 0;
3058                         }
3059                     }
3060
3061                     /* write the shift sequence if necessary */
3062                     if(g != pFromU2022State->g) {
3063                         switch(g) {
3064                         case 1:
3065                             buffer[len++] = UCNV_SO;
3066
3067                             /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3068                             pFromU2022State->g = 1;
3069                             break;
3070                         case 2:
3071                             buffer[len++] = 0x1b;
3072                             buffer[len++] = 0x4e;
3073                             break;
3074                         default: /* case 3 */
3075                             buffer[len++] = 0x1b;
3076                             buffer[len++] = 0x4f;
3077                             break;
3078                         }
3079                     }
3080
3081                     /* write the two output bytes */
3082                     buffer[len++] = (char)(targetValue >> 8);
3083                     buffer[len++] = (char)targetValue;
3084                 } else {
3085                     /* if we cannot find the character after checking all codepages
3086                      * then this is an error
3087                      */
3088                     *err = U_INVALID_CHAR_FOUND;
3089                     cnv->fromUChar32=sourceChar;
3090                     break;
3091                 }
3092             }
3093
3094             /* output len>0 bytes in buffer[] */
3095             if(len == 1) {
3096                 *target++ = buffer[0];
3097                 if(offsets) {
3098                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3099                 }
3100             } else if(len == 2 && (target + 2) <= targetLimit) {
3101                 *target++ = buffer[0];
3102                 *target++ = buffer[1];
3103                 if(offsets) {
3104                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3105                     *offsets++ = sourceIndex;
3106                     *offsets++ = sourceIndex;
3107                 }
3108             } else {
3109                 fromUWriteUInt8(
3110                     cnv,
3111                     buffer, len,
3112                     &target, (const char *)targetLimit,
3113                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3114                     err);
3115                 if(U_FAILURE(*err)) {
3116                     break;
3117                 }
3118             }
3119         } /* end if(myTargetIndex<myTargetLength) */
3120         else{
3121             *err =U_BUFFER_OVERFLOW_ERROR;
3122             break;
3123         }
3124
3125     }/* end while(mySourceIndex<mySourceLength) */
3126
3127     /*
3128      * the end of the input stream and detection of truncated input
3129      * are handled by the framework, but for ISO-2022-CN conversion
3130      * we need to be in ASCII mode at the very end
3131      *
3132      * conditions:
3133      *   successful
3134      *   not in ASCII mode
3135      *   end of input and no truncated input
3136      */
3137     if( U_SUCCESS(*err) &&
3138         pFromU2022State->g!=0 &&
3139         args->flush && source>=sourceLimit && cnv->fromUChar32==0
3140     ) {
3141         int32_t sourceIndex;
3142
3143         /* we are switching to ASCII */
3144         pFromU2022State->g=0;
3145
3146         /* get the source index of the last input character */
3147         /*
3148          * TODO this would be simpler and more reliable if we used a pair
3149          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3150          * so that we could simply use the prevSourceIndex here;
3151          * this code gives an incorrect result for the rare case of an unmatched
3152          * trail surrogate that is alone in the last buffer of the text stream
3153          */
3154         sourceIndex=(int32_t)(source-args->source);
3155         if(sourceIndex>0) {
3156             --sourceIndex;
3157             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3158                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3159             ) {
3160                 --sourceIndex;
3161             }
3162         } else {
3163             sourceIndex=-1;
3164         }
3165
3166         fromUWriteUInt8(
3167             cnv,
3168             SHIFT_IN_STR, 1,
3169             &target, (const char *)targetLimit,
3170             &offsets, sourceIndex,
3171             err);
3172     }
3173
3174     /*save the state and return */
3175     args->source = source;
3176     args->target = (char*)target;
3177 }
3178
3179
3180 static void
3181 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3182                                                UErrorCode* err){
3183     char tempBuf[3];
3184     const char *mySource = (char *) args->source;
3185     UChar *myTarget = args->target;
3186     const char *mySourceLimit = args->sourceLimit;
3187     uint32_t targetUniChar = 0x0000;
3188     uint32_t mySourceChar = 0x0000;
3189     UConverterDataISO2022* myData;
3190     ISO2022State *pToU2022State;
3191
3192     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3193     pToU2022State = &myData->toU2022State;
3194
3195     if(myData->key != 0) {
3196         /* continue with a partial escape sequence */
3197         goto escape;
3198     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3199         /* continue with a partial double-byte character */
3200         mySourceChar = args->converter->toUBytes[0];
3201         args->converter->toULength = 0;
3202         targetUniChar = missingCharMarker;
3203         goto getTrailByte;
3204     }
3205
3206     while(mySource < mySourceLimit){
3207
3208         targetUniChar =missingCharMarker;
3209
3210         if(myTarget < args->targetLimit){
3211
3212             mySourceChar= (unsigned char) *mySource++;
3213
3214             switch(mySourceChar){
3215             case UCNV_SI:
3216                 pToU2022State->g=0;
3217                 if (myData->isEmptySegment) {
3218                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
3219                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3220                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
3221                     args->converter->toUBytes[0] = mySourceChar;
3222                     args->converter->toULength = 1;
3223                     args->target = myTarget;
3224                     args->source = mySource;
3225                     return;
3226                 }
3227                 continue;
3228
3229             case UCNV_SO:
3230                 if(pToU2022State->cs[1] != 0) {
3231                     pToU2022State->g=1;
3232                     myData->isEmptySegment = TRUE;      /* Begin a new segment, empty so far */
3233                     continue;
3234                 } else {
3235                     /* illegal to have SO before a matching designator */
3236                     myData->isEmptySegment = FALSE;     /* Handling a different error, reset this to avoid future spurious errs */
3237                     break;
3238                 }
3239
3240             case ESC_2022:
3241                 mySource--;
3242 escape:
3243                 {
3244                     const char * mySourceBefore = mySource;
3245                     int8_t toULengthBefore = args->converter->toULength;
3246
3247                     changeState_2022(args->converter,&(mySource),
3248                         mySourceLimit, ISO_2022_CN,err);
3249
3250                     /* After SO there must be at least one character before a designator (designator error handled separately) */
3251                     if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3252                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3253                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
3254                         args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
3255                     }
3256                 }
3257
3258                 /* invalid or illegal escape sequence */
3259                 if(U_FAILURE(*err)){
3260                     args->target = myTarget;
3261                     args->source = mySource;
3262                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
3263                     return;
3264                 }
3265                 continue;
3266
3267             /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3268
3269             case CR:
3270                 /*falls through*/
3271             case LF:
3272                 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3273                 /* falls through */
3274             default:
3275                 /* convert one or two bytes */
3276                 myData->isEmptySegment = FALSE;
3277                 if(pToU2022State->g != 0) {
3278                     if(mySource < mySourceLimit) {
3279                         UConverterSharedData *cnv;
3280                         StateEnum tempState;
3281                         int32_t tempBufLen;
3282                         int leadIsOk, trailIsOk;
3283                         uint8_t trailByte;
3284 getTrailByte:
3285                         trailByte = (uint8_t)*mySource;
3286                         /*
3287                          * Ticket 5691: consistent illegal sequences:
3288                          * - We include at least the first byte in the illegal sequence.
3289                          * - If any of the non-initial bytes could be the start of a character,
3290                          *   we stop the illegal sequence before the first one of those.
3291                          *
3292                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3293                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3294                          * Otherwise we convert or report the pair of bytes.
3295                          */
3296                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3297                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3298                         if (leadIsOk && trailIsOk) {
3299                             ++mySource;
3300                             tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3301                             if(tempState >= CNS_11643_0) {
3302                                 cnv = myData->myConverterArray[CNS_11643];
3303                                 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3304                                 tempBuf[1] = (char) (mySourceChar);
3305                                 tempBuf[2] = (char) trailByte;
3306                                 tempBufLen = 3;
3307
3308                             }else{
3309                                 cnv = myData->myConverterArray[tempState];
3310                                 tempBuf[0] = (char) (mySourceChar);
3311                                 tempBuf[1] = (char) trailByte;
3312                                 tempBufLen = 2;
3313                             }
3314                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3315                             mySourceChar = (mySourceChar << 8) | trailByte;
3316                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3317                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3318                             ++mySource;
3319                             /* add another bit so that the code below writes 2 bytes in case of error */
3320                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3321                         }
3322                         if(pToU2022State->g>=2) {
3323                             /* return from a single-shift state to the previous one */
3324                             pToU2022State->g=pToU2022State->prevG;
3325                         }
3326                     } else {
3327                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3328                         args->converter->toULength = 1;
3329                         goto endloop;
3330                     }
3331                 }
3332                 else{
3333                     if(mySourceChar <= 0x7f) {
3334                         targetUniChar = (UChar) mySourceChar;
3335                     }
3336                 }
3337                 break;
3338             }
3339             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3340                 if(args->offsets){
3341                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3342                 }
3343                 *(myTarget++)=(UChar)targetUniChar;
3344             }
3345             else if(targetUniChar > missingCharMarker){
3346                 /* disassemble the surrogate pair and write to output*/
3347                 targetUniChar-=0x0010000;
3348                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3349                 if(args->offsets){
3350                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3351                 }
3352                 ++myTarget;
3353                 if(myTarget< args->targetLimit){
3354                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3355                     if(args->offsets){
3356                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3357                     }
3358                     ++myTarget;
3359                 }else{
3360                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3361                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3362                 }
3363
3364             }
3365             else{
3366                 /* Call the callback function*/
3367                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3368                 break;
3369             }
3370         }
3371         else{
3372             *err =U_BUFFER_OVERFLOW_ERROR;
3373             break;
3374         }
3375     }
3376 endloop:
3377     args->target = myTarget;
3378     args->source = mySource;
3379 }
3380
3381 static void
3382 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3383     UConverter *cnv = args->converter;
3384     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3385     ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3386     char *p, *subchar;
3387     char buffer[8];
3388     int32_t length;
3389
3390     subchar=(char *)cnv->subChars;
3391     length=cnv->subCharLen; /* assume length==1 for most variants */
3392
3393     p = buffer;
3394     switch(myConverterData->locale[0]){
3395     case 'j':
3396         {
3397             int8_t cs;
3398
3399             if(pFromU2022State->g == 1) {
3400                 /* JIS7: switch from G1 to G0 */
3401                 pFromU2022State->g = 0;
3402                 *p++ = UCNV_SI;
3403             }
3404
3405             cs = pFromU2022State->cs[0];
3406             if(cs != ASCII && cs != JISX201) {
3407                 /* not in ASCII or JIS X 0201: switch to ASCII */
3408                 pFromU2022State->cs[0] = (int8_t)ASCII;
3409                 *p++ = '\x1b';
3410                 *p++ = '\x28';
3411                 *p++ = '\x42';
3412             }
3413
3414             *p++ = subchar[0];
3415             break;
3416         }
3417     case 'c':
3418         if(pFromU2022State->g != 0) {
3419             /* not in ASCII mode: switch to ASCII */
3420             pFromU2022State->g = 0;
3421             *p++ = UCNV_SI;
3422         }
3423         *p++ = subchar[0];
3424         break;
3425     case 'k':
3426         if(myConverterData->version == 0) {
3427             if(length == 1) {
3428                 if((UBool)args->converter->fromUnicodeStatus) {
3429                     /* in DBCS mode: switch to SBCS */
3430                     args->converter->fromUnicodeStatus = 0;
3431                     *p++ = UCNV_SI;
3432                 }
3433                 *p++ = subchar[0];
3434             } else /* length == 2*/ {
3435                 if(!(UBool)args->converter->fromUnicodeStatus) {
3436                     /* in SBCS mode: switch to DBCS */
3437                     args->converter->fromUnicodeStatus = 1;
3438                     *p++ = UCNV_SO;
3439                 }
3440                 *p++ = subchar[0];
3441                 *p++ = subchar[1];
3442             }
3443             break;
3444         } else {
3445             /* save the subconverter's substitution string */
3446             uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3447             int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3448
3449             /* set our substitution string into the subconverter */
3450             myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3451             myConverterData->currentConverter->subCharLen = (int8_t)length;
3452
3453             /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3454             args->converter = myConverterData->currentConverter;
3455             myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3456             ucnv_cbFromUWriteSub(args, 0, err);
3457             cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3458             args->converter = cnv;
3459
3460             /* restore the subconverter's substitution string */
3461             myConverterData->currentConverter->subChars = currentSubChars;
3462             myConverterData->currentConverter->subCharLen = currentSubCharLen;
3463
3464             if(*err == U_BUFFER_OVERFLOW_ERROR) {
3465                 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3466                     uprv_memcpy(
3467                         cnv->charErrorBuffer,
3468                         myConverterData->currentConverter->charErrorBuffer,
3469                         myConverterData->currentConverter->charErrorBufferLength);
3470                 }
3471                 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3472                 myConverterData->currentConverter->charErrorBufferLength = 0;
3473             }
3474             return;
3475         }
3476     default:
3477         /* not expected */
3478         break;
3479     }
3480     ucnv_cbFromUWriteBytes(args,
3481                            buffer, (int32_t)(p - buffer),
3482                            offsetIndex, err);
3483 }
3484
3485 /*
3486  * Structure for cloning an ISO 2022 converter into a single memory block.
3487  * ucnv_safeClone() of the converter will align the entire cloneStruct,
3488  * and then ucnv_safeClone() of the sub-converter may additionally align
3489  * currentConverter inside the cloneStruct, for which we need the deadSpace
3490  * after currentConverter.
3491  * This is because UAlignedMemory may be larger than the actually
3492  * necessary alignment size for the platform.
3493  * The other cloneStruct fields will not be moved around,
3494  * and are aligned properly with cloneStruct's alignment.
3495  */
3496 struct cloneStruct
3497 {
3498     UConverter cnv;
3499     UConverter currentConverter;
3500     UAlignedMemory deadSpace;
3501     UConverterDataISO2022 mydata;
3502 };
3503
3504
3505 static UConverter *
3506 _ISO_2022_SafeClone(
3507             const UConverter *cnv,
3508             void *stackBuffer,
3509             int32_t *pBufferSize,
3510             UErrorCode *status)
3511 {
3512     struct cloneStruct * localClone;
3513     UConverterDataISO2022 *cnvData;
3514     int32_t i, size;
3515
3516     if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3517         *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3518         return NULL;
3519     }
3520
3521     cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3522     localClone = (struct cloneStruct *)stackBuffer;
3523
3524     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3525
3526     uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3527     localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3528     localClone->cnv.isExtraLocal = TRUE;
3529
3530     /* share the subconverters */
3531
3532     if(cnvData->currentConverter != NULL) {
3533         size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3534         localClone->mydata.currentConverter =
3535             ucnv_safeClone(cnvData->currentConverter,
3536                             &localClone->currentConverter,
3537                             &size, status);
3538         if(U_FAILURE(*status)) {
3539             return NULL;
3540         }
3541     }
3542
3543     for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3544         if(cnvData->myConverterArray[i] != NULL) {
3545             ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3546         }
3547     }
3548
3549     return &localClone->cnv;
3550 }
3551
3552 static void
3553 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3554                     const USetAdder *sa,
3555                     UConverterUnicodeSet which,
3556                     UErrorCode *pErrorCode)
3557 {
3558     int32_t i;
3559     UConverterDataISO2022* cnvData;
3560
3561     if (U_FAILURE(*pErrorCode)) {
3562         return;
3563     }
3564 #ifdef U_ENABLE_GENERIC_ISO_2022
3565     if (cnv->sharedData == &_ISO2022Data) {
3566         /* We use UTF-8 in this case */
3567         sa->addRange(sa->set, 0, 0xd7FF);
3568         sa->addRange(sa->set, 0xE000, 0x10FFFF);
3569         return;
3570     }
3571 #endif
3572
3573     cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3574
3575     /* open a set and initialize it with code points that are algorithmically round-tripped */
3576     switch(cnvData->locale[0]){
3577     case 'j':
3578         /* include JIS X 0201 which is hardcoded */
3579         sa->add(sa->set, 0xa5);
3580         sa->add(sa->set, 0x203e);
3581         if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3582             /* include Latin-1 for some variants of JP */
3583             sa->addRange(sa->set, 0, 0xff);
3584         } else {
3585             /* include ASCII for JP */
3586             sa->addRange(sa->set, 0, 0x7f);
3587         }
3588         if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3589             /*
3590              * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3591              * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3592              * use half-width Katakana.
3593              * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3594              * half-width Katakana via the ESC ( I sequence.
3595              * However, we only emit (fromUnicode) half-width Katakana according to the
3596              * definition of each variant.
3597              *
3598              * When including fallbacks,
3599              * we need to include half-width Katakana Unicode code points for all JP variants because
3600              * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3601              */
3602             /* include half-width Katakana for JP */
3603             sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3604         }
3605         break;
3606     case 'c':
3607     case 'z':
3608         /* include ASCII for CN */
3609         sa->addRange(sa->set, 0, 0x7f);
3610         break;
3611     case 'k':
3612         /* there is only one converter for KR, and it is not in the myConverterArray[] */
3613         cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3614                 cnvData->currentConverter, sa, which, pErrorCode);
3615         /* the loop over myConverterArray[] will simply not find another converter */
3616         break;
3617     default:
3618         break;
3619     }
3620
3621 #if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3622             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3623                 cnvData->version==0 && i==CNS_11643
3624             ) {
3625                 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3626                 ucnv_MBCSGetUnicodeSetForBytes(
3627                         cnvData->myConverterArray[i],
3628                         sa, UCNV_ROUNDTRIP_SET,
3629                         0, 0x81, 0x82,
3630                         pErrorCode);
3631             }
3632 #endif
3633
3634     for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3635         UConverterSetFilter filter;
3636         if(cnvData->myConverterArray[i]!=NULL) {
3637             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3638                 cnvData->version==0 && i==CNS_11643
3639             ) {
3640                 /*
3641                  * Version-specific for CN:
3642                  * CN version 0 does not map CNS planes 3..7 although
3643                  * they are all available in the CNS conversion table;
3644                  * CN version 1 (-EXT) does map them all.
3645                  * The two versions create different Unicode sets.
3646                  */
3647                 filter=UCNV_SET_FILTER_2022_CN;
3648             } else if(cnvData->locale[0]=='j' && i==JISX208) {
3649                 /*
3650                  * Only add code points that map to Shift-JIS codes
3651                  * corresponding to JIS X 0208.
3652                  */
3653                 filter=UCNV_SET_FILTER_SJIS;
3654             } else if(i==KSC5601) {
3655                 /*
3656                  * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3657                  * are broader than GR94.
3658                  */
3659                 filter=UCNV_SET_FILTER_GR94DBCS;
3660             } else {
3661                 filter=UCNV_SET_FILTER_NONE;
3662             }
3663             ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3664         }
3665     }
3666
3667     /*
3668      * ISO 2022 converters must not convert SO/SI/ESC despite what
3669      * sub-converters do by themselves.
3670      * Remove these characters from the set.
3671      */
3672     sa->remove(sa->set, 0x0e);
3673     sa->remove(sa->set, 0x0f);
3674     sa->remove(sa->set, 0x1b);
3675
3676     /* ISO 2022 converters do not convert C1 controls either */
3677     sa->removeRange(sa->set, 0x80, 0x9f);
3678 }
3679
3680 static const UConverterImpl _ISO2022Impl={
3681     UCNV_ISO_2022,
3682
3683     NULL,
3684     NULL,
3685
3686     _ISO2022Open,
3687     _ISO2022Close,
3688     _ISO2022Reset,
3689
3690 #ifdef U_ENABLE_GENERIC_ISO_2022
3691     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3692     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3693     ucnv_fromUnicode_UTF8,
3694     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3695 #else
3696     NULL,
3697     NULL,
3698     NULL,
3699     NULL,
3700 #endif
3701     NULL,
3702
3703     NULL,
3704     _ISO2022getName,
3705     _ISO_2022_WriteSub,
3706     _ISO_2022_SafeClone,
3707     _ISO_2022_GetUnicodeSet
3708 };
3709 static const UConverterStaticData _ISO2022StaticData={
3710     sizeof(UConverterStaticData),
3711     "ISO_2022",
3712     2022,
3713     UCNV_IBM,
3714     UCNV_ISO_2022,
3715     1,
3716     3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3717     { 0x1a, 0, 0, 0 },
3718     1,
3719     FALSE,
3720     FALSE,
3721     0,
3722     0,
3723     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3724 };
3725 const UConverterSharedData _ISO2022Data={
3726     sizeof(UConverterSharedData),
3727     ~((uint32_t) 0),
3728     NULL,
3729     NULL,
3730     &_ISO2022StaticData,
3731     FALSE,
3732     &_ISO2022Impl,
3733     0
3734 };
3735
3736 /*************JP****************/
3737 static const UConverterImpl _ISO2022JPImpl={
3738     UCNV_ISO_2022,
3739
3740     NULL,
3741     NULL,
3742
3743     _ISO2022Open,
3744     _ISO2022Close,
3745     _ISO2022Reset,
3746
3747     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3748     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3749     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3750     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3751     NULL,
3752
3753     NULL,
3754     _ISO2022getName,
3755     _ISO_2022_WriteSub,
3756     _ISO_2022_SafeClone,
3757     _ISO_2022_GetUnicodeSet
3758 };
3759 static const UConverterStaticData _ISO2022JPStaticData={
3760     sizeof(UConverterStaticData),
3761     "ISO_2022_JP",
3762     0,
3763     UCNV_IBM,
3764     UCNV_ISO_2022,
3765     1,
3766     6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3767     { 0x1a, 0, 0, 0 },
3768     1,
3769     FALSE,
3770     FALSE,
3771     0,
3772     0,
3773     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3774 };
3775 static const UConverterSharedData _ISO2022JPData={
3776     sizeof(UConverterSharedData),
3777     ~((uint32_t) 0),
3778     NULL,
3779     NULL,
3780     &_ISO2022JPStaticData,
3781     FALSE,
3782     &_ISO2022JPImpl,
3783     0
3784 };
3785
3786 /************* KR ***************/
3787 static const UConverterImpl _ISO2022KRImpl={
3788     UCNV_ISO_2022,
3789
3790     NULL,
3791     NULL,
3792
3793     _ISO2022Open,
3794     _ISO2022Close,
3795     _ISO2022Reset,
3796
3797     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3798     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3799     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3800     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3801     NULL,
3802
3803     NULL,
3804     _ISO2022getName,
3805     _ISO_2022_WriteSub,
3806     _ISO_2022_SafeClone,
3807     _ISO_2022_GetUnicodeSet
3808 };
3809 static const UConverterStaticData _ISO2022KRStaticData={
3810     sizeof(UConverterStaticData),
3811     "ISO_2022_KR",
3812     0,
3813     UCNV_IBM,
3814     UCNV_ISO_2022,
3815     1,
3816     3, /* max 3 bytes per UChar: SO+DBCS */
3817     { 0x1a, 0, 0, 0 },
3818     1,
3819     FALSE,
3820     FALSE,
3821     0,
3822     0,
3823     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3824 };
3825 static const UConverterSharedData _ISO2022KRData={
3826     sizeof(UConverterSharedData),
3827     ~((uint32_t) 0),
3828     NULL,
3829     NULL,
3830     &_ISO2022KRStaticData,
3831     FALSE,
3832     &_ISO2022KRImpl,
3833     0
3834 };
3835
3836 /*************** CN ***************/
3837 static const UConverterImpl _ISO2022CNImpl={
3838
3839     UCNV_ISO_2022,
3840
3841     NULL,
3842     NULL,
3843
3844     _ISO2022Open,
3845     _ISO2022Close,
3846     _ISO2022Reset,
3847
3848     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3849     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3850     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3851     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3852     NULL,
3853
3854     NULL,
3855     _ISO2022getName,
3856     _ISO_2022_WriteSub,
3857     _ISO_2022_SafeClone,
3858     _ISO_2022_GetUnicodeSet
3859 };
3860 static const UConverterStaticData _ISO2022CNStaticData={
3861     sizeof(UConverterStaticData),
3862     "ISO_2022_CN",
3863     0,
3864     UCNV_IBM,
3865     UCNV_ISO_2022,
3866     1,
3867     8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3868     { 0x1a, 0, 0, 0 },
3869     1,
3870     FALSE,
3871     FALSE,
3872     0,
3873     0,
3874     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3875 };
3876 static const UConverterSharedData _ISO2022CNData={
3877     sizeof(UConverterSharedData),
3878     ~((uint32_t) 0),
3879     NULL,
3880     NULL,
3881     &_ISO2022CNStaticData,
3882     FALSE,
3883     &_ISO2022CNImpl,
3884     0
3885 };
3886
3887
3888
3889 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */