icuSources/common/ucnv2022.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2000-2016, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv2022.cpp
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2000feb03
  12 *   created by: Markus W. Scherer
  13 *
  14 *   Change history:
  15 *
  16 *   06/29/2000  helena  Major rewrite of the callback APIs.
  17 *   08/08/2000  Ram     Included support for ISO-2022-JP-2
  18 *                       Changed implementation of toUnicode
  19 *                       function
  20 *   08/21/2000  Ram     Added support for ISO-2022-KR
  21 *   08/29/2000  Ram     Seperated implementation of EBCDIC to
  22 *                       ucnvebdc.c
  23 *   09/20/2000  Ram     Added support for ISO-2022-CN
  24 *                       Added implementations for getNextUChar()
  25 *                       for specific 2022 country variants.
  26 *   10/31/2000  Ram     Implemented offsets logic functions
  27 */
  28
  29 #include "unicode/utypes.h"
  30
  31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
  32
  33 #include "unicode/ucnv.h"
  34 #include "unicode/uset.h"
  35 #include "unicode/ucnv_err.h"
  36 #include "unicode/ucnv_cb.h"
  37 #include "unicode/utf16.h"
  38 #include "ucnv_imp.h"
  39 #include "ucnv_bld.h"
  40 #include "ucnv_cnv.h"
  41 #include "ucnvmbcs.h"
  42 #include "cstring.h"
  43 #include "cmemory.h"
  44 #include "uassert.h"
  45
  46 #ifdef U_ENABLE_GENERIC_ISO_2022
  47 /*
  48  * I am disabling the generic ISO-2022 converter after proposing to do so on
  49  * the icu mailing list two days ago.
  50  *
  51  * Reasons:
  52  * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
  53  *    its designation sequences, single shifts with return to the previous state,
  54  *    switch-with-no-return to UTF-16BE or similar, etc.
  55  *    This is unlike the language-specific variants like ISO-2022-JP which
  56  *    require a much smaller repertoire of ISO-2022 features.
  57  *    These variants continue to be supported.
  58  * 2. I believe that no one is really using the generic ISO-2022 converter
  59  *    but rather always one of the language-specific variants.
  60  *    Note that ICU's generic ISO-2022 converter has always output one escape
  61  *    sequence followed by UTF-8 for the whole stream.
  62  * 3. Switching between subcharsets is extremely slow, because each time
  63  *    the previous converter is closed and a new one opened,
  64  *    without any kind of caching, least-recently-used list, etc.
  65  * 4. The code is currently buggy, and given the above it does not seem
  66  *    reasonable to spend the time on maintenance.
  67  * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
  68  *    This means, for example, that when ISO-8859-7 is designated, the following
  69  *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
  70  *    The ICU ISO-2022 converter does not handle this - and has no information
  71  *    about which subconverter would have to be shifted vs. which is designed
  72  *    for 7-bit ISO-2022.
  73  *
  74  * Markus Scherer 2003-dec-03
  75  */
  76 #endif
  77
  78 #if !UCONFIG_ONLY_HTML_CONVERSION
  79 static const char SHIFT_IN_STR[]  = "\x0F";
  80 // static const char SHIFT_OUT_STR[] = "\x0E";
  81 #endif
  82
  83 #define CR      0x0D
  84 #define LF      0x0A
  85 #define H_TAB   0x09
  86 #define V_TAB   0x0B
  87 #define SPACE   0x20
  88
  89 enum {
  90     HWKANA_START=0xff61,
  91     HWKANA_END=0xff9f
  92 };
  93
  94 /*
  95  * 94-character sets with native byte values A1..FE are encoded in ISO 2022
  96  * as bytes 21..7E. (Subtract 0x80.)
  97  * 96-character sets with native byte values A0..FF are encoded in ISO 2022
  98  * as bytes 20..7F. (Subtract 0x80.)
  99  * Do not encode C1 control codes with native bytes 80..9F
 100  * as bytes 00..1F (C0 control codes).
 101  */
 102 enum {
 103     GR94_START=0xa1,
 104     GR94_END=0xfe,
 105     GR96_START=0xa0,
 106     GR96_END=0xff
 107 };
 108
 109 /*
 110  * ISO 2022 control codes must not be converted from Unicode
 111  * because they would mess up the byte stream.
 112  * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
 113  * corresponding to SO, SI, and ESC.
 114  */
 115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
 116
 117 /* for ISO-2022-JP and -CN implementations */
 118 typedef enum  {
 119         /* shared values */
 120         INVALID_STATE=-1,
 121         ASCII = 0,
 122
 123         SS2_STATE=0x10,
 124         SS3_STATE,
 125
 126         /* JP */
 127         ISO8859_1 = 1 ,
 128         ISO8859_7 = 2 ,
 129         JISX201  = 3,
 130         JISX208 = 4,
 131         JISX212 = 5,
 132         GB2312  =6,
 133         KSC5601 =7,
 134         HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
 135
 136         /* CN */
 137         /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
 138         GB2312_1=1,
 139         ISO_IR_165=2,
 140         CNS_11643=3,
 141
 142         /*
 143          * these are used in StateEnum and ISO2022State variables,
 144          * but CNS_11643 must be used to index into myConverterArray[]
 145          */
 146         CNS_11643_0=0x20,
 147         CNS_11643_1,
 148         CNS_11643_2,
 149         CNS_11643_3,
 150         CNS_11643_4,
 151         CNS_11643_5,
 152         CNS_11643_6,
 153         CNS_11643_7
 154 } StateEnum;
 155
 156 /* is the StateEnum charset value for a DBCS charset? */
 157 #if UCONFIG_ONLY_HTML_CONVERSION
 158 #define IS_JP_DBCS(cs) (JISX208==(cs))
 159 #else
 160 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
 161 #endif
 162
 163 #define CSM(cs) ((uint16_t)1<<(cs))
 164
 165 /*
 166  * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
 167  * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
 168  *
 169  * Note: The converter uses some leniency:
 170  * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
 171  *   all versions, not just JIS7 and JIS8.
 172  * - ICU does not distinguish between different versions of JIS X 0208.
 173  */
 174 #if UCONFIG_ONLY_HTML_CONVERSION
 175 enum { MAX_JA_VERSION=0 };
 176 #else
 177 enum { MAX_JA_VERSION=4 };
 178 #endif
 179 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
 180     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
 181 #if !UCONFIG_ONLY_HTML_CONVERSION
 182     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
 183     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
 184     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
 185     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
 186 #endif
 187 };
 188
 189 typedef enum {
 190         ASCII1=0,
 191         LATIN1,
 192         SBCS,
 193         DBCS,
 194         MBCS,
 195         HWKANA
 196 }Cnv2022Type;
 197
 198 typedef struct ISO2022State {
 199     int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
 200     int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
 201     int8_t prevG;       /* g before single shift (SS2 or SS3) */
 202 } ISO2022State;
 203
 204 #define UCNV_OPTIONS_VERSION_MASK 0xf
 205 #define UCNV_2022_MAX_CONVERTERS 10
 206
 207 typedef struct{
 208     UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
 209     UConverter *currentConverter;
 210     Cnv2022Type currentType;
 211     ISO2022State toU2022State, fromU2022State;
 212     uint32_t key;
 213     uint32_t version;
 214 #ifdef U_ENABLE_GENERIC_ISO_2022
 215     UBool isFirstBuffer;
 216 #endif
 217     UBool isEmptySegment;
 218     char name[30];
 219     char locale[3];
 220 }UConverterDataISO2022;
 221
 222 /* Protos */
 223 /* ISO-2022 ----------------------------------------------------------------- */
 224
 225 /*Forward declaration */
 226 U_CFUNC void
 227 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
 228                       UErrorCode * err);
 229 U_CFUNC void
 230 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
 231                                     UErrorCode * err);
 232
 233 #define ESC_2022 0x1B /*ESC*/
 234
 235 typedef enum
 236 {
 237         INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
 238         VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
 239         VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
 240         VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
 241 } UCNV_TableStates_2022;
 242
 243 /*
 244 * The way these state transition arrays work is:
 245 * ex : ESC$B is the sequence for JISX208
 246 *      a) First Iteration: char is ESC
 247 *          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
 248 *             int x = normalize_esq_chars_2022[27] which is equal to 1
 249 *         ii) Search for this value in escSeqStateTable_Key_2022[]
 250 *             value of x is stored at escSeqStateTable_Key_2022[0]
 251 *        iii) Save this index as offset
 252 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
 253 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
 254 *     b) Switch on this state and continue to next char
 255 *          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
 256 *             which is normalize_esq_chars_2022[36] == 4
 257 *         ii) x is currently 1(from above)
 258 *               x<<=5 -- x is now 32
 259 *               x+=normalize_esq_chars_2022[36]
 260 *               now x is 36
 261 *        iii) Search for this value in escSeqStateTable_Key_2022[]
 262 *             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
 263 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
 264 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
 265 *     c) Switch on this state and continue to next char
 266 *        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
 267 *        ii) x is currently 36 (from above)
 268 *            x<<=5 -- x is now 1152
 269 *            x+=normalize_esq_chars_2022[66]
 270 *            now x is 1161
 271 *       iii) Search for this value in escSeqStateTable_Key_2022[]
 272 *            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
 273 *        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
 274 *            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
 275 *         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
 276 */
 277
 278
 279 /*Below are the 3 arrays depicting a state transition table*/
 280 static const int8_t normalize_esq_chars_2022[256] = {
 281 /*       0      1       2       3       4      5       6        7       8       9           */
 282
 283          0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 284         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 285         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
 286         ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
 287         ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
 288         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 289         ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
 290         ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
 291         ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
 292         ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 293         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 294         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 295         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 296         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 297         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 298         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 299         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 300         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 301         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 302         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 303         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 304         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 305         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 306         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 307         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 308         ,0     ,0      ,0      ,0      ,0      ,0
 309 };
 310
 311 #ifdef U_ENABLE_GENERIC_ISO_2022
 312 /*
 313  * When the generic ISO-2022 converter is completely removed, not just disabled
 314  * per #ifdef, then the following state table and the associated tables that are
 315  * dimensioned with MAX_STATES_2022 should be trimmed.
 316  *
 317  * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
 318  * the associated escape sequences starting with ESC ( B should be removed.
 319  * This includes the ones with key values 1097 and all of the ones above 1000000.
 320  *
 321  * For the latter, the tables can simply be truncated.
 322  * For the former, since the tables must be kept parallel, it is probably best
 323  * to simply duplicate an adjacent table cell, parallel in all tables.
 324  *
 325  * It may make sense to restructure the tables, especially by using small search
 326  * tables for the variants instead of indexing them parallel to the table here.
 327  */
 328 #endif
 329
 330 #define MAX_STATES_2022 74
 331 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
 332 /*   0           1           2           3           4           5           6           7           8           9           */
 333
 334      1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
 335     ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
 336     ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
 337     ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
 338     ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
 339     ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
 340     ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
 341     ,35947631   ,35947635   ,35947636   ,35947638
 342 };
 343
 344 #ifdef U_ENABLE_GENERIC_ISO_2022
 345
 346 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
 347  /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
 348
 349      NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
 350     ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
 351     ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
 352     ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
 353     ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
 354     ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
 355     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
 356     ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
 357 };
 358
 359 #endif
 360
 361 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
 362 /*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
 363      VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 364     ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 365     ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
 366     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 367     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 368     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 369     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 370     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 371 };
 372
 373 /* Type def for refactoring changeState_2022 code*/
 374 typedef enum{
 375 #ifdef U_ENABLE_GENERIC_ISO_2022
 376     ISO_2022=0,
 377 #endif
 378     ISO_2022_JP=1,
 379 #if !UCONFIG_ONLY_HTML_CONVERSION
 380     ISO_2022_KR=2,
 381     ISO_2022_CN=3
 382 #endif
 383 } Variant2022;
 384
 385 /*********** ISO 2022 Converter Protos ***********/
 386 static void
 387 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
 388
 389 static void
 390  _ISO2022Close(UConverter *converter);
 391
 392 static void
 393 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
 394
 395 static const char*
 396 _ISO2022getName(const UConverter* cnv);
 397
 398 static void
 399 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
 400
 401 static UConverter *
 402 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
 403
 404 #ifdef U_ENABLE_GENERIC_ISO_2022
 405 static void
 406 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
 407 #endif
 408
 409 namespace {
 410
 411 /*const UConverterSharedData _ISO2022Data;*/
 412 extern const UConverterSharedData _ISO2022JPData;
 413
 414 #if !UCONFIG_ONLY_HTML_CONVERSION
 415 extern const UConverterSharedData _ISO2022KRData;
 416 extern const UConverterSharedData _ISO2022CNData;
 417 #endif
 418
 419 }  // namespace
 420
 421 /*************** Converter implementations ******************/
 422
 423 /* The purpose of this function is to get around gcc compiler warnings. */
 424 static inline void
 425 fromUWriteUInt8(UConverter *cnv,
 426                  const char *bytes, int32_t length,
 427                  uint8_t **target, const char *targetLimit,
 428                  int32_t **offsets,
 429                  int32_t sourceIndex,
 430                  UErrorCode *pErrorCode)
 431 {
 432     char *targetChars = (char *)*target;
 433     ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
 434                          offsets, sourceIndex, pErrorCode);
 435     *target = (uint8_t*)targetChars;
 436
 437 }
 438
 439 static inline void
 440 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
 441     if(myConverterData->version == 1) {
 442         UConverter *cnv = myConverterData->currentConverter;
 443
 444         cnv->toUnicodeStatus=0;     /* offset */
 445         cnv->mode=0;                /* state */
 446         cnv->toULength=0;           /* byteIndex */
 447     }
 448 }
 449
 450 static inline void
 451 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
 452    /* in ISO-2022-KR the designator sequence appears only once
 453     * in a file so we append it only once
 454     */
 455     if( converter->charErrorBufferLength==0){
 456
 457         converter->charErrorBufferLength = 4;
 458         converter->charErrorBuffer[0] = 0x1b;
 459         converter->charErrorBuffer[1] = 0x24;
 460         converter->charErrorBuffer[2] = 0x29;
 461         converter->charErrorBuffer[3] = 0x43;
 462     }
 463     if(myConverterData->version == 1) {
 464         UConverter *cnv = myConverterData->currentConverter;
 465
 466         cnv->fromUChar32=0;
 467         cnv->fromUnicodeStatus=1;   /* prevLength */
 468     }
 469 }
 470
 471 static void
 472 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
 473
 474     char myLocale[6]={' ',' ',' ',' ',' ',' '};
 475
 476     cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
 477     if(cnv->extraInfo != NULL) {
 478         UConverterNamePieces stackPieces;
 479         UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
 480         UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
 481         uint32_t version;
 482
 483         stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
 484
 485         uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
 486         myConverterData->currentType = ASCII1;
 487         cnv->fromUnicodeStatus =FALSE;
 488         if(pArgs->locale){
 489             uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
 490         }
 491         version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
 492         myConverterData->version = version;
 493         if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
 494             (myLocale[2]=='_' || myLocale[2]=='\0'))
 495         {
 496             /* open the required converters and cache them */
 497             if(version>MAX_JA_VERSION) {
 498                 // ICU 55 fails to open a converter for an unsupported version.
 499                 // Previously, it fell back to version 0, but that would yield
 500                 // unexpected behavior.
 501                 *errorCode = U_MISSING_RESOURCE_ERROR;
 502                 return;
 503             }
 504             if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
 505                 myConverterData->myConverterArray[ISO8859_7] =
 506                     ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
 507             }
 508             myConverterData->myConverterArray[JISX208] =
 509                 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
 510             if(jpCharsetMasks[version]&CSM(JISX212)) {
 511                 myConverterData->myConverterArray[JISX212] =
 512                     ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
 513             }
 514             if(jpCharsetMasks[version]&CSM(GB2312)) {
 515                 myConverterData->myConverterArray[GB2312] =
 516                     ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);   /* gb_2312_80-1 */
 517             }
 518             if(jpCharsetMasks[version]&CSM(KSC5601)) {
 519                 myConverterData->myConverterArray[KSC5601] =
 520                     ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
 521             }
 522
 523             /* set the function pointers to appropriate funtions */
 524             cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
 525             uprv_strcpy(myConverterData->locale,"ja");
 526
 527             (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
 528             size_t len = uprv_strlen(myConverterData->name);
 529             myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
 530             myConverterData->name[len+1]='\0';
 531         }
 532 #if !UCONFIG_ONLY_HTML_CONVERSION
 533         else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
 534             (myLocale[2]=='_' || myLocale[2]=='\0'))
 535         {
 536             if(version>1) {
 537                 // ICU 55 fails to open a converter for an unsupported version.
 538                 // Previously, it fell back to version 0, but that would yield
 539                 // unexpected behavior.
 540                 *errorCode = U_MISSING_RESOURCE_ERROR;
 541                 return;
 542             }
 543             const char *cnvName;
 544             if(version==1) {
 545                 cnvName="icu-internal-25546";
 546             } else {
 547                 cnvName="ibm-949";
 548                 myConverterData->version=version=0;
 549             }
 550             if(pArgs->onlyTestIsLoadable) {
 551                 ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
 552                 uprv_free(cnv->extraInfo);
 553                 cnv->extraInfo=NULL;
 554                 return;
 555             } else {
 556                 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
 557                 if (U_FAILURE(*errorCode)) {
 558                     _ISO2022Close(cnv);
 559                     return;
 560                 }
 561
 562                 if(version==1) {
 563                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
 564                     uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
 565                     cnv->subCharLen = myConverterData->currentConverter->subCharLen;
 566                 }else{
 567                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
 568                 }
 569
 570                 /* initialize the state variables */
 571                 setInitialStateToUnicodeKR(cnv, myConverterData);
 572                 setInitialStateFromUnicodeKR(cnv, myConverterData);
 573
 574                 /* set the function pointers to appropriate funtions */
 575                 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
 576                 uprv_strcpy(myConverterData->locale,"ko");
 577             }
 578         }
 579         else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
 580             (myLocale[2]=='_' || myLocale[2]=='\0'))
 581         {
 582             if(version>2) {
 583                 // ICU 55 fails to open a converter for an unsupported version.
 584                 // Previously, it fell back to version 0, but that would yield
 585                 // unexpected behavior.
 586                 *errorCode = U_MISSING_RESOURCE_ERROR;
 587                 return;
 588             }
 589
 590             /* open the required converters and cache them */
 591             myConverterData->myConverterArray[GB2312_1] =
 592                 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
 593             if(version==1) {
 594                 myConverterData->myConverterArray[ISO_IR_165] =
 595                     ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
 596             }
 597             myConverterData->myConverterArray[CNS_11643] =
 598                 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
 599
 600
 601             /* set the function pointers to appropriate funtions */
 602             cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
 603             uprv_strcpy(myConverterData->locale,"cn");
 604
 605             if (version==0){
 606                 myConverterData->version = 0;
 607                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
 608             }else if (version==1){
 609                 myConverterData->version = 1;
 610                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
 611             }else {
 612                 myConverterData->version = 2;
 613                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
 614             }
 615         }
 616 #endif  // !UCONFIG_ONLY_HTML_CONVERSION
 617         else{
 618 #ifdef U_ENABLE_GENERIC_ISO_2022
 619             myConverterData->isFirstBuffer = TRUE;
 620
 621             /* append the UTF-8 escape sequence */
 622             cnv->charErrorBufferLength = 3;
 623             cnv->charErrorBuffer[0] = 0x1b;
 624             cnv->charErrorBuffer[1] = 0x25;
 625             cnv->charErrorBuffer[2] = 0x42;
 626
 627             cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
 628             /* initialize the state variables */
 629             uprv_strcpy(myConverterData->name,"ISO_2022");
 630 #else
 631             *errorCode = U_MISSING_RESOURCE_ERROR;
 632             // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
 633             // data loading error code.
 634             return;
 635 #endif
 636         }
 637
 638         cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
 639
 640         if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
 641             _ISO2022Close(cnv);
 642         }
 643     } else {
 644         *errorCode = U_MEMORY_ALLOCATION_ERROR;
 645     }
 646 }
 647
 648
 649 static void
 650 _ISO2022Close(UConverter *converter) {
 651     UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
 652     UConverterSharedData **array = myData->myConverterArray;
 653     int32_t i;
 654
 655     if (converter->extraInfo != NULL) {
 656         /*close the array of converter pointers and free the memory*/
 657         for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
 658             if(array[i]!=NULL) {
 659                 ucnv_unloadSharedDataIfReady(array[i]);
 660             }
 661         }
 662
 663         ucnv_close(myData->currentConverter);
 664
 665         if(!converter->isExtraLocal){
 666             uprv_free (converter->extraInfo);
 667             converter->extraInfo = NULL;
 668         }
 669     }
 670 }
 671
 672 static void
 673 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
 674     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
 675     if(choice<=UCNV_RESET_TO_UNICODE) {
 676         uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
 677         myConverterData->key = 0;
 678         myConverterData->isEmptySegment = FALSE;
 679     }
 680     if(choice!=UCNV_RESET_TO_UNICODE) {
 681         uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
 682     }
 683 #ifdef U_ENABLE_GENERIC_ISO_2022
 684     if(myConverterData->locale[0] == 0){
 685         if(choice<=UCNV_RESET_TO_UNICODE) {
 686             myConverterData->isFirstBuffer = TRUE;
 687             myConverterData->key = 0;
 688             if (converter->mode == UCNV_SO){
 689                 ucnv_close (myConverterData->currentConverter);
 690                 myConverterData->currentConverter=NULL;
 691             }
 692             converter->mode = UCNV_SI;
 693         }
 694         if(choice!=UCNV_RESET_TO_UNICODE) {
 695             /* re-append UTF-8 escape sequence */
 696             converter->charErrorBufferLength = 3;
 697             converter->charErrorBuffer[0] = 0x1b;
 698             converter->charErrorBuffer[1] = 0x28;
 699             converter->charErrorBuffer[2] = 0x42;
 700         }
 701     }
 702     else
 703 #endif
 704     {
 705         /* reset the state variables */
 706         if(myConverterData->locale[0] == 'k'){
 707             if(choice<=UCNV_RESET_TO_UNICODE) {
 708                 setInitialStateToUnicodeKR(converter, myConverterData);
 709             }
 710             if(choice!=UCNV_RESET_TO_UNICODE) {
 711                 setInitialStateFromUnicodeKR(converter, myConverterData);
 712             }
 713         }
 714     }
 715 }
 716
 717 static const char*
 718 _ISO2022getName(const UConverter* cnv){
 719     if(cnv->extraInfo){
 720         UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
 721         return myData->name;
 722     }
 723     return NULL;
 724 }
 725
 726
 727 /*************** to unicode *******************/
 728 /****************************************************************************
 729  * Recognized escape sequences are
 730  * <ESC>(B  ASCII
 731  * <ESC>.A  ISO-8859-1
 732  * <ESC>.F  ISO-8859-7
 733  * <ESC>(J  JISX-201
 734  * <ESC>(I  JISX-201
 735  * <ESC>$B  JISX-208
 736  * <ESC>$@  JISX-208
 737  * <ESC>$(D JISX-212
 738  * <ESC>$A  GB2312
 739  * <ESC>$(C KSC5601
 740  */
 741 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
 742 /*      0                1               2               3               4               5               6               7               8               9    */
 743     INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 744     ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
 745     ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 746     ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
 747     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 748     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 749     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 750     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 751 };
 752
 753 #if !UCONFIG_ONLY_HTML_CONVERSION
 754 /*************** to unicode *******************/
 755 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
 756 /*      0                1               2               3               4               5               6               7               8               9    */
 757      INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 758     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 759     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 760     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 761     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
 762     ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 763     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 764     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 765 };
 766 #endif
 767
 768
 769 static UCNV_TableStates_2022
 770 getKey_2022(char c,int32_t* key,int32_t* offset){
 771     int32_t togo;
 772     int32_t low = 0;
 773     int32_t hi = MAX_STATES_2022;
 774     int32_t oldmid=0;
 775
 776     togo = normalize_esq_chars_2022[(uint8_t)c];
 777     if(togo == 0) {
 778         /* not a valid character anywhere in an escape sequence */
 779         *key = 0;
 780         *offset = 0;
 781         return INVALID_2022;
 782     }
 783     togo = (*key << 5) + togo;
 784
 785     while (hi != low)  /*binary search*/{
 786
 787         int32_t mid = (hi+low) >> 1; /*Finds median*/
 788
 789         if (mid == oldmid)
 790             break;
 791
 792         if (escSeqStateTable_Key_2022[mid] > togo){
 793             hi = mid;
 794         }
 795         else if (escSeqStateTable_Key_2022[mid] < togo){
 796             low = mid;
 797         }
 798         else /*we found it*/{
 799             *key = togo;
 800             *offset = mid;
 801             return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
 802         }
 803         oldmid = mid;
 804
 805     }
 806
 807     *key = 0;
 808     *offset = 0;
 809     return INVALID_2022;
 810 }
 811
 812 /*runs through a state machine to determine the escape sequence - codepage correspondance
 813  */
 814 static void
 815 changeState_2022(UConverter* _this,
 816                 const char** source,
 817                 const char* sourceLimit,
 818                 Variant2022 var,
 819                 UErrorCode* err){
 820     UCNV_TableStates_2022 value;
 821     UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
 822     uint32_t key = myData2022->key;
 823     int32_t offset = 0;
 824     int8_t initialToULength = _this->toULength;
 825     char c;
 826
 827     value = VALID_NON_TERMINAL_2022;
 828     while (*source < sourceLimit) {
 829         c = *(*source)++;
 830         _this->toUBytes[_this->toULength++]=(uint8_t)c;
 831         value = getKey_2022(c,(int32_t *) &key, &offset);
 832
 833         switch (value){
 834
 835         case VALID_NON_TERMINAL_2022 :
 836             /* continue with the loop */
 837             break;
 838
 839         case VALID_TERMINAL_2022:
 840             key = 0;
 841             goto DONE;
 842
 843         case INVALID_2022:
 844             goto DONE;
 845
 846         case VALID_MAYBE_TERMINAL_2022:
 847 #ifdef U_ENABLE_GENERIC_ISO_2022
 848             /* ESC ( B is ambiguous only for ISO_2022 itself */
 849             if(var == ISO_2022) {
 850                 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
 851                 _this->toULength = 0;
 852
 853                 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
 854
 855                 /* continue with the loop */
 856                 value = VALID_NON_TERMINAL_2022;
 857                 break;
 858             } else
 859 #endif
 860             {
 861                 /* not ISO_2022 itself, finish here */
 862                 value = VALID_TERMINAL_2022;
 863                 key = 0;
 864                 goto DONE;
 865             }
 866         }
 867     }
 868
 869 DONE:
 870     myData2022->key = key;
 871
 872     if (value == VALID_NON_TERMINAL_2022) {
 873         /* indicate that the escape sequence is incomplete: key!=0 */
 874         return;
 875     } else if (value == INVALID_2022 ) {
 876         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 877     } else /* value == VALID_TERMINAL_2022 */ {
 878         switch(var){
 879 #ifdef U_ENABLE_GENERIC_ISO_2022
 880         case ISO_2022:
 881         {
 882             const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
 883             if(chosenConverterName == NULL) {
 884                 /* SS2 or SS3 */
 885                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 886                 _this->toUCallbackReason = UCNV_UNASSIGNED;
 887                 return;
 888             }
 889
 890             _this->mode = UCNV_SI;
 891             ucnv_close(myData2022->currentConverter);
 892             myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
 893             if(U_SUCCESS(*err)) {
 894                 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
 895                 _this->mode = UCNV_SO;
 896             }
 897             break;
 898         }
 899 #endif
 900         case ISO_2022_JP:
 901             {
 902                 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
 903                 switch(tempState) {
 904                 case INVALID_STATE:
 905                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 906                     break;
 907                 case SS2_STATE:
 908                     if(myData2022->toU2022State.cs[2]!=0) {
 909                         if(myData2022->toU2022State.g<2) {
 910                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 911                         }
 912                         myData2022->toU2022State.g=2;
 913                     } else {
 914                         /* illegal to have SS2 before a matching designator */
 915                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 916                     }
 917                     break;
 918                 /* case SS3_STATE: not used in ISO-2022-JP-x */
 919                 case ISO8859_1:
 920                 case ISO8859_7:
 921                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
 922                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 923                     } else {
 924                         /* G2 charset for SS2 */
 925                         myData2022->toU2022State.cs[2]=(int8_t)tempState;
 926                     }
 927                     break;
 928                 default:
 929                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
 930                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 931                     } else {
 932                         /* G0 charset */
 933                         myData2022->toU2022State.cs[0]=(int8_t)tempState;
 934                     }
 935                     break;
 936                 }
 937             }
 938             break;
 939 #if !UCONFIG_ONLY_HTML_CONVERSION
 940         case ISO_2022_CN:
 941             {
 942                 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
 943                 switch(tempState) {
 944                 case INVALID_STATE:
 945                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 946                     break;
 947                 case SS2_STATE:
 948                     if(myData2022->toU2022State.cs[2]!=0) {
 949                         if(myData2022->toU2022State.g<2) {
 950                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 951                         }
 952                         myData2022->toU2022State.g=2;
 953                     } else {
 954                         /* illegal to have SS2 before a matching designator */
 955                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 956                     }
 957                     break;
 958                 case SS3_STATE:
 959                     if(myData2022->toU2022State.cs[3]!=0) {
 960                         if(myData2022->toU2022State.g<2) {
 961                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 962                         }
 963                         myData2022->toU2022State.g=3;
 964                     } else {
 965                         /* illegal to have SS3 before a matching designator */
 966                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 967                     }
 968                     break;
 969                 case ISO_IR_165:
 970                     if(myData2022->version==0) {
 971                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 972                         break;
 973                     }
 974                     U_FALLTHROUGH;
 975                 case GB2312_1:
 976                     U_FALLTHROUGH;
 977                 case CNS_11643_1:
 978                     myData2022->toU2022State.cs[1]=(int8_t)tempState;
 979                     break;
 980                 case CNS_11643_2:
 981                     myData2022->toU2022State.cs[2]=(int8_t)tempState;
 982                     break;
 983                 default:
 984                     /* other CNS 11643 planes */
 985                     if(myData2022->version==0) {
 986                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 987                     } else {
 988                        myData2022->toU2022State.cs[3]=(int8_t)tempState;
 989                     }
 990                     break;
 991                 }
 992             }
 993             break;
 994         case ISO_2022_KR:
 995             if(offset==0x30){
 996                 /* nothing to be done, just accept this one escape sequence */
 997             } else {
 998                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 999             }
1000             break;
1001 #endif  // !UCONFIG_ONLY_HTML_CONVERSION
1002
1003         default:
1004             *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1005             break;
1006         }
1007     }
1008     if(U_SUCCESS(*err)) {
1009         _this->toULength = 0;
1010     } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1011         if(_this->toULength>1) {
1012             /*
1013              * Ticket 5691: consistent illegal sequences:
1014              * - We include at least the first byte (ESC) in the illegal sequence.
1015              * - If any of the non-initial bytes could be the start of a character,
1016              *   we stop the illegal sequence before the first one of those.
1017              *   In escape sequences, all following bytes are "printable", that is,
1018              *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1019              *   they are valid single/lead bytes.
1020              *   For simplicity, we always only report the initial ESC byte as the
1021              *   illegal sequence and back out all other bytes we looked at.
1022              */
1023             /* Back out some bytes. */
1024             int8_t backOutDistance=_this->toULength-1;
1025             int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1026             if(backOutDistance<=bytesFromThisBuffer) {
1027                 /* same as initialToULength<=1 */
1028                 *source-=backOutDistance;
1029             } else {
1030                 /* Back out bytes from the previous buffer: Need to replay them. */
1031                 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1032                 /* same as -(initialToULength-1) */
1033                 /* preToULength is negative! */
1034                 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1035                 *source-=bytesFromThisBuffer;
1036             }
1037             _this->toULength=1;
1038         }
1039     } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1040         _this->toUCallbackReason = UCNV_UNASSIGNED;
1041     }
1042 }
1043
1044 #if !UCONFIG_ONLY_HTML_CONVERSION
1045 /*Checks the characters of the buffer against valid 2022 escape sequences
1046 *if the match we return a pointer to the initial start of the sequence otherwise
1047 *we return sourceLimit
1048 */
1049 /*for 2022 looks ahead in the stream
1050  *to determine the longest possible convertible
1051  *data stream
1052  */
1053 static inline const char*
1054 getEndOfBuffer_2022(const char** source,
1055                    const char* sourceLimit,
1056                    UBool /*flush*/){
1057
1058     const char* mySource = *source;
1059
1060 #ifdef U_ENABLE_GENERIC_ISO_2022
1061     if (*source >= sourceLimit)
1062         return sourceLimit;
1063
1064     do{
1065
1066         if (*mySource == ESC_2022){
1067             int8_t i;
1068             int32_t key = 0;
1069             int32_t offset;
1070             UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1071
1072             /* Kludge: I could not
1073             * figure out the reason for validating an escape sequence
1074             * twice - once here and once in changeState_2022().
1075             * is it possible to have an ESC character in a ISO2022
1076             * byte stream which is valid in a code page? Is it legal?
1077             */
1078             for (i=0;
1079             (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1080             i++) {
1081                 value =  getKey_2022(*(mySource+i), &key, &offset);
1082             }
1083             if (value > 0 || *mySource==ESC_2022)
1084                 return mySource;
1085
1086             if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1087                 return sourceLimit;
1088         }
1089     }while (++mySource < sourceLimit);
1090
1091     return sourceLimit;
1092 #else
1093     while(mySource < sourceLimit && *mySource != ESC_2022) {
1094         ++mySource;
1095     }
1096     return mySource;
1097 #endif
1098 }
1099 #endif
1100
1101 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1102  * any future change in _MBCSFromUChar32() function should be reflected here.
1103  * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1104  */
1105 static inline int32_t
1106 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1107                                          UChar32 c,
1108                                          uint32_t* value,
1109                                          UBool useFallback,
1110                                          int outputType)
1111 {
1112     const int32_t *cx;
1113     const uint16_t *table;
1114     uint32_t stage2Entry;
1115     uint32_t myValue;
1116     int32_t length;
1117     const uint8_t *p;
1118     /*
1119      * TODO(markus): Use and require new, faster MBCS conversion table structures.
1120      * Use internal version of ucnv_open() that verifies that the new structures are available,
1121      * else U_INTERNAL_PROGRAM_ERROR.
1122      */
1123     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1124     if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1125         table=sharedData->mbcs.fromUnicodeTable;
1126         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1127         /* get the bytes and the length for the output */
1128         if(outputType==MBCS_OUTPUT_2){
1129             myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1130             if(myValue<=0xff) {
1131                 length=1;
1132             } else {
1133                 length=2;
1134             }
1135         } else /* outputType==MBCS_OUTPUT_3 */ {
1136             p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1137             myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1138             if(myValue<=0xff) {
1139                 length=1;
1140             } else if(myValue<=0xffff) {
1141                 length=2;
1142             } else {
1143                 length=3;
1144             }
1145         }
1146         /* is this code point assigned, or do we use fallbacks? */
1147         if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1148             /* assigned */
1149             *value=myValue;
1150             return length;
1151         } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1152             /*
1153              * We allow a 0 byte output if the "assigned" bit is set for this entry.
1154              * There is no way with this data structure for fallback output
1155              * to be a zero byte.
1156              */
1157             *value=myValue;
1158             return -length;
1159         }
1160     }
1161
1162     cx=sharedData->mbcs.extIndexes;
1163     if(cx!=NULL) {
1164         return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1165     }
1166
1167     /* unassigned */
1168     return 0;
1169 }
1170
1171 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1172  * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1173  * @param retval pointer to output byte
1174  * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1175  */
1176 static inline int32_t
1177 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1178                                        UChar32 c,
1179                                        uint32_t* retval,
1180                                        UBool useFallback)
1181 {
1182     const uint16_t *table;
1183     int32_t value;
1184     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1185     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1186         return 0;
1187     }
1188     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1189     table=sharedData->mbcs.fromUnicodeTable;
1190     /* get the byte for the output */
1191     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1192     /* is this code point assigned, or do we use fallbacks? */
1193     *retval=(uint32_t)(value&0xff);
1194     if(value>=0xf00) {
1195         return 1;  /* roundtrip */
1196     } else if(useFallback ? value>=0x800 : value>=0xc00) {
1197         return -1;  /* fallback taken */
1198     } else {
1199         return 0;  /* no mapping */
1200     }
1201 }
1202
1203 /*
1204  * Check that the result is a 2-byte value with each byte in the range A1..FE
1205  * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1206  * to move it to the ISO 2022 range 21..7E.
1207  * Return 0 if out of range.
1208  */
1209 static inline uint32_t
1210 _2022FromGR94DBCS(uint32_t value) {
1211     if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1212         (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1213     ) {
1214         return value - 0x8080;  /* shift down to 21..7e byte range */
1215     } else {
1216         return 0;  /* not valid for ISO 2022 */
1217     }
1218 }
1219
1220 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1221 /*
1222  * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1223  * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1224  * unchanged.
1225  */
1226 static inline uint32_t
1227 _2022ToGR94DBCS(uint32_t value) {
1228     uint32_t returnValue = value + 0x8080;
1229     if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1230         (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1231         return returnValue;
1232     } else {
1233         return value;
1234     }
1235 }
1236 #endif
1237
1238 #ifdef U_ENABLE_GENERIC_ISO_2022
1239
1240 /**********************************************************************************
1241 *  ISO-2022 Converter
1242 *
1243 *
1244 */
1245
1246 static void
1247 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1248                                                            UErrorCode* err){
1249     const char* mySourceLimit, *realSourceLimit;
1250     const char* sourceStart;
1251     const UChar* myTargetStart;
1252     UConverter* saveThis;
1253     UConverterDataISO2022* myData;
1254     int8_t length;
1255
1256     saveThis = args->converter;
1257     myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1258
1259     realSourceLimit = args->sourceLimit;
1260     while (args->source < realSourceLimit) {
1261         if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1262             /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1263             mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1264
1265             if(args->source < mySourceLimit) {
1266                 if(myData->currentConverter==NULL) {
1267                     myData->currentConverter = ucnv_open("ASCII",err);
1268                     if(U_FAILURE(*err)){
1269                         return;
1270                     }
1271
1272                     myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1273                     saveThis->mode = UCNV_SO;
1274                 }
1275
1276                 /* convert to before the ESC or until the end of the buffer */
1277                 myData->isFirstBuffer=FALSE;
1278                 sourceStart = args->source;
1279                 myTargetStart = args->target;
1280                 args->converter = myData->currentConverter;
1281                 ucnv_toUnicode(args->converter,
1282                     &args->target,
1283                     args->targetLimit,
1284                     &args->source,
1285                     mySourceLimit,
1286                     args->offsets,
1287                     (UBool)(args->flush && mySourceLimit == realSourceLimit),
1288                     err);
1289                 args->converter = saveThis;
1290
1291                 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1292                     /* move the overflow buffer */
1293                     length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1294                     myData->currentConverter->UCharErrorBufferLength = 0;
1295                     if(length > 0) {
1296                         uprv_memcpy(saveThis->UCharErrorBuffer,
1297                                     myData->currentConverter->UCharErrorBuffer,
1298                                     length*U_SIZEOF_UCHAR);
1299                     }
1300                     return;
1301                 }
1302
1303                 /*
1304                  * At least one of:
1305                  * -Error while converting
1306                  * -Done with entire buffer
1307                  * -Need to write offsets or update the current offset
1308                  *  (leave that up to the code in ucnv.c)
1309                  *
1310                  * or else we just stopped at an ESC byte and continue with changeState_2022()
1311                  */
1312                 if (U_FAILURE(*err) ||
1313                     (args->source == realSourceLimit) ||
1314                     (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1315                     (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1316                 ) {
1317                     /* copy partial or error input for truncated detection and error handling */
1318                     if(U_FAILURE(*err)) {
1319                         length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1320                         if(length > 0) {
1321                             uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1322                         }
1323                     } else {
1324                         length = saveThis->toULength = myData->currentConverter->toULength;
1325                         if(length > 0) {
1326                             uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1327                             if(args->source < mySourceLimit) {
1328                                 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1329                             }
1330                         }
1331                     }
1332                     return;
1333                 }
1334             }
1335         }
1336
1337         sourceStart = args->source;
1338         changeState_2022(args->converter,
1339                &(args->source),
1340                realSourceLimit,
1341                ISO_2022,
1342                err);
1343         if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1344             /* let the ucnv.c code update its current offset */
1345             return;
1346         }
1347     }
1348 }
1349
1350 #endif
1351
1352 /*
1353  * To Unicode Callback helper function
1354  */
1355 static void
1356 toUnicodeCallback(UConverter *cnv,
1357                   const uint32_t sourceChar, const uint32_t targetUniChar,
1358                   UErrorCode* err){
1359     if(sourceChar>0xff){
1360         cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1361         cnv->toUBytes[1] = (uint8_t)sourceChar;
1362         cnv->toULength = 2;
1363     }
1364     else{
1365         cnv->toUBytes[0] =(char) sourceChar;
1366         cnv->toULength = 1;
1367     }
1368
1369     if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1370         *err = U_INVALID_CHAR_FOUND;
1371     }
1372     else{
1373         *err = U_ILLEGAL_CHAR_FOUND;
1374     }
1375 }
1376
1377 /**************************************ISO-2022-JP*************************************************/
1378
1379 /************************************** IMPORTANT **************************************************
1380 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1381 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1382 * The converter iterates over each Unicode codepoint
1383 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1384 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1385 * would do as far as possible.
1386 *
1387 * If the implementation of these macros or structure of sharedData struct change in the future, make
1388 * sure that ISO-2022 is also changed.
1389 ***************************************************************************************************
1390 */
1391
1392 /***************************************************************************************************
1393 * Rules for ISO-2022-jp encoding
1394 * (i)   Escape sequences must be fully contained within a line they should not
1395 *       span new lines or CRs
1396 * (ii)  If the last character on a line is represented by two bytes then an ASCII or
1397 *       JIS-Roman character escape sequence should follow before the line terminates
1398 * (iii) If the first character on the line is represented by two bytes then a two
1399 *       byte character escape sequence should precede it
1400 * (iv)  If no escape sequence is encountered then the characters are ASCII
1401 * (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1402 *       and invoked with SS2 (ESC N).
1403 * (vi)  If there is any G0 designation in text, there must be a switch to
1404 *       ASCII or to JIS X 0201-Roman before a space character (but not
1405 *       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1406 *       characters such as tab or CRLF.
1407 * (vi)  Supported encodings:
1408 *          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1409 *
1410 *  source : RFC-1554
1411 *
1412 *          JISX201, JISX208,JISX212 : new .cnv data files created
1413 *          KSC5601 : alias to ibm-949 mapping table
1414 *          GB2312 : alias to ibm-1386 mapping table
1415 *          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1416 *          ISO-8859-7 : alisas to ibm-9409 mapping table
1417 */
1418
1419 /* preference order of JP charsets */
1420 static const StateEnum jpCharsetPref[]={
1421     ASCII,
1422     JISX201,
1423     ISO8859_1,
1424     JISX208,
1425     ISO8859_7,
1426     JISX212,
1427     GB2312,
1428     KSC5601,
1429     HWKANA_7BIT
1430 };
1431
1432 /*
1433  * The escape sequences must be in order of the enum constants like JISX201  = 3,
1434  * not in order of jpCharsetPref[]!
1435  */
1436 static const char escSeqChars[][6] ={
1437     "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1438     "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1439     "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1440     "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1441     "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1442     "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1443     "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1444     "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1445     "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1446
1447 };
1448 static  const int8_t escSeqCharsLen[] ={
1449     3, /* length of <ESC>(B  ASCII       */
1450     3, /* length of <ESC>.A  ISO-8859-1  */
1451     3, /* length of <ESC>.F  ISO-8859-7  */
1452     3, /* length of <ESC>(J  JISX-201    */
1453     3, /* length of <ESC>$B  JISX-208    */
1454     4, /* length of <ESC>$(D JISX-212    */
1455     3, /* length of <ESC>$A  GB2312      */
1456     4, /* length of <ESC>$(C KSC5601     */
1457     3  /* length of <ESC>(I  HWKANA_7BIT */
1458 };
1459
1460 /*
1461 * The iteration over various code pages works this way:
1462 * i)   Get the currentState from myConverterData->currentState
1463 * ii)  Check if the character is mapped to a valid character in the currentState
1464 *      Yes ->  a) set the initIterState to currentState
1465 *       b) remain in this state until an invalid character is found
1466 *      No  ->  a) go to the next code page and find the character
1467 * iii) Before changing the state increment the current state check if the current state
1468 *      is equal to the intitIteration state
1469 *      Yes ->  A character that cannot be represented in any of the supported encodings
1470 *       break and return a U_INVALID_CHARACTER error
1471 *      No  ->  Continue and find the character in next code page
1472 *
1473 *
1474 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1475 */
1476
1477 /* Map 00..7F to Unicode according to JIS X 0201. */
1478 static inline uint32_t
1479 jisx201ToU(uint32_t value) {
1480     if(value < 0x5c) {
1481         return value;
1482     } else if(value == 0x5c) {
1483         return 0xa5;
1484     } else if(value == 0x7e) {
1485         return 0x203e;
1486     } else /* value <= 0x7f */ {
1487         return value;
1488     }
1489 }
1490
1491 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1492 static inline uint32_t
1493 jisx201FromU(uint32_t value) {
1494     if(value<=0x7f) {
1495         if(value!=0x5c && value!=0x7e) {
1496             return value;
1497         }
1498     } else if(value==0xa5) {
1499         return 0x5c;
1500     } else if(value==0x203e) {
1501         return 0x7e;
1502     }
1503     return 0xfffe;
1504 }
1505
1506 /*
1507  * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1508  * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1509  * Return 0 if the byte pair is out of range.
1510  */
1511 static inline uint32_t
1512 _2022FromSJIS(uint32_t value) {
1513     uint8_t trail;
1514
1515     if(value > 0xEFFC) {
1516         return 0;  /* beyond JIS X 0208 */
1517     }
1518
1519     trail = (uint8_t)value;
1520
1521     value &= 0xff00;  /* lead byte */
1522     if(value <= 0x9f00) {
1523         value -= 0x7000;
1524     } else /* 0xe000 <= value <= 0xef00 */ {
1525         value -= 0xb000;
1526     }
1527     value <<= 1;
1528
1529     if(trail <= 0x9e) {
1530         value -= 0x100;
1531         if(trail <= 0x7e) {
1532             value |= trail - 0x1f;
1533         } else {
1534             value |= trail - 0x20;
1535         }
1536     } else /* trail <= 0xfc */ {
1537         value |= trail - 0x7e;
1538     }
1539     return value;
1540 }
1541
1542 /*
1543  * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1544  * If either byte is outside 21..7E make sure that the result is not valid
1545  * for Shift-JIS so that the converter catches it.
1546  * Some invalid byte values already turn into equally invalid Shift-JIS
1547  * byte values and need not be tested explicitly.
1548  */
1549 static inline void
1550 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1551     if(c1&1) {
1552         ++c1;
1553         if(c2 <= 0x5f) {
1554             c2 += 0x1f;
1555         } else if(c2 <= 0x7e) {
1556             c2 += 0x20;
1557         } else {
1558             c2 = 0;  /* invalid */
1559         }
1560     } else {
1561         if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1562             c2 += 0x7e;
1563         } else {
1564             c2 = 0;  /* invalid */
1565         }
1566     }
1567     c1 >>= 1;
1568     if(c1 <= 0x2f) {
1569         c1 += 0x70;
1570     } else if(c1 <= 0x3f) {
1571         c1 += 0xb0;
1572     } else {
1573         c1 = 0;  /* invalid */
1574     }
1575     bytes[0] = (char)c1;
1576     bytes[1] = (char)c2;
1577 }
1578
1579 /*
1580  * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1581  * Katakana.
1582  * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1583  * because Shift-JIS roundtrips half-width Katakana to single bytes.
1584  * These were the only fallbacks in ICU's jisx-208.ucm file.
1585  */
1586 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1587     0x2123,  /* U+FF61 */
1588     0x2156,
1589     0x2157,
1590     0x2122,
1591     0x2126,
1592     0x2572,
1593     0x2521,
1594     0x2523,
1595     0x2525,
1596     0x2527,
1597     0x2529,
1598     0x2563,
1599     0x2565,
1600     0x2567,
1601     0x2543,
1602     0x213C,  /* U+FF70 */
1603     0x2522,
1604     0x2524,
1605     0x2526,
1606     0x2528,
1607     0x252A,
1608     0x252B,
1609     0x252D,
1610     0x252F,
1611     0x2531,
1612     0x2533,
1613     0x2535,
1614     0x2537,
1615     0x2539,
1616     0x253B,
1617     0x253D,
1618     0x253F,  /* U+FF80 */
1619     0x2541,
1620     0x2544,
1621     0x2546,
1622     0x2548,
1623     0x254A,
1624     0x254B,
1625     0x254C,
1626     0x254D,
1627     0x254E,
1628     0x254F,
1629     0x2552,
1630     0x2555,
1631     0x2558,
1632     0x255B,
1633     0x255E,
1634     0x255F,  /* U+FF90 */
1635     0x2560,
1636     0x2561,
1637     0x2562,
1638     0x2564,
1639     0x2566,
1640     0x2568,
1641     0x2569,
1642     0x256A,
1643     0x256B,
1644     0x256C,
1645     0x256D,
1646     0x256F,
1647     0x2573,
1648     0x212B,
1649     0x212C   /* U+FF9F */
1650 };
1651
1652 static void
1653 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1654     UConverter *cnv = args->converter;
1655     UConverterDataISO2022 *converterData;
1656     ISO2022State *pFromU2022State;
1657     uint8_t *target = (uint8_t *) args->target;
1658     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1659     const UChar* source = args->source;
1660     const UChar* sourceLimit = args->sourceLimit;
1661     int32_t* offsets = args->offsets;
1662     UChar32 sourceChar;
1663     char buffer[8];
1664     int32_t len, outLen;
1665     int8_t choices[10];
1666     int32_t choiceCount;
1667     uint32_t targetValue = 0;
1668     UBool useFallback;
1669
1670     int32_t i;
1671     int8_t cs, g;
1672
1673     /* set up the state */
1674     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1675     pFromU2022State   = &converterData->fromU2022State;
1676
1677     choiceCount = 0;
1678
1679     /* check if the last codepoint of previous buffer was a lead surrogate*/
1680     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1681         goto getTrail;
1682     }
1683
1684     while(source < sourceLimit) {
1685         if(target < targetLimit) {
1686
1687             sourceChar  = *(source++);
1688             /*check if the char is a First surrogate*/
1689             if(U16_IS_SURROGATE(sourceChar)) {
1690                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1691 getTrail:
1692                     /*look ahead to find the trail surrogate*/
1693                     if(source < sourceLimit) {
1694                         /* test the following code unit */
1695                         UChar trail=(UChar) *source;
1696                         if(U16_IS_TRAIL(trail)) {
1697                             source++;
1698                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1699                             cnv->fromUChar32=0x00;
1700                             /* convert this supplementary code point */
1701                             /* exit this condition tree */
1702                         } else {
1703                             /* this is an unmatched lead code unit (1st surrogate) */
1704                             /* callback(illegal) */
1705                             *err=U_ILLEGAL_CHAR_FOUND;
1706                             cnv->fromUChar32=sourceChar;
1707                             break;
1708                         }
1709                     } else {
1710                         /* no more input */
1711                         cnv->fromUChar32=sourceChar;
1712                         break;
1713                     }
1714                 } else {
1715                     /* this is an unmatched trail code unit (2nd surrogate) */
1716                     /* callback(illegal) */
1717                     *err=U_ILLEGAL_CHAR_FOUND;
1718                     cnv->fromUChar32=sourceChar;
1719                     break;
1720                 }
1721             }
1722
1723             /* do not convert SO/SI/ESC */
1724             if(IS_2022_CONTROL(sourceChar)) {
1725                 /* callback(illegal) */
1726                 *err=U_ILLEGAL_CHAR_FOUND;
1727                 cnv->fromUChar32=sourceChar;
1728                 break;
1729             }
1730
1731             /* do the conversion */
1732
1733             if(choiceCount == 0) {
1734                 uint16_t csm;
1735
1736                 /*
1737                  * The csm variable keeps track of which charsets are allowed
1738                  * and not used yet while building the choices[].
1739                  */
1740                 csm = jpCharsetMasks[converterData->version];
1741                 choiceCount = 0;
1742
1743                 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1744                 if(converterData->version == 3 || converterData->version == 4) {
1745                     choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1746                 }
1747                 /* Do not try single-byte half-width Katakana for other versions. */
1748                 csm &= ~CSM(HWKANA_7BIT);
1749
1750                 /* try the current G0 charset */
1751                 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1752                 csm &= ~CSM(cs);
1753
1754                 /* try the current G2 charset */
1755                 if((cs = pFromU2022State->cs[2]) != 0) {
1756                     choices[choiceCount++] = cs;
1757                     csm &= ~CSM(cs);
1758                 }
1759
1760                 /* try all the other possible charsets */
1761                 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
1762                     cs = (int8_t)jpCharsetPref[i];
1763                     if(CSM(cs) & csm) {
1764                         choices[choiceCount++] = cs;
1765                         csm &= ~CSM(cs);
1766                     }
1767                 }
1768             }
1769
1770             cs = g = 0;
1771             /*
1772              * len==0: no mapping found yet
1773              * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1774              * len>0: found a roundtrip result, done
1775              */
1776             len = 0;
1777             /*
1778              * We will turn off useFallback after finding a fallback,
1779              * but we still get fallbacks from PUA code points as usual.
1780              * Therefore, we will also need to check that we don't overwrite
1781              * an early fallback with a later one.
1782              */
1783             useFallback = cnv->useFallback;
1784
1785             for(i = 0; i < choiceCount && len <= 0; ++i) {
1786                 uint32_t value;
1787                 int32_t len2;
1788                 int8_t cs0 = choices[i];
1789                 switch(cs0) {
1790                 case ASCII:
1791                     if(sourceChar <= 0x7f) {
1792                         targetValue = (uint32_t)sourceChar;
1793                         len = 1;
1794                         cs = cs0;
1795                         g = 0;
1796                     }
1797                     break;
1798                 case ISO8859_1:
1799                     if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1800                         targetValue = (uint32_t)sourceChar - 0x80;
1801                         len = 1;
1802                         cs = cs0;
1803                         g = 2;
1804                     }
1805                     break;
1806                 case HWKANA_7BIT:
1807                     if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1808                         if(converterData->version==3) {
1809                             /* JIS7: use G1 (SO) */
1810                             /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1811                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1812                             len = 1;
1813                             pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1814                             g = 1;
1815                         } else if(converterData->version==4) {
1816                             /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1817                             /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1818                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1819                             len = 1;
1820
1821                             cs = pFromU2022State->cs[0];
1822                             if(IS_JP_DBCS(cs)) {
1823                                 /* switch from a DBCS charset to JISX201 */
1824                                 cs = (int8_t)JISX201;
1825                             }
1826                             /* else stay in the current G0 charset */
1827                             g = 0;
1828                         }
1829                         /* else do not use HWKANA_7BIT with other versions */
1830                     }
1831                     break;
1832                 case JISX201:
1833                     /* G0 SBCS */
1834                     value = jisx201FromU(sourceChar);
1835                     if(value <= 0x7f) {
1836                         targetValue = value;
1837                         len = 1;
1838                         cs = cs0;
1839                         g = 0;
1840                         useFallback = FALSE;
1841                     }
1842                     break;
1843                 case JISX208:
1844                     /* G0 DBCS from Shift-JIS table */
1845                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1846                                 converterData->myConverterArray[cs0],
1847                                 sourceChar, &value,
1848                                 useFallback, MBCS_OUTPUT_2);
1849                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1850                         value = _2022FromSJIS(value);
1851                         if(value != 0) {
1852                             targetValue = value;
1853                             len = len2;
1854                             cs = cs0;
1855                             g = 0;
1856                             useFallback = FALSE;
1857                         }
1858                     } else if(len == 0 && useFallback &&
1859                               (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1860                         targetValue = hwkana_fb[sourceChar - HWKANA_START];
1861                         len = -2;
1862                         cs = cs0;
1863                         g = 0;
1864                         useFallback = FALSE;
1865                     }
1866                     break;
1867                 case ISO8859_7:
1868                     /* G0 SBCS forced to 7-bit output */
1869                     len2 = MBCS_SINGLE_FROM_UCHAR32(
1870                                 converterData->myConverterArray[cs0],
1871                                 sourceChar, &value,
1872                                 useFallback);
1873                     if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1874                         targetValue = value - 0x80;
1875                         len = len2;
1876                         cs = cs0;
1877                         g = 2;
1878                         useFallback = FALSE;
1879                     }
1880                     break;
1881                 default:
1882                     /* G0 DBCS */
1883                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1884                                 converterData->myConverterArray[cs0],
1885                                 sourceChar, &value,
1886                                 useFallback, MBCS_OUTPUT_2);
1887                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1888                         if(cs0 == KSC5601) {
1889                             /*
1890                              * Check for valid bytes for the encoding scheme.
1891                              * This is necessary because the sub-converter (windows-949)
1892                              * has a broader encoding scheme than is valid for 2022.
1893                              */
1894                             value = _2022FromGR94DBCS(value);
1895                             if(value == 0) {
1896                                 break;
1897                             }
1898                         }
1899                         targetValue = value;
1900                         len = len2;
1901                         cs = cs0;
1902                         g = 0;
1903                         useFallback = FALSE;
1904                     }
1905                     break;
1906                 }
1907             }
1908
1909             if(len != 0) {
1910                 if(len < 0) {
1911                     len = -len;  /* fallback */
1912                 }
1913                 outLen = 0; /* count output bytes */
1914
1915                 /* write SI if necessary (only for JIS7) */
1916                 if(pFromU2022State->g == 1 && g == 0) {
1917                     buffer[outLen++] = UCNV_SI;
1918                     pFromU2022State->g = 0;
1919                 }
1920
1921                 /* write the designation sequence if necessary */
1922                 if(cs != pFromU2022State->cs[g]) {
1923                     int32_t escLen = escSeqCharsLen[cs];
1924                     uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1925                     outLen += escLen;
1926                     pFromU2022State->cs[g] = cs;
1927
1928                     /* invalidate the choices[] */
1929                     choiceCount = 0;
1930                 }
1931
1932                 /* write the shift sequence if necessary */
1933                 if(g != pFromU2022State->g) {
1934                     switch(g) {
1935                     /* case 0 handled before writing escapes */
1936                     case 1:
1937                         buffer[outLen++] = UCNV_SO;
1938                         pFromU2022State->g = 1;
1939                         break;
1940                     default: /* case 2 */
1941                         buffer[outLen++] = 0x1b;
1942                         buffer[outLen++] = 0x4e;
1943                         break;
1944                     /* no case 3: no SS3 in ISO-2022-JP-x */
1945                     }
1946                 }
1947
1948                 /* write the output bytes */
1949                 if(len == 1) {
1950                     buffer[outLen++] = (char)targetValue;
1951                 } else /* len == 2 */ {
1952                     buffer[outLen++] = (char)(targetValue >> 8);
1953                     buffer[outLen++] = (char)targetValue;
1954                 }
1955             } else {
1956                 /*
1957                  * if we cannot find the character after checking all codepages
1958                  * then this is an error
1959                  */
1960                 *err = U_INVALID_CHAR_FOUND;
1961                 cnv->fromUChar32=sourceChar;
1962                 break;
1963             }
1964
1965             if(sourceChar == CR || sourceChar == LF) {
1966                 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1967                 pFromU2022State->cs[2] = 0;
1968                 choiceCount = 0;
1969             }
1970
1971             /* output outLen>0 bytes in buffer[] */
1972             if(outLen == 1) {
1973                 *target++ = buffer[0];
1974                 if(offsets) {
1975                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1976                 }
1977             } else if(outLen == 2 && (target + 2) <= targetLimit) {
1978                 *target++ = buffer[0];
1979                 *target++ = buffer[1];
1980                 if(offsets) {
1981                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1982                     *offsets++ = sourceIndex;
1983                     *offsets++ = sourceIndex;
1984                 }
1985             } else {
1986                 fromUWriteUInt8(
1987                     cnv,
1988                     buffer, outLen,
1989                     &target, (const char *)targetLimit,
1990                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1991                     err);
1992                 if(U_FAILURE(*err)) {
1993                     break;
1994                 }
1995             }
1996         } /* end if(myTargetIndex<myTargetLength) */
1997         else{
1998             *err =U_BUFFER_OVERFLOW_ERROR;
1999             break;
2000         }
2001
2002     }/* end while(mySourceIndex<mySourceLength) */
2003
2004     /*
2005      * the end of the input stream and detection of truncated input
2006      * are handled by the framework, but for ISO-2022-JP conversion
2007      * we need to be in ASCII mode at the very end
2008      *
2009      * conditions:
2010      *   successful
2011      *   in SO mode or not in ASCII mode
2012      *   end of input and no truncated input
2013      */
2014     if( U_SUCCESS(*err) &&
2015         (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2016         args->flush && source>=sourceLimit && cnv->fromUChar32==0
2017     ) {
2018         int32_t sourceIndex;
2019
2020         outLen = 0;
2021
2022         if(pFromU2022State->g != 0) {
2023             buffer[outLen++] = UCNV_SI;
2024             pFromU2022State->g = 0;
2025         }
2026
2027         if(pFromU2022State->cs[0] != ASCII) {
2028             int32_t escLen = escSeqCharsLen[ASCII];
2029             uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2030             outLen += escLen;
2031             pFromU2022State->cs[0] = (int8_t)ASCII;
2032         }
2033
2034         /* get the source index of the last input character */
2035         /*
2036          * TODO this would be simpler and more reliable if we used a pair
2037          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2038          * so that we could simply use the prevSourceIndex here;
2039          * this code gives an incorrect result for the rare case of an unmatched
2040          * trail surrogate that is alone in the last buffer of the text stream
2041          */
2042         sourceIndex=(int32_t)(source-args->source);
2043         if(sourceIndex>0) {
2044             --sourceIndex;
2045             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2046                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2047             ) {
2048                 --sourceIndex;
2049             }
2050         } else {
2051             sourceIndex=-1;
2052         }
2053
2054         fromUWriteUInt8(
2055             cnv,
2056             buffer, outLen,
2057             &target, (const char *)targetLimit,
2058             &offsets, sourceIndex,
2059             err);
2060     }
2061
2062     /*save the state and return */
2063     args->source = source;
2064     args->target = (char*)target;
2065 }
2066
2067 /*************** to unicode *******************/
2068
2069 static void
2070 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2071                                                UErrorCode* err){
2072     char tempBuf[2];
2073     const char *mySource = (char *) args->source;
2074     UChar *myTarget = args->target;
2075     const char *mySourceLimit = args->sourceLimit;
2076     uint32_t targetUniChar = 0x0000;
2077     uint32_t mySourceChar = 0x0000;
2078     uint32_t tmpSourceChar = 0x0000;
2079     UConverterDataISO2022* myData;
2080     ISO2022State *pToU2022State;
2081     StateEnum cs;
2082
2083     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2084     pToU2022State = &myData->toU2022State;
2085
2086     if(myData->key != 0) {
2087         /* continue with a partial escape sequence */
2088         goto escape;
2089     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2090         /* continue with a partial double-byte character */
2091         mySourceChar = args->converter->toUBytes[0];
2092         args->converter->toULength = 0;
2093         cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2094         targetUniChar = missingCharMarker;
2095         goto getTrailByte;
2096     }
2097
2098     while(mySource < mySourceLimit){
2099
2100         targetUniChar =missingCharMarker;
2101
2102         if(myTarget < args->targetLimit){
2103
2104             mySourceChar= (unsigned char) *mySource++;
2105
2106             switch(mySourceChar) {
2107             case UCNV_SI:
2108                 if(myData->version==3) {
2109                     pToU2022State->g=0;
2110                     continue;
2111                 } else {
2112                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2113                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
2114                     break;
2115                 }
2116
2117             case UCNV_SO:
2118                 if(myData->version==3) {
2119                     /* JIS7: switch to G1 half-width Katakana */
2120                     pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2121                     pToU2022State->g=1;
2122                     continue;
2123                 } else {
2124                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2125                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
2126                     break;
2127                 }
2128
2129             case ESC_2022:
2130                 mySource--;
2131 escape:
2132                 {
2133                     const char * mySourceBefore = mySource;
2134                     int8_t toULengthBefore = args->converter->toULength;
2135
2136                     changeState_2022(args->converter,&(mySource),
2137                         mySourceLimit, ISO_2022_JP,err);
2138
2139                     /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2140                     if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2141                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2142                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
2143                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2144                     }
2145                 }
2146
2147                 /* invalid or illegal escape sequence */
2148                 if(U_FAILURE(*err)){
2149                     args->target = myTarget;
2150                     args->source = mySource;
2151                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
2152                     return;
2153                 }
2154                 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2155                 if(myData->key==0) {
2156                     myData->isEmptySegment = TRUE;
2157                 }
2158                 continue;
2159
2160             /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2161
2162             case CR:
2163             case LF:
2164                 /* automatically reset to single-byte mode */
2165                 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2166                     pToU2022State->cs[0] = (int8_t)ASCII;
2167                 }
2168                 pToU2022State->cs[2] = 0;
2169                 pToU2022State->g = 0;
2170                 U_FALLTHROUGH;
2171             default:
2172                 /* convert one or two bytes */
2173                 myData->isEmptySegment = FALSE;
2174                 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2175                 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2176                     !IS_JP_DBCS(cs)
2177                 ) {
2178                     /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2179                     targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2180
2181                     /* return from a single-shift state to the previous one */
2182                     if(pToU2022State->g >= 2) {
2183                         pToU2022State->g=pToU2022State->prevG;
2184                     }
2185                 } else switch(cs) {
2186                 case ASCII:
2187                     if(mySourceChar <= 0x7f) {
2188                         targetUniChar = mySourceChar;
2189                     }
2190                     break;
2191                 case ISO8859_1:
2192                     if(mySourceChar <= 0x7f) {
2193                         targetUniChar = mySourceChar + 0x80;
2194                     }
2195                     /* return from a single-shift state to the previous one */
2196                     pToU2022State->g=pToU2022State->prevG;
2197                     break;
2198                 case ISO8859_7:
2199                     if(mySourceChar <= 0x7f) {
2200                         /* convert mySourceChar+0x80 to use a normal 8-bit table */
2201                         targetUniChar =
2202                             _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2203                                 myData->myConverterArray[cs],
2204                                 mySourceChar + 0x80);
2205                     }
2206                     /* return from a single-shift state to the previous one */
2207                     pToU2022State->g=pToU2022State->prevG;
2208                     break;
2209                 case JISX201:
2210                     if(mySourceChar <= 0x7f) {
2211                         targetUniChar = jisx201ToU(mySourceChar);
2212                     }
2213                     break;
2214                 case HWKANA_7BIT:
2215                     if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2216                         /* 7-bit halfwidth Katakana */
2217                         targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2218                     }
2219                     break;
2220                 default:
2221                     /* G0 DBCS */
2222                     if(mySource < mySourceLimit) {
2223                         int leadIsOk, trailIsOk;
2224                         uint8_t trailByte;
2225 getTrailByte:
2226                         trailByte = (uint8_t)*mySource;
2227                         /*
2228                          * Ticket 5691: consistent illegal sequences:
2229                          * - We include at least the first byte in the illegal sequence.
2230                          * - If any of the non-initial bytes could be the start of a character,
2231                          *   we stop the illegal sequence before the first one of those.
2232                          *
2233                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2234                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2235                          * Otherwise we convert or report the pair of bytes.
2236                          */
2237                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2238                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2239                         if (leadIsOk && trailIsOk) {
2240                             ++mySource;
2241                             tmpSourceChar = (mySourceChar << 8) | trailByte;
2242                             if(cs == JISX208) {
2243                                 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2244                                 mySourceChar = tmpSourceChar;
2245                             } else {
2246                                 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2247                                 mySourceChar = tmpSourceChar;
2248                                 if (cs == KSC5601) {
2249                                     tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2250                                 }
2251                                 tempBuf[0] = (char)(tmpSourceChar >> 8);
2252                                 tempBuf[1] = (char)(tmpSourceChar);
2253                             }
2254                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2255                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2256                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2257                             ++mySource;
2258                             /* add another bit so that the code below writes 2 bytes in case of error */
2259                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2260                         }
2261                     } else {
2262                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2263                         args->converter->toULength = 1;
2264                         goto endloop;
2265                     }
2266                 }  /* End of inner switch */
2267                 break;
2268             }  /* End of outer switch */
2269             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2270                 if(args->offsets){
2271                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2272                 }
2273                 *(myTarget++)=(UChar)targetUniChar;
2274             }
2275             else if(targetUniChar > missingCharMarker){
2276                 /* disassemble the surrogate pair and write to output*/
2277                 targetUniChar-=0x0010000;
2278                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2279                 if(args->offsets){
2280                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2281                 }
2282                 ++myTarget;
2283                 if(myTarget< args->targetLimit){
2284                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2285                     if(args->offsets){
2286                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2287                     }
2288                     ++myTarget;
2289                 }else{
2290                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2291                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2292                 }
2293
2294             }
2295             else{
2296                 /* Call the callback function*/
2297                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2298                 break;
2299             }
2300         }
2301         else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2302             *err =U_BUFFER_OVERFLOW_ERROR;
2303             break;
2304         }
2305     }
2306 endloop:
2307     args->target = myTarget;
2308     args->source = mySource;
2309 }
2310
2311
2312 #if !UCONFIG_ONLY_HTML_CONVERSION
2313 /***************************************************************
2314 *   Rules for ISO-2022-KR encoding
2315 *   i) The KSC5601 designator sequence should appear only once in a file,
2316 *      at the begining of a line before any KSC5601 characters. This usually
2317 *      means that it appears by itself on the first line of the file
2318 *  ii) There are only 2 shifting sequences SO to shift into double byte mode
2319 *      and SI to shift into single byte mode
2320 */
2321 static void
2322 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2323
2324     UConverter* saveConv = args->converter;
2325     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2326     args->converter=myConverterData->currentConverter;
2327
2328     myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2329     ucnv_MBCSFromUnicodeWithOffsets(args,err);
2330     saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2331
2332     if(*err == U_BUFFER_OVERFLOW_ERROR) {
2333         if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2334             uprv_memcpy(
2335                 saveConv->charErrorBuffer,
2336                 myConverterData->currentConverter->charErrorBuffer,
2337                 myConverterData->currentConverter->charErrorBufferLength);
2338         }
2339         saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2340         myConverterData->currentConverter->charErrorBufferLength = 0;
2341     }
2342     args->converter=saveConv;
2343 }
2344
2345 static void
2346 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2347
2348     const UChar *source = args->source;
2349     const UChar *sourceLimit = args->sourceLimit;
2350     unsigned char *target = (unsigned char *) args->target;
2351     unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2352     int32_t* offsets = args->offsets;
2353     uint32_t targetByteUnit = 0x0000;
2354     UChar32 sourceChar = 0x0000;
2355     UBool isTargetByteDBCS;
2356     UBool oldIsTargetByteDBCS;
2357     UConverterDataISO2022 *converterData;
2358     UConverterSharedData* sharedData;
2359     UBool useFallback;
2360     int32_t length =0;
2361
2362     converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2363     /* if the version is 1 then the user is requesting
2364      * conversion with ibm-25546 pass the arguments to
2365      * MBCS converter and return
2366      */
2367     if(converterData->version==1){
2368         UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2369         return;
2370     }
2371
2372     /* initialize data */
2373     sharedData = converterData->currentConverter->sharedData;
2374     useFallback = args->converter->useFallback;
2375     isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2376     oldIsTargetByteDBCS = isTargetByteDBCS;
2377
2378     isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2379     if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2380         goto getTrail;
2381     }
2382     while(source < sourceLimit){
2383
2384         targetByteUnit = missingCharMarker;
2385
2386         if(target < (unsigned char*) args->targetLimit){
2387             sourceChar = *source++;
2388
2389             /* do not convert SO/SI/ESC */
2390             if(IS_2022_CONTROL(sourceChar)) {
2391                 /* callback(illegal) */
2392                 *err=U_ILLEGAL_CHAR_FOUND;
2393                 args->converter->fromUChar32=sourceChar;
2394                 break;
2395             }
2396
2397             length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2398             if(length < 0) {
2399                 length = -length;  /* fallback */
2400             }
2401             /* only DBCS or SBCS characters are expected*/
2402             /* DB characters with high bit set to 1 are expected */
2403             if( length > 2 || length==0 ||
2404                 (length == 1 && targetByteUnit > 0x7f) ||
2405                 (length == 2 &&
2406                     ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2407                     (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2408             ) {
2409                 targetByteUnit=missingCharMarker;
2410             }
2411             if (targetByteUnit != missingCharMarker){
2412
2413                 oldIsTargetByteDBCS = isTargetByteDBCS;
2414                 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2415                   /* append the shift sequence */
2416                 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2417
2418                     if (isTargetByteDBCS)
2419                         *target++ = UCNV_SO;
2420                     else
2421                         *target++ = UCNV_SI;
2422                     if(offsets)
2423                         *(offsets++) = (int32_t)(source - args->source-1);
2424                 }
2425                 /* write the targetUniChar  to target */
2426                 if(targetByteUnit <= 0x00FF){
2427                     if( target < targetLimit){
2428                         *(target++) = (unsigned char) targetByteUnit;
2429                         if(offsets){
2430                             *(offsets++) = (int32_t)(source - args->source-1);
2431                         }
2432
2433                     }else{
2434                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2435                         *err = U_BUFFER_OVERFLOW_ERROR;
2436                     }
2437                 }else{
2438                     if(target < targetLimit){
2439                         *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2440                         if(offsets){
2441                             *(offsets++) = (int32_t)(source - args->source-1);
2442                         }
2443                         if(target < targetLimit){
2444                             *(target++) =(unsigned char) (targetByteUnit -0x80);
2445                             if(offsets){
2446                                 *(offsets++) = (int32_t)(source - args->source-1);
2447                             }
2448                         }else{
2449                             args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2450                             *err = U_BUFFER_OVERFLOW_ERROR;
2451                         }
2452                     }else{
2453                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2454                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2455                         *err = U_BUFFER_OVERFLOW_ERROR;
2456                     }
2457                 }
2458
2459             }
2460             else{
2461                 /* oops.. the code point is unassingned
2462                  * set the error and reason
2463                  */
2464
2465                 /*check if the char is a First surrogate*/
2466                 if(U16_IS_SURROGATE(sourceChar)) {
2467                     if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2468 getTrail:
2469                         /*look ahead to find the trail surrogate*/
2470                         if(source <  sourceLimit) {
2471                             /* test the following code unit */
2472                             UChar trail=(UChar) *source;
2473                             if(U16_IS_TRAIL(trail)) {
2474                                 source++;
2475                                 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2476                                 *err = U_INVALID_CHAR_FOUND;
2477                                 /* convert this surrogate code point */
2478                                 /* exit this condition tree */
2479                             } else {
2480                                 /* this is an unmatched lead code unit (1st surrogate) */
2481                                 /* callback(illegal) */
2482                                 *err=U_ILLEGAL_CHAR_FOUND;
2483                             }
2484                         } else {
2485                             /* no more input */
2486                             *err = U_ZERO_ERROR;
2487                         }
2488                     } else {
2489                         /* this is an unmatched trail code unit (2nd surrogate) */
2490                         /* callback(illegal) */
2491                         *err=U_ILLEGAL_CHAR_FOUND;
2492                     }
2493                 } else {
2494                     /* callback(unassigned) for a BMP code point */
2495                     *err = U_INVALID_CHAR_FOUND;
2496                 }
2497
2498                 args->converter->fromUChar32=sourceChar;
2499                 break;
2500             }
2501         } /* end if(myTargetIndex<myTargetLength) */
2502         else{
2503             *err =U_BUFFER_OVERFLOW_ERROR;
2504             break;
2505         }
2506
2507     }/* end while(mySourceIndex<mySourceLength) */
2508
2509     /*
2510      * the end of the input stream and detection of truncated input
2511      * are handled by the framework, but for ISO-2022-KR conversion
2512      * we need to be in ASCII mode at the very end
2513      *
2514      * conditions:
2515      *   successful
2516      *   not in ASCII mode
2517      *   end of input and no truncated input
2518      */
2519     if( U_SUCCESS(*err) &&
2520         isTargetByteDBCS &&
2521         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2522     ) {
2523         int32_t sourceIndex;
2524
2525         /* we are switching to ASCII */
2526         isTargetByteDBCS=FALSE;
2527
2528         /* get the source index of the last input character */
2529         /*
2530          * TODO this would be simpler and more reliable if we used a pair
2531          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2532          * so that we could simply use the prevSourceIndex here;
2533          * this code gives an incorrect result for the rare case of an unmatched
2534          * trail surrogate that is alone in the last buffer of the text stream
2535          */
2536         sourceIndex=(int32_t)(source-args->source);
2537         if(sourceIndex>0) {
2538             --sourceIndex;
2539             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2540                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2541             ) {
2542                 --sourceIndex;
2543             }
2544         } else {
2545             sourceIndex=-1;
2546         }
2547
2548         fromUWriteUInt8(
2549             args->converter,
2550             SHIFT_IN_STR, 1,
2551             &target, (const char *)targetLimit,
2552             &offsets, sourceIndex,
2553             err);
2554     }
2555
2556     /*save the state and return */
2557     args->source = source;
2558     args->target = (char*)target;
2559     args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2560 }
2561
2562 /************************ To Unicode ***************************************/
2563
2564 static void
2565 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2566                                                             UErrorCode* err){
2567     char const* sourceStart;
2568     UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2569
2570     UConverterToUnicodeArgs subArgs;
2571     int32_t minArgsSize;
2572
2573     /* set up the subconverter arguments */
2574     if(args->size<sizeof(UConverterToUnicodeArgs)) {
2575         minArgsSize = args->size;
2576     } else {
2577         minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2578     }
2579
2580     uprv_memcpy(&subArgs, args, minArgsSize);
2581     subArgs.size = (uint16_t)minArgsSize;
2582     subArgs.converter = myData->currentConverter;
2583
2584     /* remember the original start of the input for offsets */
2585     sourceStart = args->source;
2586
2587     if(myData->key != 0) {
2588         /* continue with a partial escape sequence */
2589         goto escape;
2590     }
2591
2592     while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2593         /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2594         subArgs.source = args->source;
2595         subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2596         if(subArgs.source != subArgs.sourceLimit) {
2597             /*
2598              * get the current partial byte sequence
2599              *
2600              * it needs to be moved between the public and the subconverter
2601              * so that the conversion framework, which only sees the public
2602              * converter, can handle truncated and illegal input etc.
2603              */
2604             if(args->converter->toULength > 0) {
2605                 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2606             }
2607             subArgs.converter->toULength = args->converter->toULength;
2608
2609             /*
2610              * Convert up to the end of the input, or to before the next escape character.
2611              * Does not handle conversion extensions because the preToU[] state etc.
2612              * is not copied.
2613              */
2614             ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2615
2616             if(args->offsets != NULL && sourceStart != args->source) {
2617                 /* update offsets to base them on the actual start of the input */
2618                 int32_t *offsets = args->offsets;
2619                 UChar *target = args->target;
2620                 int32_t delta = (int32_t)(args->source - sourceStart);
2621                 while(target < subArgs.target) {
2622                     if(*offsets >= 0) {
2623                         *offsets += delta;
2624                     }
2625                     ++offsets;
2626                     ++target;
2627                 }
2628             }
2629             args->source = subArgs.source;
2630             args->target = subArgs.target;
2631             args->offsets = subArgs.offsets;
2632
2633             /* copy input/error/overflow buffers */
2634             if(subArgs.converter->toULength > 0) {
2635                 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2636             }
2637             args->converter->toULength = subArgs.converter->toULength;
2638
2639             if(*err == U_BUFFER_OVERFLOW_ERROR) {
2640                 if(subArgs.converter->UCharErrorBufferLength > 0) {
2641                     uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2642                                 subArgs.converter->UCharErrorBufferLength);
2643                 }
2644                 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2645                 subArgs.converter->UCharErrorBufferLength = 0;
2646             }
2647         }
2648
2649         if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2650             return;
2651         }
2652
2653 escape:
2654         changeState_2022(args->converter,
2655                &(args->source),
2656                args->sourceLimit,
2657                ISO_2022_KR,
2658                err);
2659     }
2660 }
2661
2662 static void
2663 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2664                                                             UErrorCode* err){
2665     char tempBuf[2];
2666     const char *mySource = ( char *) args->source;
2667     UChar *myTarget = args->target;
2668     const char *mySourceLimit = args->sourceLimit;
2669     UChar32 targetUniChar = 0x0000;
2670     UChar mySourceChar = 0x0000;
2671     UConverterDataISO2022* myData;
2672     UConverterSharedData* sharedData ;
2673     UBool useFallback;
2674
2675     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2676     if(myData->version==1){
2677         UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2678         return;
2679     }
2680
2681     /* initialize state */
2682     sharedData = myData->currentConverter->sharedData;
2683     useFallback = args->converter->useFallback;
2684
2685     if(myData->key != 0) {
2686         /* continue with a partial escape sequence */
2687         goto escape;
2688     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2689         /* continue with a partial double-byte character */
2690         mySourceChar = args->converter->toUBytes[0];
2691         args->converter->toULength = 0;
2692         goto getTrailByte;
2693     }
2694
2695     while(mySource< mySourceLimit){
2696
2697         if(myTarget < args->targetLimit){
2698
2699             mySourceChar= (unsigned char) *mySource++;
2700
2701             if(mySourceChar==UCNV_SI){
2702                 myData->toU2022State.g = 0;
2703                 if (myData->isEmptySegment) {
2704                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
2705                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2706                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
2707                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2708                     args->converter->toULength = 1;
2709                     args->target = myTarget;
2710                     args->source = mySource;
2711                     return;
2712                 }
2713                 /*consume the source */
2714                 continue;
2715             }else if(mySourceChar==UCNV_SO){
2716                 myData->toU2022State.g = 1;
2717                 myData->isEmptySegment = TRUE;  /* Begin a new segment, empty so far */
2718                 /*consume the source */
2719                 continue;
2720             }else if(mySourceChar==ESC_2022){
2721                 mySource--;
2722 escape:
2723                 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2724                 changeState_2022(args->converter,&(mySource),
2725                                 mySourceLimit, ISO_2022_KR, err);
2726                 if(U_FAILURE(*err)){
2727                     args->target = myTarget;
2728                     args->source = mySource;
2729                     return;
2730                 }
2731                 continue;
2732             }
2733
2734             myData->isEmptySegment = FALSE;     /* Any invalid char errors will be detected separately, so just reset this */
2735             if(myData->toU2022State.g == 1) {
2736                 if(mySource < mySourceLimit) {
2737                     int leadIsOk, trailIsOk;
2738                     uint8_t trailByte;
2739 getTrailByte:
2740                     targetUniChar = missingCharMarker;
2741                     trailByte = (uint8_t)*mySource;
2742                     /*
2743                      * Ticket 5691: consistent illegal sequences:
2744                      * - We include at least the first byte in the illegal sequence.
2745                      * - If any of the non-initial bytes could be the start of a character,
2746                      *   we stop the illegal sequence before the first one of those.
2747                      *
2748                      * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2749                      * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2750                      * Otherwise we convert or report the pair of bytes.
2751                      */
2752                     leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2753                     trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2754                     if (leadIsOk && trailIsOk) {
2755                         ++mySource;
2756                         tempBuf[0] = (char)(mySourceChar + 0x80);
2757                         tempBuf[1] = (char)(trailByte + 0x80);
2758                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2759                         mySourceChar = (mySourceChar << 8) | trailByte;
2760                     } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2761                         /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2762                         ++mySource;
2763                         /* add another bit so that the code below writes 2 bytes in case of error */
2764                         mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2765                     }
2766                 } else {
2767                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2768                     args->converter->toULength = 1;
2769                     break;
2770                 }
2771             }
2772             else if(mySourceChar <= 0x7f) {
2773                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2774             } else {
2775                 targetUniChar = 0xffff;
2776             }
2777             if(targetUniChar < 0xfffe){
2778                 if(args->offsets) {
2779                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2780                 }
2781                 *(myTarget++)=(UChar)targetUniChar;
2782             }
2783             else {
2784                 /* Call the callback function*/
2785                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2786                 break;
2787             }
2788         }
2789         else{
2790             *err =U_BUFFER_OVERFLOW_ERROR;
2791             break;
2792         }
2793     }
2794     args->target = myTarget;
2795     args->source = mySource;
2796 }
2797
2798 /*************************** END ISO2022-KR *********************************/
2799
2800 /*************************** ISO-2022-CN *********************************
2801 *
2802 * Rules for ISO-2022-CN Encoding:
2803 * i)   The designator sequence must appear once on a line before any instance
2804 *      of character set it designates.
2805 * ii)  If two lines contain characters from the same character set, both lines
2806 *      must include the designator sequence.
2807 * iii) Once the designator sequence is known, a shifting sequence has to be found
2808 *      to invoke the  shifting
2809 * iv)  All lines start in ASCII and end in ASCII.
2810 * v)   Four shifting sequences are employed for this purpose:
2811 *
2812 *      Sequcence   ASCII Eq    Charsets
2813 *      ----------  -------    ---------
2814 *      SI           <SI>        US-ASCII
2815 *      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2816 *      SS2          <ESC>N      CNS-11643-1992 Plane 2
2817 *      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2818 *
2819 * vi)
2820 *      SOdesignator  : ESC "$" ")" finalchar_for_SO
2821 *      SS2designator : ESC "$" "*" finalchar_for_SS2
2822 *      SS3designator : ESC "$" "+" finalchar_for_SS3
2823 *
2824 *      ESC $ ) A       Indicates the bytes following SO are Chinese
2825 *       characters as defined in GB 2312-80, until
2826 *       another SOdesignation appears
2827 *
2828 *
2829 *      ESC $ ) E       Indicates the bytes following SO are as defined
2830 *       in ISO-IR-165 (for details, see section 2.1),
2831 *       until another SOdesignation appears
2832 *
2833 *      ESC $ ) G       Indicates the bytes following SO are as defined
2834 *       in CNS 11643-plane-1, until another
2835 *       SOdesignation appears
2836 *
2837 *      ESC $ * H       Indicates the two bytes immediately following
2838 *       SS2 is a Chinese character as defined in CNS
2839 *       11643-plane-2, until another SS2designation
2840 *       appears
2841 *       (Meaning <ESC>N must preceed every 2 byte
2842 *        sequence.)
2843 *
2844 *      ESC $ + I       Indicates the immediate two bytes following SS3
2845 *       is a Chinese character as defined in CNS
2846 *       11643-plane-3, until another SS3designation
2847 *       appears
2848 *       (Meaning <ESC>O must preceed every 2 byte
2849 *        sequence.)
2850 *
2851 *      ESC $ + J       Indicates the immediate two bytes following SS3
2852 *       is a Chinese character as defined in CNS
2853 *       11643-plane-4, until another SS3designation
2854 *       appears
2855 *       (In English: <ESC>O must preceed every 2 byte
2856 *        sequence.)
2857 *
2858 *      ESC $ + K       Indicates the immediate two bytes following SS3
2859 *       is a Chinese character as defined in CNS
2860 *       11643-plane-5, until another SS3designation
2861 *       appears
2862 *
2863 *      ESC $ + L       Indicates the immediate two bytes following SS3
2864 *       is a Chinese character as defined in CNS
2865 *       11643-plane-6, until another SS3designation
2866 *       appears
2867 *
2868 *      ESC $ + M       Indicates the immediate two bytes following SS3
2869 *       is a Chinese character as defined in CNS
2870 *       11643-plane-7, until another SS3designation
2871 *       appears
2872 *
2873 *       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2874 *       has its own designation information before any Chinese characters
2875 *       appear
2876 *
2877 */
2878
2879 /* The following are defined this way to make the strings truly readonly */
2880 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2881 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2882 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2883 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2884 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2885 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2886 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2887 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2888 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2889
2890 /********************** ISO2022-CN Data **************************/
2891 static const char* const escSeqCharsCN[10] ={
2892         SHIFT_IN_STR,                   /* 0 ASCII */
2893         GB_2312_80_STR,                 /* 1 GB2312_1 */
2894         ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
2895         CNS_11643_1992_Plane_1_STR,
2896         CNS_11643_1992_Plane_2_STR,
2897         CNS_11643_1992_Plane_3_STR,
2898         CNS_11643_1992_Plane_4_STR,
2899         CNS_11643_1992_Plane_5_STR,
2900         CNS_11643_1992_Plane_6_STR,
2901         CNS_11643_1992_Plane_7_STR
2902 };
2903
2904 static void
2905 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2906     UConverter *cnv = args->converter;
2907     UConverterDataISO2022 *converterData;
2908     ISO2022State *pFromU2022State;
2909     uint8_t *target = (uint8_t *) args->target;
2910     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2911     const UChar* source = args->source;
2912     const UChar* sourceLimit = args->sourceLimit;
2913     int32_t* offsets = args->offsets;
2914     UChar32 sourceChar;
2915     char buffer[8];
2916     int32_t len;
2917     int8_t choices[3];
2918     int32_t choiceCount;
2919     uint32_t targetValue = 0;
2920     UBool useFallback;
2921
2922     /* set up the state */
2923     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2924     pFromU2022State   = &converterData->fromU2022State;
2925
2926     choiceCount = 0;
2927
2928     /* check if the last codepoint of previous buffer was a lead surrogate*/
2929     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2930         goto getTrail;
2931     }
2932
2933     while( source < sourceLimit){
2934         if(target < targetLimit){
2935
2936             sourceChar  = *(source++);
2937             /*check if the char is a First surrogate*/
2938              if(U16_IS_SURROGATE(sourceChar)) {
2939                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2940 getTrail:
2941                     /*look ahead to find the trail surrogate*/
2942                     if(source < sourceLimit) {
2943                         /* test the following code unit */
2944                         UChar trail=(UChar) *source;
2945                         if(U16_IS_TRAIL(trail)) {
2946                             source++;
2947                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2948                             cnv->fromUChar32=0x00;
2949                             /* convert this supplementary code point */
2950                             /* exit this condition tree */
2951                         } else {
2952                             /* this is an unmatched lead code unit (1st surrogate) */
2953                             /* callback(illegal) */
2954                             *err=U_ILLEGAL_CHAR_FOUND;
2955                             cnv->fromUChar32=sourceChar;
2956                             break;
2957                         }
2958                     } else {
2959                         /* no more input */
2960                         cnv->fromUChar32=sourceChar;
2961                         break;
2962                     }
2963                 } else {
2964                     /* this is an unmatched trail code unit (2nd surrogate) */
2965                     /* callback(illegal) */
2966                     *err=U_ILLEGAL_CHAR_FOUND;
2967                     cnv->fromUChar32=sourceChar;
2968                     break;
2969                 }
2970             }
2971
2972             /* do the conversion */
2973             if(sourceChar <= 0x007f ){
2974                 /* do not convert SO/SI/ESC */
2975                 if(IS_2022_CONTROL(sourceChar)) {
2976                     /* callback(illegal) */
2977                     *err=U_ILLEGAL_CHAR_FOUND;
2978                     cnv->fromUChar32=sourceChar;
2979                     break;
2980                 }
2981
2982                 /* US-ASCII */
2983                 if(pFromU2022State->g == 0) {
2984                     buffer[0] = (char)sourceChar;
2985                     len = 1;
2986                 } else {
2987                     buffer[0] = UCNV_SI;
2988                     buffer[1] = (char)sourceChar;
2989                     len = 2;
2990                     pFromU2022State->g = 0;
2991                     choiceCount = 0;
2992                 }
2993                 if(sourceChar == CR || sourceChar == LF) {
2994                     /* reset the state at the end of a line */
2995                     uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2996                     choiceCount = 0;
2997                 }
2998             }
2999             else{
3000                 /* convert U+0080..U+10ffff */
3001                 int32_t i;
3002                 int8_t cs, g;
3003
3004                 if(choiceCount == 0) {
3005                     /* try the current SO/G1 converter first */
3006                     choices[0] = pFromU2022State->cs[1];
3007
3008                     /* default to GB2312_1 if none is designated yet */
3009                     if(choices[0] == 0) {
3010                         choices[0] = GB2312_1;
3011                     }
3012
3013                     if(converterData->version == 0) {
3014                         /* ISO-2022-CN */
3015
3016                         /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3017                         if(choices[0] == GB2312_1) {
3018                             choices[1] = (int8_t)CNS_11643_1;
3019                         } else {
3020                             choices[1] = (int8_t)GB2312_1;
3021                         }
3022
3023                         choiceCount = 2;
3024                     } else if (converterData->version == 1) {
3025                         /* ISO-2022-CN-EXT */
3026
3027                         /* try one of the other converters */
3028                         switch(choices[0]) {
3029                         case GB2312_1:
3030                             choices[1] = (int8_t)CNS_11643_1;
3031                             choices[2] = (int8_t)ISO_IR_165;
3032                             break;
3033                         case ISO_IR_165:
3034                             choices[1] = (int8_t)GB2312_1;
3035                             choices[2] = (int8_t)CNS_11643_1;
3036                             break;
3037                         default: /* CNS_11643_x */
3038                             choices[1] = (int8_t)GB2312_1;
3039                             choices[2] = (int8_t)ISO_IR_165;
3040                             break;
3041                         }
3042
3043                         choiceCount = 3;
3044                     } else {
3045                         choices[0] = (int8_t)CNS_11643_1;
3046                         choices[1] = (int8_t)GB2312_1;
3047                     }
3048                 }
3049
3050                 cs = g = 0;
3051                 /*
3052                  * len==0: no mapping found yet
3053                  * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3054                  * len>0: found a roundtrip result, done
3055                  */
3056                 len = 0;
3057                 /*
3058                  * We will turn off useFallback after finding a fallback,
3059                  * but we still get fallbacks from PUA code points as usual.
3060                  * Therefore, we will also need to check that we don't overwrite
3061                  * an early fallback with a later one.
3062                  */
3063                 useFallback = cnv->useFallback;
3064
3065                 for(i = 0; i < choiceCount && len <= 0; ++i) {
3066                     int8_t cs0 = choices[i];
3067                     if(cs0 > 0) {
3068                         uint32_t value;
3069                         int32_t len2;
3070                         if(cs0 >= CNS_11643_0) {
3071                             len2 = MBCS_FROM_UCHAR32_ISO2022(
3072                                         converterData->myConverterArray[CNS_11643],
3073                                         sourceChar,
3074                                         &value,
3075                                         useFallback,
3076                                         MBCS_OUTPUT_3);
3077                             if(len2 == 3 || (len2 == -3 && len == 0)) {
3078                                 targetValue = value;
3079                                 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3080                                 if(len2 >= 0) {
3081                                     len = 2;
3082                                 } else {
3083                                     len = -2;
3084                                     useFallback = FALSE;
3085                                 }
3086                                 if(cs == CNS_11643_1) {
3087                                     g = 1;
3088                                 } else if(cs == CNS_11643_2) {
3089                                     g = 2;
3090                                 } else /* plane 3..7 */ if(converterData->version == 1) {
3091                                     g = 3;
3092                                 } else {
3093                                     /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3094                                     len = 0;
3095                                 }
3096                             }
3097                         } else {
3098                             /* GB2312_1 or ISO-IR-165 */
3099                             U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3100                             len2 = MBCS_FROM_UCHAR32_ISO2022(
3101                                         converterData->myConverterArray[cs0],
3102                                         sourceChar,
3103                                         &value,
3104                                         useFallback,
3105                                         MBCS_OUTPUT_2);
3106                             if(len2 == 2 || (len2 == -2 && len == 0)) {
3107                                 targetValue = value;
3108                                 len = len2;
3109                                 cs = cs0;
3110                                 g = 1;
3111                                 useFallback = FALSE;
3112                             }
3113                         }
3114                     }
3115                 }
3116
3117                 if(len != 0) {
3118                     len = 0; /* count output bytes; it must have been abs(len) == 2 */
3119
3120                     /* write the designation sequence if necessary */
3121                     if(cs != pFromU2022State->cs[g]) {
3122                         if(cs < CNS_11643) {
3123                             uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3124                         } else {
3125                             U_ASSERT(cs >= CNS_11643_1);
3126                             uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3127                         }
3128                         len = 4;
3129                         pFromU2022State->cs[g] = cs;
3130                         if(g == 1) {
3131                             /* changing the SO/G1 charset invalidates the choices[] */
3132                             choiceCount = 0;
3133                         }
3134                     }
3135
3136                     /* write the shift sequence if necessary */
3137                     if(g != pFromU2022State->g) {
3138                         switch(g) {
3139                         case 1:
3140                             buffer[len++] = UCNV_SO;
3141
3142                             /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3143                             pFromU2022State->g = 1;
3144                             break;
3145                         case 2:
3146                             buffer[len++] = 0x1b;
3147                             buffer[len++] = 0x4e;
3148                             break;
3149                         default: /* case 3 */
3150                             buffer[len++] = 0x1b;
3151                             buffer[len++] = 0x4f;
3152                             break;
3153                         }
3154                     }
3155
3156                     /* write the two output bytes */
3157                     buffer[len++] = (char)(targetValue >> 8);
3158                     buffer[len++] = (char)targetValue;
3159                 } else {
3160                     /* if we cannot find the character after checking all codepages
3161                      * then this is an error
3162                      */
3163                     *err = U_INVALID_CHAR_FOUND;
3164                     cnv->fromUChar32=sourceChar;
3165                     break;
3166                 }
3167             }
3168
3169             /* output len>0 bytes in buffer[] */
3170             if(len == 1) {
3171                 *target++ = buffer[0];
3172                 if(offsets) {
3173                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3174                 }
3175             } else if(len == 2 && (target + 2) <= targetLimit) {
3176                 *target++ = buffer[0];
3177                 *target++ = buffer[1];
3178                 if(offsets) {
3179                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3180                     *offsets++ = sourceIndex;
3181                     *offsets++ = sourceIndex;
3182                 }
3183             } else {
3184                 fromUWriteUInt8(
3185                     cnv,
3186                     buffer, len,
3187                     &target, (const char *)targetLimit,
3188                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3189                     err);
3190                 if(U_FAILURE(*err)) {
3191                     break;
3192                 }
3193             }
3194         } /* end if(myTargetIndex<myTargetLength) */
3195         else{
3196             *err =U_BUFFER_OVERFLOW_ERROR;
3197             break;
3198         }
3199
3200     }/* end while(mySourceIndex<mySourceLength) */
3201
3202     /*
3203      * the end of the input stream and detection of truncated input
3204      * are handled by the framework, but for ISO-2022-CN conversion
3205      * we need to be in ASCII mode at the very end
3206      *
3207      * conditions:
3208      *   successful
3209      *   not in ASCII mode
3210      *   end of input and no truncated input
3211      */
3212     if( U_SUCCESS(*err) &&
3213         pFromU2022State->g!=0 &&
3214         args->flush && source>=sourceLimit && cnv->fromUChar32==0
3215     ) {
3216         int32_t sourceIndex;
3217
3218         /* we are switching to ASCII */
3219         pFromU2022State->g=0;
3220
3221         /* get the source index of the last input character */
3222         /*
3223          * TODO this would be simpler and more reliable if we used a pair
3224          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3225          * so that we could simply use the prevSourceIndex here;
3226          * this code gives an incorrect result for the rare case of an unmatched
3227          * trail surrogate that is alone in the last buffer of the text stream
3228          */
3229         sourceIndex=(int32_t)(source-args->source);
3230         if(sourceIndex>0) {
3231             --sourceIndex;
3232             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3233                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3234             ) {
3235                 --sourceIndex;
3236             }
3237         } else {
3238             sourceIndex=-1;
3239         }
3240
3241         fromUWriteUInt8(
3242             cnv,
3243             SHIFT_IN_STR, 1,
3244             &target, (const char *)targetLimit,
3245             &offsets, sourceIndex,
3246             err);
3247     }
3248
3249     /*save the state and return */
3250     args->source = source;
3251     args->target = (char*)target;
3252 }
3253
3254
3255 static void
3256 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3257                                                UErrorCode* err){
3258     char tempBuf[3];
3259     const char *mySource = (char *) args->source;
3260     UChar *myTarget = args->target;
3261     const char *mySourceLimit = args->sourceLimit;
3262     uint32_t targetUniChar = 0x0000;
3263     uint32_t mySourceChar = 0x0000;
3264     UConverterDataISO2022* myData;
3265     ISO2022State *pToU2022State;
3266
3267     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3268     pToU2022State = &myData->toU2022State;
3269
3270     if(myData->key != 0) {
3271         /* continue with a partial escape sequence */
3272         goto escape;
3273     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3274         /* continue with a partial double-byte character */
3275         mySourceChar = args->converter->toUBytes[0];
3276         args->converter->toULength = 0;
3277         targetUniChar = missingCharMarker;
3278         goto getTrailByte;
3279     }
3280
3281     while(mySource < mySourceLimit){
3282
3283         targetUniChar =missingCharMarker;
3284
3285         if(myTarget < args->targetLimit){
3286
3287             mySourceChar= (unsigned char) *mySource++;
3288
3289             switch(mySourceChar){
3290             case UCNV_SI:
3291                 pToU2022State->g=0;
3292                 if (myData->isEmptySegment) {
3293                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
3294                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3295                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
3296                     args->converter->toUBytes[0] = mySourceChar;
3297                     args->converter->toULength = 1;
3298                     args->target = myTarget;
3299                     args->source = mySource;
3300                     return;
3301                 }
3302                 continue;
3303
3304             case UCNV_SO:
3305                 if(pToU2022State->cs[1] != 0) {
3306                     pToU2022State->g=1;
3307                     myData->isEmptySegment = TRUE;      /* Begin a new segment, empty so far */
3308                     continue;
3309                 } else {
3310                     /* illegal to have SO before a matching designator */
3311                     myData->isEmptySegment = FALSE;     /* Handling a different error, reset this to avoid future spurious errs */
3312                     break;
3313                 }
3314
3315             case ESC_2022:
3316                 mySource--;
3317 escape:
3318                 {
3319                     const char * mySourceBefore = mySource;
3320                     int8_t toULengthBefore = args->converter->toULength;
3321
3322                     changeState_2022(args->converter,&(mySource),
3323                         mySourceLimit, ISO_2022_CN,err);
3324
3325                     /* After SO there must be at least one character before a designator (designator error handled separately) */
3326                     if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3327                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3328                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
3329                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3330                     }
3331                 }
3332
3333                 /* invalid or illegal escape sequence */
3334                 if(U_FAILURE(*err)){
3335                     args->target = myTarget;
3336                     args->source = mySource;
3337                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
3338                     return;
3339                 }
3340                 continue;
3341
3342             /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3343
3344             case CR:
3345             case LF:
3346                 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3347                 U_FALLTHROUGH;
3348             default:
3349                 /* convert one or two bytes */
3350                 myData->isEmptySegment = FALSE;
3351                 if(pToU2022State->g != 0) {
3352                     if(mySource < mySourceLimit) {
3353                         UConverterSharedData *cnv;
3354                         StateEnum tempState;
3355                         int32_t tempBufLen;
3356                         int leadIsOk, trailIsOk;
3357                         uint8_t trailByte;
3358 getTrailByte:
3359                         trailByte = (uint8_t)*mySource;
3360                         /*
3361                          * Ticket 5691: consistent illegal sequences:
3362                          * - We include at least the first byte in the illegal sequence.
3363                          * - If any of the non-initial bytes could be the start of a character,
3364                          *   we stop the illegal sequence before the first one of those.
3365                          *
3366                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3367                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3368                          * Otherwise we convert or report the pair of bytes.
3369                          */
3370                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3371                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3372                         if (leadIsOk && trailIsOk) {
3373                             ++mySource;
3374                             tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3375                             if(tempState >= CNS_11643_0) {
3376                                 cnv = myData->myConverterArray[CNS_11643];
3377                                 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3378                                 tempBuf[1] = (char) (mySourceChar);
3379                                 tempBuf[2] = (char) trailByte;
3380                                 tempBufLen = 3;
3381
3382                             }else{
3383                                 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3384                                 cnv = myData->myConverterArray[tempState];
3385                                 tempBuf[0] = (char) (mySourceChar);
3386                                 tempBuf[1] = (char) trailByte;
3387                                 tempBufLen = 2;
3388                             }
3389                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3390                             mySourceChar = (mySourceChar << 8) | trailByte;
3391                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3392                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3393                             ++mySource;
3394                             /* add another bit so that the code below writes 2 bytes in case of error */
3395                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3396                         }
3397                         if(pToU2022State->g>=2) {
3398                             /* return from a single-shift state to the previous one */
3399                             pToU2022State->g=pToU2022State->prevG;
3400                         }
3401                     } else {
3402                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3403                         args->converter->toULength = 1;
3404                         goto endloop;
3405                     }
3406                 }
3407                 else{
3408                     if(mySourceChar <= 0x7f) {
3409                         targetUniChar = (UChar) mySourceChar;
3410                     }
3411                 }
3412                 break;
3413             }
3414             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3415                 if(args->offsets){
3416                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3417                 }
3418                 *(myTarget++)=(UChar)targetUniChar;
3419             }
3420             else if(targetUniChar > missingCharMarker){
3421                 /* disassemble the surrogate pair and write to output*/
3422                 targetUniChar-=0x0010000;
3423                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3424                 if(args->offsets){
3425                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3426                 }
3427                 ++myTarget;
3428                 if(myTarget< args->targetLimit){
3429                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3430                     if(args->offsets){
3431                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3432                     }
3433                     ++myTarget;
3434                 }else{
3435                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3436                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3437                 }
3438
3439             }
3440             else{
3441                 /* Call the callback function*/
3442                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3443                 break;
3444             }
3445         }
3446         else{
3447             *err =U_BUFFER_OVERFLOW_ERROR;
3448             break;
3449         }
3450     }
3451 endloop:
3452     args->target = myTarget;
3453     args->source = mySource;
3454 }
3455 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3456
3457 static void
3458 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3459     UConverter *cnv = args->converter;
3460     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3461     ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3462     char *p, *subchar;
3463     char buffer[8];
3464     int32_t length;
3465
3466     subchar=(char *)cnv->subChars;
3467     length=cnv->subCharLen; /* assume length==1 for most variants */
3468
3469     p = buffer;
3470     switch(myConverterData->locale[0]){
3471     case 'j':
3472         {
3473             int8_t cs;
3474
3475             if(pFromU2022State->g == 1) {
3476                 /* JIS7: switch from G1 to G0 */
3477                 pFromU2022State->g = 0;
3478                 *p++ = UCNV_SI;
3479             }
3480
3481             cs = pFromU2022State->cs[0];
3482             if(cs != ASCII && cs != JISX201) {
3483                 /* not in ASCII or JIS X 0201: switch to ASCII */
3484                 pFromU2022State->cs[0] = (int8_t)ASCII;
3485                 *p++ = '\x1b';
3486                 *p++ = '\x28';
3487                 *p++ = '\x42';
3488             }
3489
3490             *p++ = subchar[0];
3491             break;
3492         }
3493     case 'c':
3494         if(pFromU2022State->g != 0) {
3495             /* not in ASCII mode: switch to ASCII */
3496             pFromU2022State->g = 0;
3497             *p++ = UCNV_SI;
3498         }
3499         *p++ = subchar[0];
3500         break;
3501     case 'k':
3502         if(myConverterData->version == 0) {
3503             if(length == 1) {
3504                 if((UBool)args->converter->fromUnicodeStatus) {
3505                     /* in DBCS mode: switch to SBCS */
3506                     args->converter->fromUnicodeStatus = 0;
3507                     *p++ = UCNV_SI;
3508                 }
3509                 *p++ = subchar[0];
3510             } else /* length == 2*/ {
3511                 if(!(UBool)args->converter->fromUnicodeStatus) {
3512                     /* in SBCS mode: switch to DBCS */
3513                     args->converter->fromUnicodeStatus = 1;
3514                     *p++ = UCNV_SO;
3515                 }
3516                 *p++ = subchar[0];
3517                 *p++ = subchar[1];
3518             }
3519             break;
3520         } else {
3521             /* save the subconverter's substitution string */
3522             uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3523             int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3524
3525             /* set our substitution string into the subconverter */
3526             myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3527             myConverterData->currentConverter->subCharLen = (int8_t)length;
3528
3529             /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3530             args->converter = myConverterData->currentConverter;
3531             myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3532             ucnv_cbFromUWriteSub(args, 0, err);
3533             cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3534             args->converter = cnv;
3535
3536             /* restore the subconverter's substitution string */
3537             myConverterData->currentConverter->subChars = currentSubChars;
3538             myConverterData->currentConverter->subCharLen = currentSubCharLen;
3539
3540             if(*err == U_BUFFER_OVERFLOW_ERROR) {
3541                 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3542                     uprv_memcpy(
3543                         cnv->charErrorBuffer,
3544                         myConverterData->currentConverter->charErrorBuffer,
3545                         myConverterData->currentConverter->charErrorBufferLength);
3546                 }
3547                 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3548                 myConverterData->currentConverter->charErrorBufferLength = 0;
3549             }
3550             return;
3551         }
3552     default:
3553         /* not expected */
3554         break;
3555     }
3556     ucnv_cbFromUWriteBytes(args,
3557                            buffer, (int32_t)(p - buffer),
3558                            offsetIndex, err);
3559 }
3560
3561 /*
3562  * Structure for cloning an ISO 2022 converter into a single memory block.
3563  * ucnv_safeClone() of the converter will align the entire cloneStruct,
3564  * and then ucnv_safeClone() of the sub-converter may additionally align
3565  * currentConverter inside the cloneStruct, for which we need the deadSpace
3566  * after currentConverter.
3567  * This is because UAlignedMemory may be larger than the actually
3568  * necessary alignment size for the platform.
3569  * The other cloneStruct fields will not be moved around,
3570  * and are aligned properly with cloneStruct's alignment.
3571  */
3572 struct cloneStruct
3573 {
3574     UConverter cnv;
3575     UConverter currentConverter;
3576     UAlignedMemory deadSpace;
3577     UConverterDataISO2022 mydata;
3578 };
3579
3580
3581 static UConverter *
3582 _ISO_2022_SafeClone(
3583             const UConverter *cnv,
3584             void *stackBuffer,
3585             int32_t *pBufferSize,
3586             UErrorCode *status)
3587 {
3588     struct cloneStruct * localClone;
3589     UConverterDataISO2022 *cnvData;
3590     int32_t i, size;
3591
3592     if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3593         *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3594         return NULL;
3595     }
3596
3597     cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3598     localClone = (struct cloneStruct *)stackBuffer;
3599
3600     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3601
3602     uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3603     localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3604     localClone->cnv.isExtraLocal = TRUE;
3605
3606     /* share the subconverters */
3607
3608     if(cnvData->currentConverter != NULL) {
3609         size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3610         localClone->mydata.currentConverter =
3611             ucnv_safeClone(cnvData->currentConverter,
3612                             &localClone->currentConverter,
3613                             &size, status);
3614         if(U_FAILURE(*status)) {
3615             return NULL;
3616         }
3617     }
3618
3619     for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3620         if(cnvData->myConverterArray[i] != NULL) {
3621             ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3622         }
3623     }
3624
3625     return &localClone->cnv;
3626 }
3627
3628 static void
3629 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3630                     const USetAdder *sa,
3631                     UConverterUnicodeSet which,
3632                     UErrorCode *pErrorCode)
3633 {
3634     int32_t i;
3635     UConverterDataISO2022* cnvData;
3636
3637     if (U_FAILURE(*pErrorCode)) {
3638         return;
3639     }
3640 #ifdef U_ENABLE_GENERIC_ISO_2022
3641     if (cnv->sharedData == &_ISO2022Data) {
3642         /* We use UTF-8 in this case */
3643         sa->addRange(sa->set, 0, 0xd7FF);
3644         sa->addRange(sa->set, 0xE000, 0x10FFFF);
3645         return;
3646     }
3647 #endif
3648
3649     cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3650
3651     /* open a set and initialize it with code points that are algorithmically round-tripped */
3652     switch(cnvData->locale[0]){
3653     case 'j':
3654         /* include JIS X 0201 which is hardcoded */
3655         sa->add(sa->set, 0xa5);
3656         sa->add(sa->set, 0x203e);
3657         if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3658             /* include Latin-1 for some variants of JP */
3659             sa->addRange(sa->set, 0, 0xff);
3660         } else {
3661             /* include ASCII for JP */
3662             sa->addRange(sa->set, 0, 0x7f);
3663         }
3664         if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3665             /*
3666              * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3667              * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3668              * use half-width Katakana.
3669              * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3670              * half-width Katakana via the ESC ( I sequence.
3671              * However, we only emit (fromUnicode) half-width Katakana according to the
3672              * definition of each variant.
3673              *
3674              * When including fallbacks,
3675              * we need to include half-width Katakana Unicode code points for all JP variants because
3676              * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3677              */
3678             /* include half-width Katakana for JP */
3679             sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3680         }
3681         break;
3682 #if !UCONFIG_ONLY_HTML_CONVERSION
3683     case 'c':
3684     case 'z':
3685         /* include ASCII for CN */
3686         sa->addRange(sa->set, 0, 0x7f);
3687         break;
3688     case 'k':
3689         /* there is only one converter for KR, and it is not in the myConverterArray[] */
3690         cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3691                 cnvData->currentConverter, sa, which, pErrorCode);
3692         /* the loop over myConverterArray[] will simply not find another converter */
3693         break;
3694 #endif
3695     default:
3696         break;
3697     }
3698
3699 #if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3700             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3701                 cnvData->version==0 && i==CNS_11643
3702             ) {
3703                 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3704                 ucnv_MBCSGetUnicodeSetForBytes(
3705                         cnvData->myConverterArray[i],
3706                         sa, UCNV_ROUNDTRIP_SET,
3707                         0, 0x81, 0x82,
3708                         pErrorCode);
3709             }
3710 #endif
3711
3712     for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3713         UConverterSetFilter filter;
3714         if(cnvData->myConverterArray[i]!=NULL) {
3715             if(cnvData->locale[0]=='j' && i==JISX208) {
3716                 /*
3717                  * Only add code points that map to Shift-JIS codes
3718                  * corresponding to JIS X 0208.
3719                  */
3720                 filter=UCNV_SET_FILTER_SJIS;
3721 #if !UCONFIG_ONLY_HTML_CONVERSION
3722             } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3723                        cnvData->version==0 && i==CNS_11643) {
3724                 /*
3725                  * Version-specific for CN:
3726                  * CN version 0 does not map CNS planes 3..7 although
3727                  * they are all available in the CNS conversion table;
3728                  * CN version 1 (-EXT) does map them all.
3729                  * The two versions create different Unicode sets.
3730                  */
3731                 filter=UCNV_SET_FILTER_2022_CN;
3732             } else if(i==KSC5601) {
3733                 /*
3734                  * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3735                  * are broader than GR94.
3736                  */
3737                 filter=UCNV_SET_FILTER_GR94DBCS;
3738 #endif
3739             } else {
3740                 filter=UCNV_SET_FILTER_NONE;
3741             }
3742             ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3743         }
3744     }
3745
3746     /*
3747      * ISO 2022 converters must not convert SO/SI/ESC despite what
3748      * sub-converters do by themselves.
3749      * Remove these characters from the set.
3750      */
3751     sa->remove(sa->set, 0x0e);
3752     sa->remove(sa->set, 0x0f);
3753     sa->remove(sa->set, 0x1b);
3754
3755     /* ISO 2022 converters do not convert C1 controls either */
3756     sa->removeRange(sa->set, 0x80, 0x9f);
3757 }
3758
3759 static const UConverterImpl _ISO2022Impl={
3760     UCNV_ISO_2022,
3761
3762     NULL,
3763     NULL,
3764
3765     _ISO2022Open,
3766     _ISO2022Close,
3767     _ISO2022Reset,
3768
3769 #ifdef U_ENABLE_GENERIC_ISO_2022
3770     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3771     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3772     ucnv_fromUnicode_UTF8,
3773     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3774 #else
3775     NULL,
3776     NULL,
3777     NULL,
3778     NULL,
3779 #endif
3780     NULL,
3781
3782     NULL,
3783     _ISO2022getName,
3784     _ISO_2022_WriteSub,
3785     _ISO_2022_SafeClone,
3786     _ISO_2022_GetUnicodeSet,
3787
3788     NULL,
3789     NULL
3790 };
3791 static const UConverterStaticData _ISO2022StaticData={
3792     sizeof(UConverterStaticData),
3793     "ISO_2022",
3794     2022,
3795     UCNV_IBM,
3796     UCNV_ISO_2022,
3797     1,
3798     3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3799     { 0x1a, 0, 0, 0 },
3800     1,
3801     FALSE,
3802     FALSE,
3803     0,
3804     0,
3805     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3806 };
3807 const UConverterSharedData _ISO2022Data=
3808         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
3809
3810 /*************JP****************/
3811 static const UConverterImpl _ISO2022JPImpl={
3812     UCNV_ISO_2022,
3813
3814     NULL,
3815     NULL,
3816
3817     _ISO2022Open,
3818     _ISO2022Close,
3819     _ISO2022Reset,
3820
3821     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3822     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3823     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3824     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3825     NULL,
3826
3827     NULL,
3828     _ISO2022getName,
3829     _ISO_2022_WriteSub,
3830     _ISO_2022_SafeClone,
3831     _ISO_2022_GetUnicodeSet,
3832
3833     NULL,
3834     NULL
3835 };
3836 static const UConverterStaticData _ISO2022JPStaticData={
3837     sizeof(UConverterStaticData),
3838     "ISO_2022_JP",
3839     0,
3840     UCNV_IBM,
3841     UCNV_ISO_2022,
3842     1,
3843     6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3844     { 0x1a, 0, 0, 0 },
3845     1,
3846     FALSE,
3847     FALSE,
3848     0,
3849     0,
3850     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3851 };
3852
3853 namespace {
3854
3855 const UConverterSharedData _ISO2022JPData=
3856         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
3857
3858 }  // namespace
3859
3860 #if !UCONFIG_ONLY_HTML_CONVERSION
3861 /************* KR ***************/
3862 static const UConverterImpl _ISO2022KRImpl={
3863     UCNV_ISO_2022,
3864
3865     NULL,
3866     NULL,
3867
3868     _ISO2022Open,
3869     _ISO2022Close,
3870     _ISO2022Reset,
3871
3872     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3873     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3874     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3875     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3876     NULL,
3877
3878     NULL,
3879     _ISO2022getName,
3880     _ISO_2022_WriteSub,
3881     _ISO_2022_SafeClone,
3882     _ISO_2022_GetUnicodeSet,
3883
3884     NULL,
3885     NULL
3886 };
3887 static const UConverterStaticData _ISO2022KRStaticData={
3888     sizeof(UConverterStaticData),
3889     "ISO_2022_KR",
3890     0,
3891     UCNV_IBM,
3892     UCNV_ISO_2022,
3893     1,
3894     8, /* max 8 bytes per UChar */
3895     { 0x1a, 0, 0, 0 },
3896     1,
3897     FALSE,
3898     FALSE,
3899     0,
3900     0,
3901     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3902 };
3903
3904 namespace {
3905
3906 const UConverterSharedData _ISO2022KRData=
3907         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
3908
3909 }  // namespace
3910
3911 /*************** CN ***************/
3912 static const UConverterImpl _ISO2022CNImpl={
3913
3914     UCNV_ISO_2022,
3915
3916     NULL,
3917     NULL,
3918
3919     _ISO2022Open,
3920     _ISO2022Close,
3921     _ISO2022Reset,
3922
3923     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3924     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3925     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3926     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3927     NULL,
3928
3929     NULL,
3930     _ISO2022getName,
3931     _ISO_2022_WriteSub,
3932     _ISO_2022_SafeClone,
3933     _ISO_2022_GetUnicodeSet,
3934
3935     NULL,
3936     NULL
3937 };
3938 static const UConverterStaticData _ISO2022CNStaticData={
3939     sizeof(UConverterStaticData),
3940     "ISO_2022_CN",
3941     0,
3942     UCNV_IBM,
3943     UCNV_ISO_2022,
3944     1,
3945     8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3946     { 0x1a, 0, 0, 0 },
3947     1,
3948     FALSE,
3949     FALSE,
3950     0,
3951     0,
3952     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3953 };
3954
3955 namespace {
3956
3957 const UConverterSharedData _ISO2022CNData=
3958         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
3959
3960 }  // namespace
3961 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3962
3963 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */