icuSources/common/ucnv2022.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2000-2006,2008 International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv2022.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2000feb03
  12 *   created by: Markus W. Scherer
  13 *
  14 *   Change history:
  15 *
  16 *   06/29/2000  helena  Major rewrite of the callback APIs.
  17 *   08/08/2000  Ram     Included support for ISO-2022-JP-2
  18 *                       Changed implementation of toUnicode
  19 *                       function
  20 *   08/21/2000  Ram     Added support for ISO-2022-KR
  21 *   08/29/2000  Ram     Seperated implementation of EBCDIC to
  22 *                       ucnvebdc.c
  23 *   09/20/2000  Ram     Added support for ISO-2022-CN
  24 *                       Added implementations for getNextUChar()
  25 *                       for specific 2022 country variants.
  26 *   10/31/2000  Ram     Implemented offsets logic functions
  27 */
  28
  29 #include "unicode/utypes.h"
  30
  31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
  32
  33 #include "unicode/ucnv.h"
  34 #include "unicode/uset.h"
  35 #include "unicode/ucnv_err.h"
  36 #include "unicode/ucnv_cb.h"
  37 #include "ucnv_imp.h"
  38 #include "ucnv_bld.h"
  39 #include "ucnv_cnv.h"
  40 #include "ucnvmbcs.h"
  41 #include "cstring.h"
  42 #include "cmemory.h"
  43
  44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  45
  46 #ifdef U_ENABLE_GENERIC_ISO_2022
  47 /*
  48  * I am disabling the generic ISO-2022 converter after proposing to do so on
  49  * the icu mailing list two days ago.
  50  *
  51  * Reasons:
  52  * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
  53  *    its designation sequences, single shifts with return to the previous state,
  54  *    switch-with-no-return to UTF-16BE or similar, etc.
  55  *    This is unlike the language-specific variants like ISO-2022-JP which
  56  *    require a much smaller repertoire of ISO-2022 features.
  57  *    These variants continue to be supported.
  58  * 2. I believe that no one is really using the generic ISO-2022 converter
  59  *    but rather always one of the language-specific variants.
  60  *    Note that ICU's generic ISO-2022 converter has always output one escape
  61  *    sequence followed by UTF-8 for the whole stream.
  62  * 3. Switching between subcharsets is extremely slow, because each time
  63  *    the previous converter is closed and a new one opened,
  64  *    without any kind of caching, least-recently-used list, etc.
  65  * 4. The code is currently buggy, and given the above it does not seem
  66  *    reasonable to spend the time on maintenance.
  67  * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
  68  *    This means, for example, that when ISO-8859-7 is designated, the following
  69  *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
  70  *    The ICU ISO-2022 converter does not handle this - and has no information
  71  *    about which subconverter would have to be shifted vs. which is designed
  72  *    for 7-bit ISO-2022.
  73  *
  74  * Markus Scherer 2003-dec-03
  75  */
  76 #endif
  77
  78 static const char SHIFT_IN_STR[]  = "\x0F";
  79 static const char SHIFT_OUT_STR[] = "\x0E";
  80
  81 #define CR      0x0D
  82 #define LF      0x0A
  83 #define H_TAB   0x09
  84 #define V_TAB   0x0B
  85 #define SPACE   0x20
  86
  87 /*
  88  * ISO 2022 control codes must not be converted from Unicode
  89  * because they would mess up the byte stream.
  90  * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
  91  * corresponding to SO, SI, and ESC.
  92  */
  93 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
  94
  95 /* for ISO-2022-JP and -CN implementations */
  96 typedef enum  {
  97         /* shared values */
  98         INVALID_STATE=-1,
  99         ASCII = 0,
 100
 101         SS2_STATE=0x10,
 102         SS3_STATE,
 103
 104         /* JP */
 105         ISO8859_1 = 1 ,
 106         ISO8859_7 = 2 ,
 107         JISX201  = 3,
 108         JISX208 = 4,
 109         JISX212 = 5,
 110         GB2312  =6,
 111         KSC5601 =7,
 112         HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
 113
 114         /* CN */
 115         /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
 116         GB2312_1=1,
 117         ISO_IR_165=2,
 118         CNS_11643=3,
 119
 120         /*
 121          * these are used in StateEnum and ISO2022State variables,
 122          * but CNS_11643 must be used to index into myConverterArray[]
 123          */
 124         CNS_11643_0=0x20,
 125         CNS_11643_1,
 126         CNS_11643_2,
 127         CNS_11643_3,
 128         CNS_11643_4,
 129         CNS_11643_5,
 130         CNS_11643_6,
 131         CNS_11643_7
 132 } StateEnum;
 133
 134 /* is the StateEnum charset value for a DBCS charset? */
 135 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
 136
 137 #define CSM(cs) ((uint16_t)1<<(cs))
 138
 139 /*
 140  * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
 141  * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
 142  *
 143  * Note: The converter uses some leniency:
 144  * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
 145  *   all versions, not just JIS7 and JIS8.
 146  * - ICU does not distinguish between different versions of JIS X 0208.
 147  */
 148 static const uint16_t jpCharsetMasks[5]={
 149     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
 150     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
 151     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
 152     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
 153     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
 154 };
 155
 156 typedef enum {
 157         ASCII1=0,
 158         LATIN1,
 159         SBCS,
 160         DBCS,
 161         MBCS,
 162         HWKANA
 163 }Cnv2022Type;
 164
 165 typedef struct ISO2022State {
 166     int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
 167     int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
 168     int8_t prevG;       /* g before single shift (SS2 or SS3) */
 169 } ISO2022State;
 170
 171 #define UCNV_OPTIONS_VERSION_MASK 0xf
 172 #define UCNV_2022_MAX_CONVERTERS 10
 173
 174 typedef struct{
 175     UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
 176     UConverter *currentConverter;
 177     Cnv2022Type currentType;
 178     ISO2022State toU2022State, fromU2022State;
 179     uint32_t key;
 180     uint32_t version;
 181 #ifdef U_ENABLE_GENERIC_ISO_2022
 182     UBool isFirstBuffer;
 183 #endif
 184     UBool isEmptySegment;
 185     char name[30];
 186     char locale[3];
 187 }UConverterDataISO2022;
 188
 189 /* Protos */
 190 /* ISO-2022 ----------------------------------------------------------------- */
 191
 192 /*Forward declaration */
 193 U_CFUNC void
 194 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
 195                       UErrorCode * err);
 196 U_CFUNC void
 197 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
 198                                     UErrorCode * err);
 199
 200 #define ESC_2022 0x1B /*ESC*/
 201
 202 typedef enum
 203 {
 204         INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
 205         VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
 206         VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
 207         VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
 208 } UCNV_TableStates_2022;
 209
 210 /*
 211 * The way these state transition arrays work is:
 212 * ex : ESC$B is the sequence for JISX208
 213 *      a) First Iteration: char is ESC
 214 *          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
 215 *             int x = normalize_esq_chars_2022[27] which is equal to 1
 216 *         ii) Search for this value in escSeqStateTable_Key_2022[]
 217 *             value of x is stored at escSeqStateTable_Key_2022[0]
 218 *        iii) Save this index as offset
 219 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
 220 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
 221 *     b) Switch on this state and continue to next char
 222 *          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
 223 *             which is normalize_esq_chars_2022[36] == 4
 224 *         ii) x is currently 1(from above)
 225 *               x<<=5 -- x is now 32
 226 *               x+=normalize_esq_chars_2022[36]
 227 *               now x is 36
 228 *        iii) Search for this value in escSeqStateTable_Key_2022[]
 229 *             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
 230 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
 231 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
 232 *     c) Switch on this state and continue to next char
 233 *        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
 234 *        ii) x is currently 36 (from above)
 235 *            x<<=5 -- x is now 1152
 236 *            x+=normalize_esq_chars_2022[66]
 237 *            now x is 1161
 238 *       iii) Search for this value in escSeqStateTable_Key_2022[]
 239 *            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
 240 *        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
 241 *            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
 242 *         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
 243 */
 244
 245
 246 /*Below are the 3 arrays depicting a state transition table*/
 247 static const int8_t normalize_esq_chars_2022[256] = {
 248 /*       0      1       2       3       4      5       6        7       8       9           */
 249
 250          0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 251         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 252         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
 253         ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
 254         ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
 255         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 256         ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
 257         ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
 258         ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
 259         ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 260         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 261         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 262         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 263         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 264         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 265         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 266         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 267         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 268         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 269         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 270         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 271         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 272         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 273         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 274         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 275         ,0     ,0      ,0      ,0      ,0      ,0
 276 };
 277
 278 #ifdef U_ENABLE_GENERIC_ISO_2022
 279 /*
 280  * When the generic ISO-2022 converter is completely removed, not just disabled
 281  * per #ifdef, then the following state table and the associated tables that are
 282  * dimensioned with MAX_STATES_2022 should be trimmed.
 283  *
 284  * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
 285  * the associated escape sequences starting with ESC ( B should be removed.
 286  * This includes the ones with key values 1097 and all of the ones above 1000000.
 287  *
 288  * For the latter, the tables can simply be truncated.
 289  * For the former, since the tables must be kept parallel, it is probably best
 290  * to simply duplicate an adjacent table cell, parallel in all tables.
 291  *
 292  * It may make sense to restructure the tables, especially by using small search
 293  * tables for the variants instead of indexing them parallel to the table here.
 294  */
 295 #endif
 296
 297 #define MAX_STATES_2022 74
 298 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
 299 /*   0           1           2           3           4           5           6           7           8           9           */
 300
 301      1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
 302     ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
 303     ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
 304     ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
 305     ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
 306     ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
 307     ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
 308     ,35947631   ,35947635   ,35947636   ,35947638
 309 };
 310
 311 #ifdef U_ENABLE_GENERIC_ISO_2022
 312
 313 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
 314  /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
 315
 316      NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
 317     ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
 318     ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
 319     ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
 320     ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
 321     ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
 322     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
 323     ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
 324 };
 325
 326 #endif
 327
 328 static const UCNV_TableStates_2022 escSeqStateTable_Value_2022[MAX_STATES_2022] = {
 329 /*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
 330      VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 331     ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 332     ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
 333     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 334     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 335     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 336     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 337     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 338 };
 339
 340
 341 /* Type def for refactoring changeState_2022 code*/
 342 typedef enum{
 343 #ifdef U_ENABLE_GENERIC_ISO_2022
 344     ISO_2022=0,
 345 #endif
 346     ISO_2022_JP=1,
 347     ISO_2022_KR=2,
 348     ISO_2022_CN=3
 349 } Variant2022;
 350
 351 /*********** ISO 2022 Converter Protos ***********/
 352 static void
 353 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode);
 354
 355 static void
 356  _ISO2022Close(UConverter *converter);
 357
 358 static void
 359 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
 360
 361 static const char*
 362 _ISO2022getName(const UConverter* cnv);
 363
 364 static void
 365 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
 366
 367 static UConverter *
 368 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
 369
 370 #ifdef U_ENABLE_GENERIC_ISO_2022
 371 static void
 372 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
 373 #endif
 374
 375 /*const UConverterSharedData _ISO2022Data;*/
 376 static const UConverterSharedData _ISO2022JPData;
 377 static const UConverterSharedData _ISO2022KRData;
 378 static const UConverterSharedData _ISO2022CNData;
 379
 380 /*************** Converter implementations ******************/
 381
 382 /* The purpose of this function is to get around gcc compiler warnings. */
 383 static U_INLINE void
 384 fromUWriteUInt8(UConverter *cnv,
 385                  const char *bytes, int32_t length,
 386                  uint8_t **target, const char *targetLimit,
 387                  int32_t **offsets,
 388                  int32_t sourceIndex,
 389                  UErrorCode *pErrorCode)
 390 {
 391     char *targetChars = (char *)*target;
 392     ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
 393                          offsets, sourceIndex, pErrorCode);
 394     *target = (uint8_t*)targetChars;
 395
 396 }
 397
 398 static U_INLINE void
 399 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
 400     if(myConverterData->version == 1) {
 401         UConverter *cnv = myConverterData->currentConverter;
 402
 403         cnv->toUnicodeStatus=0;     /* offset */
 404         cnv->mode=0;                /* state */
 405         cnv->toULength=0;           /* byteIndex */
 406     }
 407 }
 408
 409 static U_INLINE void
 410 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
 411    /* in ISO-2022-KR the designator sequence appears only once
 412     * in a file so we append it only once
 413     */
 414     if( converter->charErrorBufferLength==0){
 415
 416         converter->charErrorBufferLength = 4;
 417         converter->charErrorBuffer[0] = 0x1b;
 418         converter->charErrorBuffer[1] = 0x24;
 419         converter->charErrorBuffer[2] = 0x29;
 420         converter->charErrorBuffer[3] = 0x43;
 421     }
 422     if(myConverterData->version == 1) {
 423         UConverter *cnv = myConverterData->currentConverter;
 424
 425         cnv->fromUChar32=0;
 426         cnv->fromUnicodeStatus=1;   /* prevLength */
 427     }
 428 }
 429
 430 static void
 431 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
 432
 433     char myLocale[6]={' ',' ',' ',' ',' ',' '};
 434
 435     cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
 436     if(cnv->extraInfo != NULL) {
 437         UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
 438         uint32_t version;
 439
 440         uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
 441         myConverterData->currentType = ASCII1;
 442         cnv->fromUnicodeStatus =FALSE;
 443         if(locale){
 444             uprv_strncpy(myLocale, locale, sizeof(myLocale));
 445         }
 446         version = options & UCNV_OPTIONS_VERSION_MASK;
 447         myConverterData->version = version;
 448         if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
 449             (myLocale[2]=='_' || myLocale[2]=='\0'))
 450         {
 451             size_t len=0;
 452             /* open the required converters and cache them */
 453             if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
 454                 myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
 455             }
 456             myConverterData->myConverterArray[JISX201]      = ucnv_loadSharedData("JISX0201", NULL, errorCode);
 457             myConverterData->myConverterArray[JISX208]      = ucnv_loadSharedData("jisx-208", NULL, errorCode);
 458             if(jpCharsetMasks[version]&CSM(JISX212)) {
 459                 myConverterData->myConverterArray[JISX212]  = ucnv_loadSharedData("jisx-212", NULL, errorCode);
 460             }
 461             if(jpCharsetMasks[version]&CSM(GB2312)) {
 462                 myConverterData->myConverterArray[GB2312]   = ucnv_loadSharedData("ibm-5478", NULL, errorCode);   /* gb_2312_80-1 */
 463             }
 464             if(jpCharsetMasks[version]&CSM(KSC5601)) {
 465                 myConverterData->myConverterArray[KSC5601]  = ucnv_loadSharedData("ksc_5601", NULL, errorCode);
 466             }
 467
 468             /* set the function pointers to appropriate funtions */
 469             cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
 470             uprv_strcpy(myConverterData->locale,"ja");
 471
 472             uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
 473             len = uprv_strlen(myConverterData->name);
 474             myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
 475             myConverterData->name[len+1]='\0';
 476         }
 477         else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
 478             (myLocale[2]=='_' || myLocale[2]=='\0'))
 479         {
 480             if (version==1){
 481                 myConverterData->currentConverter=
 482                     ucnv_open("icu-internal-25546",errorCode);
 483
 484                 if (U_FAILURE(*errorCode)) {
 485                     _ISO2022Close(cnv);
 486                     return;
 487                 }
 488
 489                 uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
 490                 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
 491                 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
 492             }else{
 493                 myConverterData->currentConverter=ucnv_open("ibm-949",errorCode);
 494
 495                 if (U_FAILURE(*errorCode)) {
 496                     _ISO2022Close(cnv);
 497                     return;
 498                 }
 499
 500                 myConverterData->version = 0;
 501                 uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
 502             }
 503
 504             /* initialize the state variables */
 505             setInitialStateToUnicodeKR(cnv, myConverterData);
 506             setInitialStateFromUnicodeKR(cnv, myConverterData);
 507
 508             /* set the function pointers to appropriate funtions */
 509             cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
 510             uprv_strcpy(myConverterData->locale,"ko");
 511         }
 512         else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
 513             (myLocale[2]=='_' || myLocale[2]=='\0'))
 514         {
 515
 516             /* open the required converters and cache them */
 517             myConverterData->myConverterArray[GB2312_1]         = ucnv_loadSharedData("ibm-5478", NULL, errorCode);
 518             if(version==1) {
 519                 myConverterData->myConverterArray[ISO_IR_165]   = ucnv_loadSharedData("iso-ir-165", NULL, errorCode);
 520             }
 521             myConverterData->myConverterArray[CNS_11643]        = ucnv_loadSharedData("cns-11643-1992", NULL, errorCode);
 522
 523
 524             /* set the function pointers to appropriate funtions */
 525             cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
 526             uprv_strcpy(myConverterData->locale,"cn");
 527
 528             if (version==1){
 529                 uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
 530             }else{
 531                 myConverterData->version = 0;
 532                 uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
 533             }
 534         }
 535         else{
 536 #ifdef U_ENABLE_GENERIC_ISO_2022
 537             myConverterData->isFirstBuffer = TRUE;
 538
 539             /* append the UTF-8 escape sequence */
 540             cnv->charErrorBufferLength = 3;
 541             cnv->charErrorBuffer[0] = 0x1b;
 542             cnv->charErrorBuffer[1] = 0x25;
 543             cnv->charErrorBuffer[2] = 0x42;
 544
 545             cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
 546             /* initialize the state variables */
 547             uprv_strcpy(myConverterData->name,"ISO_2022");
 548 #else
 549             *errorCode = U_UNSUPPORTED_ERROR;
 550             return;
 551 #endif
 552         }
 553
 554         cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
 555
 556         if(U_FAILURE(*errorCode)) {
 557             _ISO2022Close(cnv);
 558         }
 559     } else {
 560         *errorCode = U_MEMORY_ALLOCATION_ERROR;
 561     }
 562 }
 563
 564
 565 static void
 566 _ISO2022Close(UConverter *converter) {
 567     UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
 568     UConverterSharedData **array = myData->myConverterArray;
 569     int32_t i;
 570
 571     if (converter->extraInfo != NULL) {
 572         /*close the array of converter pointers and free the memory*/
 573         for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
 574             if(array[i]!=NULL) {
 575                 ucnv_unloadSharedDataIfReady(array[i]);
 576             }
 577         }
 578
 579         ucnv_close(myData->currentConverter);
 580
 581         if(!converter->isExtraLocal){
 582             uprv_free (converter->extraInfo);
 583             converter->extraInfo = NULL;
 584         }
 585     }
 586 }
 587
 588 static void
 589 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
 590     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
 591     if(choice<=UCNV_RESET_TO_UNICODE) {
 592         uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
 593         myConverterData->key = 0;
 594         myConverterData->isEmptySegment = FALSE;
 595     }
 596     if(choice!=UCNV_RESET_TO_UNICODE) {
 597         uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
 598     }
 599 #ifdef U_ENABLE_GENERIC_ISO_2022
 600     if(myConverterData->locale[0] == 0){
 601         if(choice<=UCNV_RESET_TO_UNICODE) {
 602             myConverterData->isFirstBuffer = TRUE;
 603             myConverterData->key = 0;
 604             if (converter->mode == UCNV_SO){
 605                 ucnv_close (myConverterData->currentConverter);
 606                 myConverterData->currentConverter=NULL;
 607             }
 608             converter->mode = UCNV_SI;
 609         }
 610         if(choice!=UCNV_RESET_TO_UNICODE) {
 611             /* re-append UTF-8 escape sequence */
 612             converter->charErrorBufferLength = 3;
 613             converter->charErrorBuffer[0] = 0x1b;
 614             converter->charErrorBuffer[1] = 0x28;
 615             converter->charErrorBuffer[2] = 0x42;
 616         }
 617     }
 618     else
 619 #endif
 620     {
 621         /* reset the state variables */
 622         if(myConverterData->locale[0] == 'k'){
 623             if(choice<=UCNV_RESET_TO_UNICODE) {
 624                 setInitialStateToUnicodeKR(converter, myConverterData);
 625             }
 626             if(choice!=UCNV_RESET_TO_UNICODE) {
 627                 setInitialStateFromUnicodeKR(converter, myConverterData);
 628             }
 629         }
 630     }
 631 }
 632
 633 static const char*
 634 _ISO2022getName(const UConverter* cnv){
 635     if(cnv->extraInfo){
 636         UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
 637         return myData->name;
 638     }
 639     return NULL;
 640 }
 641
 642
 643 /*************** to unicode *******************/
 644 /****************************************************************************
 645  * Recognized escape sequences are
 646  * <ESC>(B  ASCII
 647  * <ESC>.A  ISO-8859-1
 648  * <ESC>.F  ISO-8859-7
 649  * <ESC>(J  JISX-201
 650  * <ESC>(I  JISX-201
 651  * <ESC>$B  JISX-208
 652  * <ESC>$@  JISX-208
 653  * <ESC>$(D JISX-212
 654  * <ESC>$A  GB2312
 655  * <ESC>$(C KSC5601
 656  */
 657 static const StateEnum nextStateToUnicodeJP[MAX_STATES_2022]= {
 658 /*      0                1               2               3               4               5               6               7               8               9    */
 659     INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 660     ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
 661     ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 662     ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
 663     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 664     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 665     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 666     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 667 };
 668
 669 /*************** to unicode *******************/
 670 static const StateEnum nextStateToUnicodeCN[MAX_STATES_2022]= {
 671 /*      0                1               2               3               4               5               6               7               8               9    */
 672      INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 673     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 674     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 675     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 676     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
 677     ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 678     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 679     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 680 };
 681
 682
 683 static UCNV_TableStates_2022
 684 getKey_2022(char c,int32_t* key,int32_t* offset){
 685     int32_t togo;
 686     int32_t low = 0;
 687     int32_t hi = MAX_STATES_2022;
 688     int32_t oldmid=0;
 689
 690     togo = normalize_esq_chars_2022[(uint8_t)c];
 691     if(togo == 0) {
 692         /* not a valid character anywhere in an escape sequence */
 693         *key = 0;
 694         *offset = 0;
 695         return INVALID_2022;
 696     }
 697     togo = (*key << 5) + togo;
 698
 699     while (hi != low)  /*binary search*/{
 700
 701         register int32_t mid = (hi+low) >> 1; /*Finds median*/
 702
 703         if (mid == oldmid)
 704             break;
 705
 706         if (escSeqStateTable_Key_2022[mid] > togo){
 707             hi = mid;
 708         }
 709         else if (escSeqStateTable_Key_2022[mid] < togo){
 710             low = mid;
 711         }
 712         else /*we found it*/{
 713             *key = togo;
 714             *offset = mid;
 715             return escSeqStateTable_Value_2022[mid];
 716         }
 717         oldmid = mid;
 718
 719     }
 720
 721     *key = 0;
 722     *offset = 0;
 723     return INVALID_2022;
 724 }
 725
 726 /*runs through a state machine to determine the escape sequence - codepage correspondance
 727  */
 728 static void
 729 changeState_2022(UConverter* _this,
 730                 const char** source,
 731                 const char* sourceLimit,
 732                 Variant2022 var,
 733                 UErrorCode* err){
 734     UCNV_TableStates_2022 value;
 735     UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
 736     uint32_t key = myData2022->key;
 737     int32_t offset = 0;
 738     int8_t initialToULength = _this->toULength;
 739     char c;
 740
 741     value = VALID_NON_TERMINAL_2022;
 742     while (*source < sourceLimit) {
 743         c = *(*source)++;
 744         _this->toUBytes[_this->toULength++]=(uint8_t)c;
 745         value = getKey_2022(c,(int32_t *) &key, &offset);
 746
 747         switch (value){
 748
 749         case VALID_NON_TERMINAL_2022 :
 750             /* continue with the loop */
 751             break;
 752
 753         case VALID_TERMINAL_2022:
 754             key = 0;
 755             goto DONE;
 756
 757         case INVALID_2022:
 758             goto DONE;
 759
 760         case VALID_MAYBE_TERMINAL_2022:
 761 #ifdef U_ENABLE_GENERIC_ISO_2022
 762             /* ESC ( B is ambiguous only for ISO_2022 itself */
 763             if(var == ISO_2022) {
 764                 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
 765                 _this->toULength = 0;
 766
 767                 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
 768
 769                 /* continue with the loop */
 770                 value = VALID_NON_TERMINAL_2022;
 771                 break;
 772             } else
 773 #endif
 774             {
 775                 /* not ISO_2022 itself, finish here */
 776                 value = VALID_TERMINAL_2022;
 777                 key = 0;
 778                 goto DONE;
 779             }
 780         }
 781     }
 782
 783 DONE:
 784     myData2022->key = key;
 785
 786     if (value == VALID_NON_TERMINAL_2022) {
 787         /* indicate that the escape sequence is incomplete: key!=0 */
 788         return;
 789     } else if (value == INVALID_2022 ) {
 790         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 791     } else /* value == VALID_TERMINAL_2022 */ {
 792         switch(var){
 793 #ifdef U_ENABLE_GENERIC_ISO_2022
 794         case ISO_2022:
 795         {
 796             const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
 797             if(chosenConverterName == NULL) {
 798                 /* SS2 or SS3 */
 799                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 800                 return;
 801             }
 802
 803             _this->mode = UCNV_SI;
 804             ucnv_close(myData2022->currentConverter);
 805             myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
 806             if(U_SUCCESS(*err)) {
 807                 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
 808                 _this->mode = UCNV_SO;
 809             }
 810             break;
 811         }
 812 #endif
 813         case ISO_2022_JP:
 814             {
 815                 StateEnum tempState=nextStateToUnicodeJP[offset];
 816                 switch(tempState) {
 817                 case INVALID_STATE:
 818                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 819                     break;
 820                 case SS2_STATE:
 821                     if(myData2022->toU2022State.cs[2]!=0) {
 822                         if(myData2022->toU2022State.g<2) {
 823                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 824                         }
 825                         myData2022->toU2022State.g=2;
 826                     } else {
 827                         /* illegal to have SS2 before a matching designator */
 828                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 829                     }
 830                     break;
 831                 /* case SS3_STATE: not used in ISO-2022-JP-x */
 832                 case ISO8859_1:
 833                 case ISO8859_7:
 834                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
 835                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 836                     } else {
 837                         /* G2 charset for SS2 */
 838                         myData2022->toU2022State.cs[2]=(int8_t)tempState;
 839                     }
 840                     break;
 841                 default:
 842                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
 843                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 844                     } else {
 845                         /* G0 charset */
 846                         myData2022->toU2022State.cs[0]=(int8_t)tempState;
 847                     }
 848                     break;
 849                 }
 850             }
 851             break;
 852         case ISO_2022_CN:
 853             {
 854                 StateEnum tempState=nextStateToUnicodeCN[offset];
 855                 switch(tempState) {
 856                 case INVALID_STATE:
 857                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 858                     break;
 859                 case SS2_STATE:
 860                     if(myData2022->toU2022State.cs[2]!=0) {
 861                         if(myData2022->toU2022State.g<2) {
 862                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 863                         }
 864                         myData2022->toU2022State.g=2;
 865                     } else {
 866                         /* illegal to have SS2 before a matching designator */
 867                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 868                     }
 869                     break;
 870                 case SS3_STATE:
 871                     if(myData2022->toU2022State.cs[3]!=0) {
 872                         if(myData2022->toU2022State.g<2) {
 873                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 874                         }
 875                         myData2022->toU2022State.g=3;
 876                     } else {
 877                         /* illegal to have SS3 before a matching designator */
 878                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 879                     }
 880                     break;
 881                 case ISO_IR_165:
 882                     if(myData2022->version==0) {
 883                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 884                         break;
 885                     }
 886                     /*fall through*/
 887                 case GB2312_1:
 888                     /*fall through*/
 889                 case CNS_11643_1:
 890                     myData2022->toU2022State.cs[1]=(int8_t)tempState;
 891                     break;
 892                 case CNS_11643_2:
 893                     myData2022->toU2022State.cs[2]=(int8_t)tempState;
 894                     break;
 895                 default:
 896                     /* other CNS 11643 planes */
 897                     if(myData2022->version==0) {
 898                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 899                     } else {
 900                        myData2022->toU2022State.cs[3]=(int8_t)tempState;
 901                     }
 902                     break;
 903                 }
 904             }
 905             break;
 906         case ISO_2022_KR:
 907             if(offset==0x30){
 908                 /* nothing to be done, just accept this one escape sequence */
 909             } else {
 910                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 911             }
 912             break;
 913
 914         default:
 915             *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 916             break;
 917         }
 918     }
 919     if(U_SUCCESS(*err)) {
 920         _this->toULength = 0;
 921     } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
 922         if(_this->toULength>1) {
 923             /*
 924              * Ticket 5691: consistent illegal sequences:
 925              * - We include at least the first byte (ESC) in the illegal sequence.
 926              * - If any of the non-initial bytes could be the start of a character,
 927              *   we stop the illegal sequence before the first one of those.
 928              *   In escape sequences, all following bytes are "printable", that is,
 929              *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
 930              *   they are valid single/lead bytes.
 931              *   For simplicity, we always only report the initial ESC byte as the
 932              *   illegal sequence and back out all other bytes we looked at.
 933              */
 934             /* Back out some bytes. */
 935             int8_t backOutDistance=_this->toULength-1;
 936             int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
 937             if(backOutDistance<=bytesFromThisBuffer) {
 938                 /* same as initialToULength<=1 */
 939                 *source-=backOutDistance;
 940             } else {
 941                 /* Back out bytes from the previous buffer: Need to replay them. */
 942                 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
 943                 /* same as -(initialToULength-1) */
 944                 /* preToULength is negative! */
 945                 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
 946                 *source-=bytesFromThisBuffer;
 947             }
 948             _this->toULength=1;
 949         }
 950     }
 951 }
 952
 953 /*Checks the characters of the buffer against valid 2022 escape sequences
 954 *if the match we return a pointer to the initial start of the sequence otherwise
 955 *we return sourceLimit
 956 */
 957 /*for 2022 looks ahead in the stream
 958  *to determine the longest possible convertible
 959  *data stream
 960  */
 961 static U_INLINE const char*
 962 getEndOfBuffer_2022(const char** source,
 963                    const char* sourceLimit,
 964                    UBool flush){
 965
 966     const char* mySource = *source;
 967
 968 #ifdef U_ENABLE_GENERIC_ISO_2022
 969     if (*source >= sourceLimit)
 970         return sourceLimit;
 971
 972     do{
 973
 974         if (*mySource == ESC_2022){
 975             int8_t i;
 976             int32_t key = 0;
 977             int32_t offset;
 978             UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
 979
 980             /* Kludge: I could not
 981             * figure out the reason for validating an escape sequence
 982             * twice - once here and once in changeState_2022().
 983             * is it possible to have an ESC character in a ISO2022
 984             * byte stream which is valid in a code page? Is it legal?
 985             */
 986             for (i=0;
 987             (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
 988             i++) {
 989                 value =  getKey_2022(*(mySource+i), &key, &offset);
 990             }
 991             if (value > 0 || *mySource==ESC_2022)
 992                 return mySource;
 993
 994             if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
 995                 return sourceLimit;
 996         }
 997     }while (++mySource < sourceLimit);
 998
 999     return sourceLimit;
1000 #else
1001     while(mySource < sourceLimit && *mySource != ESC_2022) {
1002         ++mySource;
1003     }
1004     return mySource;
1005 #endif
1006 }
1007
1008
1009 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1010  * any future change in _MBCSFromUChar32() function should be reflected in
1011  * this macro
1012  */
1013 static U_INLINE void
1014 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1015                                          UChar32 c,
1016                                          uint32_t* value,
1017                                          UBool useFallback,
1018                                          int32_t *length,
1019                                          int outputType)
1020 {
1021     const int32_t *cx;
1022     const uint16_t *table;
1023     uint32_t stage2Entry;
1024     uint32_t myValue;
1025     const uint8_t *p;
1026     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1027     if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1028         table=sharedData->mbcs.fromUnicodeTable;
1029         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1030         /* get the bytes and the length for the output */
1031         if(outputType==MBCS_OUTPUT_2){
1032             myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1033             if(myValue<=0xff) {
1034                 *length=1;
1035             } else {
1036                 *length=2;
1037             }
1038         } else /* outputType==MBCS_OUTPUT_3 */ {
1039             p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1040             myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1041             if(myValue<=0xff) {
1042                 *length=1;
1043             } else if(myValue<=0xffff) {
1044                 *length=2;
1045             } else {
1046                 *length=3;
1047             }
1048         }
1049         /* is this code point assigned, or do we use fallbacks? */
1050         if( (stage2Entry&(1<<(16+(c&0xf))))!=0 ||
1051             (FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0)
1052         ) {
1053             /*
1054              * We allow a 0 byte output if the "assigned" bit is set for this entry.
1055              * There is no way with this data structure for fallback output
1056              * to be a zero byte.
1057              */
1058             /* assigned */
1059             *value=myValue;
1060             return;
1061         }
1062     }
1063
1064     cx=sharedData->mbcs.extIndexes;
1065     if(cx!=NULL) {
1066         *length=ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1067         return;
1068     }
1069
1070     /* unassigned */
1071     *length=0;
1072 }
1073
1074 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1075  * any future change in _MBCSSingleFromUChar32() function should be reflected in
1076  * this macro
1077  */
1078 static U_INLINE void
1079 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1080                                        UChar32 c,
1081                                        uint32_t* retval,
1082                                        UBool useFallback)
1083 {
1084     const uint16_t *table;
1085     int32_t value;
1086     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1087     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1088         *retval=(uint16_t)-1;
1089         return;
1090     }
1091     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1092     table=sharedData->mbcs.fromUnicodeTable;
1093     /* get the byte for the output */
1094     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1095     /* is this code point assigned, or do we use fallbacks? */
1096     if(useFallback ? value>=0x800 : value>=0xc00) {
1097         value &=0xff;
1098     } else {
1099         value= -1;
1100     }
1101     *retval=(uint16_t) value;
1102 }
1103
1104 #ifdef U_ENABLE_GENERIC_ISO_2022
1105
1106 /**********************************************************************************
1107 *  ISO-2022 Converter
1108 *
1109 *
1110 */
1111
1112 static void
1113 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1114                                                            UErrorCode* err){
1115     const char* mySourceLimit, *realSourceLimit;
1116     const char* sourceStart;
1117     const UChar* myTargetStart;
1118     UConverter* saveThis;
1119     UConverterDataISO2022* myData;
1120     int8_t length;
1121
1122     saveThis = args->converter;
1123     myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1124
1125     realSourceLimit = args->sourceLimit;
1126     while (args->source < realSourceLimit) {
1127         if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1128             /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1129             mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1130
1131             if(args->source < mySourceLimit) {
1132                 if(myData->currentConverter==NULL) {
1133                     myData->currentConverter = ucnv_open("ASCII",err);
1134                     if(U_FAILURE(*err)){
1135                         return;
1136                     }
1137
1138                     myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1139                     saveThis->mode = UCNV_SO;
1140                 }
1141
1142                 /* convert to before the ESC or until the end of the buffer */
1143                 myData->isFirstBuffer=FALSE;
1144                 sourceStart = args->source;
1145                 myTargetStart = args->target;
1146                 args->converter = myData->currentConverter;
1147                 ucnv_toUnicode(args->converter,
1148                     &args->target,
1149                     args->targetLimit,
1150                     &args->source,
1151                     mySourceLimit,
1152                     args->offsets,
1153                     (UBool)(args->flush && mySourceLimit == realSourceLimit),
1154                     err);
1155                 args->converter = saveThis;
1156
1157                 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1158                     /* move the overflow buffer */
1159                     length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1160                     myData->currentConverter->UCharErrorBufferLength = 0;
1161                     if(length > 0) {
1162                         uprv_memcpy(saveThis->UCharErrorBuffer,
1163                                     myData->currentConverter->UCharErrorBuffer,
1164                                     length*U_SIZEOF_UCHAR);
1165                     }
1166                     return;
1167                 }
1168
1169                 /*
1170                  * At least one of:
1171                  * -Error while converting
1172                  * -Done with entire buffer
1173                  * -Need to write offsets or update the current offset
1174                  *  (leave that up to the code in ucnv.c)
1175                  *
1176                  * or else we just stopped at an ESC byte and continue with changeState_2022()
1177                  */
1178                 if (U_FAILURE(*err) ||
1179                     (args->source == realSourceLimit) ||
1180                     (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1181                     (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1182                 ) {
1183                     /* copy partial or error input for truncated detection and error handling */
1184                     if(U_FAILURE(*err)) {
1185                         length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1186                         if(length > 0) {
1187                             uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1188                         }
1189                     } else {
1190                         length = saveThis->toULength = myData->currentConverter->toULength;
1191                         if(length > 0) {
1192                             uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1193                             if(args->source < mySourceLimit) {
1194                                 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1195                             }
1196                         }
1197                     }
1198                     return;
1199                 }
1200             }
1201         }
1202
1203         sourceStart = args->source;
1204         changeState_2022(args->converter,
1205                &(args->source),
1206                realSourceLimit,
1207                ISO_2022,
1208                err);
1209         if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1210             /* let the ucnv.c code update its current offset */
1211             return;
1212         }
1213     }
1214 }
1215
1216 #endif
1217
1218 /*
1219  * To Unicode Callback helper function
1220  */
1221 static void
1222 toUnicodeCallback(UConverter *cnv,
1223                   const uint32_t sourceChar, const uint32_t targetUniChar,
1224                   UErrorCode* err){
1225     if(sourceChar>0xff){
1226         cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1227         cnv->toUBytes[1] = (uint8_t)sourceChar;
1228         cnv->toULength = 2;
1229     }
1230     else{
1231         cnv->toUBytes[0] =(char) sourceChar;
1232         cnv->toULength = 1;
1233     }
1234
1235     if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1236         *err = U_INVALID_CHAR_FOUND;
1237     }
1238     else{
1239         *err = U_ILLEGAL_CHAR_FOUND;
1240     }
1241 }
1242
1243 /**************************************ISO-2022-JP*************************************************/
1244
1245 /************************************** IMPORTANT **************************************************
1246 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1247 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1248 * The converter iterates over each Unicode codepoint
1249 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1250 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1251 * would do as far as possible.
1252 *
1253 * If the implementation of these macros or structure of sharedData struct change in the future, make
1254 * sure that ISO-2022 is also changed.
1255 ***************************************************************************************************
1256 */
1257
1258 /***************************************************************************************************
1259 * Rules for ISO-2022-jp encoding
1260 * (i)   Escape sequences must be fully contained within a line they should not
1261 *       span new lines or CRs
1262 * (ii)  If the last character on a line is represented by two bytes then an ASCII or
1263 *       JIS-Roman character escape sequence should follow before the line terminates
1264 * (iii) If the first character on the line is represented by two bytes then a two
1265 *       byte character escape sequence should precede it
1266 * (iv)  If no escape sequence is encountered then the characters are ASCII
1267 * (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1268 *       and invoked with SS2 (ESC N).
1269 * (vi)  If there is any G0 designation in text, there must be a switch to
1270 *       ASCII or to JIS X 0201-Roman before a space character (but not
1271 *       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1272 *       characters such as tab or CRLF.
1273 * (vi)  Supported encodings:
1274 *          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1275 *
1276 *  source : RFC-1554
1277 *
1278 *          JISX201, JISX208,JISX212 : new .cnv data files created
1279 *          KSC5601 : alias to ibm-949 mapping table
1280 *          GB2312 : alias to ibm-1386 mapping table
1281 *          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1282 *          ISO-8859-7 : alisas to ibm-9409 mapping table
1283 */
1284
1285 /* preference order of JP charsets */
1286 static const StateEnum jpCharsetPref[]={
1287     ASCII,
1288     JISX201,
1289     ISO8859_1,
1290     ISO8859_7,
1291     JISX208,
1292     JISX212,
1293     GB2312,
1294     KSC5601,
1295     HWKANA_7BIT
1296 };
1297
1298 /*
1299  * The escape sequences must be in order of the enum constants like JISX201  = 3,
1300  * not in order of jpCharsetPref[]!
1301  */
1302 static const char escSeqChars[][6] ={
1303     "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1304     "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1305     "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1306     "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1307     "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1308     "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1309     "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1310     "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1311     "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1312
1313 };
1314 static  const int32_t escSeqCharsLen[] ={
1315     3, /* length of <ESC>(B  ASCII       */
1316     3, /* length of <ESC>.A  ISO-8859-1  */
1317     3, /* length of <ESC>.F  ISO-8859-7  */
1318     3, /* length of <ESC>(J  JISX-201    */
1319     3, /* length of <ESC>$B  JISX-208    */
1320     4, /* length of <ESC>$(D JISX-212    */
1321     3, /* length of <ESC>$A  GB2312      */
1322     4, /* length of <ESC>$(C KSC5601     */
1323     3  /* length of <ESC>(I  HWKANA_7BIT */
1324 };
1325
1326 /*
1327 * The iteration over various code pages works this way:
1328 * i)   Get the currentState from myConverterData->currentState
1329 * ii)  Check if the character is mapped to a valid character in the currentState
1330 *      Yes ->  a) set the initIterState to currentState
1331 *       b) remain in this state until an invalid character is found
1332 *      No  ->  a) go to the next code page and find the character
1333 * iii) Before changing the state increment the current state check if the current state
1334 *      is equal to the intitIteration state
1335 *      Yes ->  A character that cannot be represented in any of the supported encodings
1336 *       break and return a U_INVALID_CHARACTER error
1337 *      No  ->  Continue and find the character in next code page
1338 *
1339 *
1340 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1341 */
1342
1343 static void
1344 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1345     UConverterDataISO2022 *converterData;
1346     ISO2022State *pFromU2022State;
1347     uint8_t *target = (uint8_t *) args->target;
1348     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1349     const UChar* source = args->source;
1350     const UChar* sourceLimit = args->sourceLimit;
1351     int32_t* offsets = args->offsets;
1352     UChar32 sourceChar;
1353     char buffer[8];
1354     int32_t len, outLen;
1355     int8_t choices[10];
1356     int32_t choiceCount;
1357     uint32_t targetValue = 0;
1358     UBool useFallback;
1359
1360     int32_t i;
1361     int8_t cs, g;
1362
1363     /* set up the state */
1364     converterData     = (UConverterDataISO2022*)args->converter->extraInfo;
1365     pFromU2022State   = &converterData->fromU2022State;
1366     useFallback       = args->converter->useFallback;
1367
1368     choiceCount = 0;
1369
1370     /* check if the last codepoint of previous buffer was a lead surrogate*/
1371     if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
1372         goto getTrail;
1373     }
1374
1375     while(source < sourceLimit) {
1376         if(target < targetLimit) {
1377
1378             sourceChar  = *(source++);
1379             /*check if the char is a First surrogate*/
1380             if(UTF_IS_SURROGATE(sourceChar)) {
1381                 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1382 getTrail:
1383                     /*look ahead to find the trail surrogate*/
1384                     if(source < sourceLimit) {
1385                         /* test the following code unit */
1386                         UChar trail=(UChar) *source;
1387                         if(UTF_IS_SECOND_SURROGATE(trail)) {
1388                             source++;
1389                             sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1390                             args->converter->fromUChar32=0x00;
1391                             /* convert this supplementary code point */
1392                             /* exit this condition tree */
1393                         } else {
1394                             /* this is an unmatched lead code unit (1st surrogate) */
1395                             /* callback(illegal) */
1396                             *err=U_ILLEGAL_CHAR_FOUND;
1397                             args->converter->fromUChar32=sourceChar;
1398                             break;
1399                         }
1400                     } else {
1401                         /* no more input */
1402                         args->converter->fromUChar32=sourceChar;
1403                         break;
1404                     }
1405                 } else {
1406                     /* this is an unmatched trail code unit (2nd surrogate) */
1407                     /* callback(illegal) */
1408                     *err=U_ILLEGAL_CHAR_FOUND;
1409                     args->converter->fromUChar32=sourceChar;
1410                     break;
1411                 }
1412             }
1413
1414             /* do not convert SO/SI/ESC */
1415             if(IS_2022_CONTROL(sourceChar)) {
1416                 /* callback(illegal) */
1417                 *err=U_ILLEGAL_CHAR_FOUND;
1418                 args->converter->fromUChar32=sourceChar;
1419                 break;
1420             }
1421
1422             /* do the conversion */
1423
1424             if(choiceCount == 0) {
1425                 uint16_t csm;
1426
1427                 /*
1428                  * The csm variable keeps track of which charsets are allowed
1429                  * and not used yet while building the choices[].
1430                  */
1431                 csm = jpCharsetMasks[converterData->version];
1432                 choiceCount = 0;
1433
1434                 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1435                 if(converterData->version == 3 || converterData->version == 4) {
1436                     choices[choiceCount++] = cs = (int8_t)HWKANA_7BIT;
1437                     csm &= ~CSM(cs);
1438                 }
1439
1440                 /* try the current G0 charset */
1441                 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1442                 csm &= ~CSM(cs);
1443
1444                 /* try the current G2 charset */
1445                 if((cs = pFromU2022State->cs[2]) != 0) {
1446                     choices[choiceCount++] = cs;
1447                     csm &= ~CSM(cs);
1448                 }
1449
1450                 /* try all the other possible charsets */
1451                 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1452                     cs = (int8_t)jpCharsetPref[i];
1453                     if(CSM(cs) & csm) {
1454                         choices[choiceCount++] = cs;
1455                         csm &= ~CSM(cs);
1456                     }
1457                 }
1458             }
1459
1460             cs = g = 0;
1461             len = 0;
1462
1463             for(i = 0; i < choiceCount && len == 0; ++i) {
1464                 cs = choices[i];
1465                 switch(cs) {
1466                 case ASCII:
1467                     if(sourceChar <= 0x7f) {
1468                         targetValue = (uint32_t)sourceChar;
1469                         len = 1;
1470                     }
1471                     break;
1472                 case ISO8859_1:
1473                     if(0x80 <= sourceChar && sourceChar <= 0xff) {
1474                         targetValue = (uint32_t)sourceChar - 0x80;
1475                         len = 1;
1476                         g = 2;
1477                     }
1478                     break;
1479                 case HWKANA_7BIT:
1480                     if((uint32_t)(0xff9f-sourceChar)<=(0xff9f-0xff61)) {
1481                         targetValue = (uint32_t)(sourceChar - (0xff61 - 0x21));
1482                         len = 1;
1483
1484                         if(converterData->version==3) {
1485                             /* JIS7: use G1 (SO) */
1486                             pFromU2022State->cs[1] = cs; /* do not output an escape sequence */
1487                             g = 1;
1488                         } else if(converterData->version==4) {
1489                             /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1490                             int8_t cs0;
1491
1492                             targetValue += 0x80;
1493
1494                             cs0 = pFromU2022State->cs[0];
1495                             if(IS_JP_DBCS(cs0)) {
1496                                 /* switch from a DBCS charset to JISX201 */
1497                                 cs = (int8_t)JISX201;
1498                             } else {
1499                                 /* stay in the current G0 charset */
1500                                 cs = cs0;
1501                             }
1502                         }
1503                     }
1504                     break;
1505                 case JISX201:
1506                     /* G0 SBCS */
1507                     MBCS_SINGLE_FROM_UCHAR32(
1508                         converterData->myConverterArray[cs],
1509                         sourceChar, &targetValue,
1510                         useFallback);
1511                     if(targetValue <= 0x7f) {
1512                         len = 1;
1513                     }
1514                     break;
1515                 case ISO8859_7:
1516                     /* G0 SBCS forced to 7-bit output */
1517                     MBCS_SINGLE_FROM_UCHAR32(
1518                         converterData->myConverterArray[cs],
1519                         sourceChar, &targetValue,
1520                         useFallback);
1521                     if(0x80 <= targetValue && targetValue <= 0xff) {
1522                         targetValue -= 0x80;
1523                         len = 1;
1524                         g = 2;
1525                     }
1526                     break;
1527                 default:
1528                     /* G0 DBCS */
1529                     MBCS_FROM_UCHAR32_ISO2022(
1530                         converterData->myConverterArray[cs],
1531                         sourceChar, &targetValue,
1532                         useFallback, &len, MBCS_OUTPUT_2);
1533                     if(len != 2) {
1534                         len = 0;
1535                     }
1536                     break;
1537                 }
1538             }
1539
1540             if(len > 0) {
1541                 outLen = 0; /* count output bytes */
1542
1543                 /* write SI if necessary (only for JIS7) */
1544                 if(pFromU2022State->g == 1 && g == 0) {
1545                     buffer[outLen++] = UCNV_SI;
1546                     pFromU2022State->g = 0;
1547                 }
1548
1549                 /* write the designation sequence if necessary */
1550                 if(cs != pFromU2022State->cs[g]) {
1551                     int32_t escLen = escSeqCharsLen[cs];
1552                     uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1553                     outLen += escLen;
1554                     pFromU2022State->cs[g] = cs;
1555
1556                     /* invalidate the choices[] */
1557                     choiceCount = 0;
1558                 }
1559
1560                 /* write the shift sequence if necessary */
1561                 if(g != pFromU2022State->g) {
1562                     switch(g) {
1563                     /* case 0 handled before writing escapes */
1564                     case 1:
1565                         buffer[outLen++] = UCNV_SO;
1566                         pFromU2022State->g = 1;
1567                         break;
1568                     default: /* case 2 */
1569                         buffer[outLen++] = 0x1b;
1570                         buffer[outLen++] = 0x4e;
1571                         break;
1572                     /* no case 3: no SS3 in ISO-2022-JP-x */
1573                     }
1574                 }
1575
1576                 /* write the output bytes */
1577                 if(len == 1) {
1578                     buffer[outLen++] = (char)targetValue;
1579                 } else /* len == 2 */ {
1580                     buffer[outLen++] = (char)(targetValue >> 8);
1581                     buffer[outLen++] = (char)targetValue;
1582                 }
1583             } else {
1584                 /*
1585                  * if we cannot find the character after checking all codepages
1586                  * then this is an error
1587                  */
1588                 *err = U_INVALID_CHAR_FOUND;
1589                 args->converter->fromUChar32=sourceChar;
1590                 break;
1591             }
1592
1593             if(sourceChar == CR || sourceChar == LF) {
1594                 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1595                 pFromU2022State->cs[2] = 0;
1596                 choiceCount = 0;
1597             }
1598
1599             /* output outLen>0 bytes in buffer[] */
1600             if(outLen == 1) {
1601                 *target++ = buffer[0];
1602                 if(offsets) {
1603                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1604                 }
1605             } else if(outLen == 2 && (target + 2) <= targetLimit) {
1606                 *target++ = buffer[0];
1607                 *target++ = buffer[1];
1608                 if(offsets) {
1609                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1610                     *offsets++ = sourceIndex;
1611                     *offsets++ = sourceIndex;
1612                 }
1613             } else {
1614                 fromUWriteUInt8(
1615                     args->converter,
1616                     buffer, outLen,
1617                     &target, (const char *)targetLimit,
1618                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1619                     err);
1620                 if(U_FAILURE(*err)) {
1621                     break;
1622                 }
1623             }
1624         } /* end if(myTargetIndex<myTargetLength) */
1625         else{
1626             *err =U_BUFFER_OVERFLOW_ERROR;
1627             break;
1628         }
1629
1630     }/* end while(mySourceIndex<mySourceLength) */
1631
1632     /*
1633      * the end of the input stream and detection of truncated input
1634      * are handled by the framework, but for ISO-2022-JP conversion
1635      * we need to be in ASCII mode at the very end
1636      *
1637      * conditions:
1638      *   successful
1639      *   in SO mode or not in ASCII mode
1640      *   end of input and no truncated input
1641      */
1642     if( U_SUCCESS(*err) &&
1643         (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1644         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
1645     ) {
1646         int32_t sourceIndex;
1647
1648         outLen = 0;
1649
1650         if(pFromU2022State->g != 0) {
1651             buffer[outLen++] = UCNV_SI;
1652             pFromU2022State->g = 0;
1653         }
1654
1655         if(pFromU2022State->cs[0] != ASCII) {
1656             int32_t escLen = escSeqCharsLen[ASCII];
1657             uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1658             outLen += escLen;
1659             pFromU2022State->cs[0] = (int8_t)ASCII;
1660         }
1661
1662         /* get the source index of the last input character */
1663         /*
1664          * TODO this would be simpler and more reliable if we used a pair
1665          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1666          * so that we could simply use the prevSourceIndex here;
1667          * this code gives an incorrect result for the rare case of an unmatched
1668          * trail surrogate that is alone in the last buffer of the text stream
1669          */
1670         sourceIndex=(int32_t)(source-args->source);
1671         if(sourceIndex>0) {
1672             --sourceIndex;
1673             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1674                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1675             ) {
1676                 --sourceIndex;
1677             }
1678         } else {
1679             sourceIndex=-1;
1680         }
1681
1682         fromUWriteUInt8(
1683             args->converter,
1684             buffer, outLen,
1685             &target, (const char *)targetLimit,
1686             &offsets, sourceIndex,
1687             err);
1688     }
1689
1690     /*save the state and return */
1691     args->source = source;
1692     args->target = (char*)target;
1693 }
1694
1695 /*************** to unicode *******************/
1696
1697 static void
1698 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
1699                                                UErrorCode* err){
1700     char tempBuf[3];
1701     const char *mySource = (char *) args->source;
1702     UChar *myTarget = args->target;
1703     const char *mySourceLimit = args->sourceLimit;
1704     uint32_t targetUniChar = 0x0000;
1705     uint32_t mySourceChar = 0x0000;
1706     UConverterDataISO2022* myData;
1707     ISO2022State *pToU2022State;
1708     StateEnum cs;
1709
1710     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
1711     pToU2022State = &myData->toU2022State;
1712
1713     if(myData->key != 0) {
1714         /* continue with a partial escape sequence */
1715         goto escape;
1716     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
1717         /* continue with a partial double-byte character */
1718         mySourceChar = args->converter->toUBytes[0];
1719         args->converter->toULength = 0;
1720         cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1721         targetUniChar = missingCharMarker;
1722         goto getTrailByte;
1723     }
1724
1725     while(mySource < mySourceLimit){
1726
1727         targetUniChar =missingCharMarker;
1728
1729         if(myTarget < args->targetLimit){
1730
1731             mySourceChar= (unsigned char) *mySource++;
1732
1733             switch(mySourceChar) {
1734             case UCNV_SI:
1735                 if(myData->version==3) {
1736                     pToU2022State->g=0;
1737                     continue;
1738                 } else {
1739                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1740                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
1741                     break;
1742                 }
1743
1744             case UCNV_SO:
1745                 if(myData->version==3) {
1746                     /* JIS7: switch to G1 half-width Katakana */
1747                     pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
1748                     pToU2022State->g=1;
1749                     continue;
1750                 } else {
1751                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1752                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
1753                     break;
1754                 }
1755
1756             case ESC_2022:
1757                 mySource--;
1758 escape:
1759                 {
1760                     const char * mySourceBefore = mySource;
1761                     int8_t toULengthBefore = args->converter->toULength;
1762
1763                     changeState_2022(args->converter,&(mySource),
1764                         mySourceLimit, ISO_2022_JP,err);
1765
1766                     /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
1767                     if ( myData->version == 0 && myData->key == 0 && U_SUCCESS(*err) && myData->isEmptySegment ) {
1768                         *err = U_PARSE_ERROR;   /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
1769                         args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
1770                     }
1771
1772                 }
1773                 /* invalid or illegal escape sequence */
1774                 if(U_FAILURE(*err)){
1775                     args->target = myTarget;
1776                     args->source = mySource;
1777                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
1778                     return;
1779                 }
1780                 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
1781                 if (myData->key == 0) {
1782                     myData->isEmptySegment = TRUE;
1783                 }
1784                 continue;
1785
1786             /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
1787
1788             case CR:
1789                 /*falls through*/
1790             case LF:
1791                 /* automatically reset to single-byte mode */
1792                 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
1793                     pToU2022State->cs[0] = (int8_t)ASCII;
1794                 }
1795                 pToU2022State->cs[2] = 0;
1796                 pToU2022State->g = 0;
1797                 /* falls through */
1798             default:
1799                 /* convert one or two bytes */
1800                 myData->isEmptySegment = FALSE;
1801                 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1802                 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
1803                     !IS_JP_DBCS(cs)
1804                 ) {
1805                     /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
1806                     targetUniChar = mySourceChar + (0xff61 - 0xa1);
1807
1808                     /* return from a single-shift state to the previous one */
1809                     if(pToU2022State->g >= 2) {
1810                         pToU2022State->g=pToU2022State->prevG;
1811                     }
1812                 } else switch(cs) {
1813                 case ASCII:
1814                     if(mySourceChar <= 0x7f) {
1815                         targetUniChar = mySourceChar;
1816                     }
1817                     break;
1818                 case ISO8859_1:
1819                     if(mySourceChar <= 0x7f) {
1820                         targetUniChar = mySourceChar + 0x80;
1821                     }
1822                     /* return from a single-shift state to the previous one */
1823                     pToU2022State->g=pToU2022State->prevG;
1824                     break;
1825                 case ISO8859_7:
1826                     if(mySourceChar <= 0x7f) {
1827                         /* convert mySourceChar+0x80 to use a normal 8-bit table */
1828                         targetUniChar =
1829                             _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1830                                 myData->myConverterArray[cs],
1831                                 mySourceChar + 0x80);
1832                     }
1833                     /* return from a single-shift state to the previous one */
1834                     pToU2022State->g=pToU2022State->prevG;
1835                     break;
1836                 case JISX201:
1837                     if(mySourceChar <= 0x7f) {
1838                         targetUniChar =
1839                             _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1840                                 myData->myConverterArray[cs],
1841                                 mySourceChar);
1842                     }
1843                     break;
1844                 case HWKANA_7BIT:
1845                     if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
1846                         /* 7-bit halfwidth Katakana */
1847                         targetUniChar = mySourceChar + (0xff61 - 0x21);
1848                     }
1849                     break;
1850                 default:
1851                     /* G0 DBCS */
1852                     if(mySource < mySourceLimit) {
1853                         int leadIsOk, trailIsOk;
1854                         uint8_t trailByte;
1855 getTrailByte:
1856                         trailByte = (uint8_t)*mySource;
1857                         /* old
1858                         tempBuf[0] = (char) (mySourceChar);
1859                         tempBuf[1] = trailByte = *mySource++;
1860                         mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
1861                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
1862                         */
1863                         /*
1864                          * Ticket 5691: consistent illegal sequences:
1865                          * - We include at least the first byte in the illegal sequence.
1866                          * - If any of the non-initial bytes could be the start of a character,
1867                          *Ê Êwe stop the illegal sequence before the first one of those.
1868                          *
1869                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
1870                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
1871                          * Otherwise we convert or report the pair of bytes.
1872                          */
1873                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
1874                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
1875                         if (leadIsOk && trailIsOk) {
1876                             ++mySource;
1877                             tempBuf[0] = (char) (mySourceChar);
1878                             tempBuf[1] = trailByte;
1879                             mySourceChar = (mySourceChar << 8) | trailByte;
1880                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
1881                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
1882                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
1883                             ++mySource;
1884                             /* add another bit so that the code below writes 2 bytes in case of error */
1885                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
1886                         }
1887                     } else {
1888                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
1889                         args->converter->toULength = 1;
1890                         goto endloop;
1891                     }
1892                 }
1893                 break;
1894             }
1895             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
1896                 if(args->offsets){
1897                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
1898                 }
1899                 *(myTarget++)=(UChar)targetUniChar;
1900             }
1901             else if(targetUniChar > missingCharMarker){
1902                 /* disassemble the surrogate pair and write to output*/
1903                 targetUniChar-=0x0010000;
1904                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
1905                 if(args->offsets){
1906                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
1907                 }
1908                 ++myTarget;
1909                 if(myTarget< args->targetLimit){
1910                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
1911                     if(args->offsets){
1912                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
1913                     }
1914                     ++myTarget;
1915                 }else{
1916                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
1917                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
1918                 }
1919
1920             }
1921             else{
1922                 /* Call the callback function*/
1923                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
1924                 break;
1925             }
1926         }
1927         else{
1928             *err =U_BUFFER_OVERFLOW_ERROR;
1929             break;
1930         }
1931     }
1932 endloop:
1933     args->target = myTarget;
1934     args->source = mySource;
1935 }
1936
1937
1938 /***************************************************************
1939 *   Rules for ISO-2022-KR encoding
1940 *   i) The KSC5601 designator sequence should appear only once in a file,
1941 *      at the begining of a line before any KSC5601 characters. This usually
1942 *      means that it appears by itself on the first line of the file
1943 *  ii) There are only 2 shifting sequences SO to shift into double byte mode
1944 *      and SI to shift into single byte mode
1945 */
1946 static void
1947 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
1948
1949     UConverter* saveConv = args->converter;
1950     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
1951     args->converter=myConverterData->currentConverter;
1952
1953     myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
1954     ucnv_MBCSFromUnicodeWithOffsets(args,err);
1955     saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
1956
1957     if(*err == U_BUFFER_OVERFLOW_ERROR) {
1958         if(myConverterData->currentConverter->charErrorBufferLength > 0) {
1959             uprv_memcpy(
1960                 saveConv->charErrorBuffer,
1961                 myConverterData->currentConverter->charErrorBuffer,
1962                 myConverterData->currentConverter->charErrorBufferLength);
1963         }
1964         saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
1965         myConverterData->currentConverter->charErrorBufferLength = 0;
1966     }
1967     args->converter=saveConv;
1968 }
1969
1970 static void
1971 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
1972
1973     const UChar *source = args->source;
1974     const UChar *sourceLimit = args->sourceLimit;
1975     unsigned char *target = (unsigned char *) args->target;
1976     unsigned char *targetLimit = (unsigned char *) args->targetLimit;
1977     int32_t* offsets = args->offsets;
1978     uint32_t targetByteUnit = 0x0000;
1979     UChar32 sourceChar = 0x0000;
1980     UBool isTargetByteDBCS;
1981     UBool oldIsTargetByteDBCS;
1982     UConverterDataISO2022 *converterData;
1983     UConverterSharedData* sharedData;
1984     UBool useFallback;
1985     int32_t length =0;
1986
1987     converterData=(UConverterDataISO2022*)args->converter->extraInfo;
1988     /* if the version is 1 then the user is requesting
1989      * conversion with ibm-25546 pass the arguments to
1990      * MBCS converter and return
1991      */
1992     if(converterData->version==1){
1993         UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
1994         return;
1995     }
1996
1997     /* initialize data */
1998     sharedData = converterData->currentConverter->sharedData;
1999     useFallback = args->converter->useFallback;
2000     isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2001     oldIsTargetByteDBCS = isTargetByteDBCS;
2002
2003     isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2004     if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2005         goto getTrail;
2006     }
2007     while(source < sourceLimit){
2008
2009         targetByteUnit = missingCharMarker;
2010
2011         if(target < (unsigned char*) args->targetLimit){
2012             sourceChar = *source++;
2013
2014             /* do not convert SO/SI/ESC */
2015             if(IS_2022_CONTROL(sourceChar)) {
2016                 /* callback(illegal) */
2017                 *err=U_ILLEGAL_CHAR_FOUND;
2018                 args->converter->fromUChar32=sourceChar;
2019                 break;
2020             }
2021
2022            /* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData,
2023                 sourceChar,&targetByteUnit,args->converter->useFallback);*/
2024             MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,&length,MBCS_OUTPUT_2);
2025             /* only DBCS or SBCS characters are expected*/
2026             /* DB characters with high bit set to 1 are expected */
2027             if( length > 2 || length==0 ||
2028                 (length == 1 && targetByteUnit > 0x7f) ||
2029                 (length == 2 &&
2030                     ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2031                     (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2032             ) {
2033                 targetByteUnit=missingCharMarker;
2034             }
2035             if (targetByteUnit != missingCharMarker){
2036
2037                 oldIsTargetByteDBCS = isTargetByteDBCS;
2038                 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2039                   /* append the shift sequence */
2040                 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2041
2042                     if (isTargetByteDBCS)
2043                         *target++ = UCNV_SO;
2044                     else
2045                         *target++ = UCNV_SI;
2046                     if(offsets)
2047                         *(offsets++) = (int32_t)(source - args->source-1);
2048                 }
2049                 /* write the targetUniChar  to target */
2050                 if(targetByteUnit <= 0x00FF){
2051                     if( target < targetLimit){
2052                         *(target++) = (unsigned char) targetByteUnit;
2053                         if(offsets){
2054                             *(offsets++) = (int32_t)(source - args->source-1);
2055                         }
2056
2057                     }else{
2058                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2059                         *err = U_BUFFER_OVERFLOW_ERROR;
2060                     }
2061                 }else{
2062                     if(target < targetLimit){
2063                         *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2064                         if(offsets){
2065                             *(offsets++) = (int32_t)(source - args->source-1);
2066                         }
2067                         if(target < targetLimit){
2068                             *(target++) =(unsigned char) (targetByteUnit -0x80);
2069                             if(offsets){
2070                                 *(offsets++) = (int32_t)(source - args->source-1);
2071                             }
2072                         }else{
2073                             args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2074                             *err = U_BUFFER_OVERFLOW_ERROR;
2075                         }
2076                     }else{
2077                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2078                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2079                         *err = U_BUFFER_OVERFLOW_ERROR;
2080                     }
2081                 }
2082
2083             }
2084             else{
2085                 /* oops.. the code point is unassingned
2086                  * set the error and reason
2087                  */
2088
2089                 /*check if the char is a First surrogate*/
2090                 if(UTF_IS_SURROGATE(sourceChar)) {
2091                     if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2092 getTrail:
2093                         /*look ahead to find the trail surrogate*/
2094                         if(source <  sourceLimit) {
2095                             /* test the following code unit */
2096                             UChar trail=(UChar) *source;
2097                             if(UTF_IS_SECOND_SURROGATE(trail)) {
2098                                 source++;
2099                                 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2100                                 *err = U_INVALID_CHAR_FOUND;
2101                                 /* convert this surrogate code point */
2102                                 /* exit this condition tree */
2103                             } else {
2104                                 /* this is an unmatched lead code unit (1st surrogate) */
2105                                 /* callback(illegal) */
2106                                 *err=U_ILLEGAL_CHAR_FOUND;
2107                             }
2108                         } else {
2109                             /* no more input */
2110                             *err = U_ZERO_ERROR;
2111                         }
2112                     } else {
2113                         /* this is an unmatched trail code unit (2nd surrogate) */
2114                         /* callback(illegal) */
2115                         *err=U_ILLEGAL_CHAR_FOUND;
2116                     }
2117                 } else {
2118                     /* callback(unassigned) for a BMP code point */
2119                     *err = U_INVALID_CHAR_FOUND;
2120                 }
2121
2122                 args->converter->fromUChar32=sourceChar;
2123                 break;
2124             }
2125         } /* end if(myTargetIndex<myTargetLength) */
2126         else{
2127             *err =U_BUFFER_OVERFLOW_ERROR;
2128             break;
2129         }
2130
2131     }/* end while(mySourceIndex<mySourceLength) */
2132
2133     /*
2134      * the end of the input stream and detection of truncated input
2135      * are handled by the framework, but for ISO-2022-KR conversion
2136      * we need to be in ASCII mode at the very end
2137      *
2138      * conditions:
2139      *   successful
2140      *   not in ASCII mode
2141      *   end of input and no truncated input
2142      */
2143     if( U_SUCCESS(*err) &&
2144         isTargetByteDBCS &&
2145         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2146     ) {
2147         int32_t sourceIndex;
2148
2149         /* we are switching to ASCII */
2150         isTargetByteDBCS=FALSE;
2151
2152         /* get the source index of the last input character */
2153         /*
2154          * TODO this would be simpler and more reliable if we used a pair
2155          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2156          * so that we could simply use the prevSourceIndex here;
2157          * this code gives an incorrect result for the rare case of an unmatched
2158          * trail surrogate that is alone in the last buffer of the text stream
2159          */
2160         sourceIndex=(int32_t)(source-args->source);
2161         if(sourceIndex>0) {
2162             --sourceIndex;
2163             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2164                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2165             ) {
2166                 --sourceIndex;
2167             }
2168         } else {
2169             sourceIndex=-1;
2170         }
2171
2172         fromUWriteUInt8(
2173             args->converter,
2174             SHIFT_IN_STR, 1,
2175             &target, (const char *)targetLimit,
2176             &offsets, sourceIndex,
2177             err);
2178     }
2179
2180     /*save the state and return */
2181     args->source = source;
2182     args->target = (char*)target;
2183     args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2184 }
2185
2186 /************************ To Unicode ***************************************/
2187
2188 static void
2189 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2190                                                             UErrorCode* err){
2191     char const* sourceStart;
2192     UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2193
2194     UConverterToUnicodeArgs subArgs;
2195     int32_t minArgsSize;
2196
2197     /* set up the subconverter arguments */
2198     if(args->size<sizeof(UConverterToUnicodeArgs)) {
2199         minArgsSize = args->size;
2200     } else {
2201         minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2202     }
2203
2204     uprv_memcpy(&subArgs, args, minArgsSize);
2205     subArgs.size = (uint16_t)minArgsSize;
2206     subArgs.converter = myData->currentConverter;
2207
2208     /* remember the original start of the input for offsets */
2209     sourceStart = args->source;
2210
2211     if(myData->key != 0) {
2212         /* continue with a partial escape sequence */
2213         goto escape;
2214     }
2215
2216     while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2217         /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2218         subArgs.source = args->source;
2219         subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2220         if(subArgs.source != subArgs.sourceLimit) {
2221             /*
2222              * get the current partial byte sequence
2223              *
2224              * it needs to be moved between the public and the subconverter
2225              * so that the conversion framework, which only sees the public
2226              * converter, can handle truncated and illegal input etc.
2227              */
2228             if(args->converter->toULength > 0) {
2229                 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2230             }
2231             subArgs.converter->toULength = args->converter->toULength;
2232
2233             /*
2234              * Convert up to the end of the input, or to before the next escape character.
2235              * Does not handle conversion extensions because the preToU[] state etc.
2236              * is not copied.
2237              */
2238             ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2239
2240             if(args->offsets != NULL && sourceStart != args->source) {
2241                 /* update offsets to base them on the actual start of the input */
2242                 int32_t *offsets = args->offsets;
2243                 UChar *target = args->target;
2244                 int32_t delta = (int32_t)(args->source - sourceStart);
2245                 while(target < subArgs.target) {
2246                     if(*offsets >= 0) {
2247                         *offsets += delta;
2248                     }
2249                     ++offsets;
2250                     ++target;
2251                 }
2252             }
2253             args->source = subArgs.source;
2254             args->target = subArgs.target;
2255             args->offsets = subArgs.offsets;
2256
2257             /* copy input/error/overflow buffers */
2258             if(subArgs.converter->toULength > 0) {
2259                 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2260             }
2261             args->converter->toULength = subArgs.converter->toULength;
2262
2263             if(*err == U_BUFFER_OVERFLOW_ERROR) {
2264                 if(subArgs.converter->UCharErrorBufferLength > 0) {
2265                     uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2266                                 subArgs.converter->UCharErrorBufferLength);
2267                 }
2268                 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2269                 subArgs.converter->UCharErrorBufferLength = 0;
2270             }
2271         }
2272
2273         if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2274             return;
2275         }
2276
2277 escape:
2278         changeState_2022(args->converter,
2279                &(args->source),
2280                args->sourceLimit,
2281                ISO_2022_KR,
2282                err);
2283     }
2284 }
2285
2286 static void
2287 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2288                                                             UErrorCode* err){
2289     char tempBuf[2];
2290     const char *mySource = ( char *) args->source;
2291     UChar *myTarget = args->target;
2292     const char *mySourceLimit = args->sourceLimit;
2293     UChar32 targetUniChar = 0x0000;
2294     UChar mySourceChar = 0x0000;
2295     UConverterDataISO2022* myData;
2296     UConverterSharedData* sharedData ;
2297     UBool useFallback;
2298
2299     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2300     if(myData->version==1){
2301         UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2302         return;
2303     }
2304
2305     /* initialize state */
2306     sharedData = myData->currentConverter->sharedData;
2307     useFallback = args->converter->useFallback;
2308
2309     if(myData->key != 0) {
2310         /* continue with a partial escape sequence */
2311         goto escape;
2312     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2313         /* continue with a partial double-byte character */
2314         mySourceChar = args->converter->toUBytes[0];
2315         args->converter->toULength = 0;
2316         goto getTrailByte;
2317     }
2318
2319     while(mySource< mySourceLimit){
2320
2321         if(myTarget < args->targetLimit){
2322
2323             mySourceChar= (unsigned char) *mySource++;
2324
2325             if(mySourceChar==UCNV_SI){
2326                 myData->toU2022State.g = 0;
2327                 if (myData->isEmptySegment) {
2328                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
2329                     *err = U_PARSE_ERROR;       /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
2330                     args->converter->toUBytes[0] = mySourceChar;
2331                     args->converter->toULength = 1;
2332                     args->target = myTarget;
2333                     args->source = mySource;
2334                     return;
2335                 }
2336                 /*consume the source */
2337                 continue;
2338             }else if(mySourceChar==UCNV_SO){
2339                 myData->toU2022State.g = 1;
2340                 myData->isEmptySegment = TRUE;  /* Begin a new segment, empty so far */
2341                 /*consume the source */
2342                 continue;
2343             }else if(mySourceChar==ESC_2022){
2344                 mySource--;
2345 escape:
2346                 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2347                 changeState_2022(args->converter,&(mySource),
2348                                 mySourceLimit, ISO_2022_KR, err);
2349                 if(U_FAILURE(*err)){
2350                     args->target = myTarget;
2351                     args->source = mySource;
2352                     return;
2353                 }
2354                 continue;
2355             }
2356
2357             myData->isEmptySegment = FALSE;     /* Any invalid char errors will be detected separately, so just reset this */
2358             if(myData->toU2022State.g == 1) {
2359                 if(mySource < mySourceLimit) {
2360                     int leadIsOk, trailIsOk;
2361                     uint8_t trailByte;
2362 getTrailByte:
2363                     /* old
2364                     trailByte = *mySource++;
2365                     tempBuf[0] = (char)(mySourceChar + 0x80);
2366                     tempBuf[1] = (char)(trailByte + 0x80);
2367                     mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2368                     if((mySourceChar & 0x8080) == 0) {
2369                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2370                     */
2371                     targetUniChar = missingCharMarker;
2372                     trailByte = (uint8_t)*mySource;
2373                     /*
2374                      * Ticket 5691: consistent illegal sequences:
2375                      * - We include at least the first byte in the illegal sequence.
2376                      * - If any of the non-initial bytes could be the start of a character,
2377                      *   we stop the illegal sequence before the first one of those.
2378                      *
2379                      * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2380                      * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2381                      * Otherwise we convert or report the pair of bytes.
2382                      */
2383                     leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2384                     trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2385                     if (leadIsOk && trailIsOk) {
2386                         ++mySource;
2387                         tempBuf[0] = (char)(mySourceChar + 0x80);
2388                         tempBuf[1] = (char)(trailByte + 0x80);
2389                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2390                         mySourceChar = (mySourceChar << 8) | trailByte;
2391                     } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2392                         /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2393                         ++mySource;
2394                         /* add another bit so that the code below writes 2 bytes in case of error */
2395                         mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2396                     }
2397                 } else {
2398                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2399                     args->converter->toULength = 1;
2400                     break;
2401                 }
2402             }
2403             else if(mySourceChar <= 0x7f) {
2404                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2405             } else {
2406                 targetUniChar = 0xffff;
2407             }
2408             if(targetUniChar < 0xfffe){
2409                 if(args->offsets) {
2410                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2411                 }
2412                 *(myTarget++)=(UChar)targetUniChar;
2413             }
2414             else {
2415                 /* Call the callback function*/
2416                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2417                 break;
2418             }
2419         }
2420         else{
2421             *err =U_BUFFER_OVERFLOW_ERROR;
2422             break;
2423         }
2424     }
2425     args->target = myTarget;
2426     args->source = mySource;
2427 }
2428
2429 /*************************** END ISO2022-KR *********************************/
2430
2431 /*************************** ISO-2022-CN *********************************
2432 *
2433 * Rules for ISO-2022-CN Encoding:
2434 * i)   The designator sequence must appear once on a line before any instance
2435 *      of character set it designates.
2436 * ii)  If two lines contain characters from the same character set, both lines
2437 *      must include the designator sequence.
2438 * iii) Once the designator sequence is known, a shifting sequence has to be found
2439 *      to invoke the  shifting
2440 * iv)  All lines start in ASCII and end in ASCII.
2441 * v)   Four shifting sequences are employed for this purpose:
2442 *
2443 *      Sequcence   ASCII Eq    Charsets
2444 *      ----------  -------    ---------
2445 *      SI           <SI>        US-ASCII
2446 *      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2447 *      SS2          <ESC>N      CNS-11643-1992 Plane 2
2448 *      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2449 *
2450 * vi)
2451 *      SOdesignator  : ESC "$" ")" finalchar_for_SO
2452 *      SS2designator : ESC "$" "*" finalchar_for_SS2
2453 *      SS3designator : ESC "$" "+" finalchar_for_SS3
2454 *
2455 *      ESC $ ) A       Indicates the bytes following SO are Chinese
2456 *       characters as defined in GB 2312-80, until
2457 *       another SOdesignation appears
2458 *
2459 *
2460 *      ESC $ ) E       Indicates the bytes following SO are as defined
2461 *       in ISO-IR-165 (for details, see section 2.1),
2462 *       until another SOdesignation appears
2463 *
2464 *      ESC $ ) G       Indicates the bytes following SO are as defined
2465 *       in CNS 11643-plane-1, until another
2466 *       SOdesignation appears
2467 *
2468 *      ESC $ * H       Indicates the two bytes immediately following
2469 *       SS2 is a Chinese character as defined in CNS
2470 *       11643-plane-2, until another SS2designation
2471 *       appears
2472 *       (Meaning <ESC>N must preceed every 2 byte
2473 *        sequence.)
2474 *
2475 *      ESC $ + I       Indicates the immediate two bytes following SS3
2476 *       is a Chinese character as defined in CNS
2477 *       11643-plane-3, until another SS3designation
2478 *       appears
2479 *       (Meaning <ESC>O must preceed every 2 byte
2480 *        sequence.)
2481 *
2482 *      ESC $ + J       Indicates the immediate two bytes following SS3
2483 *       is a Chinese character as defined in CNS
2484 *       11643-plane-4, until another SS3designation
2485 *       appears
2486 *       (In English: <ESC>O must preceed every 2 byte
2487 *        sequence.)
2488 *
2489 *      ESC $ + K       Indicates the immediate two bytes following SS3
2490 *       is a Chinese character as defined in CNS
2491 *       11643-plane-5, until another SS3designation
2492 *       appears
2493 *
2494 *      ESC $ + L       Indicates the immediate two bytes following SS3
2495 *       is a Chinese character as defined in CNS
2496 *       11643-plane-6, until another SS3designation
2497 *       appears
2498 *
2499 *      ESC $ + M       Indicates the immediate two bytes following SS3
2500 *       is a Chinese character as defined in CNS
2501 *       11643-plane-7, until another SS3designation
2502 *       appears
2503 *
2504 *       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2505 *       has its own designation information before any Chinese characters
2506 *       appear
2507 *
2508 */
2509
2510 /* The following are defined this way to make the strings truely readonly */
2511 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2512 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2513 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2514 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2515 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2516 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2517 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2518 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2519 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2520
2521 /********************** ISO2022-CN Data **************************/
2522 static const char* const escSeqCharsCN[10] ={
2523         SHIFT_IN_STR,           /* ASCII */
2524         GB_2312_80_STR,
2525         ISO_IR_165_STR,
2526         CNS_11643_1992_Plane_1_STR,
2527         CNS_11643_1992_Plane_2_STR,
2528         CNS_11643_1992_Plane_3_STR,
2529         CNS_11643_1992_Plane_4_STR,
2530         CNS_11643_1992_Plane_5_STR,
2531         CNS_11643_1992_Plane_6_STR,
2532         CNS_11643_1992_Plane_7_STR
2533 };
2534
2535 static void
2536 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2537
2538     UConverterDataISO2022 *converterData;
2539     ISO2022State *pFromU2022State;
2540     uint8_t *target = (uint8_t *) args->target;
2541     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2542     const UChar* source = args->source;
2543     const UChar* sourceLimit = args->sourceLimit;
2544     int32_t* offsets = args->offsets;
2545     UChar32 sourceChar;
2546     char buffer[8];
2547     int32_t len;
2548     int8_t choices[3];
2549     int32_t choiceCount;
2550     uint32_t targetValue = 0;
2551     UBool useFallback;
2552
2553     /* set up the state */
2554     converterData     = (UConverterDataISO2022*)args->converter->extraInfo;
2555     pFromU2022State   = &converterData->fromU2022State;
2556     useFallback       = args->converter->useFallback;
2557
2558     choiceCount = 0;
2559
2560     /* check if the last codepoint of previous buffer was a lead surrogate*/
2561     if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
2562         goto getTrail;
2563     }
2564
2565     while( source < sourceLimit){
2566         if(target < targetLimit){
2567
2568             sourceChar  = *(source++);
2569             /*check if the char is a First surrogate*/
2570              if(UTF_IS_SURROGATE(sourceChar)) {
2571                 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2572 getTrail:
2573                     /*look ahead to find the trail surrogate*/
2574                     if(source < sourceLimit) {
2575                         /* test the following code unit */
2576                         UChar trail=(UChar) *source;
2577                         if(UTF_IS_SECOND_SURROGATE(trail)) {
2578                             source++;
2579                             sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2580                             args->converter->fromUChar32=0x00;
2581                             /* convert this supplementary code point */
2582                             /* exit this condition tree */
2583                         } else {
2584                             /* this is an unmatched lead code unit (1st surrogate) */
2585                             /* callback(illegal) */
2586                             *err=U_ILLEGAL_CHAR_FOUND;
2587                             args->converter->fromUChar32=sourceChar;
2588                             break;
2589                         }
2590                     } else {
2591                         /* no more input */
2592                         args->converter->fromUChar32=sourceChar;
2593                         break;
2594                     }
2595                 } else {
2596                     /* this is an unmatched trail code unit (2nd surrogate) */
2597                     /* callback(illegal) */
2598                     *err=U_ILLEGAL_CHAR_FOUND;
2599                     args->converter->fromUChar32=sourceChar;
2600                     break;
2601                 }
2602             }
2603
2604             /* do the conversion */
2605             if(sourceChar <= 0x007f ){
2606                 /* do not convert SO/SI/ESC */
2607                 if(IS_2022_CONTROL(sourceChar)) {
2608                     /* callback(illegal) */
2609                     *err=U_ILLEGAL_CHAR_FOUND;
2610                     args->converter->fromUChar32=sourceChar;
2611                     break;
2612                 }
2613
2614                 /* US-ASCII */
2615                 if(pFromU2022State->g == 0) {
2616                     buffer[0] = (char)sourceChar;
2617                     len = 1;
2618                 } else {
2619                     buffer[0] = UCNV_SI;
2620                     buffer[1] = (char)sourceChar;
2621                     len = 2;
2622                     pFromU2022State->g = 0;
2623                     choiceCount = 0;
2624                 }
2625                 if(sourceChar == CR || sourceChar == LF) {
2626                     /* reset the state at the end of a line */
2627                     uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2628                     choiceCount = 0;
2629                 }
2630             }
2631             else{
2632                 /* convert U+0080..U+10ffff */
2633                 UConverterSharedData *cnv;
2634                 int32_t i;
2635                 int8_t cs, g;
2636
2637                 if(choiceCount == 0) {
2638                     /* try the current SO/G1 converter first */
2639                     choices[0] = pFromU2022State->cs[1];
2640
2641                     /* default to GB2312_1 if none is designated yet */
2642                     if(choices[0] == 0) {
2643                         choices[0] = GB2312_1;
2644                     }
2645
2646                     if(converterData->version == 0) {
2647                         /* ISO-2022-CN */
2648
2649                         /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2650                         if(choices[0] == GB2312_1) {
2651                             choices[1] = (int8_t)CNS_11643_1;
2652                         } else {
2653                             choices[1] = (int8_t)GB2312_1;
2654                         }
2655
2656                         choiceCount = 2;
2657                     } else {
2658                         /* ISO-2022-CN-EXT */
2659
2660                         /* try one of the other converters */
2661                         switch(choices[0]) {
2662                         case GB2312_1:
2663                             choices[1] = (int8_t)CNS_11643_1;
2664                             choices[2] = (int8_t)ISO_IR_165;
2665                             break;
2666                         case ISO_IR_165:
2667                             choices[1] = (int8_t)GB2312_1;
2668                             choices[2] = (int8_t)CNS_11643_1;
2669                             break;
2670                         default: /* CNS_11643_x */
2671                             choices[1] = (int8_t)GB2312_1;
2672                             choices[2] = (int8_t)ISO_IR_165;
2673                             break;
2674                         }
2675
2676                         choiceCount = 3;
2677                     }
2678                 }
2679
2680                 cs = g = 0;
2681                 len = 0;
2682
2683                 for(i = 0; i < choiceCount && len == 0; ++i) {
2684                     cs = choices[i];
2685                     if(cs > 0) {
2686                         if(cs > CNS_11643_0) {
2687                             cnv = converterData->myConverterArray[CNS_11643];
2688                             MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_3);
2689                             if(len==3) {
2690                                 cs = (int8_t)(CNS_11643_0 + (targetValue >> 16) - 0x80);
2691                                 len = 2;
2692                                 if(cs == CNS_11643_1) {
2693                                     g = 1;
2694                                 } else if(cs == CNS_11643_2) {
2695                                     g = 2;
2696                                 } else /* plane 3..7 */ if(converterData->version == 1) {
2697                                     g = 3;
2698                                 } else {
2699                                     /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
2700                                     len = 0;
2701                                 }
2702                             }
2703                         } else {
2704                             /* GB2312_1 or ISO-IR-165 */
2705                             cnv = converterData->myConverterArray[cs];
2706                             MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_2);
2707                             g = 1; /* used if len == 2 */
2708                         }
2709                     }
2710                 }
2711
2712                 if(len > 0) {
2713                     len = 0; /* count output bytes; it must have been len == 2 */
2714
2715                     /* write the designation sequence if necessary */
2716                     if(cs != pFromU2022State->cs[g]) {
2717                         if(cs < CNS_11643) {
2718                             uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
2719                         } else {
2720                             uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
2721                         }
2722                         len = 4;
2723                         pFromU2022State->cs[g] = cs;
2724                         if(g == 1) {
2725                             /* changing the SO/G1 charset invalidates the choices[] */
2726                             choiceCount = 0;
2727                         }
2728                     }
2729
2730                     /* write the shift sequence if necessary */
2731                     if(g != pFromU2022State->g) {
2732                         switch(g) {
2733                         case 1:
2734                             buffer[len++] = UCNV_SO;
2735
2736                             /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
2737                             pFromU2022State->g = 1;
2738                             break;
2739                         case 2:
2740                             buffer[len++] = 0x1b;
2741                             buffer[len++] = 0x4e;
2742                             break;
2743                         default: /* case 3 */
2744                             buffer[len++] = 0x1b;
2745                             buffer[len++] = 0x4f;
2746                             break;
2747                         }
2748                     }
2749
2750                     /* write the two output bytes */
2751                     buffer[len++] = (char)(targetValue >> 8);
2752                     buffer[len++] = (char)targetValue;
2753                 } else {
2754                     /* if we cannot find the character after checking all codepages
2755                      * then this is an error
2756                      */
2757                     *err = U_INVALID_CHAR_FOUND;
2758                     args->converter->fromUChar32=sourceChar;
2759                     break;
2760                 }
2761             }
2762
2763             /* output len>0 bytes in buffer[] */
2764             if(len == 1) {
2765                 *target++ = buffer[0];
2766                 if(offsets) {
2767                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
2768                 }
2769             } else if(len == 2 && (target + 2) <= targetLimit) {
2770                 *target++ = buffer[0];
2771                 *target++ = buffer[1];
2772                 if(offsets) {
2773                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
2774                     *offsets++ = sourceIndex;
2775                     *offsets++ = sourceIndex;
2776                 }
2777             } else {
2778                 fromUWriteUInt8(
2779                     args->converter,
2780                     buffer, len,
2781                     &target, (const char *)targetLimit,
2782                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2783                     err);
2784                 if(U_FAILURE(*err)) {
2785                     break;
2786                 }
2787             }
2788         } /* end if(myTargetIndex<myTargetLength) */
2789         else{
2790             *err =U_BUFFER_OVERFLOW_ERROR;
2791             break;
2792         }
2793
2794     }/* end while(mySourceIndex<mySourceLength) */
2795
2796     /*
2797      * the end of the input stream and detection of truncated input
2798      * are handled by the framework, but for ISO-2022-CN conversion
2799      * we need to be in ASCII mode at the very end
2800      *
2801      * conditions:
2802      *   successful
2803      *   not in ASCII mode
2804      *   end of input and no truncated input
2805      */
2806     if( U_SUCCESS(*err) &&
2807         pFromU2022State->g!=0 &&
2808         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2809     ) {
2810         int32_t sourceIndex;
2811
2812         /* we are switching to ASCII */
2813         pFromU2022State->g=0;
2814
2815         /* get the source index of the last input character */
2816         /*
2817          * TODO this would be simpler and more reliable if we used a pair
2818          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2819          * so that we could simply use the prevSourceIndex here;
2820          * this code gives an incorrect result for the rare case of an unmatched
2821          * trail surrogate that is alone in the last buffer of the text stream
2822          */
2823         sourceIndex=(int32_t)(source-args->source);
2824         if(sourceIndex>0) {
2825             --sourceIndex;
2826             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2827                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2828             ) {
2829                 --sourceIndex;
2830             }
2831         } else {
2832             sourceIndex=-1;
2833         }
2834
2835         fromUWriteUInt8(
2836             args->converter,
2837             SHIFT_IN_STR, 1,
2838             &target, (const char *)targetLimit,
2839             &offsets, sourceIndex,
2840             err);
2841     }
2842
2843     /*save the state and return */
2844     args->source = source;
2845     args->target = (char*)target;
2846 }
2847
2848
2849 static void
2850 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2851                                                UErrorCode* err){
2852     char tempBuf[3];
2853     const char *mySource = (char *) args->source;
2854     UChar *myTarget = args->target;
2855     const char *mySourceLimit = args->sourceLimit;
2856     uint32_t targetUniChar = 0x0000;
2857     uint32_t mySourceChar = 0x0000;
2858     UConverterDataISO2022* myData;
2859     ISO2022State *pToU2022State;
2860
2861     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2862     pToU2022State = &myData->toU2022State;
2863
2864     if(myData->key != 0) {
2865         /* continue with a partial escape sequence */
2866         goto escape;
2867     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2868         /* continue with a partial double-byte character */
2869         mySourceChar = args->converter->toUBytes[0];
2870         args->converter->toULength = 0;
2871         targetUniChar = missingCharMarker;
2872         goto getTrailByte;
2873     }
2874
2875     while(mySource < mySourceLimit){
2876
2877         targetUniChar =missingCharMarker;
2878
2879         if(myTarget < args->targetLimit){
2880
2881             mySourceChar= (unsigned char) *mySource++;
2882
2883             switch(mySourceChar){
2884             case UCNV_SI:
2885                 pToU2022State->g=0;
2886                 if (myData->isEmptySegment) {
2887                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
2888                     *err = U_PARSE_ERROR;       /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
2889                     args->converter->toUBytes[0] = mySourceChar;
2890                     args->converter->toULength = 1;
2891                     args->target = myTarget;
2892                     args->source = mySource;
2893                     return;
2894                 }
2895                 continue;
2896
2897             case UCNV_SO:
2898                 if(pToU2022State->cs[1] != 0) {
2899                     pToU2022State->g=1;
2900                     myData->isEmptySegment = TRUE;      /* Begin a new segment, empty so far */
2901                     continue;
2902                 } else {
2903                     /* illegal to have SO before a matching designator */
2904                     myData->isEmptySegment = FALSE;     /* Handling a different error, reset this to avoid future spurious errs */
2905                     break;
2906                 }
2907
2908             case ESC_2022:
2909                 mySource--;
2910 escape:
2911                 {
2912                     const char * mySourceBefore = mySource;
2913                     int8_t toULengthBefore = args->converter->toULength;
2914
2915                     changeState_2022(args->converter,&(mySource),
2916                         mySourceLimit, ISO_2022_CN,err);
2917
2918                     /* After SO there must be at least one character before a designator (designator error handled separately) */
2919                     if ( myData->key == 0 && U_SUCCESS(*err) && myData->isEmptySegment ) {
2920                         *err = U_PARSE_ERROR;   /* temporary err to flag empty segment, will be reset to U_ILLEGAL_ESCAPE_SEQUENCE in _toUnicodeWithCallback */
2921                         args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
2922                     }
2923                 }
2924
2925                 /* invalid or illegal escape sequence */
2926                 if(U_FAILURE(*err)){
2927                     args->target = myTarget;
2928                     args->source = mySource;
2929                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
2930                     return;
2931                 }
2932                 continue;
2933
2934             /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
2935
2936             case CR:
2937                 /*falls through*/
2938             case LF:
2939                 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
2940                 /* falls through */
2941             default:
2942                 /* convert one or two bytes */
2943                 myData->isEmptySegment = FALSE;
2944                 if(pToU2022State->g != 0) {
2945                     if(mySource < mySourceLimit) {
2946                         UConverterSharedData *cnv;
2947                         StateEnum tempState;
2948                         int32_t tempBufLen;
2949                         int leadIsOk, trailIsOk;
2950                         uint8_t trailByte;
2951 getTrailByte:
2952                         /* old
2953                         trailByte = *mySource++;
2954                         tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
2955                         if(tempState > CNS_11643_0) {
2956                             cnv = myData->myConverterArray[CNS_11643];
2957                             tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
2958                             tempBuf[1] = (char) (mySourceChar);
2959                             tempBuf[2] = trailByte;
2960                             tempBufLen = 3;
2961
2962                         }else{
2963                             cnv = myData->myConverterArray[tempState];
2964                             tempBuf[0] = (char) (mySourceChar);
2965                             tempBuf[1] = trailByte;
2966                             tempBufLen = 2;
2967                         */
2968                         trailByte = (uint8_t)*mySource;
2969                         /*
2970                          * Ticket 5691: consistent illegal sequences:
2971                          * - We include at least the first byte in the illegal sequence.
2972                          * - If any of the non-initial bytes could be the start of a character,
2973                          *   we stop the illegal sequence before the first one of those.
2974                          *
2975                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2976                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2977                          * Otherwise we convert or report the pair of bytes.
2978                          */
2979                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2980                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2981                         if (leadIsOk && trailIsOk) {
2982                             ++mySource;
2983                             tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
2984                             if(tempState >= CNS_11643_0) {
2985                                 cnv = myData->myConverterArray[CNS_11643];
2986                                 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
2987                                 tempBuf[1] = (char) (mySourceChar);
2988                                 tempBuf[2] = (char) trailByte;
2989                                 tempBufLen = 3;
2990
2991                             }else{
2992                                 cnv = myData->myConverterArray[tempState];
2993                                 tempBuf[0] = (char) (mySourceChar);
2994                                 tempBuf[1] = (char) trailByte;
2995                                 tempBufLen = 2;
2996                             }
2997                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
2998                             mySourceChar = (mySourceChar << 8) | trailByte;
2999                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3000                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3001                             ++mySource;
3002                             /* add another bit so that the code below writes 2 bytes in case of error */
3003                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3004                         }
3005                         if(pToU2022State->g>=2) {
3006                             /* return from a single-shift state to the previous one */
3007                             pToU2022State->g=pToU2022State->prevG;
3008                         }
3009                     } else {
3010                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3011                         args->converter->toULength = 1;
3012                         goto endloop;
3013                     }
3014                 }
3015                 else{
3016                     if(mySourceChar <= 0x7f) {
3017                         targetUniChar = (UChar) mySourceChar;
3018                     }
3019                 }
3020                 break;
3021             }
3022             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3023                 if(args->offsets){
3024                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3025                 }
3026                 *(myTarget++)=(UChar)targetUniChar;
3027             }
3028             else if(targetUniChar > missingCharMarker){
3029                 /* disassemble the surrogate pair and write to output*/
3030                 targetUniChar-=0x0010000;
3031                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3032                 if(args->offsets){
3033                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3034                 }
3035                 ++myTarget;
3036                 if(myTarget< args->targetLimit){
3037                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3038                     if(args->offsets){
3039                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3040                     }
3041                     ++myTarget;
3042                 }else{
3043                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3044                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3045                 }
3046
3047             }
3048             else{
3049                 /* Call the callback function*/
3050                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3051                 break;
3052             }
3053         }
3054         else{
3055             *err =U_BUFFER_OVERFLOW_ERROR;
3056             break;
3057         }
3058     }
3059 endloop:
3060     args->target = myTarget;
3061     args->source = mySource;
3062 }
3063
3064 static void
3065 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3066     UConverter *cnv = args->converter;
3067     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3068     ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3069     char *p, *subchar;
3070     char buffer[8];
3071     int32_t length;
3072
3073     subchar=(char *)cnv->subChars;
3074     length=cnv->subCharLen; /* assume length==1 for most variants */
3075
3076     p = buffer;
3077     switch(myConverterData->locale[0]){
3078     case 'j':
3079         {
3080             int8_t cs;
3081
3082             if(pFromU2022State->g == 1) {
3083                 /* JIS7: switch from G1 to G0 */
3084                 pFromU2022State->g = 0;
3085                 *p++ = UCNV_SI;
3086             }
3087
3088             cs = pFromU2022State->cs[0];
3089             if(cs != ASCII && cs != JISX201) {
3090                 /* not in ASCII or JIS X 0201: switch to ASCII */
3091                 pFromU2022State->cs[0] = (int8_t)ASCII;
3092                 *p++ = '\x1b';
3093                 *p++ = '\x28';
3094                 *p++ = '\x42';
3095             }
3096
3097             *p++ = subchar[0];
3098             break;
3099         }
3100     case 'c':
3101         if(pFromU2022State->g != 0) {
3102             /* not in ASCII mode: switch to ASCII */
3103             pFromU2022State->g = 0;
3104             *p++ = UCNV_SI;
3105         }
3106         *p++ = subchar[0];
3107         break;
3108     case 'k':
3109         if(myConverterData->version == 0) {
3110             if(length == 1) {
3111                 if((UBool)args->converter->fromUnicodeStatus) {
3112                     /* in DBCS mode: switch to SBCS */
3113                     args->converter->fromUnicodeStatus = 0;
3114                     *p++ = UCNV_SI;
3115                 }
3116                 *p++ = subchar[0];
3117             } else /* length == 2*/ {
3118                 if(!(UBool)args->converter->fromUnicodeStatus) {
3119                     /* in SBCS mode: switch to DBCS */
3120                     args->converter->fromUnicodeStatus = 1;
3121                     *p++ = UCNV_SO;
3122                 }
3123                 *p++ = subchar[0];
3124                 *p++ = subchar[1];
3125             }
3126             break;
3127         } else {
3128             /* save the subconverter's substitution string */
3129             uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3130             int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3131
3132             /* set our substitution string into the subconverter */
3133             myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3134             myConverterData->currentConverter->subCharLen = (int8_t)length;
3135
3136             /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3137             args->converter = myConverterData->currentConverter;
3138             myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3139             ucnv_cbFromUWriteSub(args, 0, err);
3140             cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3141             args->converter = cnv;
3142
3143             /* restore the subconverter's substitution string */
3144             myConverterData->currentConverter->subChars = currentSubChars;
3145             myConverterData->currentConverter->subCharLen = currentSubCharLen;
3146
3147             if(*err == U_BUFFER_OVERFLOW_ERROR) {
3148                 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3149                     uprv_memcpy(
3150                         cnv->charErrorBuffer,
3151                         myConverterData->currentConverter->charErrorBuffer,
3152                         myConverterData->currentConverter->charErrorBufferLength);
3153                 }
3154                 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3155                 myConverterData->currentConverter->charErrorBufferLength = 0;
3156             }
3157             return;
3158         }
3159     default:
3160         /* not expected */
3161         break;
3162     }
3163     ucnv_cbFromUWriteBytes(args,
3164                            buffer, (int32_t)(p - buffer),
3165                            offsetIndex, err);
3166 }
3167
3168 /*
3169  * Structure for cloning an ISO 2022 converter into a single memory block.
3170  * ucnv_safeClone() of the converter will align the entire cloneStruct,
3171  * and then ucnv_safeClone() of the sub-converter may additionally align
3172  * currentConverter inside the cloneStruct, for which we need the deadSpace
3173  * after currentConverter.
3174  * This is because UAlignedMemory may be larger than the actually
3175  * necessary alignment size for the platform.
3176  * The other cloneStruct fields will not be moved around,
3177  * and are aligned properly with cloneStruct's alignment.
3178  */
3179 struct cloneStruct
3180 {
3181     UConverter cnv;
3182     UConverter currentConverter;
3183     UAlignedMemory deadSpace;
3184     UConverterDataISO2022 mydata;
3185 };
3186
3187
3188 static UConverter *
3189 _ISO_2022_SafeClone(
3190             const UConverter *cnv,
3191             void *stackBuffer,
3192             int32_t *pBufferSize,
3193             UErrorCode *status)
3194 {
3195     struct cloneStruct * localClone;
3196     UConverterDataISO2022 *cnvData;
3197     int32_t i, size;
3198
3199     if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3200         *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3201         return NULL;
3202     }
3203
3204     cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3205     localClone = (struct cloneStruct *)stackBuffer;
3206
3207     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3208
3209     uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3210     localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3211     localClone->cnv.isExtraLocal = TRUE;
3212
3213     /* share the subconverters */
3214
3215     if(cnvData->currentConverter != NULL) {
3216         size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3217         localClone->mydata.currentConverter =
3218             ucnv_safeClone(cnvData->currentConverter,
3219                             &localClone->currentConverter,
3220                             &size, status);
3221         if(U_FAILURE(*status)) {
3222             return NULL;
3223         }
3224     }
3225
3226     for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3227         if(cnvData->myConverterArray[i] != NULL) {
3228             ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3229         }
3230     }
3231
3232     return &localClone->cnv;
3233 }
3234
3235 static void
3236 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3237                     const USetAdder *sa,
3238                     UConverterUnicodeSet which,
3239                     UErrorCode *pErrorCode)
3240 {
3241     int32_t i;
3242     UConverterDataISO2022* cnvData;
3243
3244     if (U_FAILURE(*pErrorCode)) {
3245         return;
3246     }
3247 #ifdef U_ENABLE_GENERIC_ISO_2022
3248     if (cnv->sharedData == &_ISO2022Data) {
3249         /* We use UTF-8 in this case */
3250         sa->addRange(sa->set, 0, 0xd7FF);
3251         sa->addRange(sa->set, 0xE000, 0x10FFFF);
3252         return;
3253     }
3254 #endif
3255
3256     cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3257
3258     /* open a set and initialize it with code points that are algorithmically round-tripped */
3259     switch(cnvData->locale[0]){
3260     case 'j':
3261         if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3262             /* include Latin-1 for some variants of JP */
3263             sa->addRange(sa->set, 0, 0xff);
3264         } else {
3265             /* include ASCII for JP */
3266             sa->addRange(sa->set, 0, 0x7f);
3267         }
3268         if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
3269             /* include half-width Katakana for JP */
3270             sa->addRange(sa->set, 0xff61, 0xff9f);
3271         }
3272         break;
3273     case 'c':
3274     case 'z':
3275         /* include ASCII for CN */
3276         sa->addRange(sa->set, 0, 0x7f);
3277         break;
3278     case 'k':
3279         /* there is only one converter for KR, and it is not in the myConverterArray[] */
3280         cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3281                 cnvData->currentConverter, sa, which, pErrorCode);
3282         /* the loop over myConverterArray[] will simply not find another converter */
3283         break;
3284     default:
3285         break;
3286     }
3287
3288     /*
3289      * Version-specific for CN:
3290      * CN version 0 does not map CNS planes 3..7 although
3291      * they are all available in the CNS conversion table;
3292      * CN version 1 does map them all.
3293      * The two versions create different Unicode sets.
3294      */
3295     for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3296         if(cnvData->myConverterArray[i]!=NULL) {
3297             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3298                 cnvData->version==0 && i==CNS_11643
3299             ) {
3300                 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3301                 ucnv_MBCSGetUnicodeSetForBytes(
3302                         cnvData->myConverterArray[i],
3303                         sa, UCNV_ROUNDTRIP_SET,
3304                         0, 0x81, 0x82,
3305                         pErrorCode);
3306             } else {
3307                 ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode);
3308             }
3309         }
3310     }
3311
3312     /*
3313      * ISO 2022 converters must not convert SO/SI/ESC despite what
3314      * sub-converters do by themselves.
3315      * Remove these characters from the set.
3316      */
3317     sa->remove(sa->set, 0x0e);
3318     sa->remove(sa->set, 0x0f);
3319     sa->remove(sa->set, 0x1b);
3320 }
3321
3322 static const UConverterImpl _ISO2022Impl={
3323     UCNV_ISO_2022,
3324
3325     NULL,
3326     NULL,
3327
3328     _ISO2022Open,
3329     _ISO2022Close,
3330     _ISO2022Reset,
3331
3332 #ifdef U_ENABLE_GENERIC_ISO_2022
3333     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3334     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3335     ucnv_fromUnicode_UTF8,
3336     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3337 #else
3338     NULL,
3339     NULL,
3340     NULL,
3341     NULL,
3342 #endif
3343     NULL,
3344
3345     NULL,
3346     _ISO2022getName,
3347     _ISO_2022_WriteSub,
3348     _ISO_2022_SafeClone,
3349     _ISO_2022_GetUnicodeSet
3350 };
3351 static const UConverterStaticData _ISO2022StaticData={
3352     sizeof(UConverterStaticData),
3353     "ISO_2022",
3354     2022,
3355     UCNV_IBM,
3356     UCNV_ISO_2022,
3357     1,
3358     3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3359     { 0x1a, 0, 0, 0 },
3360     1,
3361     FALSE,
3362     FALSE,
3363     0,
3364     0,
3365     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3366 };
3367 const UConverterSharedData _ISO2022Data={
3368     sizeof(UConverterSharedData),
3369     ~((uint32_t) 0),
3370     NULL,
3371     NULL,
3372     &_ISO2022StaticData,
3373     FALSE,
3374     &_ISO2022Impl,
3375     0
3376 };
3377
3378 /*************JP****************/
3379 static const UConverterImpl _ISO2022JPImpl={
3380     UCNV_ISO_2022,
3381
3382     NULL,
3383     NULL,
3384
3385     _ISO2022Open,
3386     _ISO2022Close,
3387     _ISO2022Reset,
3388
3389     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3390     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3391     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3392     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3393     NULL,
3394
3395     NULL,
3396     _ISO2022getName,
3397     _ISO_2022_WriteSub,
3398     _ISO_2022_SafeClone,
3399     _ISO_2022_GetUnicodeSet
3400 };
3401 static const UConverterStaticData _ISO2022JPStaticData={
3402     sizeof(UConverterStaticData),
3403     "ISO_2022_JP",
3404     0,
3405     UCNV_IBM,
3406     UCNV_ISO_2022,
3407     1,
3408     6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3409     { 0x1a, 0, 0, 0 },
3410     1,
3411     FALSE,
3412     FALSE,
3413     0,
3414     0,
3415     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3416 };
3417 static const UConverterSharedData _ISO2022JPData={
3418     sizeof(UConverterSharedData),
3419     ~((uint32_t) 0),
3420     NULL,
3421     NULL,
3422     &_ISO2022JPStaticData,
3423     FALSE,
3424     &_ISO2022JPImpl,
3425     0
3426 };
3427
3428 /************* KR ***************/
3429 static const UConverterImpl _ISO2022KRImpl={
3430     UCNV_ISO_2022,
3431
3432     NULL,
3433     NULL,
3434
3435     _ISO2022Open,
3436     _ISO2022Close,
3437     _ISO2022Reset,
3438
3439     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3440     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3441     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3442     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3443     NULL,
3444
3445     NULL,
3446     _ISO2022getName,
3447     _ISO_2022_WriteSub,
3448     _ISO_2022_SafeClone,
3449     _ISO_2022_GetUnicodeSet
3450 };
3451 static const UConverterStaticData _ISO2022KRStaticData={
3452     sizeof(UConverterStaticData),
3453     "ISO_2022_KR",
3454     0,
3455     UCNV_IBM,
3456     UCNV_ISO_2022,
3457     1,
3458     3, /* max 3 bytes per UChar: SO+DBCS */
3459     { 0x1a, 0, 0, 0 },
3460     1,
3461     FALSE,
3462     FALSE,
3463     0,
3464     0,
3465     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3466 };
3467 static const UConverterSharedData _ISO2022KRData={
3468     sizeof(UConverterSharedData),
3469     ~((uint32_t) 0),
3470     NULL,
3471     NULL,
3472     &_ISO2022KRStaticData,
3473     FALSE,
3474     &_ISO2022KRImpl,
3475     0
3476 };
3477
3478 /*************** CN ***************/
3479 static const UConverterImpl _ISO2022CNImpl={
3480
3481     UCNV_ISO_2022,
3482
3483     NULL,
3484     NULL,
3485
3486     _ISO2022Open,
3487     _ISO2022Close,
3488     _ISO2022Reset,
3489
3490     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3491     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3492     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3493     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3494     NULL,
3495
3496     NULL,
3497     _ISO2022getName,
3498     _ISO_2022_WriteSub,
3499     _ISO_2022_SafeClone,
3500     _ISO_2022_GetUnicodeSet
3501 };
3502 static const UConverterStaticData _ISO2022CNStaticData={
3503     sizeof(UConverterStaticData),
3504     "ISO_2022_CN",
3505     0,
3506     UCNV_IBM,
3507     UCNV_ISO_2022,
3508     1,
3509     8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3510     { 0x1a, 0, 0, 0 },
3511     1,
3512     FALSE,
3513     FALSE,
3514     0,
3515     0,
3516     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3517 };
3518 static const UConverterSharedData _ISO2022CNData={
3519     sizeof(UConverterSharedData),
3520     ~((uint32_t) 0),
3521     NULL,
3522     NULL,
3523     &_ISO2022CNStaticData,
3524     FALSE,
3525     &_ISO2022CNImpl,
3526     0
3527 };
3528
3529
3530
3531 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */