icuSources/common/ucnv2022.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2000-2004, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  ucnv2022.c
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2000feb03
  12 *   created by: Markus W. Scherer
  13 *
  14 *   Change history:
  15 *
  16 *   06/29/2000  helena  Major rewrite of the callback APIs.
  17 *   08/08/2000  Ram     Included support for ISO-2022-JP-2
  18 *                       Changed implementation of toUnicode
  19 *                       function
  20 *   08/21/2000  Ram     Added support for ISO-2022-KR
  21 *   08/29/2000  Ram     Seperated implementation of EBCDIC to
  22 *                       ucnvebdc.c
  23 *   09/20/2000  Ram     Added support for ISO-2022-CN
  24 *                       Added implementations for getNextUChar()
  25 *                       for specific 2022 country variants.
  26 *   10/31/2000  Ram     Implemented offsets logic functions
  27 */
  28
  29 #include "unicode/utypes.h"
  30
  31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
  32
  33 #include "unicode/ucnv.h"
  34 #include "unicode/uset.h"
  35 #include "unicode/ucnv_err.h"
  36 #include "unicode/ucnv_cb.h"
  37 #include "ucnv_imp.h"
  38 #include "ucnv_bld.h"
  39 #include "ucnv_cnv.h"
  40 #include "ucnvmbcs.h"
  41 #include "cstring.h"
  42 #include "cmemory.h"
  43
  44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  45
  46 #ifdef U_ENABLE_GENERIC_ISO_2022
  47 /*
  48  * I am disabling the generic ISO-2022 converter after proposing to do so on
  49  * the icu mailing list two days ago.
  50  *
  51  * Reasons:
  52  * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
  53  *    its designation sequences, single shifts with return to the previous state,
  54  *    switch-with-no-return to UTF-16BE or similar, etc.
  55  *    This is unlike the language-specific variants like ISO-2022-JP which
  56  *    require a much smaller repertoire of ISO-2022 features.
  57  *    These variants continue to be supported.
  58  * 2. I believe that no one is really using the generic ISO-2022 converter
  59  *    but rather always one of the language-specific variants.
  60  *    Note that ICU's generic ISO-2022 converter has always output one escape
  61  *    sequence followed by UTF-8 for the whole stream.
  62  * 3. Switching between subcharsets is extremely slow, because each time
  63  *    the previous converter is closed and a new one opened,
  64  *    without any kind of caching, least-recently-used list, etc.
  65  * 4. The code is currently buggy, and given the above it does not seem
  66  *    reasonable to spend the time on maintenance.
  67  * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
  68  *    This means, for example, that when ISO-8859-7 is designated, the following
  69  *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
  70  *    The ICU ISO-2022 converter does not handle this - and has no information
  71  *    about which subconverter would have to be shifted vs. which is designed
  72  *    for 7-bit ISO-2022.
  73  *
  74  * Markus Scherer 2003-dec-03
  75  */
  76 #endif
  77
  78 static const char SHIFT_IN_STR[]  = "\x0F";
  79 static const char SHIFT_OUT_STR[] = "\x0E";
  80
  81 #define CR      0x0D
  82 #define LF      0x0A
  83 #define H_TAB   0x09
  84 #define V_TAB   0x0B
  85 #define SPACE   0x20
  86
  87 /* for ISO-2022-JP and -CN implementations */
  88 typedef enum  {
  89         /* shared values */
  90         INVALID_STATE=-1,
  91         ASCII = 0,
  92
  93         SS2_STATE=0x10,
  94         SS3_STATE,
  95
  96         /* JP */
  97         ISO8859_1 = 1 ,
  98         ISO8859_7 = 2 ,
  99         JISX201  = 3,
 100         JISX208 = 4,
 101         JISX212 = 5,
 102         GB2312  =6,
 103         KSC5601 =7,
 104         HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
 105
 106         /* CN */
 107         /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
 108         GB2312_1=1,
 109         ISO_IR_165=2,
 110         CNS_11643=3,
 111
 112         /*
 113          * these are used in StateEnum and ISO2022State variables,
 114          * but CNS_11643 must be used to index into myConverterArray[]
 115          */
 116         CNS_11643_0=0x20,
 117         CNS_11643_1,
 118         CNS_11643_2,
 119         CNS_11643_3,
 120         CNS_11643_4,
 121         CNS_11643_5,
 122         CNS_11643_6,
 123         CNS_11643_7
 124 } StateEnum;
 125
 126 /* is the StateEnum charset value for a DBCS charset? */
 127 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
 128
 129 #define CSM(cs) ((uint16_t)1<<(cs))
 130
 131 /*
 132  * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
 133  * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
 134  *
 135  * Note: The converter uses some leniency:
 136  * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
 137  *   all versions, not just JIS7 and JIS8.
 138  * - ICU does not distinguish between different versions of JIS X 0208.
 139  */
 140 static const uint16_t jpCharsetMasks[5]={
 141     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
 142     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
 143     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
 144     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
 145     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
 146 };
 147
 148 typedef enum {
 149         ASCII1=0,
 150         LATIN1,
 151         SBCS,
 152         DBCS,
 153         MBCS,
 154         HWKANA
 155 }Cnv2022Type;
 156
 157 typedef struct ISO2022State {
 158     int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
 159     int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
 160     int8_t prevG;       /* g before single shift (SS2 or SS3) */
 161 } ISO2022State;
 162
 163 #define UCNV_OPTIONS_VERSION_MASK 0xf
 164 #define UCNV_2022_MAX_CONVERTERS 10
 165
 166 typedef struct{
 167     UConverter *currentConverter;
 168 #ifdef U_ENABLE_GENERIC_ISO_2022
 169     UBool isFirstBuffer;
 170 #endif
 171     Cnv2022Type currentType;
 172     ISO2022State toU2022State, fromU2022State;
 173     UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
 174     uint32_t key;
 175     uint32_t version;
 176     char locale[3];
 177     char name[30];
 178 }UConverterDataISO2022;
 179
 180 /* Protos */
 181 /* ISO-2022 ----------------------------------------------------------------- */
 182
 183 /*Forward declaration */
 184 U_CFUNC void
 185 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
 186                       UErrorCode * err);
 187 U_CFUNC void
 188 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
 189                                     UErrorCode * err);
 190
 191 #define ESC_2022 0x1B /*ESC*/
 192
 193 typedef enum
 194 {
 195         INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
 196         VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
 197         VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
 198         VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
 199 } UCNV_TableStates_2022;
 200
 201 /*
 202 * The way these state transition arrays work is:
 203 * ex : ESC$B is the sequence for JISX208
 204 *      a) First Iteration: char is ESC
 205 *          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
 206 *             int x = normalize_esq_chars_2022[27] which is equal to 1
 207 *         ii) Search for this value in escSeqStateTable_Key_2022[]
 208 *             value of x is stored at escSeqStateTable_Key_2022[0]
 209 *        iii) Save this index as offset
 210 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
 211 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
 212 *     b) Switch on this state and continue to next char
 213 *          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
 214 *             which is normalize_esq_chars_2022[36] == 4
 215 *         ii) x is currently 1(from above)
 216 *               x<<=5 -- x is now 32
 217 *               x+=normalize_esq_chars_2022[36]
 218 *               now x is 36
 219 *        iii) Search for this value in escSeqStateTable_Key_2022[]
 220 *             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
 221 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
 222 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
 223 *     c) Switch on this state and continue to next char
 224 *        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
 225 *        ii) x is currently 36 (from above)
 226 *            x<<=5 -- x is now 1152
 227 *            x+=normalize_esq_chars_2022[66]
 228 *            now x is 1161
 229 *       iii) Search for this value in escSeqStateTable_Key_2022[]
 230 *            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
 231 *        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
 232 *            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
 233 *         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
 234 */
 235
 236
 237 /*Below are the 3 arrays depicting a state transition table*/
 238 static const int8_t normalize_esq_chars_2022[256] = {
 239 /*       0      1       2       3       4      5       6        7       8       9           */
 240
 241          0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 242         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 243         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
 244         ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
 245         ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
 246         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 247         ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
 248         ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
 249         ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
 250         ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 251         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 252         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 253         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 254         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 255         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 256         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 257         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 258         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 259         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 260         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 261         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 262         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 263         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 264         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 265         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
 266         ,0     ,0      ,0      ,0      ,0      ,0
 267 };
 268
 269 #ifdef U_ENABLE_GENERIC_ISO_2022
 270 /*
 271  * When the generic ISO-2022 converter is completely removed, not just disabled
 272  * per #ifdef, then the following state table and the associated tables that are
 273  * dimensioned with MAX_STATES_2022 should be trimmed.
 274  *
 275  * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
 276  * the associated escape sequences starting with ESC ( B should be removed.
 277  * This includes the ones with key values 1097 and all of the ones above 1000000.
 278  *
 279  * For the latter, the tables can simply be truncated.
 280  * For the former, since the tables must be kept parallel, it is probably best
 281  * to simply duplicate an adjacent table cell, parallel in all tables.
 282  *
 283  * It may make sense to restructure the tables, especially by using small search
 284  * tables for the variants instead of indexing them parallel to the table here.
 285  */
 286 #endif
 287
 288 #define MAX_STATES_2022 74
 289 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
 290 /*   0           1           2           3           4           5           6           7           8           9           */
 291
 292      1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
 293     ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
 294     ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
 295     ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
 296     ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
 297     ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
 298     ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
 299     ,35947631   ,35947635   ,35947636   ,35947638
 300 };
 301
 302 #ifdef U_ENABLE_GENERIC_ISO_2022
 303
 304 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
 305  /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
 306
 307      NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
 308     ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
 309     ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
 310     ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
 311     ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
 312     ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
 313     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
 314     ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
 315 };
 316
 317 #endif
 318
 319 static const UCNV_TableStates_2022 escSeqStateTable_Value_2022[MAX_STATES_2022] = {
 320 /*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
 321      VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 322     ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 323     ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
 324     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 325     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 326     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 327     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 328     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
 329 };
 330
 331
 332 /* Type def for refactoring changeState_2022 code*/
 333 typedef enum{
 334 #ifdef U_ENABLE_GENERIC_ISO_2022
 335     ISO_2022=0,
 336 #endif
 337     ISO_2022_JP=1,
 338     ISO_2022_KR=2,
 339     ISO_2022_CN=3
 340 } Variant2022;
 341
 342 /*********** ISO 2022 Converter Protos ***********/
 343 static void
 344 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode);
 345
 346 static void
 347  _ISO2022Close(UConverter *converter);
 348
 349 static void
 350 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
 351
 352 static const char*
 353 _ISO2022getName(const UConverter* cnv);
 354
 355 static void
 356 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
 357
 358 static UConverter *
 359 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
 360
 361 #ifdef U_ENABLE_GENERIC_ISO_2022
 362 static void
 363 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
 364 #endif
 365
 366 /*const UConverterSharedData _ISO2022Data;*/
 367 static const UConverterSharedData _ISO2022JPData;
 368 static const UConverterSharedData _ISO2022KRData;
 369 static const UConverterSharedData _ISO2022CNData;
 370
 371 /*************** Converter implementations ******************/
 372
 373 static void
 374 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
 375     if(myConverterData->version == 1) {
 376         UConverter *cnv = myConverterData->currentConverter;
 377
 378         cnv->toUnicodeStatus=0;     /* offset */
 379         cnv->mode=0;                /* state */
 380         cnv->toULength=0;           /* byteIndex */
 381     }
 382 }
 383
 384 static void
 385 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
 386    /* in ISO-2022-KR the designator sequence appears only once
 387     * in a file so we append it only once
 388     */
 389     if( converter->charErrorBufferLength==0){
 390
 391         converter->charErrorBufferLength = 4;
 392         converter->charErrorBuffer[0] = 0x1b;
 393         converter->charErrorBuffer[1] = 0x24;
 394         converter->charErrorBuffer[2] = 0x29;
 395         converter->charErrorBuffer[3] = 0x43;
 396     }
 397     if(myConverterData->version == 1) {
 398         UConverter *cnv = myConverterData->currentConverter;
 399
 400         cnv->fromUChar32=0;
 401         cnv->fromUnicodeStatus=1;   /* prevLength */
 402     }
 403 }
 404
 405 static void
 406 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
 407
 408     char myLocale[6]={' ',' ',' ',' ',' ',' '};
 409
 410     cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
 411     if(cnv->extraInfo != NULL) {
 412         UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
 413         uint32_t version;
 414
 415         uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
 416         myConverterData->currentConverter = NULL;
 417         myConverterData->currentType = ASCII1;
 418         myConverterData->key =0;
 419 #ifdef U_ENABLE_GENERIC_ISO_2022
 420         myConverterData->isFirstBuffer = TRUE;
 421 #endif
 422         cnv->fromUnicodeStatus =FALSE;
 423         if(locale){
 424             uprv_strncpy(myLocale, locale, sizeof(myLocale));
 425         }
 426         myConverterData->version= 0;
 427         version = options & UCNV_OPTIONS_VERSION_MASK;
 428         if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
 429             (myLocale[2]=='_' || myLocale[2]=='\0')){
 430             int len=0;
 431             /* open the required converters and cache them */
 432             if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
 433                 myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
 434             }
 435             myConverterData->myConverterArray[JISX201]      = ucnv_loadSharedData("JISX0201", NULL, errorCode);
 436             myConverterData->myConverterArray[JISX208]      = ucnv_loadSharedData("jisx-208", NULL, errorCode);
 437             if(jpCharsetMasks[version]&CSM(JISX212)) {
 438                 myConverterData->myConverterArray[JISX212]  = ucnv_loadSharedData("jisx-212", NULL, errorCode);
 439             }
 440             if(jpCharsetMasks[version]&CSM(GB2312)) {
 441                 myConverterData->myConverterArray[GB2312]   = ucnv_loadSharedData("ibm-5478", NULL, errorCode);   /* gb_2312_80-1 */
 442             }
 443             if(jpCharsetMasks[version]&CSM(KSC5601)) {
 444                 myConverterData->myConverterArray[KSC5601]  = ucnv_loadSharedData("ksc_5601", NULL, errorCode);
 445             }
 446
 447             /* set the function pointers to appropriate funtions */
 448             cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
 449             uprv_strcpy(myConverterData->locale,"ja");
 450
 451             myConverterData->version = version;
 452             uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
 453             len = uprv_strlen(myConverterData->name);
 454             myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
 455             myConverterData->name[len+1]='\0';
 456         }
 457         else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
 458             (myLocale[2]=='_' || myLocale[2]=='\0')){
 459
 460             if ((options  & UCNV_OPTIONS_VERSION_MASK)==1){
 461                     myConverterData->version = 1;
 462                     myConverterData->currentConverter=
 463                         ucnv_open("icu-internal-25546",errorCode);
 464
 465                     if (U_FAILURE(*errorCode)) {
 466                         _ISO2022Close(cnv);
 467                         return;
 468                     }
 469
 470                     uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
 471                     uprv_memcpy(cnv->subChar, myConverterData->currentConverter->subChar, 4);
 472                     cnv->subCharLen = myConverterData->currentConverter->subCharLen;
 473             }else{
 474                     myConverterData->currentConverter=ucnv_open("ibm-949",errorCode);
 475
 476                     if (U_FAILURE(*errorCode)) {
 477                         _ISO2022Close(cnv);
 478                         return;
 479                     }
 480
 481                     myConverterData->version = 0;
 482                     uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
 483             }
 484
 485             /* initialize the state variables */
 486             setInitialStateToUnicodeKR(cnv, myConverterData);
 487             setInitialStateFromUnicodeKR(cnv,myConverterData);
 488
 489             /* set the function pointers to appropriate funtions */
 490             cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
 491             uprv_strcpy(myConverterData->locale,"ko");
 492         }
 493         else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
 494             (myLocale[2]=='_' || myLocale[2]=='\0')){
 495
 496             /* open the required converters and cache them */
 497             myConverterData->myConverterArray[GB2312_1]         = ucnv_loadSharedData("ibm-5478", NULL, errorCode);
 498             if(version==1) {
 499                 myConverterData->myConverterArray[ISO_IR_165]   = ucnv_loadSharedData("iso-ir-165", NULL, errorCode);
 500             }
 501             myConverterData->myConverterArray[CNS_11643]        = ucnv_loadSharedData("cns-11643-1992", NULL, errorCode);
 502
 503
 504             /* set the function pointers to appropriate funtions */
 505             cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
 506             uprv_strcpy(myConverterData->locale,"cn");
 507
 508             if ((options  & UCNV_OPTIONS_VERSION_MASK)==1){
 509                 myConverterData->version = 1;
 510                 uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
 511             }else{
 512                 uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
 513                 myConverterData->version = 0;
 514             }
 515         }
 516         else{
 517 #ifdef U_ENABLE_GENERIC_ISO_2022
 518             /* append the UTF-8 escape sequence */
 519             cnv->charErrorBufferLength = 3;
 520             cnv->charErrorBuffer[0] = 0x1b;
 521             cnv->charErrorBuffer[1] = 0x25;
 522             cnv->charErrorBuffer[2] = 0x42;
 523
 524             cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
 525             /* initialize the state variables */
 526             uprv_strcpy(myConverterData->name,"ISO_2022");
 527 #else
 528             *errorCode = U_UNSUPPORTED_ERROR;
 529             return;
 530 #endif
 531         }
 532
 533         cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
 534
 535         if(U_FAILURE(*errorCode)) {
 536             _ISO2022Close(cnv);
 537         }
 538     } else {
 539         *errorCode = U_MEMORY_ALLOCATION_ERROR;
 540     }
 541 }
 542
 543
 544 static void
 545 _ISO2022Close(UConverter *converter) {
 546     UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
 547     UConverterSharedData **array = myData->myConverterArray;
 548     int32_t i;
 549
 550     if (converter->extraInfo != NULL) {
 551         /*close the array of converter pointers and free the memory*/
 552         for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
 553             if(array[i]!=NULL) {
 554                 ucnv_unloadSharedDataIfReady(array[i]);
 555             }
 556         }
 557
 558         ucnv_close(myData->currentConverter);
 559
 560         if(!converter->isExtraLocal){
 561             uprv_free (converter->extraInfo);
 562             converter->extraInfo = NULL;
 563         }
 564     }
 565 }
 566
 567 static void
 568 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
 569     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
 570     if(choice<=UCNV_RESET_TO_UNICODE) {
 571         uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
 572         myConverterData->key = 0;
 573     }
 574     if(choice!=UCNV_RESET_TO_UNICODE) {
 575         uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
 576     }
 577 #ifdef U_ENABLE_GENERIC_ISO_2022
 578     if(myConverterData->locale[0] == 0){
 579         if(choice<=UCNV_RESET_TO_UNICODE) {
 580             myConverterData->isFirstBuffer = TRUE;
 581             myConverterData->key = 0;
 582             if (converter->mode == UCNV_SO){
 583                 ucnv_close (myConverterData->currentConverter);
 584                 myConverterData->currentConverter=NULL;
 585             }
 586             converter->mode = UCNV_SI;
 587         }
 588         if(choice!=UCNV_RESET_TO_UNICODE) {
 589             /* re-append UTF-8 escape sequence */
 590             converter->charErrorBufferLength = 3;
 591             converter->charErrorBuffer[0] = 0x1b;
 592             converter->charErrorBuffer[1] = 0x28;
 593             converter->charErrorBuffer[2] = 0x42;
 594         }
 595     }
 596     else
 597 #endif
 598     {
 599         /* reset the state variables */
 600         if(myConverterData->locale[0] == 'k'){
 601             if(choice<=UCNV_RESET_TO_UNICODE) {
 602                 setInitialStateToUnicodeKR(converter, myConverterData);
 603             }
 604             if(choice!=UCNV_RESET_TO_UNICODE) {
 605                 setInitialStateFromUnicodeKR(converter, myConverterData);
 606             }
 607         }
 608     }
 609 }
 610
 611 static const char*
 612 _ISO2022getName(const UConverter* cnv){
 613     if(cnv->extraInfo){
 614         UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
 615         return myData->name;
 616     }
 617     return NULL;
 618 }
 619
 620
 621 /*************** to unicode *******************/
 622 /****************************************************************************
 623  * Recognized escape sequences are
 624  * <ESC>(B  ASCII
 625  * <ESC>.A  ISO-8859-1
 626  * <ESC>.F  ISO-8859-7
 627  * <ESC>(J  JISX-201
 628  * <ESC>(I  JISX-201
 629  * <ESC>$B  JISX-208
 630  * <ESC>$@  JISX-208
 631  * <ESC>$(D JISX-212
 632  * <ESC>$A  GB2312
 633  * <ESC>$(C KSC5601
 634  */
 635 static const StateEnum nextStateToUnicodeJP[MAX_STATES_2022]= {
 636 /*      0                1               2               3               4               5               6               7               8               9    */
 637     INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 638     ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
 639     ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 640     ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
 641     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 642     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 643     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 644     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 645 };
 646
 647 /*************** to unicode *******************/
 648 static const StateEnum nextStateToUnicodeCN[MAX_STATES_2022]= {
 649 /*      0                1               2               3               4               5               6               7               8               9    */
 650      INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 651     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 652     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 653     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 654     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
 655     ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 656     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 657     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
 658 };
 659
 660
 661 static UCNV_TableStates_2022
 662 getKey_2022(char c,int32_t* key,int32_t* offset){
 663     int32_t togo;
 664     int32_t low = 0;
 665     int32_t hi = MAX_STATES_2022;
 666     int32_t oldmid=0;
 667
 668     togo = normalize_esq_chars_2022[(uint8_t)c];
 669     if(togo == 0) {
 670         /* not a valid character anywhere in an escape sequence */
 671         *key = 0;
 672         *offset = 0;
 673         return INVALID_2022;
 674     }
 675     togo = (*key << 5) + togo;
 676
 677     while (hi != low)  /*binary search*/{
 678
 679         register int32_t mid = (hi+low) >> 1; /*Finds median*/
 680
 681         if (mid == oldmid)
 682             break;
 683
 684         if (escSeqStateTable_Key_2022[mid] > togo){
 685             hi = mid;
 686         }
 687         else if (escSeqStateTable_Key_2022[mid] < togo){
 688             low = mid;
 689         }
 690         else /*we found it*/{
 691             *key = togo;
 692             *offset = mid;
 693             return escSeqStateTable_Value_2022[mid];
 694         }
 695         oldmid = mid;
 696
 697     }
 698
 699     *key = 0;
 700     *offset = 0;
 701     return INVALID_2022;
 702 }
 703
 704 /*runs through a state machine to determine the escape sequence - codepage correspondance
 705  */
 706 static void
 707 changeState_2022(UConverter* _this,
 708                 const char** source,
 709                 const char* sourceLimit,
 710                 Variant2022 var,
 711                 UErrorCode* err){
 712     UCNV_TableStates_2022 value;
 713     UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
 714     uint32_t key = myData2022->key;
 715     int32_t offset;
 716     char c;
 717
 718     value = VALID_NON_TERMINAL_2022;
 719     while (*source < sourceLimit) {
 720         c = *(*source)++;
 721         _this->toUBytes[_this->toULength++]=(uint8_t)c;
 722         value = getKey_2022(c,(int32_t *) &key, &offset);
 723
 724         switch (value){
 725
 726         case VALID_NON_TERMINAL_2022 :
 727             /* continue with the loop */
 728             break;
 729
 730         case VALID_TERMINAL_2022:
 731             key = 0;
 732             goto DONE;
 733
 734         case INVALID_2022:
 735             goto DONE;
 736
 737         case VALID_MAYBE_TERMINAL_2022:
 738 #ifdef U_ENABLE_GENERIC_ISO_2022
 739             /* ESC ( B is ambiguous only for ISO_2022 itself */
 740             if(var == ISO_2022) {
 741                 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
 742                 _this->toULength = 0;
 743
 744                 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
 745
 746                 /* continue with the loop */
 747                 value = VALID_NON_TERMINAL_2022;
 748                 break;
 749             } else
 750 #endif
 751             {
 752                 /* not ISO_2022 itself, finish here */
 753                 value = VALID_TERMINAL_2022;
 754                 key = 0;
 755                 goto DONE;
 756             }
 757         }
 758     }
 759
 760 DONE:
 761     myData2022->key = key;
 762
 763     if (value == VALID_NON_TERMINAL_2022) {
 764         /* indicate that the escape sequence is incomplete: key!=0 */
 765         return;
 766     } else if (value == INVALID_2022 ) {
 767         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 768         return;
 769     } else /* value == VALID_TERMINAL_2022 */ {
 770         switch(var){
 771 #ifdef U_ENABLE_GENERIC_ISO_2022
 772         case ISO_2022:
 773         {
 774             const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
 775             if(chosenConverterName == NULL) {
 776                 /* SS2 or SS3 */
 777                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 778                 return;
 779             }
 780
 781             _this->mode = UCNV_SI;
 782             ucnv_close(myData2022->currentConverter);
 783             myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
 784             if(U_SUCCESS(*err)) {
 785                 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
 786                 _this->mode = UCNV_SO;
 787             }
 788             break;
 789         }
 790 #endif
 791         case ISO_2022_JP:
 792             {
 793                 StateEnum tempState=nextStateToUnicodeJP[offset];
 794                 switch(tempState) {
 795                 case INVALID_STATE:
 796                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 797                     break;
 798                 case SS2_STATE:
 799                     if(myData2022->toU2022State.cs[2]!=0) {
 800                         if(myData2022->toU2022State.g<2) {
 801                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 802                         }
 803                         myData2022->toU2022State.g=2;
 804                     } else {
 805                         /* illegal to have SS2 before a matching designator */
 806                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 807                     }
 808                     break;
 809                 /* case SS3_STATE: not used in ISO-2022-JP-x */
 810                 case ISO8859_1:
 811                 case ISO8859_7:
 812                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
 813                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 814                     } else {
 815                         /* G2 charset for SS2 */
 816                         myData2022->toU2022State.cs[2]=(int8_t)tempState;
 817                     }
 818                     break;
 819                 default:
 820                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
 821                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 822                     } else {
 823                         /* G0 charset */
 824                         myData2022->toU2022State.cs[0]=(int8_t)tempState;
 825                     }
 826                     break;
 827                 }
 828             }
 829             break;
 830         case ISO_2022_CN:
 831             {
 832                 StateEnum tempState=nextStateToUnicodeCN[offset];
 833                 switch(tempState) {
 834                 case INVALID_STATE:
 835                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 836                     break;
 837                 case SS2_STATE:
 838                     if(myData2022->toU2022State.cs[2]!=0) {
 839                         if(myData2022->toU2022State.g<2) {
 840                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 841                         }
 842                         myData2022->toU2022State.g=2;
 843                     } else {
 844                         /* illegal to have SS2 before a matching designator */
 845                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 846                     }
 847                     break;
 848                 case SS3_STATE:
 849                     if(myData2022->toU2022State.cs[3]!=0) {
 850                         if(myData2022->toU2022State.g<2) {
 851                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
 852                         }
 853                         myData2022->toU2022State.g=3;
 854                     } else {
 855                         /* illegal to have SS3 before a matching designator */
 856                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 857                     }
 858                     break;
 859                 case ISO_IR_165:
 860                     if(myData2022->version==0) {
 861                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 862                         break;
 863                     }
 864                 case GB2312_1:
 865                 case CNS_11643_1:
 866                     myData2022->toU2022State.cs[1]=(int8_t)tempState;
 867                     break;
 868                 case CNS_11643_2:
 869                     myData2022->toU2022State.cs[2]=(int8_t)tempState;
 870                     break;
 871                 default:
 872                     /* other CNS 11643 planes */
 873                     if(myData2022->version==0) {
 874                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 875                     } else {
 876                        myData2022->toU2022State.cs[3]=(int8_t)tempState;
 877                     }
 878                     break;
 879                 }
 880             }
 881             break;
 882         case ISO_2022_KR:
 883             if(offset==0x30){
 884                 /* nothing to be done, just accept this one escape sequence */
 885             } else {
 886                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
 887             }
 888             break;
 889
 890         default:
 891             *err = U_ILLEGAL_ESCAPE_SEQUENCE;
 892             break;
 893         }
 894     }
 895     if(U_SUCCESS(*err)) {
 896         _this->toULength = 0;
 897     }
 898 }
 899
 900 /*Checks the characters of the buffer against valid 2022 escape sequences
 901 *if the match we return a pointer to the initial start of the sequence otherwise
 902 *we return sourceLimit
 903 */
 904 /*for 2022 looks ahead in the stream
 905  *to determine the longest possible convertible
 906  *data stream
 907  */
 908 static U_INLINE const char*
 909 getEndOfBuffer_2022(const char** source,
 910                    const char* sourceLimit,
 911                    UBool flush){
 912
 913     const char* mySource = *source;
 914
 915 #ifdef U_ENABLE_GENERIC_ISO_2022
 916     if (*source >= sourceLimit)
 917         return sourceLimit;
 918
 919     do{
 920
 921         if (*mySource == ESC_2022){
 922             int8_t i;
 923             int32_t key = 0;
 924             int32_t offset;
 925             UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
 926
 927             /* Kludge: I could not
 928             * figure out the reason for validating an escape sequence
 929             * twice - once here and once in changeState_2022().
 930             * is it possible to have an ESC character in a ISO2022
 931             * byte stream which is valid in a code page? Is it legal?
 932             */
 933             for (i=0;
 934             (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
 935             i++) {
 936                 value =  getKey_2022(*(mySource+i), &key, &offset);
 937             }
 938             if (value > 0 || *mySource==ESC_2022)
 939                 return mySource;
 940
 941             if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
 942                 return sourceLimit;
 943         }
 944     }while (++mySource < sourceLimit);
 945
 946     return sourceLimit;
 947 #else
 948     while(mySource < sourceLimit && *mySource != ESC_2022) {
 949         ++mySource;
 950     }
 951     return mySource;
 952 #endif
 953 }
 954
 955
 956 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
 957  * any future change in _MBCSFromUChar32() function should be reflected in
 958  * this macro
 959  */
 960 static U_INLINE void
 961 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
 962                                          UChar32 c,
 963                                          uint32_t* value,
 964                                          UBool useFallback,
 965                                          int32_t *length,
 966                                          int outputType)
 967 {
 968     const int32_t *cx;
 969     const uint16_t *table;
 970     uint32_t stage2Entry;
 971     uint32_t myValue;
 972     const uint8_t *p;
 973     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
 974     if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
 975         table=sharedData->mbcs.fromUnicodeTable;
 976         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
 977         /* get the bytes and the length for the output */
 978         if(outputType==MBCS_OUTPUT_2){
 979             myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
 980             if(myValue<=0xff) {
 981                 *length=1;
 982             } else {
 983                 *length=2;
 984             }
 985         } else /* outputType==MBCS_OUTPUT_3 */ {
 986             p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
 987             myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
 988             if(myValue<=0xff) {
 989                 *length=1;
 990             } else if(myValue<=0xffff) {
 991                 *length=2;
 992             } else {
 993                 *length=3;
 994             }
 995         }
 996         /* is this code point assigned, or do we use fallbacks? */
 997         if( (stage2Entry&(1<<(16+(c&0xf))))!=0 ||
 998             (FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0)
 999         ) {
1000             /*
1001              * We allow a 0 byte output if the "assigned" bit is set for this entry.
1002              * There is no way with this data structure for fallback output
1003              * to be a zero byte.
1004              */
1005             /* assigned */
1006             *value=myValue;
1007             return;
1008         }
1009     }
1010
1011     cx=sharedData->mbcs.extIndexes;
1012     if(cx!=NULL) {
1013         *length=ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1014         return;
1015     }
1016
1017     /* unassigned */
1018     *length=0;
1019 }
1020
1021 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1022  * any future change in _MBCSSingleFromUChar32() function should be reflected in
1023  * this macro
1024  */
1025 static U_INLINE void
1026 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1027                                        UChar32 c,
1028                                        uint32_t* retval,
1029                                        UBool useFallback)
1030 {
1031     const uint16_t *table;
1032     int32_t value;
1033     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1034     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1035         *retval=(uint16_t)-1;
1036         return;
1037     }
1038     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1039     table=sharedData->mbcs.fromUnicodeTable;
1040     /* get the byte for the output */
1041     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1042     /* is this code point assigned, or do we use fallbacks? */
1043     if(useFallback ? value>=0x800 : value>=0xc00) {
1044         value &=0xff;
1045     } else {
1046         value= -1;
1047     }
1048     *retval=(uint16_t) value;
1049 }
1050
1051 #ifdef U_ENABLE_GENERIC_ISO_2022
1052
1053 /**********************************************************************************
1054 *  ISO-2022 Converter
1055 *
1056 *
1057 */
1058
1059 static void
1060 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1061                                                            UErrorCode* err){
1062     const char* mySourceLimit, *realSourceLimit;
1063     const char* sourceStart;
1064     const UChar* myTargetStart;
1065     UConverter* saveThis;
1066     UConverterDataISO2022* myData;
1067     int8_t length;
1068
1069     saveThis = args->converter;
1070     myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1071
1072     realSourceLimit = args->sourceLimit;
1073     while (args->source < realSourceLimit) {
1074         if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1075             /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1076             mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1077
1078             if(args->source < mySourceLimit) {
1079                 if(myData->currentConverter==NULL) {
1080                     myData->currentConverter = ucnv_open("ASCII",err);
1081                     if(U_FAILURE(*err)){
1082                         return;
1083                     }
1084
1085                     myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1086                     saveThis->mode = UCNV_SO;
1087                 }
1088
1089                 /* convert to before the ESC or until the end of the buffer */
1090                 myData->isFirstBuffer=FALSE;
1091                 sourceStart = args->source;
1092                 myTargetStart = args->target;
1093                 args->converter = myData->currentConverter;
1094                 ucnv_toUnicode(args->converter,
1095                     &args->target,
1096                     args->targetLimit,
1097                     &args->source,
1098                     mySourceLimit,
1099                     args->offsets,
1100                     (UBool)(args->flush && mySourceLimit == realSourceLimit),
1101                     err);
1102                 args->converter = saveThis;
1103
1104                 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1105                     /* move the overflow buffer */
1106                     length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1107                     myData->currentConverter->UCharErrorBufferLength = 0;
1108                     if(length > 0) {
1109                         uprv_memcpy(saveThis->UCharErrorBuffer,
1110                                     myData->currentConverter->UCharErrorBuffer,
1111                                     length*U_SIZEOF_UCHAR);
1112                     }
1113                     return;
1114                 }
1115
1116                 /*
1117                  * At least one of:
1118                  * -Error while converting
1119                  * -Done with entire buffer
1120                  * -Need to write offsets or update the current offset
1121                  *  (leave that up to the code in ucnv.c)
1122                  *
1123                  * or else we just stopped at an ESC byte and continue with changeState_2022()
1124                  */
1125                 if (U_FAILURE(*err) ||
1126                     (args->source == realSourceLimit) ||
1127                     (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1128                     (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1129                 ) {
1130                     /* copy partial or error input for truncated detection and error handling */
1131                     if(U_FAILURE(*err)) {
1132                         length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1133                         if(length > 0) {
1134                             uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1135                         }
1136                     } else {
1137                         length = saveThis->toULength = myData->currentConverter->toULength;
1138                         if(length > 0) {
1139                             uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1140                             if(args->source < mySourceLimit) {
1141                                 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1142                             }
1143                         }
1144                     }
1145                     return;
1146                 }
1147             }
1148         }
1149
1150         sourceStart = args->source;
1151         changeState_2022(args->converter,
1152                &(args->source),
1153                realSourceLimit,
1154                ISO_2022,
1155                err);
1156         if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1157             /* let the ucnv.c code update its current offset */
1158             return;
1159         }
1160     }
1161 }
1162
1163 #endif
1164
1165 /*
1166  * To Unicode Callback helper function
1167  */
1168 static void
1169 toUnicodeCallback(UConverter *cnv,
1170                   const uint32_t sourceChar, const uint32_t targetUniChar,
1171                   UErrorCode* err){
1172     if(sourceChar>0xff){
1173         cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1174         cnv->toUBytes[1] = (uint8_t)sourceChar;
1175         cnv->toULength = 2;
1176     }
1177     else{
1178         cnv->toUBytes[0] =(char) sourceChar;
1179         cnv->toULength = 2;
1180     }
1181
1182     if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1183         *err = U_INVALID_CHAR_FOUND;
1184     }
1185     else{
1186         *err = U_ILLEGAL_CHAR_FOUND;
1187     }
1188 }
1189
1190 /**************************************ISO-2022-JP*************************************************/
1191
1192 /************************************** IMPORTANT **************************************************
1193 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1194 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1195 * The converter iterates over each Unicode codepoint
1196 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1197 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1198 * would do as far as possible.
1199 *
1200 * If the implementation of these macros or structure of sharedData struct change in the future, make
1201 * sure that ISO-2022 is also changed.
1202 ***************************************************************************************************
1203 */
1204
1205 /***************************************************************************************************
1206 * Rules for ISO-2022-jp encoding
1207 * (i)   Escape sequences must be fully contained within a line they should not
1208 *       span new lines or CRs
1209 * (ii)  If the last character on a line is represented by two bytes then an ASCII or
1210 *       JIS-Roman character escape sequence should follow before the line terminates
1211 * (iii) If the first character on the line is represented by two bytes then a two
1212 *       byte character escape sequence should precede it
1213 * (iv)  If no escape sequence is encountered then the characters are ASCII
1214 * (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1215 *       and invoked with SS2 (ESC N).
1216 * (vi)  If there is any G0 designation in text, there must be a switch to
1217 *       ASCII or to JIS X 0201-Roman before a space character (but not
1218 *       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1219 *       characters such as tab or CRLF.
1220 * (vi)  Supported encodings:
1221 *          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1222 *
1223 *  source : RFC-1554
1224 *
1225 *          JISX201, JISX208,JISX212 : new .cnv data files created
1226 *          KSC5601 : alias to ibm-949 mapping table
1227 *          GB2312 : alias to ibm-1386 mapping table
1228 *          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1229 *          ISO-8859-7 : alisas to ibm-9409 mapping table
1230 */
1231
1232 /* preference order of JP charsets */
1233 static const StateEnum jpCharsetPref[]={
1234     ASCII,
1235     JISX201,
1236     ISO8859_1,
1237     ISO8859_7,
1238     JISX208,
1239     JISX212,
1240     GB2312,
1241     KSC5601,
1242     HWKANA_7BIT
1243 };
1244
1245 static const char escSeqChars[][6] ={
1246     "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1247     "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1248     "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1249     "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1250     "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1251     "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1252     "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1253     "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1254     "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1255
1256 };
1257 static  const int32_t escSeqCharsLen[] ={
1258     3, /* length of <ESC>(B  ASCII       */
1259     3, /* length of <ESC>.A  ISO-8859-1  */
1260     3, /* length of <ESC>.F  ISO-8859-7  */
1261     3, /* length of <ESC>(J  JISX-201    */
1262     3, /* length of <ESC>$B  JISX-208    */
1263     4, /* length of <ESC>$(D JISX-212    */
1264     3, /* length of <ESC>$A  GB2312      */
1265     4, /* length of <ESC>$(C KSC5601     */
1266     3  /* length of <ESC>(I  HWKANA_7BIT */
1267 };
1268
1269 /*
1270 * The iteration over various code pages works this way:
1271 * i)   Get the currentState from myConverterData->currentState
1272 * ii)  Check if the character is mapped to a valid character in the currentState
1273 *      Yes ->  a) set the initIterState to currentState
1274 *       b) remain in this state until an invalid character is found
1275 *      No  ->  a) go to the next code page and find the character
1276 * iii) Before changing the state increment the current state check if the current state
1277 *      is equal to the intitIteration state
1278 *      Yes ->  A character that cannot be represented in any of the supported encodings
1279 *       break and return a U_INVALID_CHARACTER error
1280 *      No  ->  Continue and find the character in next code page
1281 *
1282 *
1283 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1284 */
1285
1286 static void
1287 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1288     UConverterDataISO2022 *converterData;
1289     ISO2022State *pFromU2022State;
1290     uint8_t *target = (uint8_t *) args->target;
1291     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1292     const UChar* source = args->source;
1293     const UChar* sourceLimit = args->sourceLimit;
1294     int32_t* offsets = args->offsets;
1295     UChar32 sourceChar;
1296     char buffer[8];
1297     int32_t len, outLen;
1298     int8_t choices[10];
1299     int32_t choiceCount;
1300     uint32_t targetValue;
1301     UBool useFallback;
1302
1303     int32_t i;
1304     int8_t cs, g;
1305
1306     /* set up the state */
1307     converterData     = (UConverterDataISO2022*)args->converter->extraInfo;
1308     pFromU2022State   = &converterData->fromU2022State;
1309     useFallback       = args->converter->useFallback;
1310
1311     choiceCount = 0;
1312
1313     /* check if the last codepoint of previous buffer was a lead surrogate*/
1314     if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
1315         goto getTrail;
1316     }
1317
1318     while(source < sourceLimit) {
1319         if(target < targetLimit) {
1320
1321             sourceChar  = *(source++);
1322             /*check if the char is a First surrogate*/
1323              if(UTF_IS_SURROGATE(sourceChar)) {
1324                 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1325 getTrail:
1326                     /*look ahead to find the trail surrogate*/
1327                     if(source < sourceLimit) {
1328                         /* test the following code unit */
1329                         UChar trail=(UChar) *source;
1330                         if(UTF_IS_SECOND_SURROGATE(trail)) {
1331                             source++;
1332                             sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1333                             args->converter->fromUChar32=0x00;
1334                             /* convert this supplementary code point */
1335                             /* exit this condition tree */
1336                         } else {
1337                             /* this is an unmatched lead code unit (1st surrogate) */
1338                             /* callback(illegal) */
1339                             *err=U_ILLEGAL_CHAR_FOUND;
1340                             args->converter->fromUChar32=sourceChar;
1341                             break;
1342                         }
1343                     } else {
1344                         /* no more input */
1345                         args->converter->fromUChar32=sourceChar;
1346                         break;
1347                     }
1348                 } else {
1349                     /* this is an unmatched trail code unit (2nd surrogate) */
1350                     /* callback(illegal) */
1351                     *err=U_ILLEGAL_CHAR_FOUND;
1352                     args->converter->fromUChar32=sourceChar;
1353                     break;
1354                 }
1355             }
1356
1357             /* do the conversion */
1358
1359             if(choiceCount == 0) {
1360                 uint16_t csm;
1361
1362                 /*
1363                  * The csm variable keeps track of which charsets are allowed
1364                  * and not used yet while building the choices[].
1365                  */
1366                 csm = jpCharsetMasks[converterData->version];
1367                 choiceCount = 0;
1368
1369                 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1370                 if(converterData->version == 3 || converterData->version == 4) {
1371                     choices[choiceCount++] = cs = (int8_t)HWKANA_7BIT;
1372                     csm &= ~CSM(cs);
1373                 }
1374
1375                 /* try the current G0 charset */
1376                 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1377                 csm &= ~CSM(cs);
1378
1379                 /* try the current G2 charset */
1380                 if((cs = pFromU2022State->cs[2]) != 0) {
1381                     choices[choiceCount++] = cs;
1382                     csm &= ~CSM(cs);
1383                 }
1384
1385                 /* try all the other possible charsets */
1386                 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1387                     cs = (int8_t)jpCharsetPref[i];
1388                     if(CSM(cs) & csm) {
1389                         choices[choiceCount++] = cs;
1390                         csm &= ~CSM(cs);
1391                     }
1392                 }
1393             }
1394
1395             cs = g = 0;
1396             len = 0;
1397
1398             for(i = 0; i < choiceCount && len == 0; ++i) {
1399                 cs = choices[i];
1400                 switch(cs) {
1401                 case ASCII:
1402                     if(sourceChar <= 0x7f) {
1403                         targetValue = (uint32_t)sourceChar;
1404                         len = 1;
1405                     }
1406                     break;
1407                 case ISO8859_1:
1408                     if(0x80 <= sourceChar && sourceChar <= 0xff) {
1409                         targetValue = (uint32_t)sourceChar - 0x80;
1410                         len = 1;
1411                         g = 2;
1412                     }
1413                     break;
1414                 case HWKANA_7BIT:
1415                     if((uint32_t)(0xff9f-sourceChar)<=(0xff9f-0xff61)) {
1416                         targetValue = (uint32_t)(sourceChar - (0xff61 - 0x21));
1417                         len = 1;
1418
1419                         if(converterData->version==3) {
1420                             /* JIS7: use G1 (SO) */
1421                             pFromU2022State->cs[1] = cs; /* do not output an escape sequence */
1422                             g = 1;
1423                         } else if(converterData->version==4) {
1424                             /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1425                             int8_t cs0;
1426
1427                             targetValue += 0x80;
1428
1429                             cs0 = pFromU2022State->cs[0];
1430                             if(IS_JP_DBCS(cs0)) {
1431                                 /* switch from a DBCS charset to JISX201 */
1432                                 cs = (int8_t)JISX201;
1433                             } else {
1434                                 /* stay in the current G0 charset */
1435                                 cs = cs0;
1436                             }
1437                         }
1438                     }
1439                     break;
1440                 case JISX201:
1441                     /* G0 SBCS */
1442                     MBCS_SINGLE_FROM_UCHAR32(
1443                         converterData->myConverterArray[cs],
1444                         sourceChar, &targetValue,
1445                         useFallback);
1446                     if(targetValue <= 0x7f) {
1447                         len = 1;
1448                     }
1449                     break;
1450                 case ISO8859_7:
1451                     /* G0 SBCS forced to 7-bit output */
1452                     MBCS_SINGLE_FROM_UCHAR32(
1453                         converterData->myConverterArray[cs],
1454                         sourceChar, &targetValue,
1455                         useFallback);
1456                     if(0x80 <= targetValue && targetValue <= 0xff) {
1457                         targetValue -= 0x80;
1458                         len = 1;
1459                         g = 2;
1460                     }
1461                     break;
1462                 default:
1463                     /* G0 DBCS */
1464                     MBCS_FROM_UCHAR32_ISO2022(
1465                         converterData->myConverterArray[cs],
1466                         sourceChar, &targetValue,
1467                         useFallback, &len, MBCS_OUTPUT_2);
1468                     if(len != 2) {
1469                         len = 0;
1470                     }
1471                     break;
1472                 }
1473             }
1474
1475             if(len > 0) {
1476                 outLen = 0; /* count output bytes */
1477
1478                 /* write SI if necessary (only for JIS7) */
1479                 if(pFromU2022State->g == 1 && g == 0) {
1480                     buffer[outLen++] = UCNV_SI;
1481                     pFromU2022State->g = 0;
1482                 }
1483
1484                 /* write the designation sequence if necessary */
1485                 if(cs != pFromU2022State->cs[g]) {
1486                     int32_t escLen = escSeqCharsLen[cs];
1487                     uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1488                     outLen += escLen;
1489                     pFromU2022State->cs[g] = cs;
1490
1491                     /* invalidate the choices[] */
1492                     choiceCount = 0;
1493                 }
1494
1495                 /* write the shift sequence if necessary */
1496                 if(g != pFromU2022State->g) {
1497                     switch(g) {
1498                     /* case 0 handled before writing escapes */
1499                     case 1:
1500                         buffer[outLen++] = UCNV_SO;
1501                         pFromU2022State->g = 1;
1502                         break;
1503                     default: /* case 2 */
1504                         buffer[outLen++] = 0x1b;
1505                         buffer[outLen++] = 0x4e;
1506                         break;
1507                     /* no case 3: no SS3 in ISO-2022-JP-x */
1508                     }
1509                 }
1510
1511                 /* write the output bytes */
1512                 if(len == 1) {
1513                     buffer[outLen++] = (char)targetValue;
1514                 } else /* len == 2 */ {
1515                     buffer[outLen++] = (char)(targetValue >> 8);
1516                     buffer[outLen++] = (char)targetValue;
1517                 }
1518             } else {
1519                 /*
1520                  * if we cannot find the character after checking all codepages
1521                  * then this is an error
1522                  */
1523                 *err = U_INVALID_CHAR_FOUND;
1524                 args->converter->fromUChar32=sourceChar;
1525                 break;
1526             }
1527
1528             if(sourceChar == CR || sourceChar == LF) {
1529                 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1530                 pFromU2022State->cs[2] = 0;
1531                 choiceCount = 0;
1532             }
1533
1534             /* output outLen>0 bytes in buffer[] */
1535             if(outLen == 1) {
1536                 *target++ = buffer[0];
1537                 if(offsets) {
1538                     *offsets++ = source - args->source - 1; /* -1: known to be ASCII */
1539                 }
1540             } else if(outLen == 2 && (target + 2) <= targetLimit) {
1541                 *target++ = buffer[0];
1542                 *target++ = buffer[1];
1543                 if(offsets) {
1544                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1545                     *offsets++ = sourceIndex;
1546                     *offsets++ = sourceIndex;
1547                 }
1548             } else {
1549                 ucnv_fromUWriteBytes(
1550                     args->converter,
1551                     buffer, outLen,
1552                     (char **)&target, (const char *)targetLimit,
1553                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1554                     err);
1555                 if(U_FAILURE(*err)) {
1556                     break;
1557                 }
1558             }
1559         } /* end if(myTargetIndex<myTargetLength) */
1560         else{
1561             *err =U_BUFFER_OVERFLOW_ERROR;
1562             break;
1563         }
1564
1565     }/* end while(mySourceIndex<mySourceLength) */
1566
1567     /*
1568      * the end of the input stream and detection of truncated input
1569      * are handled by the framework, but for ISO-2022-JP conversion
1570      * we need to be in ASCII mode at the very end
1571      *
1572      * conditions:
1573      *   successful
1574      *   in SO mode or not in ASCII mode
1575      *   end of input and no truncated input
1576      */
1577     if( U_SUCCESS(*err) &&
1578         (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1579         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
1580     ) {
1581         int32_t sourceIndex;
1582
1583         outLen = 0;
1584
1585         if(pFromU2022State->g != 0) {
1586             buffer[outLen++] = UCNV_SI;
1587             pFromU2022State->g = 0;
1588         }
1589
1590         if(pFromU2022State->cs[0] != ASCII) {
1591             int32_t escLen = escSeqCharsLen[ASCII];
1592             uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1593             outLen += escLen;
1594             pFromU2022State->cs[0] = (int8_t)ASCII;
1595         }
1596
1597         /* get the source index of the last input character */
1598         /*
1599          * TODO this would be simpler and more reliable if we used a pair
1600          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1601          * so that we could simply use the prevSourceIndex here;
1602          * this code gives an incorrect result for the rare case of an unmatched
1603          * trail surrogate that is alone in the last buffer of the text stream
1604          */
1605         sourceIndex=(int32_t)(source-args->source);
1606         if(sourceIndex>0) {
1607             --sourceIndex;
1608             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1609                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1610             ) {
1611                 --sourceIndex;
1612             }
1613         } else {
1614             sourceIndex=-1;
1615         }
1616
1617         ucnv_fromUWriteBytes(
1618             args->converter,
1619             buffer, outLen,
1620             (char **)&target, (const char *)targetLimit,
1621             &offsets, sourceIndex,
1622             err);
1623     }
1624
1625     /*save the state and return */
1626     args->source = source;
1627     args->target = (char*)target;
1628 }
1629
1630 /*************** to unicode *******************/
1631
1632 static void
1633 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
1634                                                UErrorCode* err){
1635     char tempBuf[3];
1636     const char *mySource = (char *) args->source;
1637     UChar *myTarget = args->target;
1638     const char *mySourceLimit = args->sourceLimit;
1639     uint32_t targetUniChar = 0x0000;
1640     uint32_t mySourceChar = 0x0000;
1641     UConverterDataISO2022* myData;
1642     ISO2022State *pToU2022State;
1643     StateEnum cs;
1644
1645     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
1646     pToU2022State = &myData->toU2022State;
1647
1648     if(myData->key != 0) {
1649         /* continue with a partial escape sequence */
1650         goto escape;
1651     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
1652         /* continue with a partial double-byte character */
1653         mySourceChar = args->converter->toUBytes[0];
1654         args->converter->toULength = 0;
1655         cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1656         goto getTrailByte;
1657     }
1658
1659     while(mySource < mySourceLimit){
1660
1661         targetUniChar =missingCharMarker;
1662
1663         if(myTarget < args->targetLimit){
1664
1665             mySourceChar= (unsigned char) *mySource++;
1666
1667             switch(mySourceChar) {
1668             case UCNV_SI:
1669                 if(myData->version==3) {
1670                     pToU2022State->g=0;
1671                     continue;
1672                 } else {
1673                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1674                     break;
1675                 }
1676
1677             case UCNV_SO:
1678                 if(myData->version==3) {
1679                     /* JIS7: switch to G1 half-width Katakana */
1680                     pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
1681                     pToU2022State->g=1;
1682                     continue;
1683                 } else {
1684                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1685                     break;
1686                 }
1687
1688             case ESC_2022:
1689                 mySource--;
1690 escape:
1691                 changeState_2022(args->converter,&(mySource),
1692                     mySourceLimit, ISO_2022_JP,err);
1693
1694                 /* invalid or illegal escape sequence */
1695                 if(U_FAILURE(*err)){
1696                     args->target = myTarget;
1697                     args->source = mySource;
1698                     return;
1699                 }
1700                 continue;
1701
1702             /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
1703
1704             case CR:
1705                 /*falls through*/
1706             case LF:
1707                 /* automatically reset to single-byte mode */
1708                 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
1709                     pToU2022State->cs[0] = (int8_t)ASCII;
1710                 }
1711                 pToU2022State->cs[2] = 0;
1712                 pToU2022State->g = 0;
1713                 /* falls through */
1714             default:
1715                 /* convert one or two bytes */
1716                 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1717                 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
1718                     !IS_JP_DBCS(cs)
1719                 ) {
1720                     /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
1721                     targetUniChar = mySourceChar + (0xff61 - 0xa1);
1722
1723                     /* return from a single-shift state to the previous one */
1724                     if(pToU2022State->g >= 2) {
1725                         pToU2022State->g=pToU2022State->prevG;
1726                     }
1727                 } else switch(cs) {
1728                 case ASCII:
1729                     if(mySourceChar <= 0x7f) {
1730                         targetUniChar = mySourceChar;
1731                     }
1732                     break;
1733                 case ISO8859_1:
1734                     if(mySourceChar <= 0x7f) {
1735                         targetUniChar = mySourceChar + 0x80;
1736                     }
1737                     /* return from a single-shift state to the previous one */
1738                     pToU2022State->g=pToU2022State->prevG;
1739                     break;
1740                 case ISO8859_7:
1741                     if(mySourceChar <= 0x7f) {
1742                         /* convert mySourceChar+0x80 to use a normal 8-bit table */
1743                         targetUniChar =
1744                             _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1745                                 myData->myConverterArray[cs],
1746                                 mySourceChar + 0x80);
1747                     }
1748                     /* return from a single-shift state to the previous one */
1749                     pToU2022State->g=pToU2022State->prevG;
1750                     break;
1751                 case JISX201:
1752                     if(mySourceChar <= 0x7f) {
1753                         targetUniChar =
1754                             _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1755                                 myData->myConverterArray[cs],
1756                                 mySourceChar);
1757                     }
1758                     break;
1759                 case HWKANA_7BIT:
1760                     if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
1761                         /* 7-bit halfwidth Katakana */
1762                         targetUniChar = mySourceChar + (0xff61 - 0x21);
1763                     }
1764                     break;
1765                 default:
1766                     /* G0 DBCS */
1767                     if(mySource < mySourceLimit) {
1768                         char trailByte;
1769 getTrailByte:
1770                         tempBuf[0] = (char) (mySourceChar);
1771                         tempBuf[1] = trailByte = *mySource++;
1772                         mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
1773                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
1774                     } else {
1775                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
1776                         args->converter->toULength = 1;
1777                         goto endloop;
1778                     }
1779                 }
1780                 break;
1781             }
1782             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
1783                 if(args->offsets){
1784                     args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
1785                 }
1786                 *(myTarget++)=(UChar)targetUniChar;
1787             }
1788             else if(targetUniChar > missingCharMarker){
1789                 /* disassemble the surrogate pair and write to output*/
1790                 targetUniChar-=0x0010000;
1791                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
1792                 if(args->offsets){
1793                     args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
1794                 }
1795                 ++myTarget;
1796                 if(myTarget< args->targetLimit){
1797                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
1798                     if(args->offsets){
1799                         args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
1800                     }
1801                     ++myTarget;
1802                 }else{
1803                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
1804                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
1805                 }
1806
1807             }
1808             else{
1809                 /* Call the callback function*/
1810                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
1811                 break;
1812             }
1813         }
1814         else{
1815             *err =U_BUFFER_OVERFLOW_ERROR;
1816             break;
1817         }
1818     }
1819 endloop:
1820     args->target = myTarget;
1821     args->source = mySource;
1822 }
1823
1824
1825 /***************************************************************
1826 *   Rules for ISO-2022-KR encoding
1827 *   i) The KSC5601 designator sequence should appear only once in a file,
1828 *      at the begining of a line before any KSC5601 characters. This usually
1829 *      means that it appears by itself on the first line of the file
1830 *  ii) There are only 2 shifting sequences SO to shift into double byte mode
1831 *      and SI to shift into single byte mode
1832 */
1833 static void
1834 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
1835
1836     UConverter* saveConv = args->converter;
1837     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
1838     args->converter=myConverterData->currentConverter;
1839
1840     myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
1841     ucnv_MBCSFromUnicodeWithOffsets(args,err);
1842     saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
1843
1844     if(*err == U_BUFFER_OVERFLOW_ERROR) {
1845         if(myConverterData->currentConverter->charErrorBufferLength > 0) {
1846             uprv_memcpy(
1847                 saveConv->charErrorBuffer,
1848                 myConverterData->currentConverter->charErrorBuffer,
1849                 myConverterData->currentConverter->charErrorBufferLength);
1850         }
1851         saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
1852         myConverterData->currentConverter->charErrorBufferLength = 0;
1853     }
1854     args->converter=saveConv;
1855 }
1856
1857 static void
1858 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
1859
1860     const UChar *source = args->source;
1861     const UChar *sourceLimit = args->sourceLimit;
1862     unsigned char *target = (unsigned char *) args->target;
1863     unsigned char *targetLimit = (unsigned char *) args->targetLimit;
1864     int32_t* offsets = args->offsets;
1865     uint32_t targetByteUnit = 0x0000;
1866     UChar32 sourceChar = 0x0000;
1867     UBool isTargetByteDBCS;
1868     UBool oldIsTargetByteDBCS;
1869     UConverterDataISO2022 *converterData;
1870     UConverterSharedData* sharedData;
1871     UBool useFallback;
1872     int32_t length =0;
1873
1874     converterData=(UConverterDataISO2022*)args->converter->extraInfo;
1875     /* if the version is 1 then the user is requesting
1876      * conversion with ibm-25546 pass the arguments to
1877      * MBCS converter and return
1878      */
1879     if(converterData->version==1){
1880         UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
1881         return;
1882     }
1883
1884     /* initialize data */
1885     sharedData = converterData->currentConverter->sharedData;
1886     useFallback = args->converter->useFallback;
1887     isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
1888     oldIsTargetByteDBCS = isTargetByteDBCS;
1889
1890     isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
1891     if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
1892         goto getTrail;
1893     }
1894     while(source < sourceLimit){
1895
1896         targetByteUnit = missingCharMarker;
1897
1898         if(target < (unsigned char*) args->targetLimit){
1899             sourceChar = *source++;
1900            /* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData,
1901                 sourceChar,&targetByteUnit,args->converter->useFallback);*/
1902             MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,&length,MBCS_OUTPUT_2);
1903             /* only DBCS or SBCS characters are expected*/
1904             /* DB characters with high bit set to 1 are expected */
1905             if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
1906                 targetByteUnit=missingCharMarker;
1907             }
1908             if (targetByteUnit != missingCharMarker){
1909
1910                 oldIsTargetByteDBCS = isTargetByteDBCS;
1911                 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
1912                   /* append the shift sequence */
1913                 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
1914
1915                     if (isTargetByteDBCS)
1916                         *target++ = UCNV_SO;
1917                     else
1918                         *target++ = UCNV_SI;
1919                     if(offsets)
1920                         *(offsets++)=   source - args->source-1;
1921                 }
1922                 /* write the targetUniChar  to target */
1923                 if(targetByteUnit <= 0x00FF){
1924                     if( target < targetLimit){
1925                         *(target++) = (unsigned char) targetByteUnit;
1926                         if(offsets){
1927                             *(offsets++) = source - args->source-1;
1928                         }
1929
1930                     }else{
1931                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
1932                         *err = U_BUFFER_OVERFLOW_ERROR;
1933                     }
1934                 }else{
1935                     if(target < targetLimit){
1936                         *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
1937                         if(offsets){
1938                             *(offsets++) = source - args->source-1;
1939                         }
1940                         if(target < targetLimit){
1941                             *(target++) =(unsigned char) (targetByteUnit -0x80);
1942                             if(offsets){
1943                                 *(offsets++) = source - args->source-1;
1944                             }
1945                         }else{
1946                             args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
1947                             *err = U_BUFFER_OVERFLOW_ERROR;
1948                         }
1949                     }else{
1950                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
1951                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
1952                         *err = U_BUFFER_OVERFLOW_ERROR;
1953                     }
1954                 }
1955
1956             }
1957             else{
1958                 /* oops.. the code point is unassingned
1959                  * set the error and reason
1960                  */
1961
1962                 /*check if the char is a First surrogate*/
1963                 if(UTF_IS_SURROGATE(sourceChar)) {
1964                     if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1965 getTrail:
1966                         /*look ahead to find the trail surrogate*/
1967                         if(source <  sourceLimit) {
1968                             /* test the following code unit */
1969                             UChar trail=(UChar) *source;
1970                             if(UTF_IS_SECOND_SURROGATE(trail)) {
1971                                 source++;
1972                                 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1973                                 *err = U_INVALID_CHAR_FOUND;
1974                                 /* convert this surrogate code point */
1975                                 /* exit this condition tree */
1976                             } else {
1977                                 /* this is an unmatched lead code unit (1st surrogate) */
1978                                 /* callback(illegal) */
1979                                 *err=U_ILLEGAL_CHAR_FOUND;
1980                             }
1981                         } else {
1982                             /* no more input */
1983                             *err = U_ZERO_ERROR;
1984                         }
1985                     } else {
1986                         /* this is an unmatched trail code unit (2nd surrogate) */
1987                         /* callback(illegal) */
1988                         *err=U_ILLEGAL_CHAR_FOUND;
1989                     }
1990                 } else {
1991                     /* callback(unassigned) for a BMP code point */
1992                     *err = U_INVALID_CHAR_FOUND;
1993                 }
1994
1995                 args->converter->fromUChar32=sourceChar;
1996                 args->converter->fromUnicodeStatus = (int32_t)isTargetByteDBCS;
1997                 break;
1998             }
1999         } /* end if(myTargetIndex<myTargetLength) */
2000         else{
2001             *err =U_BUFFER_OVERFLOW_ERROR;
2002             break;
2003         }
2004
2005     }/* end while(mySourceIndex<mySourceLength) */
2006
2007     /*
2008      * the end of the input stream and detection of truncated input
2009      * are handled by the framework, but for ISO-2022-KR conversion
2010      * we need to be in ASCII mode at the very end
2011      *
2012      * conditions:
2013      *   successful
2014      *   not in ASCII mode
2015      *   end of input and no truncated input
2016      */
2017     if( U_SUCCESS(*err) &&
2018         isTargetByteDBCS &&
2019         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2020     ) {
2021         int32_t sourceIndex;
2022
2023         /* we are switching to ASCII */
2024         isTargetByteDBCS=FALSE;
2025
2026         /* get the source index of the last input character */
2027         /*
2028          * TODO this would be simpler and more reliable if we used a pair
2029          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2030          * so that we could simply use the prevSourceIndex here;
2031          * this code gives an incorrect result for the rare case of an unmatched
2032          * trail surrogate that is alone in the last buffer of the text stream
2033          */
2034         sourceIndex=(int32_t)(source-args->source);
2035         if(sourceIndex>0) {
2036             --sourceIndex;
2037             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2038                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2039             ) {
2040                 --sourceIndex;
2041             }
2042         } else {
2043             sourceIndex=-1;
2044         }
2045
2046         ucnv_fromUWriteBytes(
2047             args->converter,
2048             SHIFT_IN_STR, 1,
2049             (char **)&target, (const char *)targetLimit,
2050             &offsets, sourceIndex,
2051             err);
2052     }
2053
2054     /*save the state and return */
2055     args->source = source;
2056     args->target = (char*)target;
2057     args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2058 }
2059
2060 /************************ To Unicode ***************************************/
2061
2062 static void
2063 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2064                                                             UErrorCode* err){
2065     char const* sourceStart;
2066     UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2067
2068     UConverterToUnicodeArgs subArgs;
2069     int32_t minArgsSize;
2070
2071     /* set up the subconverter arguments */
2072     if(args->size<sizeof(UConverterToUnicodeArgs)) {
2073         minArgsSize = args->size;
2074     } else {
2075         minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2076     }
2077
2078     uprv_memcpy(&subArgs, args, minArgsSize);
2079     subArgs.size = (uint16_t)minArgsSize;
2080     subArgs.converter = myData->currentConverter;
2081
2082     /* remember the original start of the input for offsets */
2083     sourceStart = args->source;
2084
2085     if(myData->key != 0) {
2086         /* continue with a partial escape sequence */
2087         goto escape;
2088     }
2089
2090     while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2091         /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2092         subArgs.source = args->source;
2093         subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2094         if(subArgs.source != subArgs.sourceLimit) {
2095             /*
2096              * get the current partial byte sequence
2097              *
2098              * it needs to be moved between the public and the subconverter
2099              * so that the conversion framework, which only sees the public
2100              * converter, can handle truncated and illegal input etc.
2101              */
2102             if(args->converter->toULength > 0) {
2103                 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2104             }
2105             subArgs.converter->toULength = args->converter->toULength;
2106
2107             /*
2108              * Convert up to the end of the input, or to before the next escape character.
2109              * Does not handle conversion extensions because the preToU[] state etc.
2110              * is not copied.
2111              */
2112             ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2113
2114             if(args->offsets != NULL && sourceStart != args->source) {
2115                 /* update offsets to base them on the actual start of the input */
2116                 int32_t *offsets = args->offsets;
2117                 UChar *target = args->target;
2118                 int32_t delta = (int32_t)(args->source - sourceStart);
2119                 while(target < subArgs.target) {
2120                     if(*offsets >= 0) {
2121                         *offsets += delta;
2122                     }
2123                     ++offsets;
2124                     ++target;
2125                 }
2126             }
2127             args->source = subArgs.source;
2128             args->target = subArgs.target;
2129             args->offsets = subArgs.offsets;
2130
2131             /* copy input/error/overflow buffers */
2132             if(subArgs.converter->toULength > 0) {
2133                 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2134             }
2135             args->converter->toULength = subArgs.converter->toULength;
2136
2137             if(*err == U_BUFFER_OVERFLOW_ERROR) {
2138                 if(subArgs.converter->UCharErrorBufferLength > 0) {
2139                     uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2140                                 subArgs.converter->UCharErrorBufferLength);
2141                 }
2142                 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2143                 subArgs.converter->UCharErrorBufferLength = 0;
2144             }
2145         }
2146
2147         if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2148             return;
2149         }
2150
2151 escape:
2152         changeState_2022(args->converter,
2153                &(args->source),
2154                args->sourceLimit,
2155                ISO_2022_KR,
2156                err);
2157     }
2158 }
2159
2160 static void
2161 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2162                                                             UErrorCode* err){
2163     char tempBuf[2];
2164     const char *mySource = ( char *) args->source;
2165     UChar *myTarget = args->target;
2166     const char *mySourceLimit = args->sourceLimit;
2167     UChar32 targetUniChar = 0x0000;
2168     UChar mySourceChar = 0x0000;
2169     UConverterDataISO2022* myData;
2170     UConverterSharedData* sharedData ;
2171     UBool useFallback;
2172
2173     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2174     if(myData->version==1){
2175         UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2176         return;
2177     }
2178
2179     /* initialize state */
2180     sharedData = myData->currentConverter->sharedData;
2181     useFallback = args->converter->useFallback;
2182
2183     if(myData->key != 0) {
2184         /* continue with a partial escape sequence */
2185         goto escape;
2186     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2187         /* continue with a partial double-byte character */
2188         mySourceChar = args->converter->toUBytes[0];
2189         args->converter->toULength = 0;
2190         goto getTrailByte;
2191     }
2192
2193     while(mySource< mySourceLimit){
2194
2195         if(myTarget < args->targetLimit){
2196
2197             mySourceChar= (unsigned char) *mySource++;
2198
2199             if(mySourceChar==UCNV_SI){
2200                 myData->toU2022State.g = 0;
2201                 /*consume the source */
2202                 continue;
2203             }else if(mySourceChar==UCNV_SO){
2204                 myData->toU2022State.g = 1;
2205                 /*consume the source */
2206                 continue;
2207             }else if(mySourceChar==ESC_2022){
2208                 mySource--;
2209 escape:
2210                 changeState_2022(args->converter,&(mySource),
2211                                 mySourceLimit, ISO_2022_KR, err);
2212                 if(U_FAILURE(*err)){
2213                     args->target = myTarget;
2214                     args->source = mySource;
2215                     return;
2216                 }
2217                 continue;
2218             }
2219
2220             if(myData->toU2022State.g == 1) {
2221                 if(mySource < mySourceLimit) {
2222                     char trailByte;
2223 getTrailByte:
2224                     trailByte = *mySource++;
2225                     tempBuf[0] = (char)(mySourceChar + 0x80);
2226                     tempBuf[1] = (char)(trailByte + 0x80);
2227                     mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2228                     if((mySourceChar & 0x8080) == 0) {
2229                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2230                     } else {
2231                         /* illegal bytes > 0x7f */
2232                         targetUniChar = missingCharMarker;
2233                     }
2234                 } else {
2235                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2236                     args->converter->toULength = 1;
2237                     break;
2238                 }
2239             }
2240             else{
2241                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2242             }
2243             if(targetUniChar < 0xfffe){
2244                 if(args->offsets) {
2245                     args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
2246                 }
2247                 *(myTarget++)=(UChar)targetUniChar;
2248             }
2249             else {
2250                 /* Call the callback function*/
2251                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2252                 break;
2253             }
2254         }
2255         else{
2256             *err =U_BUFFER_OVERFLOW_ERROR;
2257             break;
2258         }
2259     }
2260     args->target = myTarget;
2261     args->source = mySource;
2262 }
2263
2264 /*************************** END ISO2022-KR *********************************/
2265
2266 /*************************** ISO-2022-CN *********************************
2267 *
2268 * Rules for ISO-2022-CN Encoding:
2269 * i)   The designator sequence must appear once on a line before any instance
2270 *      of character set it designates.
2271 * ii)  If two lines contain characters from the same character set, both lines
2272 *      must include the designator sequence.
2273 * iii) Once the designator sequence is known, a shifting sequence has to be found
2274 *      to invoke the  shifting
2275 * iv)  All lines start in ASCII and end in ASCII.
2276 * v)   Four shifting sequences are employed for this purpose:
2277 *
2278 *      Sequcence   ASCII Eq    Charsets
2279 *      ----------  -------    ---------
2280 *      SI           <SI>        US-ASCII
2281 *      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2282 *      SS2          <ESC>N      CNS-11643-1992 Plane 2
2283 *      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2284 *
2285 * vi)
2286 *      SOdesignator  : ESC "$" ")" finalchar_for_SO
2287 *      SS2designator : ESC "$" "*" finalchar_for_SS2
2288 *      SS3designator : ESC "$" "+" finalchar_for_SS3
2289 *
2290 *      ESC $ ) A       Indicates the bytes following SO are Chinese
2291 *       characters as defined in GB 2312-80, until
2292 *       another SOdesignation appears
2293 *
2294 *
2295 *      ESC $ ) E       Indicates the bytes following SO are as defined
2296 *       in ISO-IR-165 (for details, see section 2.1),
2297 *       until another SOdesignation appears
2298 *
2299 *      ESC $ ) G       Indicates the bytes following SO are as defined
2300 *       in CNS 11643-plane-1, until another
2301 *       SOdesignation appears
2302 *
2303 *      ESC $ * H       Indicates the two bytes immediately following
2304 *       SS2 is a Chinese character as defined in CNS
2305 *       11643-plane-2, until another SS2designation
2306 *       appears
2307 *       (Meaning <ESC>N must preceed every 2 byte
2308 *        sequence.)
2309 *
2310 *      ESC $ + I       Indicates the immediate two bytes following SS3
2311 *       is a Chinese character as defined in CNS
2312 *       11643-plane-3, until another SS3designation
2313 *       appears
2314 *       (Meaning <ESC>O must preceed every 2 byte
2315 *        sequence.)
2316 *
2317 *      ESC $ + J       Indicates the immediate two bytes following SS3
2318 *       is a Chinese character as defined in CNS
2319 *       11643-plane-4, until another SS3designation
2320 *       appears
2321 *       (In English: <ESC>O must preceed every 2 byte
2322 *        sequence.)
2323 *
2324 *      ESC $ + K       Indicates the immediate two bytes following SS3
2325 *       is a Chinese character as defined in CNS
2326 *       11643-plane-5, until another SS3designation
2327 *       appears
2328 *
2329 *      ESC $ + L       Indicates the immediate two bytes following SS3
2330 *       is a Chinese character as defined in CNS
2331 *       11643-plane-6, until another SS3designation
2332 *       appears
2333 *
2334 *      ESC $ + M       Indicates the immediate two bytes following SS3
2335 *       is a Chinese character as defined in CNS
2336 *       11643-plane-7, until another SS3designation
2337 *       appears
2338 *
2339 *       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2340 *       has its own designation information before any Chinese characters
2341 *       appear
2342 *
2343 */
2344
2345 /* The following are defined this way to make the strings truely readonly */
2346 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2347 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2348 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2349 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2350 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2351 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2352 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2353 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2354 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2355
2356 /********************** ISO2022-CN Data **************************/
2357 static const char* const escSeqCharsCN[10] ={
2358         SHIFT_IN_STR,           /* ASCII */
2359         GB_2312_80_STR,
2360         ISO_IR_165_STR,
2361         CNS_11643_1992_Plane_1_STR,
2362         CNS_11643_1992_Plane_2_STR,
2363         CNS_11643_1992_Plane_3_STR,
2364         CNS_11643_1992_Plane_4_STR,
2365         CNS_11643_1992_Plane_5_STR,
2366         CNS_11643_1992_Plane_6_STR,
2367         CNS_11643_1992_Plane_7_STR
2368 };
2369
2370 static void
2371 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2372
2373     UConverterDataISO2022 *converterData;
2374     ISO2022State *pFromU2022State;
2375     uint8_t *target = (uint8_t *) args->target;
2376     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2377     const UChar* source = args->source;
2378     const UChar* sourceLimit = args->sourceLimit;
2379     int32_t* offsets = args->offsets;
2380     UChar32 sourceChar;
2381     char buffer[8];
2382     int32_t len;
2383     int8_t choices[3];
2384     int32_t choiceCount;
2385     uint32_t targetValue;
2386     UBool useFallback;
2387
2388     /* set up the state */
2389     converterData     = (UConverterDataISO2022*)args->converter->extraInfo;
2390     pFromU2022State   = &converterData->fromU2022State;
2391     useFallback       = args->converter->useFallback;
2392
2393     choiceCount = 0;
2394
2395     /* check if the last codepoint of previous buffer was a lead surrogate*/
2396     if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
2397         goto getTrail;
2398     }
2399
2400     while( source < sourceLimit){
2401         if(target < targetLimit){
2402
2403             sourceChar  = *(source++);
2404             /*check if the char is a First surrogate*/
2405              if(UTF_IS_SURROGATE(sourceChar)) {
2406                 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2407 getTrail:
2408                     /*look ahead to find the trail surrogate*/
2409                     if(source < sourceLimit) {
2410                         /* test the following code unit */
2411                         UChar trail=(UChar) *source;
2412                         if(UTF_IS_SECOND_SURROGATE(trail)) {
2413                             source++;
2414                             sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2415                             args->converter->fromUChar32=0x00;
2416                             /* convert this supplementary code point */
2417                             /* exit this condition tree */
2418                         } else {
2419                             /* this is an unmatched lead code unit (1st surrogate) */
2420                             /* callback(illegal) */
2421                             *err=U_ILLEGAL_CHAR_FOUND;
2422                             args->converter->fromUChar32=sourceChar;
2423                             break;
2424                         }
2425                     } else {
2426                         /* no more input */
2427                         args->converter->fromUChar32=sourceChar;
2428                         break;
2429                     }
2430                 } else {
2431                     /* this is an unmatched trail code unit (2nd surrogate) */
2432                     /* callback(illegal) */
2433                     *err=U_ILLEGAL_CHAR_FOUND;
2434                     args->converter->fromUChar32=sourceChar;
2435                     break;
2436                 }
2437             }
2438
2439             /* do the conversion */
2440             if(sourceChar <= 0x007f ){
2441                 /* US-ASCII */
2442                 if(pFromU2022State->g == 0) {
2443                     buffer[0] = (char)sourceChar;
2444                     len = 1;
2445                 } else {
2446                     buffer[0] = UCNV_SI;
2447                     buffer[1] = (char)sourceChar;
2448                     len = 2;
2449                     pFromU2022State->g = 0;
2450                     choiceCount = 0;
2451                 }
2452                 if(sourceChar == CR || sourceChar == LF) {
2453                     /* reset the state at the end of a line */
2454                     uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2455                     choiceCount = 0;
2456                 }
2457             }
2458             else{
2459                 /* convert U+0080..U+10ffff */
2460                 UConverterSharedData *cnv;
2461                 int32_t i;
2462                 int8_t cs, g;
2463
2464                 if(choiceCount == 0) {
2465                     /* try the current SO/G1 converter first */
2466                     choices[0] = pFromU2022State->cs[1];
2467
2468                     /* default to GB2312_1 if none is designated yet */
2469                     if(choices[0] == 0) {
2470                         choices[0] = GB2312_1;
2471                     }
2472
2473                     if(converterData->version == 0) {
2474                         /* ISO-2022-CN */
2475
2476                         /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2477                         if(choices[0] == GB2312_1) {
2478                             choices[1] = (int8_t)CNS_11643_1;
2479                         } else {
2480                             choices[1] = (int8_t)GB2312_1;
2481                         }
2482
2483                         choiceCount = 2;
2484                     } else {
2485                         /* ISO-2022-CN-EXT */
2486
2487                         /* try one of the other converters */
2488                         switch(choices[0]) {
2489                         case GB2312_1:
2490                             choices[1] = (int8_t)CNS_11643_1;
2491                             choices[2] = (int8_t)ISO_IR_165;
2492                             break;
2493                         case ISO_IR_165:
2494                             choices[1] = (int8_t)GB2312_1;
2495                             choices[2] = (int8_t)CNS_11643_1;
2496                             break;
2497                         default: /* CNS_11643_x */
2498                             choices[1] = (int8_t)GB2312_1;
2499                             choices[2] = (int8_t)ISO_IR_165;
2500                             break;
2501                         }
2502
2503                         choiceCount = 3;
2504                     }
2505                 }
2506
2507                 cs = g = 0;
2508                 len = 0;
2509
2510                 for(i = 0; i < choiceCount && len == 0; ++i) {
2511                     cs = choices[i];
2512                     if(cs > 0) {
2513                         if(cs > CNS_11643_0) {
2514                             cnv = converterData->myConverterArray[CNS_11643];
2515                             MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_3);
2516                             if(len==3) {
2517                                 cs = (int8_t)(CNS_11643_0 + (targetValue >> 16) - 0x80);
2518                                 len = 2;
2519                                 if(cs == CNS_11643_1) {
2520                                     g = 1;
2521                                 } else if(cs == CNS_11643_2) {
2522                                     g = 2;
2523                                 } else /* plane 3..7 */ if(converterData->version == 1) {
2524                                     g = 3;
2525                                 } else {
2526                                     /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
2527                                     len = 0;
2528                                 }
2529                             }
2530                         } else {
2531                             /* GB2312_1 or ISO-IR-165 */
2532                             cnv = converterData->myConverterArray[cs];
2533                             MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_2);
2534                             g = 1; /* used if len == 2 */
2535                         }
2536                     }
2537                 }
2538
2539                 if(len > 0) {
2540                     len = 0; /* count output bytes; it must have been len == 2 */
2541
2542                     /* write the designation sequence if necessary */
2543                     if(cs != pFromU2022State->cs[g]) {
2544                         if(cs < CNS_11643) {
2545                             uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
2546                         } else {
2547                             uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
2548                         }
2549                         len = 4;
2550                         pFromU2022State->cs[g] = cs;
2551                         if(g == 1) {
2552                             /* changing the SO/G1 charset invalidates the choices[] */
2553                             choiceCount = 0;
2554                         }
2555                     }
2556
2557                     /* write the shift sequence if necessary */
2558                     if(g != pFromU2022State->g) {
2559                         switch(g) {
2560                         case 1:
2561                             buffer[len++] = UCNV_SO;
2562
2563                             /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
2564                             pFromU2022State->g = 1;
2565                             break;
2566                         case 2:
2567                             buffer[len++] = 0x1b;
2568                             buffer[len++] = 0x4e;
2569                             break;
2570                         default: /* case 3 */
2571                             buffer[len++] = 0x1b;
2572                             buffer[len++] = 0x4f;
2573                             break;
2574                         }
2575                     }
2576
2577                     /* write the two output bytes */
2578                     buffer[len++] = (char)(targetValue >> 8);
2579                     buffer[len++] = (char)targetValue;
2580                 } else {
2581                     /* if we cannot find the character after checking all codepages
2582                      * then this is an error
2583                      */
2584                     *err = U_INVALID_CHAR_FOUND;
2585                     args->converter->fromUChar32=sourceChar;
2586                     break;
2587                 }
2588             }
2589
2590             /* output len>0 bytes in buffer[] */
2591             if(len == 1) {
2592                 *target++ = buffer[0];
2593                 if(offsets) {
2594                     *offsets++ = source - args->source - 1; /* -1: known to be ASCII */
2595                 }
2596             } else if(len == 2 && (target + 2) <= targetLimit) {
2597                 *target++ = buffer[0];
2598                 *target++ = buffer[1];
2599                 if(offsets) {
2600                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
2601                     *offsets++ = sourceIndex;
2602                     *offsets++ = sourceIndex;
2603                 }
2604             } else {
2605                 ucnv_fromUWriteBytes(
2606                     args->converter,
2607                     buffer, len,
2608                     (char **)&target, (const char *)targetLimit,
2609                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2610                     err);
2611                 if(U_FAILURE(*err)) {
2612                     break;
2613                 }
2614             }
2615         } /* end if(myTargetIndex<myTargetLength) */
2616         else{
2617             *err =U_BUFFER_OVERFLOW_ERROR;
2618             break;
2619         }
2620
2621     }/* end while(mySourceIndex<mySourceLength) */
2622
2623     /*
2624      * the end of the input stream and detection of truncated input
2625      * are handled by the framework, but for ISO-2022-CN conversion
2626      * we need to be in ASCII mode at the very end
2627      *
2628      * conditions:
2629      *   successful
2630      *   not in ASCII mode
2631      *   end of input and no truncated input
2632      */
2633     if( U_SUCCESS(*err) &&
2634         pFromU2022State->g!=0 &&
2635         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2636     ) {
2637         int32_t sourceIndex;
2638
2639         /* we are switching to ASCII */
2640         pFromU2022State->g=0;
2641
2642         /* get the source index of the last input character */
2643         /*
2644          * TODO this would be simpler and more reliable if we used a pair
2645          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2646          * so that we could simply use the prevSourceIndex here;
2647          * this code gives an incorrect result for the rare case of an unmatched
2648          * trail surrogate that is alone in the last buffer of the text stream
2649          */
2650         sourceIndex=(int32_t)(source-args->source);
2651         if(sourceIndex>0) {
2652             --sourceIndex;
2653             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2654                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2655             ) {
2656                 --sourceIndex;
2657             }
2658         } else {
2659             sourceIndex=-1;
2660         }
2661
2662         ucnv_fromUWriteBytes(
2663             args->converter,
2664             SHIFT_IN_STR, 1,
2665             (char **)&target, (const char *)targetLimit,
2666             &offsets, sourceIndex,
2667             err);
2668     }
2669
2670     /*save the state and return */
2671     args->source = source;
2672     args->target = (char*)target;
2673 }
2674
2675
2676 static void
2677 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2678                                                UErrorCode* err){
2679     char tempBuf[3];
2680     const char *mySource = (char *) args->source;
2681     UChar *myTarget = args->target;
2682     const char *mySourceLimit = args->sourceLimit;
2683     uint32_t targetUniChar = 0x0000;
2684     uint32_t mySourceChar = 0x0000;
2685     UConverterDataISO2022* myData;
2686     ISO2022State *pToU2022State;
2687
2688     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2689     pToU2022State = &myData->toU2022State;
2690
2691     if(myData->key != 0) {
2692         /* continue with a partial escape sequence */
2693         goto escape;
2694     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2695         /* continue with a partial double-byte character */
2696         mySourceChar = args->converter->toUBytes[0];
2697         args->converter->toULength = 0;
2698         goto getTrailByte;
2699     }
2700
2701     while(mySource < mySourceLimit){
2702
2703         targetUniChar =missingCharMarker;
2704
2705         if(myTarget < args->targetLimit){
2706
2707             mySourceChar= (unsigned char) *mySource++;
2708
2709             switch(mySourceChar){
2710             case UCNV_SI:
2711                 pToU2022State->g=0;
2712                 continue;
2713
2714             case UCNV_SO:
2715                 if(pToU2022State->cs[1] != 0) {
2716                     pToU2022State->g=1;
2717                     continue;
2718                 } else {
2719                     /* illegal to have SO before a matching designator */
2720                     break;
2721                 }
2722
2723             case ESC_2022:
2724                 mySource--;
2725 escape:
2726                 changeState_2022(args->converter,&(mySource),
2727                     mySourceLimit, ISO_2022_CN,err);
2728
2729                 /* invalid or illegal escape sequence */
2730                 if(U_FAILURE(*err)){
2731                     args->target = myTarget;
2732                     args->source = mySource;
2733                     return;
2734                 }
2735                 continue;
2736
2737             /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
2738
2739             case CR:
2740                 /*falls through*/
2741             case LF:
2742                 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
2743                 /* falls through */
2744             default:
2745                 /* convert one or two bytes */
2746                 if(pToU2022State->g != 0) {
2747                     if(mySource < mySourceLimit) {
2748                         UConverterSharedData *cnv;
2749                         StateEnum tempState;
2750                         int32_t tempBufLen;
2751                         char trailByte;
2752 getTrailByte:
2753                         trailByte = *mySource++;
2754                         tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
2755                         if(tempState > CNS_11643_0) {
2756                             cnv = myData->myConverterArray[CNS_11643];
2757                             tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
2758                             tempBuf[1] = (char) (mySourceChar);
2759                             tempBuf[2] = trailByte;
2760                             tempBufLen = 3;
2761
2762                         }else{
2763                             cnv = myData->myConverterArray[tempState];
2764                             tempBuf[0] = (char) (mySourceChar);
2765                             tempBuf[1] = trailByte;
2766                             tempBufLen = 2;
2767                         }
2768                         mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2769                         if(pToU2022State->g>=2) {
2770                             /* return from a single-shift state to the previous one */
2771                             pToU2022State->g=pToU2022State->prevG;
2772                         }
2773                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
2774                     } else {
2775                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2776                         args->converter->toULength = 1;
2777                         goto endloop;
2778                     }
2779                 }
2780                 else{
2781                     if(mySourceChar <= 0x7f) {
2782                         targetUniChar = (UChar) mySourceChar;
2783                     }
2784                 }
2785                 break;
2786             }
2787             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2788                 if(args->offsets){
2789                     args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
2790                 }
2791                 *(myTarget++)=(UChar)targetUniChar;
2792             }
2793             else if(targetUniChar > missingCharMarker){
2794                 /* disassemble the surrogate pair and write to output*/
2795                 targetUniChar-=0x0010000;
2796                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2797                 if(args->offsets){
2798                     args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
2799                 }
2800                 ++myTarget;
2801                 if(myTarget< args->targetLimit){
2802                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2803                     if(args->offsets){
2804                         args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
2805                     }
2806                     ++myTarget;
2807                 }else{
2808                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2809                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2810                 }
2811
2812             }
2813             else{
2814                 /* Call the callback function*/
2815                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2816                 break;
2817             }
2818         }
2819         else{
2820             *err =U_BUFFER_OVERFLOW_ERROR;
2821             break;
2822         }
2823     }
2824 endloop:
2825     args->target = myTarget;
2826     args->source = mySource;
2827 }
2828
2829 static void
2830 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
2831     UConverter *cnv = args->converter;
2832     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
2833     ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
2834     char *p, *subchar;
2835     char buffer[8];
2836     int32_t length;
2837
2838     subchar=(char *)cnv->subChar;
2839     length=cnv->subCharLen; /* assume length==1 for most variants */
2840
2841     p = buffer;
2842     switch(myConverterData->locale[0]){
2843     case 'j':
2844         {
2845             int8_t cs;
2846
2847             if(pFromU2022State->g == 1) {
2848                 /* JIS7: switch from G1 to G0 */
2849                 pFromU2022State->g = 0;
2850                 *p++ = UCNV_SI;
2851             }
2852
2853             cs = pFromU2022State->cs[0];
2854             if(cs != ASCII && cs != JISX201) {
2855                 /* not in ASCII or JIS X 0201: switch to ASCII */
2856                 pFromU2022State->cs[0] = (int8_t)ASCII;
2857                 *p++ = '\x1b';
2858                 *p++ = '\x28';
2859                 *p++ = '\x42';
2860             }
2861
2862             *p++ = subchar[0];
2863             break;
2864         }
2865     case 'c':
2866         if(pFromU2022State->g != 0) {
2867             /* not in ASCII mode: switch to ASCII */
2868             pFromU2022State->g = 0;
2869             *p++ = UCNV_SI;
2870         }
2871         *p++ = subchar[0];
2872         break;
2873     case 'k':
2874         if(myConverterData->version == 0) {
2875             if(length == 1) {
2876                 if((UBool)args->converter->fromUnicodeStatus) {
2877                     /* in DBCS mode: switch to SBCS */
2878                     args->converter->fromUnicodeStatus = 0;
2879                     *p++ = UCNV_SI;
2880                 }
2881                 *p++ = subchar[0];
2882             } else /* length == 2*/ {
2883                 if(!(UBool)args->converter->fromUnicodeStatus) {
2884                     /* in SBCS mode: switch to DBCS */
2885                     args->converter->fromUnicodeStatus = 1;
2886                     *p++ = UCNV_SO;
2887                 }
2888                 *p++ = subchar[0];
2889                 *p++ = subchar[1];
2890             }
2891             break;
2892         } else {
2893             /* let the subconverter write the subchar */
2894             args->converter = myConverterData->currentConverter;
2895             uprv_memcpy(myConverterData->currentConverter->subChar, subchar, 4);
2896             myConverterData->currentConverter->subCharLen = (int8_t)length;
2897
2898             myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
2899             ucnv_cbFromUWriteSub(args, 0, err);
2900             cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2901
2902             if(*err == U_BUFFER_OVERFLOW_ERROR) {
2903                 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2904                     uprv_memcpy(
2905                         cnv->charErrorBuffer,
2906                         myConverterData->currentConverter->charErrorBuffer,
2907                         myConverterData->currentConverter->charErrorBufferLength);
2908                 }
2909                 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2910                 myConverterData->currentConverter->charErrorBufferLength = 0;
2911             }
2912             args->converter = cnv;
2913             return;
2914         }
2915     default:
2916         /* not expected */
2917         break;
2918     }
2919     ucnv_cbFromUWriteBytes(args,
2920                            buffer, (int32_t)(p - buffer),
2921                            offsetIndex, err);
2922 }
2923
2924 /* structure for SafeClone calculations */
2925 struct cloneStruct
2926 {
2927     UConverter cnv;
2928     UConverterDataISO2022 mydata;
2929     UConverter currentConverter;
2930 };
2931
2932
2933 static UConverter *
2934 _ISO_2022_SafeClone(
2935             const UConverter *cnv,
2936             void *stackBuffer,
2937             int32_t *pBufferSize,
2938             UErrorCode *status)
2939 {
2940     struct cloneStruct * localClone;
2941     UConverterDataISO2022 *cnvData;
2942     int32_t i, size;
2943
2944     if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
2945         *pBufferSize = (int32_t)sizeof(struct cloneStruct);
2946         return NULL;
2947     }
2948
2949     cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
2950     localClone = (struct cloneStruct *)stackBuffer;
2951
2952     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
2953
2954     uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
2955
2956     /* share the subconverters */
2957
2958     if(cnvData->currentConverter != NULL) {
2959         size = (int32_t)sizeof(UConverter);
2960         localClone->mydata.currentConverter =
2961             ucnv_safeClone(cnvData->currentConverter,
2962                             &localClone->currentConverter,
2963                             &size, status);
2964         if(U_FAILURE(*status)) {
2965             return NULL;
2966         }
2967     }
2968
2969     for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
2970         if(cnvData->myConverterArray[i] != NULL) {
2971             ucnv_incrementRefCount(cnvData->myConverterArray[i]);
2972         }
2973     }
2974
2975     localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
2976     localClone->cnv.isExtraLocal = TRUE;
2977     return &localClone->cnv;
2978 }
2979
2980 static void
2981 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
2982                     USetAdder *sa,
2983                     UConverterUnicodeSet which,
2984                     UErrorCode *pErrorCode)
2985 {
2986     int32_t i;
2987     UConverterDataISO2022* cnvData;
2988
2989     if (U_FAILURE(*pErrorCode)) {
2990         return;
2991     }
2992 #ifdef U_ENABLE_GENERIC_ISO_2022
2993     if (cnv->sharedData == &_ISO2022Data) {
2994         /* We use UTF-8 in this case */
2995         sa->addRange(sa->set, 0, 0xd7FF);
2996         sa->addRange(sa->set, 0xE000, 0x10FFFF);
2997         return;
2998     }
2999 #endif
3000
3001     cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3002
3003     /* open a set and initialize it with code points that are algorithmically round-tripped */
3004     switch(cnvData->locale[0]){
3005     case 'j':
3006         if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3007             /* include Latin-1 for some variants of JP */
3008             sa->addRange(sa->set, 0, 0xff);
3009         } else {
3010             /* include ASCII for JP */
3011             sa->addRange(sa->set, 0, 0x7f);
3012         }
3013         if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
3014             /* include half-width Katakana for JP */
3015             sa->addRange(sa->set, 0xff61, 0xff9f);
3016         }
3017         break;
3018     case 'c':
3019     case 'z':
3020         /* include ASCII for CN */
3021         sa->addRange(sa->set, 0, 0x7f);
3022         break;
3023     case 'k':
3024         /* there is only one converter for KR, and it is not in the myConverterArray[] */
3025         cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3026                 cnvData->currentConverter, sa, which, pErrorCode);
3027         return;
3028     default:
3029         break;
3030     }
3031
3032     /*
3033      * TODO: need to make this version-specific for CN.
3034      * CN version 0 does not map CNS planes 3..7 although
3035      * they are all available in the CNS conversion table;
3036      * CN version 1 does map them all.
3037      * The two versions need to create different Unicode sets.
3038      */
3039     for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3040         if(cnvData->myConverterArray[i]!=NULL) {
3041             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3042                 cnvData->version==0 && i==CNS_11643
3043             ) {
3044                 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3045                 ucnv_MBCSGetUnicodeSetForBytes(
3046                         cnvData->myConverterArray[i],
3047                         sa, UCNV_ROUNDTRIP_SET,
3048                         0, 0x81, 0x82,
3049                         pErrorCode);
3050             } else {
3051                 ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode);
3052             }
3053         }
3054     }
3055 }
3056
3057 static const UConverterImpl _ISO2022Impl={
3058     UCNV_ISO_2022,
3059
3060     NULL,
3061     NULL,
3062
3063     _ISO2022Open,
3064     _ISO2022Close,
3065     _ISO2022Reset,
3066
3067 #ifdef U_ENABLE_GENERIC_ISO_2022
3068     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3069     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3070     ucnv_fromUnicode_UTF8,
3071     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3072 #else
3073     NULL,
3074     NULL,
3075     NULL,
3076     NULL,
3077 #endif
3078     NULL,
3079
3080     NULL,
3081     _ISO2022getName,
3082     _ISO_2022_WriteSub,
3083     _ISO_2022_SafeClone,
3084     _ISO_2022_GetUnicodeSet
3085 };
3086 static const UConverterStaticData _ISO2022StaticData={
3087     sizeof(UConverterStaticData),
3088     "ISO_2022",
3089     2022,
3090     UCNV_IBM,
3091     UCNV_ISO_2022,
3092     1,
3093     3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3094     { 0x1a, 0, 0, 0 },
3095     1,
3096     FALSE,
3097     FALSE,
3098     0,
3099     0,
3100     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3101 };
3102 const UConverterSharedData _ISO2022Data={
3103     sizeof(UConverterSharedData),
3104     ~((uint32_t) 0),
3105     NULL,
3106     NULL,
3107     &_ISO2022StaticData,
3108     FALSE,
3109     &_ISO2022Impl,
3110     0
3111 };
3112
3113 /*************JP****************/
3114 static const UConverterImpl _ISO2022JPImpl={
3115     UCNV_ISO_2022,
3116
3117     NULL,
3118     NULL,
3119
3120     _ISO2022Open,
3121     _ISO2022Close,
3122     _ISO2022Reset,
3123
3124     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3125     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3126     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3127     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3128     NULL,
3129
3130     NULL,
3131     _ISO2022getName,
3132     _ISO_2022_WriteSub,
3133     _ISO_2022_SafeClone,
3134     _ISO_2022_GetUnicodeSet
3135 };
3136 static const UConverterStaticData _ISO2022JPStaticData={
3137     sizeof(UConverterStaticData),
3138     "ISO_2022_JP",
3139     0,
3140     UCNV_IBM,
3141     UCNV_ISO_2022,
3142     1,
3143     6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3144     { 0x1a, 0, 0, 0 },
3145     1,
3146     FALSE,
3147     FALSE,
3148     0,
3149     0,
3150     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3151 };
3152 static const UConverterSharedData _ISO2022JPData={
3153     sizeof(UConverterSharedData),
3154     ~((uint32_t) 0),
3155     NULL,
3156     NULL,
3157     &_ISO2022JPStaticData,
3158     FALSE,
3159     &_ISO2022JPImpl,
3160     0
3161 };
3162
3163 /************* KR ***************/
3164 static const UConverterImpl _ISO2022KRImpl={
3165     UCNV_ISO_2022,
3166
3167     NULL,
3168     NULL,
3169
3170     _ISO2022Open,
3171     _ISO2022Close,
3172     _ISO2022Reset,
3173
3174     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3175     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3176     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3177     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3178     NULL,
3179
3180     NULL,
3181     _ISO2022getName,
3182     _ISO_2022_WriteSub,
3183     _ISO_2022_SafeClone,
3184     _ISO_2022_GetUnicodeSet
3185 };
3186 static const UConverterStaticData _ISO2022KRStaticData={
3187     sizeof(UConverterStaticData),
3188     "ISO_2022_KR",
3189     0,
3190     UCNV_IBM,
3191     UCNV_ISO_2022,
3192     1,
3193     3, /* max 3 bytes per UChar: SO+DBCS */
3194     { 0x1a, 0, 0, 0 },
3195     1,
3196     FALSE,
3197     FALSE,
3198     0,
3199     0,
3200     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3201 };
3202 static const UConverterSharedData _ISO2022KRData={
3203     sizeof(UConverterSharedData),
3204     ~((uint32_t) 0),
3205     NULL,
3206     NULL,
3207     &_ISO2022KRStaticData,
3208     FALSE,
3209     &_ISO2022KRImpl,
3210     0
3211 };
3212
3213 /*************** CN ***************/
3214 static const UConverterImpl _ISO2022CNImpl={
3215
3216     UCNV_ISO_2022,
3217
3218     NULL,
3219     NULL,
3220
3221     _ISO2022Open,
3222     _ISO2022Close,
3223     _ISO2022Reset,
3224
3225     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3226     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3227     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3228     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3229     NULL,
3230
3231     NULL,
3232     _ISO2022getName,
3233     _ISO_2022_WriteSub,
3234     _ISO_2022_SafeClone,
3235     _ISO_2022_GetUnicodeSet
3236 };
3237 static const UConverterStaticData _ISO2022CNStaticData={
3238     sizeof(UConverterStaticData),
3239     "ISO_2022_CN",
3240     0,
3241     UCNV_IBM,
3242     UCNV_ISO_2022,
3243     2,
3244     8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3245     { 0x1a, 0, 0, 0 },
3246     1,
3247     FALSE,
3248     FALSE,
3249     0,
3250     0,
3251     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3252 };
3253 static const UConverterSharedData _ISO2022CNData={
3254     sizeof(UConverterSharedData),
3255     ~((uint32_t) 0),
3256     NULL,
3257     NULL,
3258     &_ISO2022CNStaticData,
3259     FALSE,
3260     &_ISO2022CNImpl,
3261     0
3262 };
3263
3264
3265
3266 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */