icuSources/common/ucnvmbcs.c

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2000-2003, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *   file name:  ucnvmbcs.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2000jul03
  14 *   created by: Markus W. Scherer
  15 *
  16 *   The current code in this file replaces the previous implementation
  17 *   of conversion code from multi-byte codepages to Unicode and back.
  18 *   This implementation supports the following:
  19 *   - legacy variable-length codepages with up to 4 bytes per character
  20 *   - all Unicode code points (up to 0x10ffff)
  21 *   - efficient distinction of unassigned vs. illegal byte sequences
  22 *   - it is possible in fromUnicode() to directly deal with simple
  23 *     stateful encodings (used for EBCDIC_STATEFUL)
  24 *   - it is possible to convert Unicode code points other than U+0000
  25 *     to a single zero byte (but not as a fallback except for SBCS)
  26 *
  27 *   Remaining limitations in fromUnicode:
  28 *   - byte sequences must not have leading zero bytes
  29 *   - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
  30 *   - limitation to up to 4 bytes per character
  31 *
  32 *   Change history:
  33 *
  34 *    5/6/2001       Ram       Moved  MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
  35 *                             MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
  36 *                             macros to ucnvmbcs.h file
  37 */
  38
  39 #include "unicode/utypes.h"
  40
  41 #if !UCONFIG_NO_LEGACY_CONVERSION
  42
  43 #include "unicode/ucnv.h"
  44 #include "unicode/ucnv_cb.h"
  45 #include "unicode/udata.h"
  46 #include "unicode/uset.h"
  47 #include "ucnv_bld.h"
  48 #include "ucnvmbcs.h"
  49 #include "ucnv_cnv.h"
  50 #include "umutex.h"
  51 #include "cmemory.h"
  52 #include "cstring.h"
  53
  54 /* control optimizations according to the platform */
  55 #define MBCS_UNROLL_SINGLE_TO_BMP 1
  56 #define MBCS_UNROLL_SINGLE_FROM_BMP 0
  57
  58 /*
  59  * _MBCSHeader versions 4.1
  60  * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
  61  *
  62  * Change from version 4.0:
  63  * - Replace header.reserved with header.fromUBytesLength so that all
  64  *   fields in the data have length.
  65  *
  66  * Changes from version 3 (for performance improvements):
  67  * - new bit distribution for state table entries
  68  * - reordered action codes
  69  * - new data structure for single-byte fromUnicode
  70  *   + stage 2 only contains indexes
  71  *   + stage 3 stores 16 bits per character with classification bits 15..8
  72  * - no multiplier for stage 1 entries
  73  * - stage 2 for non-single-byte codepages contains the index and the flags in
  74  *   one 32-bit value
  75  * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
  76  *
  77  * For more details about old versions of the MBCS data structure, see
  78  * the corresponding versions of this file.
  79  *
  80  * Converting stateless codepage data ---------------------------------------***
  81  * (or codepage data with simple states) to Unicode.
  82  *
  83  * Data structure and algorithm for converting from complex legacy codepages
  84  * to Unicode. (Designed before 2000-may-22.)
  85  *
  86  * The basic idea is that the structure of legacy codepages can be described
  87  * with state tables.
  88  * When reading a byte stream, each input byte causes a state transition.
  89  * Some transitions result in the output of a code point, some result in
  90  * "unassigned" or "illegal" output.
  91  * This is used here for character conversion.
  92  *
  93  * The data structure begins with a state table consisting of a row
  94  * per state, with 256 entries (columns) per row for each possible input
  95  * byte value.
  96  * Each entry is 32 bits wide, with two formats distinguished by
  97  * the sign bit (bit 31):
  98  *
  99  * One format for transitional entries (bit 31 not set) for non-final bytes, and
 100  * one format for final entries (bit 31 set).
 101  * Both formats contain the number of the next state in the same bit
 102  * positions.
 103  * State 0 is the initial state.
 104  *
 105  * Most of the time, the offset values of subsequent states are added
 106  * up to a scalar value. This value will eventually be the index of
 107  * the Unicode code point in a table that follows the state table.
 108  * The effect is that the code points for final state table rows
 109  * are contiguous. The code points of final state rows follow each other
 110  * in the order of the references to those final states by previous
 111  * states, etc.
 112  *
 113  * For some terminal states, the offset is itself the output Unicode
 114  * code point (16 bits for a BMP code point or 20 bits for a supplementary
 115  * code point (stored as code point minus 0x10000 so that 20 bits are enough).
 116  * For others, the code point in the Unicode table is stored with either
 117  * one or two code units: one for BMP code points, two for a pair of
 118  * surrogates.
 119  * All code points for a final state entry take up the same number of code
 120  * units, regardless of whether they all actually _use_ the same number
 121  * of code units. This is necessary for simple array access.
 122  *
 123  * An additional feature comes in with what in ICU is called "fallback"
 124  * mappings:
 125  *
 126  * In addition to round-trippable, precise, 1:1 mappings, there are often
 127  * mappings defined between similar, though not the same, characters.
 128  * Typically, such mappings occur only in fromUnicode mapping tables because
 129  * Unicode has a superset repertoire of most other codepages. However, it
 130  * is possible to provide such mappings in the toUnicode tables, too.
 131  * In this case, the fallback mappings are partly integrated into the
 132  * general state tables because the structure of the encoding includes their
 133  * byte sequences.
 134  * For final entries in an initial state, fallback mappings are stored in
 135  * the entry itself like with roundtrip mappings.
 136  * For other final entries, they are stored in the code units table if
 137  * the entry is for a pair of code units.
 138  * For single-unit results in the code units table, there is no space to
 139  * alternatively hold a fallback mapping; in this case, the code unit
 140  * is stored as U+fffe (unassigned), and the fallback mapping needs to
 141  * be looked up by the scalar offset value in a separate table.
 142  *
 143  * "Unassigned" state entries really mean "structurally unassigned",
 144  * i.e., such a byte sequence will never have a mapping result.
 145  *
 146  * The interpretation of the bits in each entry is as follows:
 147  *
 148  * Bit 31 not set, not a terminal entry ("transitional"):
 149  * 30..24 next state
 150  * 23..0  offset delta, to be added up
 151  *
 152  * Bit 31 set, terminal ("final") entry:
 153  * 30..24 next state (regardless of action code)
 154  * 23..20 action code:
 155  *        action codes 0 and 1 result in precise-mapping Unicode code points
 156  *        0  valid byte sequence
 157  *           19..16 not used, 0
 158  *           15..0  16-bit Unicode BMP code point
 159  *                  never U+fffe or U+ffff
 160  *        1  valid byte sequence
 161  *           19..0  20-bit Unicode supplementary code point
 162  *                  never U+fffe or U+ffff
 163  *
 164  *        action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
 165  *        2  valid byte sequence (fallback)
 166  *           19..16 not used, 0
 167  *           15..0  16-bit Unicode BMP code point as fallback result
 168  *        3  valid byte sequence (fallback)
 169  *           19..0  20-bit Unicode supplementary code point as fallback result
 170  *
 171  *        action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
 172  *        depending on the code units they result in
 173  *        4  valid byte sequence
 174  *           19..9  not used, 0
 175  *            8..0  final offset delta
 176  *                  pointing to one 16-bit code unit which may be
 177  *                  fffe  unassigned -- look for a fallback for this offset
 178  *                  ffff  illegal
 179  *        5  valid byte sequence
 180  *           19..9  not used, 0
 181  *            8..0  final offset delta
 182  *                  pointing to two 16-bit code units
 183  *                  (typically UTF-16 surrogates)
 184  *                  the result depends on the first code unit as follows:
 185  *                  0000..d7ff  roundtrip BMP code point (1st alone)
 186  *                  d800..dbff  roundtrip surrogate pair (1st, 2nd)
 187  *                  dc00..dfff  fallback surrogate pair (1st-400, 2nd)
 188  *                  e000        roundtrip BMP code point (2nd alone)
 189  *                  e001        fallback BMP code point (2nd alone)
 190  *                  fffe        unassigned
 191  *                  ffff        illegal
 192  *           (the final offset deltas are at most 255 * 2,
 193  *            times 2 because of storing code unit pairs)
 194  *
 195  *        6  unassigned byte sequence
 196  *           19..16 not used, 0
 197  *           15..0  16-bit Unicode BMP code point U+fffe (new with version 2)
 198  *                  this does not contain a final offset delta because the main
 199  *                  purpose of this action code is to save scalar offset values;
 200  *                  therefore, fallback values cannot be assigned to byte
 201  *                  sequences that result in this action code
 202  *        7  illegal byte sequence
 203  *           19..16 not used, 0
 204  *           15..0  16-bit Unicode BMP code point U+ffff (new with version 2)
 205  *        8  state change only
 206  *           19..0  not used, 0
 207  *           useful for state changes in simple stateful encodings,
 208  *           at Shift-In/Shift-Out codes
 209  *
 210  *
 211  *        9..15 reserved for future use
 212  *           current implementations will only perform a state change
 213  *           and ignore bits 19..0
 214  *
 215  * An encoding with contiguous ranges of unassigned byte sequences, like
 216  * Shift-JIS and especially EUC-TW, can be stored efficiently by having
 217  * at least two states for the trail bytes:
 218  * One trail byte state that results in code points, and one that only
 219  * has "unassigned" and "illegal" terminal states.
 220  *
 221  * Note: partly by accident, this data structure supports simple stateless
 222  * encodings without any additional logic.
 223  * Currently, only simple Shift-In/Shift-Out schemes are handled with
 224  * appropriate state tables (especially EBCDIC_STATEFUL!).
 225  *
 226  * MBCS version 2 added:
 227  * unassigned and illegal action codes have U+fffe and U+ffff
 228  * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
 229  *
 230  * Converting from Unicode to codepage bytes --------------------------------***
 231  *
 232  * The conversion data structure for fromUnicode is designed for the known
 233  * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
 234  * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
 235  * a roundtrip mapping.
 236  *
 237  * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
 238  * like in the character properties table.
 239  * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
 240  * with the resulting bytes is at offsetFromUBytes.
 241  *
 242  * Beginning with version 4, single-byte codepages have a significantly different
 243  * trie compared to other codepages.
 244  * In all cases, the entry in stage 1 is directly the index of the block of
 245  * 64 entries in stage 2.
 246  *
 247  * Single-byte lookup:
 248  *
 249  * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
 250  * Stage 3 contains one 16-bit word per result:
 251  * Bits 15..8 indicate the kind of result:
 252  *    f  roundtrip result
 253  *    c  fallback result from private-use code point
 254  *    8  fallback result from other code points
 255  *    0  unassigned
 256  * Bits 7..0 contain the codepage byte. A zero byte is always possible.
 257  *
 258  * Multi-byte lookup:
 259  *
 260  * Stage 2 contains a 32-bit word for each 16-block in stage 3:
 261  * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
 262  *             test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
 263  *             If this test is false, then a non-zero result will be interpreted as
 264  *             a fallback mapping.
 265  * Bits 15..0  contain the index to stage 3, which must be multiplied by 16*(bytes per char)
 266  *
 267  * Stage 3 contains 2, 3, or 4 bytes per result.
 268  * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
 269  * while 3 bytes are stored as bytes in big-endian order.
 270  * Leading zero bytes are ignored, and the number of bytes is counted.
 271  * A zero byte mapping result is possible as a roundtrip result.
 272  * For some output types, the actual result is processed from this;
 273  * see _MBCSFromUnicodeWithOffsets().
 274  *
 275  * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
 276  * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
 277  *
 278  * In version 3, stage 2 blocks may overlap by multiples of the multiplier
 279  * for compaction.
 280  * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
 281  * may overlap by any number of entries.
 282  *
 283  * MBCS version 2 added:
 284  * the converter checks for known output types, which allows
 285  * adding new ones without crashing an unaware converter
 286  */
 287
 288 /* prototypes --------------------------------------------------------------- */
 289
 290 static void
 291 _MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 292                                 UErrorCode *pErrorCode);
 293
 294 static void
 295 _MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
 296                             UErrorCode *pErrorCode);
 297
 298 static UChar32
 299 _MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
 300                   UErrorCode *pErrorCode);
 301
 302 static UChar32
 303 _MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
 304                         UErrorCode *pErrorCode);
 305
 306 static void
 307 _MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
 308                                   UErrorCode *pErrorCode);
 309
 310 static void
 311 _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
 312                                   UErrorCode *pErrorCode);
 313
 314 static void
 315 _MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
 316                               UErrorCode *pErrorCode);
 317
 318 static void
 319 fromUCallback(UConverter *cnv,
 320               const void *context, UConverterFromUnicodeArgs *pArgs,
 321               UChar32 codePoint,
 322               UConverterCallbackReason reason, UErrorCode *pErrorCode);
 323
 324 static void
 325 toUCallback(UConverter *cnv,
 326             const void *context, UConverterToUnicodeArgs *pArgs,
 327             const char *codeUnits, int32_t length,
 328             UConverterCallbackReason reason, UErrorCode *pErrorCode);
 329
 330 /* GB 18030 data ------------------------------------------------------------ */
 331
 332 /* helper macros for linear values for GB 18030 four-byte sequences */
 333 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
 334
 335 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
 336
 337 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
 338
 339 /*
 340  * Some ranges of GB 18030 where both the Unicode code points and the
 341  * GB four-byte sequences are contiguous and are handled algorithmically by
 342  * the special callback functions below.
 343  * The values are start & end of Unicode & GB codes.
 344  *
 345  * Note that single surrogates are not mapped by GB 18030
 346  * as of the re-released mapping tables from 2000-nov-30.
 347  */
 348 static const uint32_t
 349 gb18030Ranges[13][4]={
 350     {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
 351     {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
 352     {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)},
 353     {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
 354     {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
 355     {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
 356     {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
 357     {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
 358     {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
 359     {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
 360     {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
 361     {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
 362     {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
 363 };
 364
 365 /* bit flag for UConverter.options indicating GB 18030 special handling */
 366 #define _MBCS_OPTION_GB18030 0x8000
 367
 368 /* Miscellaneous ------------------------------------------------------------ */
 369
 370 static uint32_t
 371 _MBCSSizeofFromUBytes(UConverterMBCSTable *mbcsTable) {
 372     const uint16_t *table;
 373
 374     uint32_t st3, maxStage3;
 375     uint16_t st1, maxStage1, st2;
 376
 377     if(mbcsTable->fromUBytesLength>0) {
 378         /*
 379          * We _know_ the number of bytes in the fromUnicodeBytes array
 380          * starting with header.version 4.1.
 381          * Otherwise, below, we need to enumerate the fromUnicode
 382          * trie and find the highest entry.
 383          */
 384         return mbcsTable->fromUBytesLength;
 385     }
 386
 387     /* Enumerate the from-Unicode trie table to find the highest stage 3 index. */
 388     table=mbcsTable->fromUnicodeTable;
 389     maxStage3=0;
 390     if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
 391         maxStage1=0x440;
 392     } else {
 393         maxStage1=0x40;
 394     }
 395
 396
 397     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
 398         const uint16_t *stage2;
 399
 400         for(st1=0; st1<maxStage1; ++st1) {
 401             st2=table[st1];
 402             if(st2>maxStage1) {
 403                 stage2=table+st2;
 404                 for(st2=0; st2<64; ++st2) {
 405                     st3=stage2[st2];
 406                     if(st3>maxStage3) {
 407                         maxStage3=st3;
 408                     }
 409                 }
 410             }
 411         }
 412
 413         /*
 414          * add 16 to get the limit not start index of the last stage 3 block,
 415          * times 2 for number of bytes
 416          */
 417         return (maxStage3+16)*2;
 418     } else {
 419         const uint32_t *stage2;
 420
 421         for(st1=0; st1<maxStage1; ++st1) {
 422             st2=table[st1];
 423             if(st2>(maxStage1>>1)) {
 424                 stage2=(const uint32_t *)table+st2;
 425                 for(st2=0; st2<64; ++st2) {
 426                     st3=stage2[st2]&0xffff;
 427                     if(st3>maxStage3) {
 428                         maxStage3=st3;
 429                     }
 430                 }
 431             }
 432         }
 433
 434         /*
 435          * add 16 to get the limit not start index of the last stage 3 block,
 436          * times 2..4 for number of bytes
 437          */
 438         maxStage3=16*maxStage3+16;
 439         switch(mbcsTable->outputType) {
 440         case MBCS_OUTPUT_3:
 441         case MBCS_OUTPUT_4_EUC:
 442             maxStage3*=3;
 443             break;
 444         case MBCS_OUTPUT_4:
 445             maxStage3*=4;
 446             break;
 447         default:
 448             /* MBCS_OUTPUT_2... and MBCS_OUTPUT_3_EUC */
 449             maxStage3*=2;
 450             break;
 451         }
 452         return maxStage3;
 453     }
 454 }
 455
 456 static void
 457 _MBCSGetUnicodeSet(const UConverter *cnv,
 458                    USet *set,
 459                    UConverterUnicodeSet which,
 460                    UErrorCode *pErrorCode) {
 461     UConverterMBCSTable *mbcsTable;
 462     const uint16_t *table;
 463
 464     uint32_t st3;
 465     uint16_t st1, maxStage1, st2;
 466
 467     UChar32 c;
 468
 469     if(cnv->options&_MBCS_OPTION_GB18030) {
 470         uset_addRange(set, 0, 0xd7ff);
 471         uset_addRange(set, 0xe000, 0x10ffff);
 472         return;
 473     }
 474
 475     /* enumerate the from-Unicode trie table */
 476     mbcsTable=&cnv->sharedData->table->mbcs;
 477     table=mbcsTable->fromUnicodeTable;
 478     if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
 479         maxStage1=0x440;
 480     } else {
 481         maxStage1=0x40;
 482     }
 483
 484     c=0; /* keep track of the current code point while enumerating */
 485
 486     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
 487         const uint16_t *stage2, *stage3, *results;
 488
 489         results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
 490
 491         for(st1=0; st1<maxStage1; ++st1) {
 492             st2=table[st1];
 493             if(st2>maxStage1) {
 494                 stage2=table+st2;
 495                 for(st2=0; st2<64; ++st2) {
 496                     if((st3=stage2[st2])!=0) {
 497                         /* read the stage 3 block */
 498                         stage3=results+st3;
 499
 500                         /*
 501                          * Add code points for which the roundtrip flag is set.
 502                          * Once we get a set for fallback mappings, we have to use
 503                          * a threshold variable with a value of 0x800.
 504                          * See _MBCSSingleFromBMPWithOffsets() and
 505                          * MBCS_SINGLE_RESULT_FROM_U() for details.
 506                          */
 507                         do {
 508                             if(*stage3++>=0xf00) {
 509                                 uset_add(set, c);
 510                             }
 511                         } while((++c&0xf)!=0);
 512                     } else {
 513                         c+=16; /* empty stage 3 block */
 514                     }
 515                 }
 516             } else {
 517                 c+=1024; /* empty stage 2 block */
 518             }
 519         }
 520     } else {
 521         const uint32_t *stage2;
 522
 523         for(st1=0; st1<maxStage1; ++st1) {
 524             st2=table[st1];
 525             if(st2>(maxStage1>>1)) {
 526                 stage2=(const uint32_t *)table+st2;
 527                 for(st2=0; st2<64; ++st2) {
 528                     if((st3=stage2[st2])!=0) {
 529                         /* get the roundtrip flags for the stage 3 block */
 530                         st3>>=16;
 531
 532                         /*
 533                          * Add code points for which the roundtrip flag is set.
 534                          * Once we get a set for fallback mappings, we have to check
 535                          * non-roundtrip stage 3 results for whether they are 0.
 536                          * See _MBCSFromUnicodeWithOffsets() for details.
 537                          */
 538                         do {
 539                             if(st3&1) {
 540                                 uset_add(set, c);
 541                             }
 542                             st3>>=1;
 543                         } while((++c&0xf)!=0);
 544                     } else {
 545                         c+=16; /* empty stage 3 block */
 546                     }
 547                 }
 548             } else {
 549                 c+=1024; /* empty stage 2 block */
 550             }
 551         }
 552     }
 553 }
 554
 555 /* EBCDIC swap LF<->NL ------------------------------------------------------ */
 556
 557 /*
 558  * This code modifies a standard EBCDIC<->Unicode mapping table for
 559  * OS/390 (z/OS) Unix System Services (Open Edition).
 560  * The difference is in the mapping of Line Feed and New Line control codes:
 561  * Standard EBCDIC maps
 562  *
 563  *   <U000A> \x25 |0
 564  *   <U0085> \x15 |0
 565  *
 566  * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
 567  * mapping
 568  *
 569  *   <U000A> \x15 |0
 570  *   <U0085> \x25 |0
 571  *
 572  * This code modifies a loaded standard EBCDIC<->Unicode mapping table
 573  * by copying it into allocated memory and swapping the LF and NL values.
 574  * It allows to support the same EBCDIC charset in both versions without
 575  * duplicating the entire installed table.
 576  */
 577
 578 /* standard EBCDIC codes */
 579 #define EBCDIC_LF 0x25
 580 #define EBCDIC_NL 0x15
 581
 582 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
 583 #define EBCDIC_RT_LF 0xf25
 584 #define EBCDIC_RT_NL 0xf15
 585
 586 /* Unicode code points */
 587 #define U_LF 0x0a
 588 #define U_NL 0x85
 589
 590 static UBool
 591 _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) {
 592     UConverterMBCSTable *mbcsTable;
 593
 594     const uint16_t *table, *results;
 595     const uint8_t *bytes;
 596
 597     int32_t (*newStateTable)[256];
 598     uint16_t *newResults;
 599     uint8_t *p;
 600     char *name;
 601
 602     uint32_t stage2Entry;
 603     uint32_t size, sizeofFromUBytes;
 604
 605     mbcsTable=&sharedData->table->mbcs;
 606
 607     table=mbcsTable->fromUnicodeTable;
 608     bytes=mbcsTable->fromUnicodeBytes;
 609     results=(const uint16_t *)bytes;
 610
 611     /*
 612      * Check that this is an EBCDIC table with SBCS portion -
 613      * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
 614      *
 615      * If not, ignore the option. Options are always ignored if they do not apply.
 616      */
 617     if(!(
 618          (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
 619          mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
 620          mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL)
 621     )) {
 622         return FALSE;
 623     }
 624
 625     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
 626         if(!(
 627              EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
 628              EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)
 629         )) {
 630             return FALSE;
 631         }
 632     } else /* MBCS_OUTPUT_2_SISO */ {
 633         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
 634         if(!(
 635              MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&
 636              EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)
 637         )) {
 638             return FALSE;
 639         }
 640
 641         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
 642         if(!(
 643              MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&
 644              EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)
 645         )) {
 646             return FALSE;
 647         }
 648     }
 649
 650     /*
 651      * The table has an appropriate format.
 652      * Allocate and build
 653      * - a modified to-Unicode state table
 654      * - a modified from-Unicode output array
 655      * - a converter name string with the swap option appended
 656      */
 657     sizeofFromUBytes=_MBCSSizeofFromUBytes(mbcsTable);
 658     size=
 659         mbcsTable->countStates*1024+
 660         sizeofFromUBytes+
 661         UCNV_MAX_CONVERTER_NAME_LENGTH+20;
 662     p=(uint8_t *)uprv_malloc(size);
 663     if(p==NULL) {
 664         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
 665         return FALSE;
 666     }
 667
 668     /* copy and modify the to-Unicode state table */
 669     newStateTable=(int32_t (*)[256])p;
 670     uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024);
 671
 672     newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
 673     newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
 674
 675     /* copy and modify the from-Unicode result table */
 676     newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
 677     uprv_memcpy(newResults, bytes, sizeofFromUBytes);
 678
 679     /* conveniently, the table access macros work on the left side of expressions */
 680     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
 681         MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;
 682         MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;
 683     } else /* MBCS_OUTPUT_2_SISO */ {
 684         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
 685         MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
 686
 687         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
 688         MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
 689     }
 690
 691     /* set the canonical converter name */
 692     name=(char *)newResults+sizeofFromUBytes;
 693     uprv_strcpy(name, sharedData->staticData->name);
 694     uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);
 695
 696     /* set the pointers */
 697     umtx_lock(NULL);
 698     if(mbcsTable->swapLFNLStateTable==NULL) {
 699         mbcsTable->swapLFNLStateTable=newStateTable;
 700         mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
 701         mbcsTable->swapLFNLName=name;
 702
 703         newStateTable=NULL;
 704     }
 705     umtx_unlock(NULL);
 706
 707     /* release the allocated memory if another thread beat us to it */
 708     if(newStateTable!=NULL) {
 709         uprv_free(newStateTable);
 710     }
 711     return TRUE;
 712 }
 713
 714 /* MBCS setup functions ----------------------------------------------------- */
 715
 716 static void
 717 _MBCSLoad(UConverterSharedData *sharedData,
 718           const uint8_t *raw,
 719           UErrorCode *pErrorCode) {
 720     UDataInfo info;
 721     UConverterMBCSTable *mbcsTable=&sharedData->table->mbcs;
 722     _MBCSHeader *header=(_MBCSHeader *)raw;
 723
 724     if(header->version[0]!=4) {
 725         *pErrorCode=U_INVALID_TABLE_FORMAT;
 726         return;
 727     }
 728
 729     mbcsTable->countStates=(uint8_t)header->countStates;
 730     mbcsTable->countToUFallbacks=header->countToUFallbacks;
 731     mbcsTable->stateTable=(const int32_t (*)[256])(raw+sizeof(_MBCSHeader));
 732     mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
 733     mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
 734
 735     mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
 736     mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
 737     mbcsTable->fromUBytesLength=header->fromUBytesLength;
 738     mbcsTable->outputType=(uint8_t)header->flags;
 739
 740     /* make sure that the output type is known */
 741     switch(mbcsTable->outputType) {
 742     case MBCS_OUTPUT_1:
 743     case MBCS_OUTPUT_2:
 744     case MBCS_OUTPUT_3:
 745     case MBCS_OUTPUT_4:
 746     case MBCS_OUTPUT_3_EUC:
 747     case MBCS_OUTPUT_4_EUC:
 748     case MBCS_OUTPUT_2_SISO:
 749         /* OK */
 750         break;
 751     default:
 752         *pErrorCode=U_INVALID_TABLE_FORMAT;
 753         return;
 754     }
 755
 756     /*
 757      * converter versions 6.1 and up contain a unicodeMask that is
 758      * used here to select the most efficient function implementations
 759      */
 760     info.size=sizeof(UDataInfo);
 761     udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
 762     if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
 763         /* mask off possible future extensions to be safe */
 764         mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3);
 765     } else {
 766         /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
 767         mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
 768     }
 769 }
 770
 771 static void
 772 _MBCSUnload(UConverterSharedData *sharedData) {
 773     UConverterMBCSTable *mbcsTable=&sharedData->table->mbcs;
 774
 775     if(mbcsTable->swapLFNLStateTable!=NULL) {
 776         uprv_free(mbcsTable->swapLFNLStateTable);
 777     }
 778 }
 779
 780 static void
 781 _MBCSReset(UConverter *cnv, UConverterResetChoice choice) {
 782     if(choice<=UCNV_RESET_TO_UNICODE) {
 783         /* toUnicode */
 784         cnv->toUnicodeStatus=0;     /* offset */
 785         cnv->mode=0;                /* state */
 786         cnv->toULength=0;           /* byteIndex */
 787     }
 788     if(choice!=UCNV_RESET_TO_UNICODE) {
 789         /* fromUnicode */
 790         cnv->fromUSurrogateLead=0;
 791         cnv->fromUnicodeStatus=1;   /* prevLength */
 792     }
 793 }
 794
 795 static void
 796 _MBCSOpen(UConverter *cnv,
 797           const char *name,
 798           const char *locale,
 799           uint32_t options,
 800           UErrorCode *pErrorCode) {
 801     if((options&UCNV_OPTION_SWAP_LFNL)!=0) {
 802         /* do this because double-checked locking is broken */
 803         UBool isCached;
 804
 805         umtx_lock(NULL);
 806         isCached=cnv->sharedData->table->mbcs.swapLFNLStateTable!=NULL;
 807         umtx_unlock(NULL);
 808
 809         if(!isCached) {
 810             if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
 811                 /* the option does not apply, remove it */
 812                 cnv->options&=~UCNV_OPTION_SWAP_LFNL;
 813             }
 814         }
 815     }
 816
 817
 818     if(uprv_strstr(name, "18030")!=NULL) {
 819         if(uprv_strstr(name, "gb18030")!=NULL || uprv_strstr(name, "GB18030")!=NULL) {
 820             /* set a flag for GB 18030 mode, which changes the callback behavior */
 821             cnv->options|=_MBCS_OPTION_GB18030;
 822         }
 823     }
 824
 825     _MBCSReset(cnv, UCNV_RESET_BOTH);
 826 }
 827
 828 static const char *
 829 _MBCSGetName(const UConverter *cnv) {
 830     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->table->mbcs.swapLFNLName!=NULL) {
 831         return cnv->sharedData->table->mbcs.swapLFNLName;
 832     } else {
 833         return cnv->sharedData->staticData->name;
 834     }
 835 }
 836
 837 /* MBCS-to-Unicode conversion functions ------------------------------------- */
 838
 839 static UChar32
 840 _MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
 841     const _MBCSToUFallback *toUFallbacks;
 842     uint32_t i, start, limit;
 843
 844     limit=mbcsTable->countToUFallbacks;
 845     if(limit>0) {
 846         /* do a binary search for the fallback mapping */
 847         toUFallbacks=mbcsTable->toUFallbacks;
 848         start=0;
 849         while(start<limit-1) {
 850             i=(start+limit)/2;
 851             if(offset<toUFallbacks[i].offset) {
 852                 limit=i;
 853             } else {
 854                 start=i;
 855             }
 856         }
 857
 858         /* did we really find it? */
 859         if(offset==toUFallbacks[start].offset) {
 860             return toUFallbacks[start].codePoint;
 861         }
 862     }
 863
 864     return 0xfffe;
 865 }
 866
 867 U_CFUNC void
 868 _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 869                           UErrorCode *pErrorCode) {
 870     UConverter *cnv;
 871     const uint8_t *source, *sourceLimit;
 872     UChar *target;
 873     const UChar *targetLimit;
 874     int32_t *offsets;
 875
 876     const int32_t (*stateTable)[256];
 877     const uint16_t *unicodeCodeUnits;
 878
 879     uint32_t offset;
 880     uint8_t state;
 881     int8_t byteIndex;
 882     uint8_t *bytes;
 883
 884     int32_t sourceIndex, nextSourceIndex;
 885
 886     int32_t entry;
 887     UChar c;
 888     uint8_t action;
 889     UConverterCallbackReason reason;
 890
 891     /* use optimized function if possible */
 892     cnv=pArgs->converter;
 893     if(cnv->sharedData->table->mbcs.countStates==1) {
 894         if(!(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
 895             _MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
 896         } else {
 897             _MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
 898         }
 899         return;
 900     }
 901
 902     /* set up the local pointers */
 903     source=(const uint8_t *)pArgs->source;
 904     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 905     target=pArgs->target;
 906     targetLimit=pArgs->targetLimit;
 907     offsets=pArgs->offsets;
 908
 909     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
 910         stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
 911     } else {
 912         stateTable=cnv->sharedData->table->mbcs.stateTable;
 913     }
 914     unicodeCodeUnits=cnv->sharedData->table->mbcs.unicodeCodeUnits;
 915
 916     /* get the converter state from UConverter */
 917     offset=cnv->toUnicodeStatus;
 918     state=(uint8_t)(cnv->mode);
 919     byteIndex=cnv->toULength;
 920     bytes=cnv->toUBytes;
 921
 922     /* sourceIndex=-1 if the current character began in the previous buffer */
 923     sourceIndex=byteIndex==0 ? 0 : -1;
 924     nextSourceIndex=0;
 925
 926     /* conversion loop */
 927     while(source<sourceLimit) {
 928         /*
 929          * This following test is to see if available input would overflow the output.
 930          * It does not catch output of more than one code unit that
 931          * overflows as a result of a surrogate pair or callback output
 932          * from the last source byte.
 933          * Therefore, those situations also test for overflows and will
 934          * then break the loop, too.
 935          */
 936         if(target<targetLimit) {
 937             ++nextSourceIndex;
 938             entry=stateTable[state][bytes[byteIndex++]=*source++];
 939             if(MBCS_ENTRY_IS_TRANSITION(entry)) {
 940                 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
 941                 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
 942             } else {
 943                 /* set the next state early so that we can reuse the entry variable */
 944                 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
 945
 946                 /*
 947                  * An if-else-if chain provides more reliable performance for
 948                  * the most common cases compared to a switch.
 949                  */
 950                 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
 951                 if(action==MBCS_STATE_VALID_16) {
 952                     offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
 953                     c=unicodeCodeUnits[offset];
 954                     if(c<0xfffe) {
 955                         /* output BMP code point */
 956                         *target++=c;
 957                         if(offsets!=NULL) {
 958                             *offsets++=sourceIndex;
 959                         }
 960                     } else if(c==0xfffe) {
 961                         if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)_MBCSGetFallback(&cnv->sharedData->table->mbcs, offset))!=0xfffe) {
 962                             /* output fallback BMP code point */
 963                             *target++=(UChar)entry;
 964                             if(offsets!=NULL) {
 965                                 *offsets++=sourceIndex;
 966                             }
 967                         } else {
 968                             /* callback(unassigned) */
 969                             goto unassigned;
 970                         }
 971                     } else {
 972                         /* callback(illegal) */
 973                         goto illegal;
 974                     }
 975                 } else if(action==MBCS_STATE_VALID_DIRECT_16) {
 976                     /* output BMP code point */
 977                     *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
 978                     if(offsets!=NULL) {
 979                         *offsets++=sourceIndex;
 980                     }
 981                 } else if(action==MBCS_STATE_VALID_16_PAIR) {
 982                     offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
 983                     c=unicodeCodeUnits[offset++];
 984                     if(c<0xd800) {
 985                         /* output BMP code point below 0xd800 */
 986                         *target++=c;
 987                         if(offsets!=NULL) {
 988                             *offsets++=sourceIndex;
 989                         }
 990                     } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
 991                         /* output roundtrip or fallback surrogate pair */
 992                         *target++=(UChar)(c&0xdbff);
 993                         if(offsets!=NULL) {
 994                             *offsets++=sourceIndex;
 995                         }
 996                         if(target<targetLimit) {
 997                             *target++=unicodeCodeUnits[offset];
 998                             if(offsets!=NULL) {
 999                                 *offsets++=sourceIndex;
1000                             }
1001                         } else {
1002                             /* target overflow */
1003                             cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
1004                             cnv->UCharErrorBufferLength=1;
1005                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1006
1007                             offset=0;
1008                             byteIndex=0;
1009                             break;
1010                         }
1011                     } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
1012                         /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
1013                         *target++=unicodeCodeUnits[offset];
1014                         if(offsets!=NULL) {
1015                             *offsets++=sourceIndex;
1016                         }
1017                     } else if(c==0xffff) {
1018                         /* callback(illegal) */
1019                         goto illegal;
1020                     } else {
1021                         /* callback(unassigned) */
1022                         goto unassigned;
1023                     }
1024                 } else if(action==MBCS_STATE_VALID_DIRECT_20) {
1025 valid20:
1026                     entry=MBCS_ENTRY_FINAL_VALUE(entry);
1027                     /* output surrogate pair */
1028                     *target++=(UChar)(0xd800|(UChar)(entry>>10));
1029                     if(offsets!=NULL) {
1030                         *offsets++=sourceIndex;
1031                     }
1032                     c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
1033                     if(target<targetLimit) {
1034                         *target++=c;
1035                         if(offsets!=NULL) {
1036                             *offsets++=sourceIndex;
1037                         }
1038                     } else {
1039                         /* target overflow */
1040                         cnv->UCharErrorBuffer[0]=c;
1041                         cnv->UCharErrorBufferLength=1;
1042                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1043
1044                         offset=0;
1045                         byteIndex=0;
1046                         break;
1047                     }
1048                 } else if(action==MBCS_STATE_CHANGE_ONLY) {
1049                     /*
1050                      * This serves as a state change without any output.
1051                      * It is useful for reading simple stateful encodings,
1052                      * for example using just Shift-In/Shift-Out codes.
1053                      * The 21 unused bits may later be used for more sophisticated
1054                      * state transitions.
1055                      */
1056                 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1057                     if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1058                         /* callback(unassigned) */
1059                         goto unassigned;
1060                     }
1061                     /* output BMP code point */
1062                     *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1063                     if(offsets!=NULL) {
1064                         *offsets++=sourceIndex;
1065                     }
1066                 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
1067                     if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1068                         /* callback(unassigned) */
1069                         goto unassigned;
1070                     }
1071                     goto valid20;
1072                 } else if(action==MBCS_STATE_UNASSIGNED) {
1073                     /* callback(unassigned) */
1074                     goto unassigned;
1075                 } else if(action==MBCS_STATE_ILLEGAL) {
1076                     /* callback(illegal) */
1077                     goto illegal;
1078                 } else {
1079                     /* reserved, must never occur */
1080                 }
1081
1082                 /* normal end of action codes: prepare for a new character */
1083                 offset=0;
1084                 byteIndex=0;
1085                 sourceIndex=nextSourceIndex;
1086                 continue;
1087
1088 illegal:
1089                 reason=UCNV_ILLEGAL;
1090                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1091                 goto callback;
1092 unassigned:
1093                 reason=UCNV_UNASSIGNED;
1094                 *pErrorCode=U_INVALID_CHAR_FOUND;
1095 callback:
1096                 /* call the callback function with all the preparations and post-processing */
1097                 /* update the arguments structure */
1098                 pArgs->source=(const char *)source;
1099                 pArgs->target=target;
1100                 pArgs->offsets=offsets;
1101
1102                 /* set the converter state in UConverter to deal with the next character */
1103                 cnv->toUnicodeStatus=0;
1104                 cnv->mode=state;
1105                 cnv->toULength=0;
1106
1107                 /* call the callback function */
1108                 toUCallback(cnv, cnv->toUContext, pArgs, (const char *)bytes, byteIndex, reason, pErrorCode);
1109
1110                 /* get the converter state from UConverter */
1111                 offset=cnv->toUnicodeStatus;
1112                 state=(uint8_t)cnv->mode;
1113                 byteIndex=cnv->toULength;
1114
1115                 /* update target and deal with offsets if necessary */
1116                 offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
1117                 target=pArgs->target;
1118
1119                 /* update the source pointer and index */
1120                 sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
1121                 source=(const uint8_t *)pArgs->source;
1122
1123                 /*
1124                  * If the callback overflowed the target, then we need to
1125                  * stop here with an overflow indication.
1126                  */
1127                 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1128                     break;
1129                 } else if(U_FAILURE(*pErrorCode)) {
1130                     /* break on error */
1131                     offset=0;
1132                     state=0;
1133                     byteIndex=0;
1134                     break;
1135                 } else if(cnv->UCharErrorBufferLength>0) {
1136                     /* target is full */
1137                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1138                     break;
1139                 }
1140
1141                 /*
1142                  * We do not need to repeat the statements from the normal
1143                  * end of the action codes because we already updated all the
1144                  * necessary variables.
1145                  */
1146             }
1147         } else {
1148             /* target is full */
1149             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1150             break;
1151         }
1152     }
1153
1154     if(pArgs->flush && source>=sourceLimit) {
1155         /* reset the state for the next conversion */
1156         if(byteIndex>0 && U_SUCCESS(*pErrorCode)) {
1157             /* a character byte sequence remains incomplete */
1158             *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1159         }
1160         cnv->toUnicodeStatus=0;
1161         cnv->mode=0;
1162         cnv->toULength=0;
1163     } else {
1164         /* set the converter state back into UConverter */
1165         cnv->toUnicodeStatus=offset;
1166         cnv->mode=state;
1167         cnv->toULength=byteIndex;
1168     }
1169
1170     /* write back the updated pointers */
1171     pArgs->source=(const char *)source;
1172     pArgs->target=target;
1173     pArgs->offsets=offsets;
1174 }
1175
1176 /* This version of _MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
1177 static void
1178 _MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1179                                 UErrorCode *pErrorCode) {
1180     UConverter *cnv;
1181     const uint8_t *source, *sourceLimit;
1182     UChar *target;
1183     const UChar *targetLimit;
1184     int32_t *offsets;
1185
1186     const int32_t (*stateTable)[256];
1187
1188     int32_t sourceIndex, nextSourceIndex;
1189
1190     int32_t entry;
1191     UChar c;
1192     uint8_t action;
1193     UConverterCallbackReason reason;
1194
1195     /* set up the local pointers */
1196     cnv=pArgs->converter;
1197     source=(const uint8_t *)pArgs->source;
1198     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1199     target=pArgs->target;
1200     targetLimit=pArgs->targetLimit;
1201     offsets=pArgs->offsets;
1202
1203     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1204         stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
1205     } else {
1206         stateTable=cnv->sharedData->table->mbcs.stateTable;
1207     }
1208
1209     /* sourceIndex=-1 if the current character began in the previous buffer */
1210     sourceIndex=0;
1211     nextSourceIndex=0;
1212
1213     /* conversion loop */
1214     while(source<sourceLimit) {
1215         /*
1216          * This following test is to see if available input would overflow the output.
1217          * It does not catch output of more than one code unit that
1218          * overflows as a result of a surrogate pair or callback output
1219          * from the last source byte.
1220          * Therefore, those situations also test for overflows and will
1221          * then break the loop, too.
1222          */
1223         if(target<targetLimit) {
1224             ++nextSourceIndex;
1225             entry=stateTable[0][*source++];
1226             /* MBCS_ENTRY_IS_FINAL(entry) */
1227
1228             /* test the most common case first */
1229             if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1230                 /* output BMP code point */
1231                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1232                 if(offsets!=NULL) {
1233                     *offsets++=sourceIndex;
1234                 }
1235
1236                 /* normal end of action codes: prepare for a new character */
1237                 sourceIndex=nextSourceIndex;
1238                 continue;
1239             }
1240
1241             /*
1242              * An if-else-if chain provides more reliable performance for
1243              * the most common cases compared to a switch.
1244              */
1245             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1246             if(action==MBCS_STATE_VALID_DIRECT_20) {
1247 valid20:
1248                 entry=MBCS_ENTRY_FINAL_VALUE(entry);
1249                 /* output surrogate pair */
1250                 *target++=(UChar)(0xd800|(UChar)(entry>>10));
1251                 if(offsets!=NULL) {
1252                     *offsets++=sourceIndex;
1253                 }
1254                 c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
1255                 if(target<targetLimit) {
1256                     *target++=c;
1257                     if(offsets!=NULL) {
1258                         *offsets++=sourceIndex;
1259                     }
1260                 } else {
1261                     /* target overflow */
1262                     cnv->UCharErrorBuffer[0]=c;
1263                     cnv->UCharErrorBufferLength=1;
1264                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1265                     break;
1266                 }
1267             } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1268                 if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1269                     /* callback(unassigned) */
1270                     goto unassigned;
1271                 }
1272                 /* output BMP code point */
1273                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1274                 if(offsets!=NULL) {
1275                     *offsets++=sourceIndex;
1276                 }
1277             } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
1278                 if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1279                     /* callback(unassigned) */
1280                     goto unassigned;
1281                 }
1282                 goto valid20;
1283             } else if(action==MBCS_STATE_UNASSIGNED) {
1284                 /* callback(unassigned) */
1285                 goto unassigned;
1286             } else if(action==MBCS_STATE_ILLEGAL) {
1287                 /* callback(illegal) */
1288                 reason=UCNV_ILLEGAL;
1289                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1290                 goto callback;
1291             } else {
1292                 /* reserved, must never occur */
1293             }
1294
1295             /* normal end of action codes: prepare for a new character */
1296             sourceIndex=nextSourceIndex;
1297             continue;
1298
1299 unassigned:
1300             reason=UCNV_UNASSIGNED;
1301             *pErrorCode=U_INVALID_CHAR_FOUND;
1302 callback:
1303             /* call the callback function with all the preparations and post-processing */
1304             /* update the arguments structure */
1305             pArgs->source=(const char *)source;
1306             pArgs->target=target;
1307             pArgs->offsets=offsets;
1308
1309             /* call the callback function */
1310             toUCallback(cnv, cnv->toUContext, pArgs, (const char *)(source-1), 1, reason, pErrorCode);
1311
1312             /* update target and deal with offsets if necessary */
1313             offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
1314             target=pArgs->target;
1315
1316             /* update the source pointer and index */
1317             sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
1318             source=(const uint8_t *)pArgs->source;
1319
1320             /*
1321              * If the callback overflowed the target, then we need to
1322              * stop here with an overflow indication.
1323              */
1324             if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1325                 break;
1326             } else if(U_FAILURE(*pErrorCode)) {
1327                 /* break on error */
1328                 break;
1329             } else if(cnv->UCharErrorBufferLength>0) {
1330                 /* target is full */
1331                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1332                 break;
1333             }
1334
1335             /*
1336              * We do not need to repeat the statements from the normal
1337              * end of the action codes because we already updated all the
1338              * necessary variables.
1339              */
1340         } else {
1341             /* target is full */
1342             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1343             break;
1344         }
1345     }
1346
1347     /* write back the updated pointers */
1348     pArgs->source=(const char *)source;
1349     pArgs->target=target;
1350     pArgs->offsets=offsets;
1351 }
1352
1353 /*
1354  * This version of _MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
1355  * that only map to and from the BMP.
1356  * In addition to single-byte optimizations, the offset calculations
1357  * become much easier.
1358  */
1359 static void
1360 _MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
1361                             UErrorCode *pErrorCode) {
1362     UConverter *cnv;
1363     const uint8_t *source, *sourceLimit, *lastSource;
1364     UChar *target;
1365     int32_t targetCapacity, length;
1366     int32_t *offsets;
1367
1368     const int32_t (*stateTable)[256];
1369
1370     int32_t sourceIndex;
1371
1372     int32_t entry;
1373     uint8_t action;
1374     UConverterCallbackReason reason;
1375
1376     /* set up the local pointers */
1377     cnv=pArgs->converter;
1378     source=(const uint8_t *)pArgs->source;
1379     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1380     target=pArgs->target;
1381     targetCapacity=pArgs->targetLimit-pArgs->target;
1382     offsets=pArgs->offsets;
1383
1384     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1385         stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
1386     } else {
1387         stateTable=cnv->sharedData->table->mbcs.stateTable;
1388     }
1389
1390     /* sourceIndex=-1 if the current character began in the previous buffer */
1391     sourceIndex=0;
1392     lastSource=source;
1393
1394     /*
1395      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
1396      * for the minimum of the sourceLength and targetCapacity
1397      */
1398     length=sourceLimit-source;
1399     if(length<targetCapacity) {
1400         targetCapacity=length;
1401     }
1402
1403 #if MBCS_UNROLL_SINGLE_TO_BMP
1404     /* unrolling makes it faster on Pentium III/Windows 2000 */
1405     /* unroll the loop with the most common case */
1406 unrolled:
1407     if(targetCapacity>=16) {
1408         int32_t count, loops, oredEntries;
1409
1410         loops=count=targetCapacity>>4;
1411         do {
1412             oredEntries=entry=stateTable[0][*source++];
1413             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1414             oredEntries|=entry=stateTable[0][*source++];
1415             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1416             oredEntries|=entry=stateTable[0][*source++];
1417             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1418             oredEntries|=entry=stateTable[0][*source++];
1419             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1420             oredEntries|=entry=stateTable[0][*source++];
1421             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1422             oredEntries|=entry=stateTable[0][*source++];
1423             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1424             oredEntries|=entry=stateTable[0][*source++];
1425             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1426             oredEntries|=entry=stateTable[0][*source++];
1427             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1428             oredEntries|=entry=stateTable[0][*source++];
1429             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1430             oredEntries|=entry=stateTable[0][*source++];
1431             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1432             oredEntries|=entry=stateTable[0][*source++];
1433             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1434             oredEntries|=entry=stateTable[0][*source++];
1435             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1436             oredEntries|=entry=stateTable[0][*source++];
1437             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1438             oredEntries|=entry=stateTable[0][*source++];
1439             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1440             oredEntries|=entry=stateTable[0][*source++];
1441             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1442             oredEntries|=entry=stateTable[0][*source++];
1443             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1444
1445             /* were all 16 entries really valid? */
1446             if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
1447                 /* no, return to the first of these 16 */
1448                 source-=16;
1449                 target-=16;
1450                 break;
1451             }
1452         } while(--count>0);
1453         count=loops-count;
1454         targetCapacity-=16*count;
1455
1456         if(offsets!=NULL) {
1457             lastSource+=16*count;
1458             while(count>0) {
1459                 *offsets++=sourceIndex++;
1460                 *offsets++=sourceIndex++;
1461                 *offsets++=sourceIndex++;
1462                 *offsets++=sourceIndex++;
1463                 *offsets++=sourceIndex++;
1464                 *offsets++=sourceIndex++;
1465                 *offsets++=sourceIndex++;
1466                 *offsets++=sourceIndex++;
1467                 *offsets++=sourceIndex++;
1468                 *offsets++=sourceIndex++;
1469                 *offsets++=sourceIndex++;
1470                 *offsets++=sourceIndex++;
1471                 *offsets++=sourceIndex++;
1472                 *offsets++=sourceIndex++;
1473                 *offsets++=sourceIndex++;
1474                 *offsets++=sourceIndex++;
1475                 --count;
1476             }
1477         }
1478     }
1479 #endif
1480
1481     /* conversion loop */
1482     while(targetCapacity>0) {
1483         entry=stateTable[0][*source++];
1484         /* MBCS_ENTRY_IS_FINAL(entry) */
1485
1486         /* test the most common case first */
1487         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1488             /* output BMP code point */
1489             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1490             --targetCapacity;
1491             continue;
1492         }
1493
1494         /*
1495          * An if-else-if chain provides more reliable performance for
1496          * the most common cases compared to a switch.
1497          */
1498         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1499         if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1500             if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1501                 /* callback(unassigned) */
1502                 reason=UCNV_UNASSIGNED;
1503                 *pErrorCode=U_INVALID_CHAR_FOUND;
1504             }
1505             /* output BMP code point */
1506             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1507             --targetCapacity;
1508             continue;
1509         } else if(action==MBCS_STATE_UNASSIGNED) {
1510             /* callback(unassigned) */
1511             reason=UCNV_UNASSIGNED;
1512             *pErrorCode=U_INVALID_CHAR_FOUND;
1513         } else if(action==MBCS_STATE_ILLEGAL) {
1514             /* callback(illegal) */
1515             reason=UCNV_ILLEGAL;
1516             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1517         } else {
1518             /* reserved, must never occur */
1519             continue;
1520         }
1521
1522         /* call the callback function with all the preparations and post-processing */
1523         /* set offsets since the start or the last callback */
1524         if(offsets!=NULL) {
1525             int32_t count=(int32_t)(source-lastSource);
1526
1527             /* predecrement: do not set the offset for the callback-causing character */
1528             while(--count>0) {
1529                 *offsets++=sourceIndex++;
1530             }
1531             /* offset and sourceIndex are now set for the current character */
1532         }
1533
1534         /* update the arguments structure */
1535         pArgs->source=(const char *)source;
1536         pArgs->target=target;
1537         pArgs->offsets=offsets;
1538
1539         /* call the callback function */
1540         toUCallback(cnv, cnv->toUContext, pArgs, (const char *)(source-1), 1, reason, pErrorCode);
1541
1542         /* update target and deal with offsets if necessary */
1543         offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
1544         target=pArgs->target;
1545
1546         /* update the source pointer and index */
1547         sourceIndex+=1+((const uint8_t *)pArgs->source-source);
1548         source=lastSource=(const uint8_t *)pArgs->source;
1549         targetCapacity=pArgs->targetLimit-target;
1550         length=sourceLimit-source;
1551         if(length<targetCapacity) {
1552             targetCapacity=length;
1553         }
1554
1555         /*
1556          * If the callback overflowed the target, then we need to
1557          * stop here with an overflow indication.
1558          */
1559         if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1560             break;
1561         } else if(U_FAILURE(*pErrorCode)) {
1562             /* break on error */
1563             break;
1564         } else if(cnv->UCharErrorBufferLength>0) {
1565             /* target is full */
1566             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1567             break;
1568         }
1569
1570 #if MBCS_UNROLL_SINGLE_TO_BMP
1571         /* unrolling makes it faster on Pentium III/Windows 2000 */
1572         goto unrolled;
1573 #endif
1574     }
1575
1576     if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
1577         /* target is full */
1578         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1579     }
1580
1581     /* set offsets since the start or the last callback */
1582     if(offsets!=NULL) {
1583         size_t count=source-lastSource;
1584         while(count>0) {
1585             *offsets++=sourceIndex++;
1586             --count;
1587         }
1588     }
1589
1590     /* write back the updated pointers */
1591     pArgs->source=(const char *)source;
1592     pArgs->target=target;
1593     pArgs->offsets=offsets;
1594 }
1595
1596 static UChar32
1597 _MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
1598                   UErrorCode *pErrorCode) {
1599     UChar buffer[UTF_MAX_CHAR_LENGTH];
1600
1601     UConverter *cnv;
1602     const uint8_t *source, *sourceLimit;
1603
1604     const int32_t (*stateTable)[256];
1605     const uint16_t *unicodeCodeUnits;
1606
1607     uint32_t offset;
1608     uint8_t state;
1609     int8_t byteIndex;
1610     uint8_t *bytes;
1611
1612     int32_t entry;
1613     UChar32 c;
1614     uint8_t action;
1615     UConverterCallbackReason reason;
1616
1617     /* use optimized function if possible */
1618     cnv=pArgs->converter;
1619     if(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
1620         /*
1621          * Calling the inefficient, generic getNextUChar() lets us deal correctly
1622          * with the rare case of a codepage that maps single surrogates
1623          * without adding the complexity to this already complicated function here.
1624          */
1625         return ucnv_getNextUCharFromToUImpl(pArgs, _MBCSToUnicodeWithOffsets, TRUE, pErrorCode);
1626     } else if(cnv->sharedData->table->mbcs.countStates==1) {
1627         return _MBCSSingleGetNextUChar(pArgs, pErrorCode);
1628     }
1629
1630     /* set up the local pointers */
1631     source=(const uint8_t *)pArgs->source;
1632     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1633
1634     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1635         stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
1636     } else {
1637         stateTable=cnv->sharedData->table->mbcs.stateTable;
1638     }
1639     unicodeCodeUnits=cnv->sharedData->table->mbcs.unicodeCodeUnits;
1640
1641     /* get the converter state from UConverter */
1642     offset=cnv->toUnicodeStatus;
1643     state=(uint8_t)(cnv->mode);
1644     byteIndex=cnv->toULength;
1645     bytes=cnv->toUBytes;
1646
1647     /* conversion loop */
1648     while(source<sourceLimit) {
1649         entry=stateTable[state][bytes[byteIndex++]=*source++];
1650         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1651             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
1652             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
1653         } else {
1654             /* set the next state early so that we can reuse the entry variable */
1655             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1656
1657             /*
1658              * An if-else-if chain provides more reliable performance for
1659              * the most common cases compared to a switch.
1660              */
1661             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1662             if(action==MBCS_STATE_VALID_16) {
1663                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
1664                 c=unicodeCodeUnits[offset];
1665                 if(c<0xfffe) {
1666                     /* output BMP code point */
1667                     goto finish;
1668                 } else if(c==0xfffe) {
1669                     if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=_MBCSGetFallback(&cnv->sharedData->table->mbcs, offset))!=0xfffe) {
1670                         goto finish;
1671                     }
1672                     /* callback(unassigned) */
1673                     goto unassigned;
1674                 } else {
1675                     /* callback(illegal) */
1676                     goto illegal;
1677                 }
1678             } else if(action==MBCS_STATE_VALID_DIRECT_16) {
1679                 /* output BMP code point */
1680                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1681                 goto finish;
1682             } else if(action==MBCS_STATE_VALID_16_PAIR) {
1683                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
1684                 c=unicodeCodeUnits[offset++];
1685                 if(c<0xd800) {
1686                     /* output BMP code point below 0xd800 */
1687                     goto finish;
1688                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
1689                     /* output roundtrip or fallback supplementary code point */
1690                     c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
1691                     goto finish;
1692                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
1693                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
1694                     c=unicodeCodeUnits[offset];
1695                     goto finish;
1696                 } else if(c==0xffff) {
1697                     /* callback(illegal) */
1698                     goto illegal;
1699                 } else {
1700                     /* callback(unassigned) */
1701                     goto unassigned;
1702                 }
1703             } else if(action==MBCS_STATE_VALID_DIRECT_20) {
1704                 /* output supplementary code point */
1705                 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
1706                 goto finish;
1707             } else if(action==MBCS_STATE_CHANGE_ONLY) {
1708                 /*
1709                  * This serves as a state change without any output.
1710                  * It is useful for reading simple stateful encodings,
1711                  * for example using just Shift-In/Shift-Out codes.
1712                  * The 21 unused bits may later be used for more sophisticated
1713                  * state transitions.
1714                  */
1715             } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1716                 if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1717                     /* callback(unassigned) */
1718                     goto unassigned;
1719                 }
1720                 /* output BMP code point */
1721                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1722                 goto finish;
1723             } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
1724                 if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1725                     /* callback(unassigned) */
1726                     goto unassigned;
1727                 }
1728                 /* output supplementary code point */
1729                 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
1730                 goto finish;
1731             } else if(action==MBCS_STATE_UNASSIGNED) {
1732                 /* callback(unassigned) */
1733                 goto unassigned;
1734             } else if(action==MBCS_STATE_ILLEGAL) {
1735                 /* callback(illegal) */
1736                 goto illegal;
1737             } else {
1738                 /* reserved, must never occur */
1739             }
1740
1741             /* normal end of action codes: prepare for a new character */
1742             offset=0;
1743             byteIndex=0;
1744             continue;
1745
1746 illegal:
1747             reason=UCNV_ILLEGAL;
1748             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1749             goto callback;
1750 unassigned:
1751             reason=UCNV_UNASSIGNED;
1752             *pErrorCode=U_INVALID_CHAR_FOUND;
1753 callback:
1754             /* call the callback function with all the preparations and post-processing */
1755             /* update the arguments structure */
1756             pArgs->source=(const char *)source;
1757             pArgs->target=buffer;
1758             pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
1759
1760             /* set the converter state in UConverter to deal with the next character */
1761             cnv->toUnicodeStatus=0;
1762             cnv->mode=state;
1763             cnv->toULength=0;
1764
1765             /* call the callback function */
1766             toUCallback(cnv, cnv->toUContext, pArgs, (const char *)bytes, byteIndex, reason, pErrorCode);
1767
1768             /* get the converter state from UConverter */
1769             offset=cnv->toUnicodeStatus;
1770             state=(uint8_t)cnv->mode;
1771             byteIndex=cnv->toULength;
1772
1773             /* update the source pointer */
1774             source=(const uint8_t *)pArgs->source;
1775
1776             /*
1777              * return the first character if the callback wrote some
1778              * we do not need to goto finish because the converter state is already set
1779              */
1780             if(U_SUCCESS(*pErrorCode)) {
1781                 entry=pArgs->target-buffer;
1782                 if(entry>0) {
1783                     return ucnv_getUChar32KeepOverflow(cnv, buffer, entry);
1784                 }
1785                 /* else (callback did not write anything) continue */
1786             } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1787                 *pErrorCode=U_ZERO_ERROR;
1788                 return ucnv_getUChar32KeepOverflow(cnv, buffer, UTF_MAX_CHAR_LENGTH);
1789             } else {
1790                 /* break on error */
1791                 /* ### what if a callback set an error but _also_ generated output?! */
1792                 state=0;
1793                 c=0xffff;
1794                 goto finish;
1795             }
1796
1797             /*
1798              * We do not need to repeat the statements from the normal
1799              * end of the action codes because we already updated all the
1800              * necessary variables.
1801              */
1802         }
1803     }
1804
1805     if(byteIndex>0) {
1806         /* incomplete character byte sequence */
1807         *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1808         state=0;
1809     } else {
1810         /* no output because of empty input or only state changes and skipping callbacks */
1811         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1812     }
1813     c=0xffff;
1814
1815 finish:
1816     /* set the converter state back into UConverter, ready for a new character */
1817     cnv->toUnicodeStatus=0;
1818     cnv->mode=state;
1819     cnv->toULength=0;
1820
1821     /* write back the updated pointer */
1822     pArgs->source=(const char *)source;
1823     return c;
1824 }
1825
1826 /*
1827  * This version of _MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
1828  * We still need a conversion loop in case a skipping callback is called.
1829  */
1830 static UChar32
1831 _MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
1832                         UErrorCode *pErrorCode) {
1833     UChar buffer[UTF_MAX_CHAR_LENGTH];
1834
1835     UConverter *cnv;
1836     const int32_t (*stateTable)[256];
1837     const uint8_t *source, *sourceLimit;
1838
1839     int32_t entry;
1840     uint8_t action;
1841     UConverterCallbackReason reason;
1842
1843     /* set up the local pointers */
1844     cnv=pArgs->converter;
1845     source=(const uint8_t *)pArgs->source;
1846     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1847     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1848         stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
1849     } else {
1850         stateTable=cnv->sharedData->table->mbcs.stateTable;
1851     }
1852
1853     /* conversion loop */
1854     while(source<sourceLimit) {
1855         entry=stateTable[0][*source++];
1856         /* MBCS_ENTRY_IS_FINAL(entry) */
1857
1858         /* write back the updated pointer early so that we can return directly */
1859         pArgs->source=(const char *)source;
1860
1861         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1862             /* output BMP code point */
1863             return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1864         }
1865
1866         /*
1867          * An if-else-if chain provides more reliable performance for
1868          * the most common cases compared to a switch.
1869          */
1870         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1871         if(action==MBCS_STATE_VALID_DIRECT_20) {
1872             /* output supplementary code point */
1873             return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
1874         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1875             if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1876                 /* callback(unassigned) */
1877                 reason=UCNV_UNASSIGNED;
1878                 *pErrorCode=U_INVALID_CHAR_FOUND;
1879             } else {
1880                 /* output BMP code point */
1881                 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1882             }
1883         } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
1884             if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
1885                 /* callback(unassigned) */
1886                 reason=UCNV_UNASSIGNED;
1887                 *pErrorCode=U_INVALID_CHAR_FOUND;
1888             } else {
1889                 /* output supplementary code point */
1890                 return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
1891             }
1892         } else if(action==MBCS_STATE_UNASSIGNED) {
1893             /* callback(unassigned) */
1894             reason=UCNV_UNASSIGNED;
1895             *pErrorCode=U_INVALID_CHAR_FOUND;
1896         } else if(action==MBCS_STATE_ILLEGAL) {
1897             /* callback(illegal) */
1898             reason=UCNV_ILLEGAL;
1899             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1900         } else {
1901             /* reserved, must never occur */
1902             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1903             return 0xffff;
1904         }
1905
1906         /* call the callback function with all the preparations and post-processing */
1907         /* update the arguments structure */
1908         pArgs->target=buffer;
1909         pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
1910
1911         /* call the callback function */
1912         toUCallback(cnv, cnv->toUContext, pArgs, (const char *)(source-1), 1, reason, pErrorCode);
1913
1914         /* update the source pointer */
1915         source=(const uint8_t *)pArgs->source;
1916
1917         /*
1918          * return the first character if the callback wrote some
1919          * we do not need to goto finish because the converter state is already set
1920          */
1921         if(U_SUCCESS(*pErrorCode)) {
1922             entry=pArgs->target-buffer;
1923             if(entry>0) {
1924                 return ucnv_getUChar32KeepOverflow(cnv, buffer, entry);
1925             }
1926             /* else (callback did not write anything) continue */
1927         } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1928             *pErrorCode=U_ZERO_ERROR;
1929             return ucnv_getUChar32KeepOverflow(cnv, buffer, UTF_MAX_CHAR_LENGTH);
1930         } else {
1931             /* break on error */
1932             /* ### what if a callback set an error but _also_ generated output?! */
1933             return 0xffff;
1934         }
1935     }
1936
1937     /* no output because of empty input or only state changes and skipping callbacks */
1938     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1939     return 0xffff;
1940 }
1941
1942 /*
1943  * This is a simple version of getNextUChar() that is used
1944  * by other converter implementations.
1945  * It does not use state from the converter, nor error codes.
1946  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
1947  *
1948  * Return value:
1949  * U+fffe   unassigned
1950  * U+ffff   illegal
1951  * otherwise the Unicode code point
1952  */
1953 U_CFUNC UChar32
1954 _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
1955                         const char **pSource, const char *sourceLimit,
1956                         UBool useFallback) {
1957     const uint8_t *source;
1958
1959     const int32_t (*stateTable)[256];
1960     const uint16_t *unicodeCodeUnits;
1961
1962     uint32_t offset;
1963     uint8_t state, action;
1964
1965     int32_t entry;
1966
1967     /* set up the local pointers */
1968     source=(const uint8_t *)*pSource;
1969     if(source>=(const uint8_t *)sourceLimit) {
1970         /* no input at all: "illegal" */
1971         return 0xffff;
1972     }
1973
1974 #if 0
1975 /*
1976  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
1977  * TODO In future releases, verify that this function is never called for SBCS
1978  * conversions, i.e., that sharedData->table->mbcs.countStates==1 is still true.
1979  * Removal improves code coverage.
1980  */
1981     /* use optimized function if possible */
1982     if(sharedData->table->mbcs.countStates==1) {
1983         return _MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)(*(*pSource)++), useFallback);
1984     }
1985 #endif
1986
1987     stateTable=sharedData->table->mbcs.stateTable;
1988     unicodeCodeUnits=sharedData->table->mbcs.unicodeCodeUnits;
1989
1990     /* converter state */
1991     offset=0;
1992     state=0;
1993
1994     /* conversion loop */
1995     do {
1996         entry=stateTable[state][*source++];
1997         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1998             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
1999             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2000         } else {
2001             *pSource=(const char *)source;
2002
2003             /*
2004              * An if-else-if chain provides more reliable performance for
2005              * the most common cases compared to a switch.
2006              */
2007             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2008             if(action==MBCS_STATE_VALID_16) {
2009                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2010                 entry=unicodeCodeUnits[offset];
2011                 if(entry!=0xfffe) {
2012                     return (UChar32)entry;
2013                 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2014                     return _MBCSGetFallback(&sharedData->table->mbcs, offset);
2015                 } else {
2016                     return 0xfffe;
2017                 }
2018             } else if(action==MBCS_STATE_VALID_DIRECT_16) {
2019                 /* output BMP code point */
2020                 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2021             } else if(action==MBCS_STATE_VALID_16_PAIR) {
2022                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2023                 entry=unicodeCodeUnits[offset++];
2024                 if(entry<0xd800) {
2025                     /* output BMP code point below 0xd800 */
2026                     return (UChar32)entry;
2027                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? entry<=0xdfff : entry<=0xdbff) {
2028                     /* output roundtrip or fallback supplementary code point */
2029                     return (UChar32)(((entry&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
2030                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (entry&0xfffe)==0xe000 : entry==0xe000) {
2031                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2032                     return unicodeCodeUnits[offset];
2033                 } else if(entry==0xffff) {
2034                     return 0xffff;
2035                 } else {
2036                     return 0xfffe;
2037                 }
2038             } else if(action==MBCS_STATE_VALID_DIRECT_20) {
2039                 /* output supplementary code point */
2040                 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2041             } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2042                 if(!TO_U_USE_FALLBACK(useFallback)) {
2043                     return 0xfffe;
2044                 }
2045                 /* output BMP code point */
2046                 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2047             } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
2048                 if(!TO_U_USE_FALLBACK(useFallback)) {
2049                     return 0xfffe;
2050                 }
2051                 /* output supplementary code point */
2052                 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2053             } else if(action==MBCS_STATE_CHANGE_ONLY) {
2054                 /*
2055                  * This serves as a state change without any output.
2056                  * It is useful for reading simple stateful encodings,
2057                  * for example using just Shift-In/Shift-Out codes.
2058                  * The 21 unused bits may later be used for more sophisticated
2059                  * state transitions.
2060                  */
2061                 if(source==(const uint8_t *)sourceLimit) {
2062                     /* if there are only state changes, then return "unassigned" */
2063                     return 0xfffe;
2064                 }
2065             } else if(action==MBCS_STATE_UNASSIGNED) {
2066                 return 0xfffe;
2067             } else if(action==MBCS_STATE_ILLEGAL) {
2068                 return 0xffff;
2069             } else {
2070                 /* reserved, must never occur */
2071             }
2072
2073             /* state change only - prepare for a new character */
2074             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2075             offset=0;
2076         }
2077     } while(source<(const uint8_t *)sourceLimit);
2078
2079     *pSource=(const char *)source;
2080     return 0xffff;
2081 }
2082
2083 #if 0
2084 /*
2085  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
2086  * Removal improves code coverage.
2087  */
2088 /**
2089  * This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
2090  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
2091  */
2092 U_CFUNC UChar32
2093 _MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
2094                               uint8_t b, UBool useFallback) {
2095     int32_t entry;
2096     uint8_t action;
2097
2098     entry=sharedData->table->mbcs.stateTable[0][b];
2099     /* MBCS_ENTRY_IS_FINAL(entry) */
2100
2101     if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2102         /* output BMP code point */
2103         return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2104     }
2105
2106     /*
2107      * An if-else-if chain provides more reliable performance for
2108      * the most common cases compared to a switch.
2109      */
2110     action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2111     if(action==MBCS_STATE_VALID_DIRECT_20) {
2112         /* output supplementary code point */
2113         return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2114     } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2115         if(!TO_U_USE_FALLBACK(useFallback)) {
2116             return 0xfffe;
2117         }
2118         /* output BMP code point */
2119         return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2120     } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
2121         if(!TO_U_USE_FALLBACK(useFallback)) {
2122             return 0xfffe;
2123         }
2124         /* output supplementary code point */
2125         return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2126     } else if(action==MBCS_STATE_UNASSIGNED) {
2127         return 0xfffe;
2128     } else if(action==MBCS_STATE_ILLEGAL) {
2129         return 0xffff;
2130     } else {
2131         /* reserved, must never occur */
2132         return 0xffff;
2133     }
2134 }
2135 #endif
2136
2137 /* MBCS-from-Unicode conversion functions ----------------------------------- */
2138
2139 U_CFUNC void
2140 _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
2141                             UErrorCode *pErrorCode) {
2142     UConverter *cnv;
2143     const UChar *source, *sourceLimit;
2144     uint8_t *target;
2145     int32_t targetCapacity;
2146     int32_t *offsets;
2147
2148     const uint16_t *table;
2149     const uint8_t *p, *bytes;
2150     uint8_t outputType;
2151
2152     UChar32 c;
2153
2154     int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
2155
2156     UConverterCallbackReason reason;
2157     uint32_t stage2Entry;
2158     uint32_t value;
2159     int32_t length, prevLength;
2160     uint8_t unicodeMask;
2161
2162     /* use optimized function if possible */
2163     cnv=pArgs->converter;
2164     outputType=cnv->sharedData->table->mbcs.outputType;
2165     unicodeMask=cnv->sharedData->table->mbcs.unicodeMask;
2166     if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
2167         if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
2168             _MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
2169         } else {
2170             _MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
2171         }
2172         return;
2173     } else if(outputType==MBCS_OUTPUT_2) {
2174         _MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
2175         return;
2176     }
2177
2178     /* set up the local pointers */
2179     source=pArgs->source;
2180     sourceLimit=pArgs->sourceLimit;
2181     target=(uint8_t *)pArgs->target;
2182     targetCapacity=pArgs->targetLimit-pArgs->target;
2183     offsets=pArgs->offsets;
2184
2185     table=cnv->sharedData->table->mbcs.fromUnicodeTable;
2186     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2187         bytes=cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes;
2188     } else {
2189         bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes;
2190     }
2191
2192     /* get the converter state from UConverter */
2193     c=cnv->fromUSurrogateLead;
2194     prevLength=cnv->fromUnicodeStatus;
2195
2196     /* sourceIndex=-1 if the current character began in the previous buffer */
2197     prevSourceIndex=-1;
2198     sourceIndex= c==0 ? 0 : -1;
2199     nextSourceIndex=0;
2200
2201     /* conversion loop */
2202     /*
2203      * This is another piece of ugly code:
2204      * A goto into the loop if the converter state contains a first surrogate
2205      * from the previous function call.
2206      * It saves me to check in each loop iteration a check of if(c==0)
2207      * and duplicating the trail-surrogate-handling code in the else
2208      * branch of that check.
2209      * I could not find any other way to get around this other than
2210      * using a function call for the conversion and callback, which would
2211      * be even more inefficient.
2212      *
2213      * Markus Scherer 2000-jul-19
2214      */
2215     if(c!=0 && targetCapacity>0) {
2216         goto getTrail;
2217     }
2218
2219     while(source<sourceLimit) {
2220         /*
2221          * This following test is to see if available input would overflow the output.
2222          * It does not catch output of more than one byte that
2223          * overflows as a result of a multi-byte character or callback output
2224          * from the last source character.
2225          * Therefore, those situations also test for overflows and will
2226          * then break the loop, too.
2227          */
2228         if(targetCapacity>0) {
2229             /*
2230              * Get a correct Unicode code point:
2231              * a single UChar for a BMP code point or
2232              * a matched surrogate pair for a "supplementary code point".
2233              */
2234             c=*source++;
2235             ++nextSourceIndex;
2236             /*
2237              * This also tests if the codepage maps single surrogates.
2238              * If it does, then surrogates are not paired but mapped separately.
2239              * Note that in this case unmatched surrogates are not detected.
2240              */
2241             if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
2242                 if(UTF_IS_SURROGATE_FIRST(c)) {
2243 getTrail:
2244                     if(source<sourceLimit) {
2245                         /* test the following code unit */
2246                         UChar trail=*source;
2247                         if(UTF_IS_SECOND_SURROGATE(trail)) {
2248                             ++source;
2249                             ++nextSourceIndex;
2250                             c=UTF16_GET_PAIR_VALUE(c, trail);
2251                             if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
2252                                 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
2253                                 /* callback(unassigned) */
2254                                 goto unassigned;
2255                             }
2256                             /* convert this supplementary code point */
2257                             /* exit this condition tree */
2258                         } else {
2259                             /* this is an unmatched lead code unit (1st surrogate) */
2260                             /* callback(illegal) */
2261                             reason=UCNV_ILLEGAL;
2262                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2263                             goto callback;
2264                         }
2265                     } else {
2266                         /* no more input */
2267                         break;
2268                     }
2269                 } else {
2270                     /* this is an unmatched trail code unit (2nd surrogate) */
2271                     /* callback(illegal) */
2272                     reason=UCNV_ILLEGAL;
2273                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2274                     goto callback;
2275                 }
2276             }
2277
2278             /* convert the Unicode code point in c into codepage bytes */
2279
2280             /*
2281              * The basic lookup is a triple-stage compact array (trie) lookup.
2282              * For details see the beginning of this file.
2283              *
2284              * Single-byte codepages are handled with a different data structure
2285              * by _MBCSSingle... functions.
2286              *
2287              * The result consists of a 32-bit value from stage 2 and
2288              * a pointer to as many bytes as are stored per character.
2289              * The pointer points to the character's bytes in stage 3.
2290              * Bits 15..0 of the stage 2 entry contain the stage 3 index
2291              * for that pointer, while bits 31..16 are flags for which of
2292              * the 16 characters in the block are roundtrip-assigned.
2293              *
2294              * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
2295              * respectively as uint32_t, in the platform encoding.
2296              * For 3-byte codepages, the bytes are always stored in big-endian order.
2297              *
2298              * For EUC encodings that use only either 0x8e or 0x8f as the first
2299              * byte of their longest byte sequences, the first two bytes in
2300              * this third stage indicate with their 7th bits whether these bytes
2301              * are to be written directly or actually need to be preceeded by
2302              * one of the two Single-Shift codes. With this, the third stage
2303              * stores one byte fewer per character than the actual maximum length of
2304              * EUC byte sequences.
2305              *
2306              * Other than that, leading zero bytes are removed and the other
2307              * bytes output. A single zero byte may be output if the "assigned"
2308              * bit in stage 2 was on or also if the Unicode code point is U+0000.
2309              * The data structure does not support zero byte output as a fallback
2310              * for other code points, and also does not allow output of leading zeros.
2311              */
2312             stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
2313
2314             /* get the bytes and the length for the output */
2315             switch(outputType) {
2316             case MBCS_OUTPUT_2:
2317                 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
2318                 if(value<=0xff) {
2319                     length=1;
2320                 } else {
2321                     length=2;
2322                 }
2323                 break;
2324             case MBCS_OUTPUT_2_SISO:
2325                 /* 1/2-byte stateful with Shift-In/Shift-Out */
2326                 /*
2327                  * Save the old state in the converter object
2328                  * right here, then change the local prevLength state variable if necessary.
2329                  * Then, if this character turns out to be unassigned or a fallback that
2330                  * is not taken, the callback code must not save the new state in the converter
2331                  * because the new state is for a character that is not output.
2332                  * However, the callback must still restore the state from the converter
2333                  * in case the callback function changed it for its output.
2334                  */
2335                 cnv->fromUnicodeStatus=prevLength; /* save the old state */
2336                 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
2337                 if(value<=0xff) {
2338                     if(prevLength==1) {
2339                         length=1;
2340                     } else {
2341                         /* change from double-byte mode to single-byte */
2342                         value|=(uint32_t)UCNV_SI<<8;
2343                         length=2;
2344                         prevLength=1;
2345                     }
2346                 } else {
2347                     if(prevLength==2) {
2348                         length=2;
2349                     } else {
2350                         /* change from single-byte mode to double-byte */
2351                         value|=(uint32_t)UCNV_SO<<16;
2352                         length=3;
2353                         prevLength=2;
2354                     }
2355                 }
2356                 break;
2357             case MBCS_OUTPUT_3:
2358                 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
2359                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
2360                 if(value<=0xff) {
2361                     length=1;
2362                 } else if(value<=0xffff) {
2363                     length=2;
2364                 } else {
2365                     length=3;
2366                 }
2367                 break;
2368             case MBCS_OUTPUT_4:
2369                 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
2370                 if(value<=0xff) {
2371                     length=1;
2372                 } else if(value<=0xffff) {
2373                     length=2;
2374                 } else if(value<=0xffffff) {
2375                     length=3;
2376                 } else {
2377                     length=4;
2378                 }
2379                 break;
2380             case MBCS_OUTPUT_3_EUC:
2381                 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
2382                 /* EUC 16-bit fixed-length representation */
2383                 if(value<=0xff) {
2384                     length=1;
2385                 } else if((value&0x8000)==0) {
2386                     value|=0x8e8000;
2387                     length=3;
2388                 } else if((value&0x80)==0) {
2389                     value|=0x8f0080;
2390                     length=3;
2391                 } else {
2392                     length=2;
2393                 }
2394                 break;
2395             case MBCS_OUTPUT_4_EUC:
2396                 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
2397                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
2398                 /* EUC 16-bit fixed-length representation applied to the first two bytes */
2399                 if(value<=0xff) {
2400                     length=1;
2401                 } else if(value<=0xffff) {
2402                     length=2;
2403                 } else if((value&0x800000)==0) {
2404                     value|=0x8e800000;
2405                     length=4;
2406                 } else if((value&0x8000)==0) {
2407                     value|=0x8f008000;
2408                     length=4;
2409                 } else {
2410                     length=3;
2411                 }
2412                 break;
2413             default:
2414                 /* must not occur */
2415                 /*
2416                  * To avoid compiler warnings that value & length may be
2417                  * used without having been initialized, we set them here.
2418                  * In reality, this is unreachable code.
2419                  * Not having a default branch also causes warnings with
2420                  * some compilers.
2421                  */
2422                 value=0;
2423                 length=0;
2424                 break;
2425             }
2426
2427             /* is this code point assigned, or do we use fallbacks? */
2428             if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 ||
2429                  (UCNV_FROM_U_USE_FALLBACK(cnv, c) && (value!=0 || c==0)))
2430             ) {
2431                 /*
2432                  * We allow a 0 byte output if the Unicode code point is
2433                  * U+0000 and also if the "assigned" bit is set for this entry.
2434                  * There is no way with this data structure for fallback output
2435                  * for other than U+0000 to be a zero byte.
2436                  */
2437                 /* callback(unassigned) */
2438                 goto unassigned;
2439             }
2440
2441             /* write the output character bytes from value and length */
2442             /* from the first if in the loop we know that targetCapacity>0 */
2443             if(length<=targetCapacity) {
2444                 if(offsets==NULL) {
2445                     switch(length) {
2446                         /* each branch falls through to the next one */
2447                     case 4:
2448                         *target++=(uint8_t)(value>>24);
2449                     case 3:
2450                         *target++=(uint8_t)(value>>16);
2451                     case 2:
2452                         *target++=(uint8_t)(value>>8);
2453                     case 1:
2454                         *target++=(uint8_t)value;
2455                     default:
2456                         /* will never occur */
2457                         break;
2458                     }
2459                 } else {
2460                     switch(length) {
2461                         /* each branch falls through to the next one */
2462                     case 4:
2463                         *target++=(uint8_t)(value>>24);
2464                         *offsets++=sourceIndex;
2465                     case 3:
2466                         *target++=(uint8_t)(value>>16);
2467                         *offsets++=sourceIndex;
2468                     case 2:
2469                         *target++=(uint8_t)(value>>8);
2470                         *offsets++=sourceIndex;
2471                     case 1:
2472                         *target++=(uint8_t)value;
2473                         *offsets++=sourceIndex;
2474                     default:
2475                         /* will never occur */
2476                         break;
2477                     }
2478                 }
2479                 targetCapacity-=length;
2480             } else {
2481                 uint8_t *charErrorBuffer;
2482
2483                 /*
2484                  * We actually do this backwards here:
2485                  * In order to save an intermediate variable, we output
2486                  * first to the overflow buffer what does not fit into the
2487                  * regular target.
2488                  */
2489                 /* we know that 1<=targetCapacity<length<=4 */
2490                 length-=targetCapacity;
2491                 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
2492                 switch(length) {
2493                     /* each branch falls through to the next one */
2494                 case 3:
2495                     *charErrorBuffer++=(uint8_t)(value>>16);
2496                 case 2:
2497                     *charErrorBuffer++=(uint8_t)(value>>8);
2498                 case 1:
2499                     *charErrorBuffer=(uint8_t)value;
2500                 default:
2501                     /* will never occur */
2502                     break;
2503                 }
2504                 cnv->charErrorBufferLength=(int8_t)length;
2505
2506                 /* now output what fits into the regular target */
2507                 value>>=8*length; /* length was reduced by targetCapacity */
2508                 switch(targetCapacity) {
2509                     /* each branch falls through to the next one */
2510                 case 3:
2511                     *target++=(uint8_t)(value>>16);
2512                     if(offsets!=NULL) {
2513                         *offsets++=sourceIndex;
2514                     }
2515                 case 2:
2516                     *target++=(uint8_t)(value>>8);
2517                     if(offsets!=NULL) {
2518                         *offsets++=sourceIndex;
2519                     }
2520                 case 1:
2521                     *target++=(uint8_t)value;
2522                     if(offsets!=NULL) {
2523                         *offsets++=sourceIndex;
2524                     }
2525                 default:
2526                     /* will never occur */
2527                     break;
2528                 }
2529
2530                 /* target overflow */
2531                 targetCapacity=0;
2532                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2533                 c=0;
2534                 break;
2535             }
2536
2537             /* normal end of conversion: prepare for a new character */
2538             c=0;
2539             if(offsets!=NULL) {
2540                 prevSourceIndex=sourceIndex;
2541                 sourceIndex=nextSourceIndex;
2542             }
2543             continue;
2544
2545             /*
2546              * This is the same ugly trick as in ToUnicode(), for the
2547              * same reasons...
2548              */
2549 unassigned:
2550             reason=UCNV_UNASSIGNED;
2551             *pErrorCode=U_INVALID_CHAR_FOUND;
2552 callback:
2553             /* call the callback function with all the preparations and post-processing */
2554             /* update the arguments structure */
2555             pArgs->source=source;
2556             pArgs->target=(char *)target;
2557             pArgs->offsets=offsets;
2558
2559             /* set the converter state in UConverter to deal with the next character */
2560             cnv->fromUSurrogateLead=0;
2561             /*
2562              * Do not save the prevLength SISO state because prevLength is set for
2563              * the character that is now not output because it is unassigned or it is
2564              * a fallback that is not taken.
2565              * The above branch for MBCS_OUTPUT_2_SISO has saved the previous state already.
2566              * See comments there.
2567              */
2568             prevSourceIndex=sourceIndex;
2569
2570             /* call the callback function */
2571             fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode);
2572
2573             /* get the converter state from UConverter */
2574             c=cnv->fromUSurrogateLead;
2575             prevLength=cnv->fromUnicodeStatus;
2576
2577             /* update target and deal with offsets if necessary */
2578             offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
2579             target=(uint8_t *)pArgs->target;
2580
2581             /* update the source pointer and index */
2582             sourceIndex=nextSourceIndex+(pArgs->source-source);
2583             source=pArgs->source;
2584             targetCapacity=(uint8_t *)pArgs->targetLimit-target;
2585
2586             /*
2587              * If the callback overflowed the target, then we need to
2588              * stop here with an overflow indication.
2589              */
2590             if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
2591                 break;
2592             } else if(U_FAILURE(*pErrorCode)) {
2593                 /* break on error */
2594                 c=0;
2595                 break;
2596             } else if(cnv->charErrorBufferLength>0) {
2597                 /* target is full */
2598                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2599                 break;
2600             }
2601
2602             /*
2603              * We do not need to repeat the statements from the normal
2604              * end of the conversion because we already updated all the
2605              * necessary variables.
2606              */
2607         } else {
2608             /* target is full */
2609             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2610             break;
2611         }
2612     }
2613
2614     if(pArgs->flush && source>=sourceLimit && U_SUCCESS(*pErrorCode)) {
2615         /* end of input stream */
2616         if(c!=0) {
2617             /* a Unicode code point remains incomplete (only a first surrogate) */
2618             *pErrorCode=U_TRUNCATED_CHAR_FOUND;
2619             /* the following may change with Jitterbug 2449: would prepare for callback instead of resetting */
2620             c=0;
2621             prevLength=1;
2622         } else if(outputType==MBCS_OUTPUT_2_SISO && prevLength==2) {
2623             /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
2624             if(targetCapacity>0) {
2625                 *target++=(uint8_t)UCNV_SI;
2626                 if(offsets!=NULL) {
2627                     /* set the last source character's index (sourceIndex points at sourceLimit now) */
2628                     *offsets++=prevSourceIndex;
2629                 }
2630             } else {
2631                 /* target is full */
2632                 cnv->charErrorBuffer[0]=(char)UCNV_SI;
2633                 cnv->charErrorBufferLength=1;
2634                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2635             }
2636             prevLength=1; /* we switched into SBCS */
2637         }
2638
2639         /* reset the state for the next conversion */
2640         if(U_SUCCESS(*pErrorCode)) {
2641             c=0;
2642             prevLength=1;
2643         }
2644     }
2645
2646     /* set the converter state back into UConverter */
2647     cnv->fromUSurrogateLead=(UChar)c;
2648     cnv->fromUnicodeStatus=prevLength;
2649
2650     /* write back the updated pointers */
2651     pArgs->source=source;
2652     pArgs->target=(char *)target;
2653     pArgs->offsets=offsets;
2654 }
2655
2656 /* This version of _MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
2657 static void
2658 _MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
2659                                   UErrorCode *pErrorCode) {
2660     UConverter *cnv;
2661     const UChar *source, *sourceLimit;
2662     uint8_t *target;
2663     int32_t targetCapacity;
2664     int32_t *offsets;
2665
2666     const uint16_t *table;
2667     const uint8_t *bytes;
2668
2669     UChar32 c;
2670
2671     int32_t sourceIndex, nextSourceIndex;
2672
2673     UConverterCallbackReason reason;
2674     uint32_t stage2Entry;
2675     uint32_t value;
2676     int32_t length, prevLength;
2677     uint8_t unicodeMask;
2678
2679     /* use optimized function if possible */
2680     cnv=pArgs->converter;
2681     unicodeMask=cnv->sharedData->table->mbcs.unicodeMask;
2682
2683     /* set up the local pointers */
2684     source=pArgs->source;
2685     sourceLimit=pArgs->sourceLimit;
2686     target=(uint8_t *)pArgs->target;
2687     targetCapacity=pArgs->targetLimit-pArgs->target;
2688     offsets=pArgs->offsets;
2689
2690     table=cnv->sharedData->table->mbcs.fromUnicodeTable;
2691     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2692         bytes=cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes;
2693     } else {
2694         bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes;
2695     }
2696
2697     /* get the converter state from UConverter */
2698     c=cnv->fromUSurrogateLead;
2699     prevLength=cnv->fromUnicodeStatus;
2700
2701     /* sourceIndex=-1 if the current character began in the previous buffer */
2702     sourceIndex= c==0 ? 0 : -1;
2703     nextSourceIndex=0;
2704
2705     /* conversion loop */
2706     if(c!=0 && targetCapacity>0) {
2707         goto getTrail;
2708     }
2709
2710     while(source<sourceLimit) {
2711         /*
2712          * This following test is to see if available input would overflow the output.
2713          * It does not catch output of more than one byte that
2714          * overflows as a result of a multi-byte character or callback output
2715          * from the last source character.
2716          * Therefore, those situations also test for overflows and will
2717          * then break the loop, too.
2718          */
2719         if(targetCapacity>0) {
2720             /*
2721              * Get a correct Unicode code point:
2722              * a single UChar for a BMP code point or
2723              * a matched surrogate pair for a "supplementary code point".
2724              */
2725             c=*source++;
2726             ++nextSourceIndex;
2727             /*
2728              * This also tests if the codepage maps single surrogates.
2729              * If it does, then surrogates are not paired but mapped separately.
2730              * Note that in this case unmatched surrogates are not detected.
2731              */
2732             if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
2733                 if(UTF_IS_SURROGATE_FIRST(c)) {
2734 getTrail:
2735                     if(source<sourceLimit) {
2736                         /* test the following code unit */
2737                         UChar trail=*source;
2738                         if(UTF_IS_SECOND_SURROGATE(trail)) {
2739                             ++source;
2740                             ++nextSourceIndex;
2741                             c=UTF16_GET_PAIR_VALUE(c, trail);
2742                             if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
2743                                 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
2744                                 /* callback(unassigned) */
2745                                 goto unassigned;
2746                             }
2747                             /* convert this supplementary code point */
2748                             /* exit this condition tree */
2749                         } else {
2750                             /* this is an unmatched lead code unit (1st surrogate) */
2751                             /* callback(illegal) */
2752                             reason=UCNV_ILLEGAL;
2753                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2754                             goto callback;
2755                         }
2756                     } else {
2757                         /* no more input */
2758                         break;
2759                     }
2760                 } else {
2761                     /* this is an unmatched trail code unit (2nd surrogate) */
2762                     /* callback(illegal) */
2763                     reason=UCNV_ILLEGAL;
2764                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2765                     goto callback;
2766                 }
2767             }
2768
2769             /* convert the Unicode code point in c into codepage bytes */
2770             stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
2771
2772             /* get the bytes and the length for the output */
2773             /* MBCS_OUTPUT_2 */
2774             value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
2775             if(value<=0xff) {
2776                 length=1;
2777             } else {
2778                 length=2;
2779             }
2780
2781             /* is this code point assigned, or do we use fallbacks? */
2782             if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
2783                  (UCNV_FROM_U_USE_FALLBACK(cnv, c) && (value!=0 || c==0)))
2784             ) {
2785                 /*
2786                  * We allow a 0 byte output if the Unicode code point is
2787                  * U+0000 and also if the "assigned" bit is set for this entry.
2788                  * There is no way with this data structure for fallback output
2789                  * for other than U+0000 to be a zero byte.
2790                  */
2791                 /* callback(unassigned) */
2792                 goto unassigned;
2793             }
2794
2795             /* write the output character bytes from value and length */
2796             /* from the first if in the loop we know that targetCapacity>0 */
2797             if(length==1) {
2798                 /* this is easy because we know that there is enough space */
2799                 *target++=(uint8_t)value;
2800                 if(offsets!=NULL) {
2801                     *offsets++=sourceIndex;
2802                 }
2803                 --targetCapacity;
2804             } else /* length==2 */ {
2805                 *target++=(uint8_t)(value>>8);
2806                 if(2<=targetCapacity) {
2807                     *target++=(uint8_t)value;
2808                     if(offsets!=NULL) {
2809                         *offsets++=sourceIndex;
2810                         *offsets++=sourceIndex;
2811                     }
2812                     targetCapacity-=2;
2813                 } else {
2814                     if(offsets!=NULL) {
2815                         *offsets++=sourceIndex;
2816                     }
2817                     cnv->charErrorBuffer[0]=(char)value;
2818                     cnv->charErrorBufferLength=1;
2819
2820                     /* target overflow */
2821                     targetCapacity=0;
2822                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2823                     c=0;
2824                     break;
2825                 }
2826             }
2827
2828             /* normal end of conversion: prepare for a new character */
2829             c=0;
2830             sourceIndex=nextSourceIndex;
2831             continue;
2832
2833             /*
2834              * This is the same ugly trick as in ToUnicode(), for the
2835              * same reasons...
2836              */
2837 unassigned:
2838             reason=UCNV_UNASSIGNED;
2839             *pErrorCode=U_INVALID_CHAR_FOUND;
2840 callback:
2841             /* call the callback function with all the preparations and post-processing */
2842             /* update the arguments structure */
2843             pArgs->source=source;
2844             pArgs->target=(char *)target;
2845             pArgs->offsets=offsets;
2846
2847             /* set the converter state in UConverter to deal with the next character */
2848             cnv->fromUSurrogateLead=0;
2849             cnv->fromUnicodeStatus=prevLength;
2850
2851             /* call the callback function */
2852             fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode);
2853
2854             /* get the converter state from UConverter */
2855             c=cnv->fromUSurrogateLead;
2856             prevLength=cnv->fromUnicodeStatus;
2857
2858             /* update target and deal with offsets if necessary */
2859             offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
2860             target=(uint8_t *)pArgs->target;
2861
2862             /* update the source pointer and index */
2863             sourceIndex=nextSourceIndex+(pArgs->source-source);
2864             source=pArgs->source;
2865             targetCapacity=(uint8_t *)pArgs->targetLimit-target;
2866
2867             /*
2868              * If the callback overflowed the target, then we need to
2869              * stop here with an overflow indication.
2870              */
2871             if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
2872                 break;
2873             } else if(U_FAILURE(*pErrorCode)) {
2874                 /* break on error */
2875                 c=0;
2876                 break;
2877             } else if(cnv->charErrorBufferLength>0) {
2878                 /* target is full */
2879                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2880                 break;
2881             }
2882
2883             /*
2884              * We do not need to repeat the statements from the normal
2885              * end of the conversion because we already updated all the
2886              * necessary variables.
2887              */
2888         } else {
2889             /* target is full */
2890             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2891             break;
2892         }
2893     }
2894
2895     if(pArgs->flush && source>=sourceLimit) {
2896         /* reset the state for the next conversion */
2897         if(c!=0 && U_SUCCESS(*pErrorCode)) {
2898             /* a Unicode code point remains incomplete (only a first surrogate) */
2899             *pErrorCode=U_TRUNCATED_CHAR_FOUND;
2900         }
2901         cnv->fromUSurrogateLead=0;
2902         cnv->fromUnicodeStatus=1;
2903     } else {
2904         /* set the converter state back into UConverter */
2905         cnv->fromUSurrogateLead=(UChar)c;
2906         cnv->fromUnicodeStatus=prevLength;
2907     }
2908
2909     /* write back the updated pointers */
2910     pArgs->source=source;
2911     pArgs->target=(char *)target;
2912     pArgs->offsets=offsets;
2913 }
2914
2915 /* This version of _MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
2916 static void
2917 _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
2918                                   UErrorCode *pErrorCode) {
2919     UConverter *cnv;
2920     const UChar *source, *sourceLimit;
2921     uint8_t *target;
2922     int32_t targetCapacity;
2923     int32_t *offsets;
2924
2925     const uint16_t *table;
2926     const uint16_t *results;
2927
2928     UChar32 c;
2929
2930     int32_t sourceIndex, nextSourceIndex;
2931
2932     UConverterCallbackReason reason;
2933     uint16_t value, minValue;
2934     UBool hasSupplementary;
2935
2936     /* set up the local pointers */
2937     cnv=pArgs->converter;
2938     source=pArgs->source;
2939     sourceLimit=pArgs->sourceLimit;
2940     target=(uint8_t *)pArgs->target;
2941     targetCapacity=pArgs->targetLimit-pArgs->target;
2942     offsets=pArgs->offsets;
2943
2944     table=cnv->sharedData->table->mbcs.fromUnicodeTable;
2945     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2946         results=(uint16_t *)cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes;
2947     } else {
2948         results=(uint16_t *)cnv->sharedData->table->mbcs.fromUnicodeBytes;
2949     }
2950
2951     if(cnv->useFallback) {
2952         /* use all roundtrip and fallback results */
2953         minValue=0x800;
2954     } else {
2955         /* use only roundtrips and fallbacks from private-use characters */
2956         minValue=0xc00;
2957     }
2958     hasSupplementary=(UBool)(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
2959
2960     /* get the converter state from UConverter */
2961     c=cnv->fromUSurrogateLead;
2962
2963     /* sourceIndex=-1 if the current character began in the previous buffer */
2964     sourceIndex= c==0 ? 0 : -1;
2965     nextSourceIndex=0;
2966
2967     /* conversion loop */
2968     if(c!=0 && targetCapacity>0) {
2969         goto getTrail;
2970     }
2971
2972     while(source<sourceLimit) {
2973         /*
2974          * This following test is to see if available input would overflow the output.
2975          * It does not catch output of more than one byte that
2976          * overflows as a result of a multi-byte character or callback output
2977          * from the last source character.
2978          * Therefore, those situations also test for overflows and will
2979          * then break the loop, too.
2980          */
2981         if(targetCapacity>0) {
2982             /*
2983              * Get a correct Unicode code point:
2984              * a single UChar for a BMP code point or
2985              * a matched surrogate pair for a "supplementary code point".
2986              */
2987             c=*source++;
2988             ++nextSourceIndex;
2989             if(UTF_IS_SURROGATE(c)) {
2990                 if(UTF_IS_SURROGATE_FIRST(c)) {
2991 getTrail:
2992                     if(source<sourceLimit) {
2993                         /* test the following code unit */
2994                         UChar trail=*source;
2995                         if(UTF_IS_SECOND_SURROGATE(trail)) {
2996                             ++source;
2997                             ++nextSourceIndex;
2998                             c=UTF16_GET_PAIR_VALUE(c, trail);
2999                             if(!hasSupplementary) {
3000                                 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3001                                 /* callback(unassigned) */
3002                                 goto unassigned;
3003                             }
3004                             /* convert this supplementary code point */
3005                             /* exit this condition tree */
3006                         } else {
3007                             /* this is an unmatched lead code unit (1st surrogate) */
3008                             /* callback(illegal) */
3009                             reason=UCNV_ILLEGAL;
3010                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3011                             goto callback;
3012                         }
3013                     } else {
3014                         /* no more input */
3015                         break;
3016                     }
3017                 } else {
3018                     /* this is an unmatched trail code unit (2nd surrogate) */
3019                     /* callback(illegal) */
3020                     reason=UCNV_ILLEGAL;
3021                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3022                     goto callback;
3023                 }
3024             }
3025
3026             /* convert the Unicode code point in c into codepage bytes */
3027             value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3028
3029             /* is this code point assigned, or do we use fallbacks? */
3030             if(value>=minValue) {
3031                 /* assigned, write the output character bytes from value and length */
3032                 /* length==1 */
3033                 /* this is easy because we know that there is enough space */
3034                 *target++=(uint8_t)value;
3035                 if(offsets!=NULL) {
3036                     *offsets++=sourceIndex;
3037                 }
3038                 --targetCapacity;
3039
3040                 /* normal end of conversion: prepare for a new character */
3041                 c=0;
3042                 sourceIndex=nextSourceIndex;
3043                 continue;
3044             } else { /* unassigned */
3045                 /*
3046                  * We allow a 0 byte output if the Unicode code point is
3047                  * U+0000 and also if the "assigned" bit is set for this entry.
3048                  * There is no way with this data structure for fallback output
3049                  * for other than U+0000 to be a zero byte.
3050                  */
3051                 /* callback(unassigned) */
3052             }
3053 unassigned:
3054             reason=UCNV_UNASSIGNED;
3055             *pErrorCode=U_INVALID_CHAR_FOUND;
3056 callback:
3057             /* call the callback function with all the preparations and post-processing */
3058             /* update the arguments structure */
3059             pArgs->source=source;
3060             pArgs->target=(char *)target;
3061             pArgs->offsets=offsets;
3062
3063             /* set the converter state in UConverter to deal with the next character */
3064             cnv->fromUSurrogateLead=0;
3065
3066             /* call the callback function */
3067             fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode);
3068
3069             /* get the converter state from UConverter */
3070             c=cnv->fromUSurrogateLead;
3071
3072             /* update target and deal with offsets if necessary */
3073             offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
3074             target=(uint8_t *)pArgs->target;
3075
3076             /* update the source pointer and index */
3077             sourceIndex=nextSourceIndex+(pArgs->source-source);
3078             source=pArgs->source;
3079             targetCapacity=(uint8_t *)pArgs->targetLimit-target;
3080
3081             /*
3082              * If the callback overflowed the target, then we need to
3083              * stop here with an overflow indication.
3084              */
3085             if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
3086                 break;
3087             } else if(U_FAILURE(*pErrorCode)) {
3088                 /* break on error */
3089                 c=0;
3090                 break;
3091             } else if(cnv->charErrorBufferLength>0) {
3092                 /* target is full */
3093                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3094                 break;
3095             }
3096
3097             /*
3098              * We do not need to repeat the statements from the normal
3099              * end of the conversion because we already updated all the
3100              * necessary variables.
3101              */
3102         } else {
3103             /* target is full */
3104             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3105             break;
3106         }
3107     }
3108
3109     if(pArgs->flush && source>=sourceLimit) {
3110         /* reset the state for the next conversion */
3111         if(c!=0 && U_SUCCESS(*pErrorCode)) {
3112             /* a Unicode code point remains incomplete (only a first surrogate) */
3113             *pErrorCode=U_TRUNCATED_CHAR_FOUND;
3114         }
3115         cnv->fromUSurrogateLead=0;
3116     } else {
3117         /* set the converter state back into UConverter */
3118         cnv->fromUSurrogateLead=(UChar)c;
3119     }
3120
3121     /* write back the updated pointers */
3122     pArgs->source=source;
3123     pArgs->target=(char *)target;
3124     pArgs->offsets=offsets;
3125 }
3126
3127 /*
3128  * This version of _MBCSFromUnicode() is optimized for single-byte codepages
3129  * that map only to and from the BMP.
3130  * In addition to single-byte/state optimizations, the offset calculations
3131  * become much easier.
3132  */
3133 static void
3134 _MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
3135                               UErrorCode *pErrorCode) {
3136     UConverter *cnv;
3137     const UChar *source, *sourceLimit, *lastSource;
3138     uint8_t *target;
3139     int32_t targetCapacity, length;
3140     int32_t *offsets;
3141
3142     const uint16_t *table;
3143     const uint16_t *results;
3144
3145     UChar32 c;
3146
3147     int32_t sourceIndex;
3148
3149     UConverterCallbackReason reason;
3150     uint16_t value, minValue;
3151
3152     /* set up the local pointers */
3153     cnv=pArgs->converter;
3154     source=pArgs->source;
3155     sourceLimit=pArgs->sourceLimit;
3156     target=(uint8_t *)pArgs->target;
3157     targetCapacity=pArgs->targetLimit-pArgs->target;
3158     offsets=pArgs->offsets;
3159
3160     table=cnv->sharedData->table->mbcs.fromUnicodeTable;
3161     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3162         results=(uint16_t *)cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes;
3163     } else {
3164         results=(uint16_t *)cnv->sharedData->table->mbcs.fromUnicodeBytes;
3165     }
3166
3167     if(cnv->useFallback) {
3168         /* use all roundtrip and fallback results */
3169         minValue=0x800;
3170     } else {
3171         /* use only roundtrips and fallbacks from private-use characters */
3172         minValue=0xc00;
3173     }
3174
3175     /* get the converter state from UConverter */
3176     c=cnv->fromUSurrogateLead;
3177
3178     /* sourceIndex=-1 if the current character began in the previous buffer */
3179     sourceIndex= c==0 ? 0 : -1;
3180     lastSource=source;
3181
3182     /*
3183      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
3184      * for the minimum of the sourceLength and targetCapacity
3185      */
3186     length=sourceLimit-source;
3187     if(length<targetCapacity) {
3188         targetCapacity=length;
3189     }
3190
3191     /* conversion loop */
3192     if(c!=0 && targetCapacity>0) {
3193         goto getTrail;
3194     }
3195
3196 #if MBCS_UNROLL_SINGLE_FROM_BMP
3197     /* unrolling makes it slower on Pentium III/Windows 2000?! */
3198     /* unroll the loop with the most common case */
3199 unrolled:
3200     if(targetCapacity>=4) {
3201         int32_t count, loops;
3202         uint16_t andedValues;
3203
3204         loops=count=targetCapacity>>2;
3205         do {
3206             c=*source++;
3207             andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3208             *target++=(uint8_t)value;
3209             c=*source++;
3210             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3211             *target++=(uint8_t)value;
3212             c=*source++;
3213             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3214             *target++=(uint8_t)value;
3215             c=*source++;
3216             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3217             *target++=(uint8_t)value;
3218
3219             /* were all 4 entries really valid? */
3220             if(andedValues<minValue) {
3221                 /* no, return to the first of these 4 */
3222                 source-=4;
3223                 target-=4;
3224                 break;
3225             }
3226         } while(--count>0);
3227         count=loops-count;
3228         targetCapacity-=4*count;
3229
3230         if(offsets!=NULL) {
3231             lastSource+=4*count;
3232             while(count>0) {
3233                 *offsets++=sourceIndex++;
3234                 *offsets++=sourceIndex++;
3235                 *offsets++=sourceIndex++;
3236                 *offsets++=sourceIndex++;
3237                 --count;
3238             }
3239         }
3240
3241         c=0;
3242     }
3243 #endif
3244
3245     while(targetCapacity>0) {
3246         /*
3247          * Get a correct Unicode code point:
3248          * a single UChar for a BMP code point or
3249          * a matched surrogate pair for a "supplementary code point".
3250          */
3251         c=*source++;
3252         /*
3253          * Do not immediately check for single surrogates:
3254          * Assume that they are unassigned and check for them in that case.
3255          * This speeds up the conversion of assigned characters.
3256          */
3257         /* convert the Unicode code point in c into codepage bytes */
3258         value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3259
3260         /* is this code point assigned, or do we use fallbacks? */
3261         if(value>=minValue) {
3262             /* assigned, write the output character bytes from value and length */
3263             /* length==1 */
3264             /* this is easy because we know that there is enough space */
3265             *target++=(uint8_t)value;
3266             --targetCapacity;
3267
3268             /* normal end of conversion: prepare for a new character */
3269             c=0;
3270             continue;
3271         } else if(!UTF_IS_SURROGATE(c)) {
3272             /* normal, unassigned BMP character */
3273             /*
3274              * We allow a 0 byte output if the Unicode code point is
3275              * U+0000 and also if the "assigned" bit is set for this entry.
3276              * There is no way with this data structure for fallback output
3277              * for other than U+0000 to be a zero byte.
3278              */
3279             /* callback(unassigned) */
3280             reason=UCNV_UNASSIGNED;
3281             *pErrorCode=U_INVALID_CHAR_FOUND;
3282         } else if(UTF_IS_SURROGATE_FIRST(c)) {
3283 getTrail:
3284             if(source<sourceLimit) {
3285                 /* test the following code unit */
3286                 UChar trail=*source;
3287                 if(UTF_IS_SECOND_SURROGATE(trail)) {
3288                     ++source;
3289                     c=UTF16_GET_PAIR_VALUE(c, trail);
3290                     /* this codepage does not map supplementary code points */
3291                     /* callback(unassigned) */
3292                     reason=UCNV_UNASSIGNED;
3293                     *pErrorCode=U_INVALID_CHAR_FOUND;
3294                 } else {
3295                     /* this is an unmatched lead code unit (1st surrogate) */
3296                     /* callback(illegal) */
3297                     reason=UCNV_ILLEGAL;
3298                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3299                 }
3300             } else {
3301                 /* no more input */
3302                 break;
3303             }
3304         } else {
3305             /* this is an unmatched trail code unit (2nd surrogate) */
3306             /* callback(illegal) */
3307             reason=UCNV_ILLEGAL;
3308             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3309         }
3310
3311         /* call the callback function with all the preparations and post-processing */
3312         /* get the number of code units for c to correctly advance sourceIndex after the callback call */
3313         length=UTF_CHAR_LENGTH(c);
3314
3315         /* set offsets since the start or the last callback */
3316         if(offsets!=NULL) {
3317             int32_t count=(int32_t)(source-lastSource);
3318
3319             /* do not set the offset for the callback-causing character */
3320             count-=length;
3321
3322             while(count>0) {
3323                 *offsets++=sourceIndex++;
3324                 --count;
3325             }
3326             /* offset and sourceIndex are now set for the current character */
3327         }
3328
3329         /* update the arguments structure */
3330         pArgs->source=source;
3331         pArgs->target=(char *)target;
3332         pArgs->offsets=offsets;
3333
3334         /* set the converter state in UConverter to deal with the next character */
3335         cnv->fromUSurrogateLead=0;
3336
3337         /* call the callback function */
3338         fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode);
3339
3340         /* get the converter state from UConverter */
3341         c=cnv->fromUSurrogateLead;
3342
3343         /* update target and deal with offsets if necessary */
3344         offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
3345         target=(uint8_t *)pArgs->target;
3346
3347         /* update the source pointer and index */
3348         sourceIndex+=length+(pArgs->source-source);
3349         source=lastSource=pArgs->source;
3350         targetCapacity=(uint8_t *)pArgs->targetLimit-target;
3351         length=sourceLimit-source;
3352         if(length<targetCapacity) {
3353             targetCapacity=length;
3354         }
3355
3356         /*
3357          * If the callback overflowed the target, then we need to
3358          * stop here with an overflow indication.
3359          */
3360         if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
3361             break;
3362         } else if(U_FAILURE(*pErrorCode)) {
3363             /* break on error */
3364             c=0;
3365             break;
3366         } else if(cnv->charErrorBufferLength>0) {
3367             /* target is full */
3368             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3369             break;
3370         }
3371
3372 #if MBCS_UNROLL_SINGLE_FROM_BMP
3373         /* unrolling makes it slower on Pentium III/Windows 2000?! */
3374         goto unrolled;
3375 #endif
3376     }
3377
3378     if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
3379         /* target is full */
3380         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3381     }
3382
3383     /* set offsets since the start or the last callback */
3384     if(offsets!=NULL) {
3385         size_t count=source-lastSource;
3386         while(count>0) {
3387             *offsets++=sourceIndex++;
3388             --count;
3389         }
3390     }
3391
3392     if(pArgs->flush && source>=sourceLimit) {
3393         /* reset the state for the next conversion */
3394         if(c!=0 && U_SUCCESS(*pErrorCode)) {
3395             /* a Unicode code point remains incomplete (only a first surrogate) */
3396             *pErrorCode=U_TRUNCATED_CHAR_FOUND;
3397         }
3398         cnv->fromUSurrogateLead=0;
3399     } else {
3400         /* set the converter state back into UConverter */
3401         cnv->fromUSurrogateLead=(UChar)c;
3402     }
3403
3404     /* write back the updated pointers */
3405     pArgs->source=source;
3406     pArgs->target=(char *)target;
3407     pArgs->offsets=offsets;
3408 }
3409
3410 /*
3411  * This is another simple conversion function for internal use by other
3412  * conversion implementations.
3413  * It does not use the converter state nor call callbacks.
3414  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3415  *
3416  * It converts one single Unicode code point into codepage bytes, encoded
3417  * as one 32-bit value. The function returns the number of bytes in *pValue:
3418  * 1..4 the number of bytes in *pValue
3419  * 0    unassigned (*pValue undefined)
3420  * -1   illegal (currently not used, *pValue undefined)
3421  *
3422  * *pValue will contain the resulting bytes with the last byte in bits 7..0,
3423  * the second to last byte in bits 15..8, etc.
3424  * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
3425  */
3426 U_CFUNC int32_t
3427 _MBCSFromUChar32(UConverterSharedData *sharedData,
3428                  UChar32 c, uint32_t *pValue,
3429                  UBool useFallback) {
3430     const uint16_t *table=sharedData->table->mbcs.fromUnicodeTable;
3431     const uint8_t *p;
3432     uint32_t stage2Entry;
3433     uint32_t value;
3434     int32_t length;
3435
3436     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3437     if(c>=0x10000 && !(sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3438         return 0;
3439     }
3440
3441     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
3442     if(sharedData->table->mbcs.outputType==MBCS_OUTPUT_1) {
3443         value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->table->mbcs.fromUnicodeBytes, c);
3444         /* is this code point assigned, or do we use fallbacks? */
3445         if(useFallback ? value>=0x800 : value>=0xc00) {
3446             *pValue=value&0xff;
3447             return 1;
3448         } else {
3449             return 0;
3450         }
3451     }
3452
3453     stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
3454
3455     /* get the bytes and the length for the output */
3456     switch(sharedData->table->mbcs.outputType) {
3457     case MBCS_OUTPUT_2:
3458         value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->table->mbcs.fromUnicodeBytes, stage2Entry, c);
3459         if(value<=0xff) {
3460             length=1;
3461         } else {
3462             length=2;
3463         }
3464         break;
3465     case MBCS_OUTPUT_3:
3466         p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->table->mbcs.fromUnicodeBytes, stage2Entry, c);
3467         value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
3468         if(value<=0xff) {
3469             length=1;
3470         } else if(value<=0xffff) {
3471             length=2;
3472         } else {
3473             length=3;
3474         }
3475         break;
3476     case MBCS_OUTPUT_4:
3477         value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->table->mbcs.fromUnicodeBytes, stage2Entry, c);
3478         if(value<=0xff) {
3479             length=1;
3480         } else if(value<=0xffff) {
3481             length=2;
3482         } else if(value<=0xffffff) {
3483             length=3;
3484         } else {
3485             length=4;
3486         }
3487         break;
3488     case MBCS_OUTPUT_3_EUC:
3489         value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->table->mbcs.fromUnicodeBytes, stage2Entry, c);
3490         /* EUC 16-bit fixed-length representation */
3491         if(value<=0xff) {
3492             length=1;
3493         } else if((value&0x8000)==0) {
3494             value|=0x8e8000;
3495             length=3;
3496         } else if((value&0x80)==0) {
3497             value|=0x8f0080;
3498             length=3;
3499         } else {
3500             length=2;
3501         }
3502         break;
3503     case MBCS_OUTPUT_4_EUC:
3504         p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->table->mbcs.fromUnicodeBytes, stage2Entry, c);
3505         value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
3506         /* EUC 16-bit fixed-length representation applied to the first two bytes */
3507         if(value<=0xff) {
3508             length=1;
3509         } else if(value<=0xffff) {
3510             length=2;
3511         } else if((value&0x800000)==0) {
3512             value|=0x8e800000;
3513             length=4;
3514         } else if((value&0x8000)==0) {
3515             value|=0x8f008000;
3516             length=4;
3517         } else {
3518             length=3;
3519         }
3520         break;
3521     default:
3522         /* must not occur */
3523         return -1;
3524     }
3525
3526     /* is this code point assigned, or do we use fallbacks? */
3527     if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
3528         (FROM_U_USE_FALLBACK(useFallback, c) && (value!=0 || c==0))
3529     ) {
3530         /*
3531          * We allow a 0 byte output if the Unicode code point is
3532          * U+0000 and also if the "assigned" bit is set for this entry.
3533          * There is no way with this data structure for fallback output
3534          * for other than U+0000 to be a zero byte.
3535          */
3536         /* assigned */
3537         *pValue=value;
3538         return length;
3539     } else {
3540         return 0;
3541     }
3542 }
3543
3544
3545 #if 0
3546 /**
3547  * ################################################################
3548  * #
3549  * # This function has been moved to ucnv2022.c for inlining.
3550  * # This implementation is here only for documentation purposes
3551  * #
3552  * ################################################################
3553  */
3554
3555 /**
3556  * This version of _MBCSFromUChar32() is optimized for single-byte codepages.
3557  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3558  *
3559  * It returns the codepage byte for the code point, or -1 if it is unassigned.
3560  */
3561 U_CFUNC int32_t
3562 _MBCSSingleFromUChar32(UConverterSharedData *sharedData,
3563                        UChar32 c,
3564                        UBool useFallback) {
3565     const uint16_t *table;
3566     int32_t value;
3567
3568     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3569     if(c>=0x10000 && !(sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3570         return -1;
3571     }
3572
3573     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
3574     table=sharedData->table->mbcs.fromUnicodeTable;
3575
3576     /* get the byte for the output */
3577     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->table->mbcs.fromUnicodeBytes, c);
3578     /* is this code point assigned, or do we use fallbacks? */
3579     if(useFallback ? value>=0x800 : value>=0xc00) {
3580         return value&0xff;
3581     } else {
3582         return -1;
3583     }
3584 }
3585 #endif
3586
3587 /* miscellaneous ------------------------------------------------------------ */
3588
3589 static void
3590 _MBCSGetStarters(const UConverter* cnv,
3591                  UBool starters[256],
3592                  UErrorCode *pErrorCode) {
3593     const int32_t *state0=cnv->sharedData->table->mbcs.stateTable[0];
3594     int i;
3595
3596     for(i=0; i<256; ++i) {
3597         /* all bytes that cause a state transition from state 0 are lead bytes */
3598         starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);
3599     }
3600 }
3601
3602 /*
3603  * This is an internal function that allows other converter implementations
3604  * to check whether a byte is a lead byte.
3605  */
3606 U_CFUNC UBool
3607 _MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {
3608     return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->table->mbcs.stateTable[0][(uint8_t)byte]);
3609 }
3610
3611 static void
3612 _MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
3613               int32_t offsetIndex,
3614               UErrorCode *pErrorCode) {
3615     UConverter *cnv=pArgs->converter;
3616     char *p, *subchar;
3617     char buffer[4];
3618     int32_t length;
3619
3620     /* first, select between subChar and subChar1 */
3621     if(cnv->subChar1!=0 && cnv->invalidUCharBuffer[0]<=0xff) {
3622         /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
3623         subchar=(char *)&cnv->subChar1;
3624         length=1;
3625     } else {
3626         /* select subChar in all other cases */
3627         subchar=(char *)cnv->subChar;
3628         length=cnv->subCharLen;
3629     }
3630
3631     switch(cnv->sharedData->table->mbcs.outputType) {
3632     case MBCS_OUTPUT_2_SISO:
3633         p=buffer;
3634
3635         /* fromUnicodeStatus contains prevLength */
3636         switch(length) {
3637         case 1:
3638             if(cnv->fromUnicodeStatus==2) {
3639                 /* DBCS mode and SBCS sub char: change to SBCS */
3640                 cnv->fromUnicodeStatus=1;
3641                 *p++=UCNV_SI;
3642             }
3643             *p++=subchar[0];
3644             break;
3645         case 2:
3646             if(cnv->fromUnicodeStatus==1) {
3647                 /* SBCS mode and DBCS sub char: change to DBCS */
3648                 cnv->fromUnicodeStatus=2;
3649                 *p++=UCNV_SO;
3650             }
3651             *p++=subchar[0];
3652             *p++=subchar[1];
3653             break;
3654         default:
3655             *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3656             return;
3657         }
3658         ucnv_cbFromUWriteBytes(pArgs,
3659                                buffer, (int32_t)(p-buffer),
3660                                offsetIndex, pErrorCode);
3661         break;
3662     default:
3663         ucnv_cbFromUWriteBytes(pArgs,
3664                                subchar, length,
3665                                offsetIndex, pErrorCode);
3666         break;
3667     }
3668 }
3669
3670 U_CFUNC UConverterType
3671 _MBCSGetType(const UConverter* converter) {
3672     /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
3673     if(converter->sharedData->table->mbcs.countStates==1) {
3674         return (UConverterType)UCNV_SBCS;
3675     } else if((converter->sharedData->table->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {
3676         return (UConverterType)UCNV_EBCDIC_STATEFUL;
3677     } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) {
3678         return (UConverterType)UCNV_DBCS;
3679     }
3680     return (UConverterType)UCNV_MBCS;
3681 }
3682
3683 static const UConverterImpl _MBCSImpl={
3684     UCNV_MBCS,
3685
3686     _MBCSLoad,
3687     _MBCSUnload,
3688
3689     _MBCSOpen,
3690     NULL,
3691     _MBCSReset,
3692
3693     _MBCSToUnicodeWithOffsets,
3694     _MBCSToUnicodeWithOffsets,
3695     _MBCSFromUnicodeWithOffsets,
3696     _MBCSFromUnicodeWithOffsets,
3697     _MBCSGetNextUChar,
3698
3699     _MBCSGetStarters,
3700     _MBCSGetName,
3701     _MBCSWriteSub,
3702     NULL,
3703     _MBCSGetUnicodeSet
3704 };
3705
3706
3707 /* Static data is in tools/makeconv/ucnvstat.c for data-based
3708  * converters. Be sure to update it as well.
3709  */
3710
3711 const UConverterSharedData _MBCSData={
3712     sizeof(UConverterSharedData), 1,
3713     NULL, NULL, NULL, FALSE, &_MBCSImpl,
3714     0
3715 };
3716
3717 /* GB 18030 special handling ------------------------------------------------ */
3718
3719 /* definition of LINEAR macros and gb18030Ranges see near the beginning of the file */
3720
3721 /* the callback functions handle GB 18030 specially */
3722 static void
3723 fromUCallback(UConverter *cnv,
3724               const void *context, UConverterFromUnicodeArgs *pArgs,
3725               UChar32 codePoint,
3726               UConverterCallbackReason reason, UErrorCode *pErrorCode) {
3727     int32_t i;
3728
3729     if((cnv->options&_MBCS_OPTION_GB18030)!=0 && reason==UCNV_UNASSIGNED) {
3730         const uint32_t *range;
3731
3732         range=gb18030Ranges[0];
3733         for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
3734             if(range[0]<=(uint32_t)codePoint && (uint32_t)codePoint<=range[1]) {
3735                 uint32_t linear;
3736                 char bytes[4];
3737
3738                 /* found the Unicode code point, output the four-byte sequence for it */
3739                 *pErrorCode=U_ZERO_ERROR;
3740
3741                 /* get the linear value of the first GB 18030 code in this range */
3742                 linear=range[2]-LINEAR_18030_BASE;
3743
3744                 /* add the offset from the beginning of the range */
3745                 linear+=((uint32_t)codePoint-range[0]);
3746
3747                 /* turn this into a four-byte sequence */
3748                 bytes[3]=(char)(0x30+linear%10); linear/=10;
3749                 bytes[2]=(char)(0x81+linear%126); linear/=126;
3750                 bytes[1]=(char)(0x30+linear%10); linear/=10;
3751                 bytes[0]=(char)(0x81+linear);
3752
3753                 /* output this sequence */
3754                 ucnv_cbFromUWriteBytes(pArgs, bytes, 4, 0, pErrorCode);
3755                 return;
3756             }
3757         }
3758     }
3759
3760     /* write the code point as code units */
3761     i=0;
3762     UTF_APPEND_CHAR_UNSAFE(cnv->invalidUCharBuffer, i, codePoint);
3763     cnv->invalidUCharLength=(int8_t)i;
3764
3765     /* call the normal callback function */
3766     cnv->fromUCharErrorBehaviour(context, pArgs, cnv->invalidUCharBuffer, i, codePoint, reason, pErrorCode);
3767 }
3768
3769 static void
3770 toUCallback(UConverter *cnv,
3771             const void *context, UConverterToUnicodeArgs *pArgs,
3772             const char *codeUnits, int32_t length,
3773             UConverterCallbackReason reason, UErrorCode *pErrorCode) {
3774     int32_t i;
3775
3776     if((cnv->options&_MBCS_OPTION_GB18030)!=0 && reason==UCNV_UNASSIGNED && length==4) {
3777         const uint32_t *range;
3778         uint32_t linear;
3779
3780         linear=LINEAR_18030((uint8_t)codeUnits[0], (uint8_t)codeUnits[1], (uint8_t)codeUnits[2], (uint8_t)codeUnits[3]);
3781         range=gb18030Ranges[0];
3782         for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
3783             if(range[2]<=linear && linear<=range[3]) {
3784                 UChar u[UTF_MAX_CHAR_LENGTH];
3785
3786                 /* found the sequence, output the Unicode code point for it */
3787                 *pErrorCode=U_ZERO_ERROR;
3788
3789                 /* add the linear difference between the input and start sequences to the start code point */
3790                 linear=range[0]+(linear-range[2]);
3791
3792                 /* write the result as UChars and output */
3793                 i=0;
3794                 UTF_APPEND_CHAR_UNSAFE(u, i, linear);
3795                 ucnv_cbToUWriteUChars(pArgs, u, i, 0, pErrorCode);
3796                 return;
3797             }
3798         }
3799     }
3800
3801     /* copy the current bytes to invalidCharBuffer */
3802     for(i=0; i<length; ++i) {
3803         cnv->invalidCharBuffer[i]=codeUnits[i];
3804     }
3805     cnv->invalidCharLength=(int8_t)length;
3806
3807     /* call the normal callback function */
3808     cnv->fromCharErrorBehaviour(context, pArgs, codeUnits, length, reason, pErrorCode);
3809 }
3810
3811 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */