icuSources/common/ucnvmbcs.c

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2000-2004, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *   file name:  ucnvmbcs.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2000jul03
  14 *   created by: Markus W. Scherer
  15 *
  16 *   The current code in this file replaces the previous implementation
  17 *   of conversion code from multi-byte codepages to Unicode and back.
  18 *   This implementation supports the following:
  19 *   - legacy variable-length codepages with up to 4 bytes per character
  20 *   - all Unicode code points (up to 0x10ffff)
  21 *   - efficient distinction of unassigned vs. illegal byte sequences
  22 *   - it is possible in fromUnicode() to directly deal with simple
  23 *     stateful encodings (used for EBCDIC_STATEFUL)
  24 *   - it is possible to convert Unicode code points
  25 *     to a single zero byte (but not as a fallback except for SBCS)
  26 *
  27 *   Remaining limitations in fromUnicode:
  28 *   - byte sequences must not have leading zero bytes
  29 *   - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
  30 *   - limitation to up to 4 bytes per character
  31 *
  32 *   ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
  33 *   limitations and adds m:n character mappings and other features.
  34 *   See ucnv_ext.h for details.
  35 *
  36 *   Change history:
  37 *
  38 *    5/6/2001       Ram       Moved  MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
  39 *                             MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
  40 *                             macros to ucnvmbcs.h file
  41 */
  42
  43 #include "unicode/utypes.h"
  44
  45 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
  46
  47 #include "unicode/ucnv.h"
  48 #include "unicode/ucnv_cb.h"
  49 #include "unicode/udata.h"
  50 #include "unicode/uset.h"
  51 #include "ucnv_bld.h"
  52 #include "ucnvmbcs.h"
  53 #include "ucnv_ext.h"
  54 #include "ucnv_cnv.h"
  55 #include "umutex.h"
  56 #include "cmemory.h"
  57 #include "cstring.h"
  58
  59 /* control optimizations according to the platform */
  60 #define MBCS_UNROLL_SINGLE_TO_BMP 1
  61 #define MBCS_UNROLL_SINGLE_FROM_BMP 0
  62
  63 /*
  64  * _MBCSHeader versions 4.2
  65  * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
  66  *
  67  * Change from version 4.1:
  68  * - Added an optional extension table structure at the end of the .cnv file.
  69  *   It is present if the upper bits of the header flags field contains a non-zero
  70  *   byte offset to it.
  71  *   Files that contain only a conversion table and no base table
  72  *   use the special outputType MBCS_OUTPUT_EXT_ONLY.
  73  *   These contain the base table name between the MBCS header and the extension
  74  *   data.
  75  *
  76  * Change from version 4.0:
  77  * - Replace header.reserved with header.fromUBytesLength so that all
  78  *   fields in the data have length.
  79  *
  80  * Changes from version 3 (for performance improvements):
  81  * - new bit distribution for state table entries
  82  * - reordered action codes
  83  * - new data structure for single-byte fromUnicode
  84  *   + stage 2 only contains indexes
  85  *   + stage 3 stores 16 bits per character with classification bits 15..8
  86  * - no multiplier for stage 1 entries
  87  * - stage 2 for non-single-byte codepages contains the index and the flags in
  88  *   one 32-bit value
  89  * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
  90  *
  91  * For more details about old versions of the MBCS data structure, see
  92  * the corresponding versions of this file.
  93  *
  94  * Converting stateless codepage data ---------------------------------------***
  95  * (or codepage data with simple states) to Unicode.
  96  *
  97  * Data structure and algorithm for converting from complex legacy codepages
  98  * to Unicode. (Designed before 2000-may-22.)
  99  *
 100  * The basic idea is that the structure of legacy codepages can be described
 101  * with state tables.
 102  * When reading a byte stream, each input byte causes a state transition.
 103  * Some transitions result in the output of a code point, some result in
 104  * "unassigned" or "illegal" output.
 105  * This is used here for character conversion.
 106  *
 107  * The data structure begins with a state table consisting of a row
 108  * per state, with 256 entries (columns) per row for each possible input
 109  * byte value.
 110  * Each entry is 32 bits wide, with two formats distinguished by
 111  * the sign bit (bit 31):
 112  *
 113  * One format for transitional entries (bit 31 not set) for non-final bytes, and
 114  * one format for final entries (bit 31 set).
 115  * Both formats contain the number of the next state in the same bit
 116  * positions.
 117  * State 0 is the initial state.
 118  *
 119  * Most of the time, the offset values of subsequent states are added
 120  * up to a scalar value. This value will eventually be the index of
 121  * the Unicode code point in a table that follows the state table.
 122  * The effect is that the code points for final state table rows
 123  * are contiguous. The code points of final state rows follow each other
 124  * in the order of the references to those final states by previous
 125  * states, etc.
 126  *
 127  * For some terminal states, the offset is itself the output Unicode
 128  * code point (16 bits for a BMP code point or 20 bits for a supplementary
 129  * code point (stored as code point minus 0x10000 so that 20 bits are enough).
 130  * For others, the code point in the Unicode table is stored with either
 131  * one or two code units: one for BMP code points, two for a pair of
 132  * surrogates.
 133  * All code points for a final state entry take up the same number of code
 134  * units, regardless of whether they all actually _use_ the same number
 135  * of code units. This is necessary for simple array access.
 136  *
 137  * An additional feature comes in with what in ICU is called "fallback"
 138  * mappings:
 139  *
 140  * In addition to round-trippable, precise, 1:1 mappings, there are often
 141  * mappings defined between similar, though not the same, characters.
 142  * Typically, such mappings occur only in fromUnicode mapping tables because
 143  * Unicode has a superset repertoire of most other codepages. However, it
 144  * is possible to provide such mappings in the toUnicode tables, too.
 145  * In this case, the fallback mappings are partly integrated into the
 146  * general state tables because the structure of the encoding includes their
 147  * byte sequences.
 148  * For final entries in an initial state, fallback mappings are stored in
 149  * the entry itself like with roundtrip mappings.
 150  * For other final entries, they are stored in the code units table if
 151  * the entry is for a pair of code units.
 152  * For single-unit results in the code units table, there is no space to
 153  * alternatively hold a fallback mapping; in this case, the code unit
 154  * is stored as U+fffe (unassigned), and the fallback mapping needs to
 155  * be looked up by the scalar offset value in a separate table.
 156  *
 157  * "Unassigned" state entries really mean "structurally unassigned",
 158  * i.e., such a byte sequence will never have a mapping result.
 159  *
 160  * The interpretation of the bits in each entry is as follows:
 161  *
 162  * Bit 31 not set, not a terminal entry ("transitional"):
 163  * 30..24 next state
 164  * 23..0  offset delta, to be added up
 165  *
 166  * Bit 31 set, terminal ("final") entry:
 167  * 30..24 next state (regardless of action code)
 168  * 23..20 action code:
 169  *        action codes 0 and 1 result in precise-mapping Unicode code points
 170  *        0  valid byte sequence
 171  *           19..16 not used, 0
 172  *           15..0  16-bit Unicode BMP code point
 173  *                  never U+fffe or U+ffff
 174  *        1  valid byte sequence
 175  *           19..0  20-bit Unicode supplementary code point
 176  *                  never U+fffe or U+ffff
 177  *
 178  *        action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
 179  *        2  valid byte sequence (fallback)
 180  *           19..16 not used, 0
 181  *           15..0  16-bit Unicode BMP code point as fallback result
 182  *        3  valid byte sequence (fallback)
 183  *           19..0  20-bit Unicode supplementary code point as fallback result
 184  *
 185  *        action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
 186  *        depending on the code units they result in
 187  *        4  valid byte sequence
 188  *           19..9  not used, 0
 189  *            8..0  final offset delta
 190  *                  pointing to one 16-bit code unit which may be
 191  *                  fffe  unassigned -- look for a fallback for this offset
 192  *                  ffff  illegal
 193  *        5  valid byte sequence
 194  *           19..9  not used, 0
 195  *            8..0  final offset delta
 196  *                  pointing to two 16-bit code units
 197  *                  (typically UTF-16 surrogates)
 198  *                  the result depends on the first code unit as follows:
 199  *                  0000..d7ff  roundtrip BMP code point (1st alone)
 200  *                  d800..dbff  roundtrip surrogate pair (1st, 2nd)
 201  *                  dc00..dfff  fallback surrogate pair (1st-400, 2nd)
 202  *                  e000        roundtrip BMP code point (2nd alone)
 203  *                  e001        fallback BMP code point (2nd alone)
 204  *                  fffe        unassigned
 205  *                  ffff        illegal
 206  *           (the final offset deltas are at most 255 * 2,
 207  *            times 2 because of storing code unit pairs)
 208  *
 209  *        6  unassigned byte sequence
 210  *           19..16 not used, 0
 211  *           15..0  16-bit Unicode BMP code point U+fffe (new with version 2)
 212  *                  this does not contain a final offset delta because the main
 213  *                  purpose of this action code is to save scalar offset values;
 214  *                  therefore, fallback values cannot be assigned to byte
 215  *                  sequences that result in this action code
 216  *        7  illegal byte sequence
 217  *           19..16 not used, 0
 218  *           15..0  16-bit Unicode BMP code point U+ffff (new with version 2)
 219  *        8  state change only
 220  *           19..0  not used, 0
 221  *           useful for state changes in simple stateful encodings,
 222  *           at Shift-In/Shift-Out codes
 223  *
 224  *
 225  *        9..15 reserved for future use
 226  *           current implementations will only perform a state change
 227  *           and ignore bits 19..0
 228  *
 229  * An encoding with contiguous ranges of unassigned byte sequences, like
 230  * Shift-JIS and especially EUC-TW, can be stored efficiently by having
 231  * at least two states for the trail bytes:
 232  * One trail byte state that results in code points, and one that only
 233  * has "unassigned" and "illegal" terminal states.
 234  *
 235  * Note: partly by accident, this data structure supports simple stateless
 236  * encodings without any additional logic.
 237  * Currently, only simple Shift-In/Shift-Out schemes are handled with
 238  * appropriate state tables (especially EBCDIC_STATEFUL!).
 239  *
 240  * MBCS version 2 added:
 241  * unassigned and illegal action codes have U+fffe and U+ffff
 242  * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
 243  *
 244  * Converting from Unicode to codepage bytes --------------------------------***
 245  *
 246  * The conversion data structure for fromUnicode is designed for the known
 247  * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
 248  * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
 249  * a roundtrip mapping.
 250  *
 251  * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
 252  * like in the character properties table.
 253  * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
 254  * with the resulting bytes is at offsetFromUBytes.
 255  *
 256  * Beginning with version 4, single-byte codepages have a significantly different
 257  * trie compared to other codepages.
 258  * In all cases, the entry in stage 1 is directly the index of the block of
 259  * 64 entries in stage 2.
 260  *
 261  * Single-byte lookup:
 262  *
 263  * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
 264  * Stage 3 contains one 16-bit word per result:
 265  * Bits 15..8 indicate the kind of result:
 266  *    f  roundtrip result
 267  *    c  fallback result from private-use code point
 268  *    8  fallback result from other code points
 269  *    0  unassigned
 270  * Bits 7..0 contain the codepage byte. A zero byte is always possible.
 271  *
 272  * Multi-byte lookup:
 273  *
 274  * Stage 2 contains a 32-bit word for each 16-block in stage 3:
 275  * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
 276  *             test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
 277  *             If this test is false, then a non-zero result will be interpreted as
 278  *             a fallback mapping.
 279  * Bits 15..0  contain the index to stage 3, which must be multiplied by 16*(bytes per char)
 280  *
 281  * Stage 3 contains 2, 3, or 4 bytes per result.
 282  * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
 283  * while 3 bytes are stored as bytes in big-endian order.
 284  * Leading zero bytes are ignored, and the number of bytes is counted.
 285  * A zero byte mapping result is possible as a roundtrip result.
 286  * For some output types, the actual result is processed from this;
 287  * see ucnv_MBCSFromUnicodeWithOffsets().
 288  *
 289  * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
 290  * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
 291  *
 292  * In version 3, stage 2 blocks may overlap by multiples of the multiplier
 293  * for compaction.
 294  * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
 295  * may overlap by any number of entries.
 296  *
 297  * MBCS version 2 added:
 298  * the converter checks for known output types, which allows
 299  * adding new ones without crashing an unaware converter
 300  */
 301
 302
 303 /* GB 18030 data ------------------------------------------------------------ */
 304
 305 /* helper macros for linear values for GB 18030 four-byte sequences */
 306 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
 307
 308 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
 309
 310 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
 311
 312 /*
 313  * Some ranges of GB 18030 where both the Unicode code points and the
 314  * GB four-byte sequences are contiguous and are handled algorithmically by
 315  * the special callback functions below.
 316  * The values are start & end of Unicode & GB codes.
 317  *
 318  * Note that single surrogates are not mapped by GB 18030
 319  * as of the re-released mapping tables from 2000-nov-30.
 320  */
 321 static const uint32_t
 322 gb18030Ranges[13][4]={
 323     {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
 324     {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
 325     {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)},
 326     {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
 327     {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
 328     {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
 329     {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
 330     {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
 331     {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
 332     {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
 333     {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
 334     {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
 335     {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
 336 };
 337
 338 /* bit flag for UConverter.options indicating GB 18030 special handling */
 339 #define _MBCS_OPTION_GB18030 0x8000
 340
 341 /* Miscellaneous ------------------------------------------------------------ */
 342
 343 /* similar to ucnv_MBCSGetNextUChar() but recursive */
 344 static void
 345 _getUnicodeSetForBytes(const UConverterSharedData *sharedData,
 346                        const int32_t (*stateTable)[256], const uint16_t *unicodeCodeUnits,
 347                        USetAdder *sa,
 348                        UConverterUnicodeSet which,
 349                        uint8_t state, uint32_t offset, int32_t lowByte, int32_t highByte,
 350
 351                        UErrorCode *pErrorCode) {
 352     int32_t b, entry;
 353
 354     for(b=lowByte; b<=highByte; ++b) {
 355         entry=stateTable[state][b];
 356         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
 357             _getUnicodeSetForBytes(
 358                 sharedData, stateTable, unicodeCodeUnits,
 359                 sa, which,
 360                 (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry),
 361                 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
 362                 0, 0xff,
 363                 pErrorCode);
 364         } else {
 365             UChar32 c;
 366             int32_t rowOffset=offset;
 367             uint8_t action;
 368
 369             c=U_SENTINEL;
 370
 371             /*
 372              * An if-else-if chain provides more reliable performance for
 373              * the most common cases compared to a switch.
 374              */
 375             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
 376             if(action==MBCS_STATE_VALID_DIRECT_16) {
 377                 /* output BMP code point */
 378                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
 379             } else if(action==MBCS_STATE_VALID_16) {
 380                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
 381                 c=unicodeCodeUnits[offset];
 382                 if(c<0xfffe) {
 383                     /* output BMP code point */
 384                 } else {
 385                     c=U_SENTINEL;
 386                 }
 387             } else if(action==MBCS_STATE_VALID_16_PAIR) {
 388                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
 389                 c=unicodeCodeUnits[offset++];
 390                 if(c<0xd800) {
 391                     /* output BMP code point below 0xd800 */
 392                 } else if(c<=0xdbff) {
 393                     /* output roundtrip or fallback supplementary code point */
 394                     c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
 395                 } else if(c==0xe000) {
 396                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
 397                     c=unicodeCodeUnits[offset];
 398                 } else {
 399                     c=U_SENTINEL;
 400                 }
 401             } else if(action==MBCS_STATE_VALID_DIRECT_20) {
 402                 /* output supplementary code point */
 403                 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
 404             }
 405
 406             if(c>=0) {
 407                 sa->add(sa->set, c);
 408             }
 409             offset=rowOffset;
 410         }
 411     }
 412 }
 413
 414 /*
 415  * Internal function returning a UnicodeSet for toUnicode() conversion.
 416  * Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
 417  * In the future, if we add support for reverse-fallback sets, this function
 418  * needs to be updated, and called for each initial state.
 419  * Does not currently handle extensions.
 420  * Does not empty the set first.
 421  */
 422 U_CFUNC void
 423 ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
 424                            USetAdder *sa,
 425                            UConverterUnicodeSet which,
 426                            uint8_t state, int32_t lowByte, int32_t highByte,
 427                            UErrorCode *pErrorCode) {
 428     _getUnicodeSetForBytes(
 429         sharedData, sharedData->mbcs.stateTable, sharedData->mbcs.unicodeCodeUnits,
 430         sa, which,
 431         state, 0, lowByte, highByte,
 432         pErrorCode);
 433 }
 434
 435 U_CFUNC void
 436 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
 437                              USetAdder *sa,
 438                              UConverterUnicodeSet which,
 439                              UErrorCode *pErrorCode) {
 440     const UConverterMBCSTable *mbcsTable;
 441     const uint16_t *table;
 442
 443     uint32_t st3;
 444     uint16_t st1, maxStage1, st2;
 445
 446     UChar32 c;
 447
 448     /* enumerate the from-Unicode trie table */
 449     mbcsTable=&sharedData->mbcs;
 450     table=mbcsTable->fromUnicodeTable;
 451     if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
 452         maxStage1=0x440;
 453     } else {
 454         maxStage1=0x40;
 455     }
 456
 457     c=0; /* keep track of the current code point while enumerating */
 458
 459     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
 460         const uint16_t *stage2, *stage3, *results;
 461
 462         results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
 463
 464         for(st1=0; st1<maxStage1; ++st1) {
 465             st2=table[st1];
 466             if(st2>maxStage1) {
 467                 stage2=table+st2;
 468                 for(st2=0; st2<64; ++st2) {
 469                     if((st3=stage2[st2])!=0) {
 470                         /* read the stage 3 block */
 471                         stage3=results+st3;
 472
 473                         /*
 474                          * Add code points for which the roundtrip flag is set.
 475                          * Once we get a set for fallback mappings, we have to use
 476                          * a threshold variable with a value of 0x800.
 477                          * See ucnv_MBCSSingleFromBMPWithOffsets() and
 478                          * MBCS_SINGLE_RESULT_FROM_U() for details.
 479                          */
 480                         do {
 481                             if(*stage3++>=0xf00) {
 482                                 sa->add(sa->set, c);
 483                             }
 484                         } while((++c&0xf)!=0);
 485                     } else {
 486                         c+=16; /* empty stage 3 block */
 487                     }
 488                 }
 489             } else {
 490                 c+=1024; /* empty stage 2 block */
 491             }
 492         }
 493     } else if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY) {
 494         /* ignore single-byte results */
 495         const uint32_t *stage2;
 496         const uint16_t *stage3, *results;
 497
 498         results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
 499
 500         for(st1=0; st1<maxStage1; ++st1) {
 501             st2=table[st1];
 502             if(st2>(maxStage1>>1)) {
 503                 stage2=(const uint32_t *)table+st2;
 504                 for(st2=0; st2<64; ++st2) {
 505                     if((st3=stage2[st2])!=0) {
 506                         /* read the stage 3 block */
 507                         stage3=results+16*(uint32_t)(uint16_t)st3;
 508
 509                         /* get the roundtrip flags for the stage 3 block */
 510                         st3>>=16;
 511
 512                         /*
 513                          * Add code points for which the roundtrip flag is set.
 514                          * Once we get a set for fallback mappings, we have to check
 515                          * non-roundtrip stage 3 results for whether they are 0.
 516                          * See ucnv_MBCSFromUnicodeWithOffsets() for details.
 517                          *
 518                          * Ignore single-byte results (<0x100).
 519                          */
 520                         do {
 521                             if((st3&1)!=0 && *stage3>=0x100) {
 522                                 sa->add(sa->set, c);
 523                             }
 524                             st3>>=1;
 525                             ++stage3;
 526                         } while((++c&0xf)!=0);
 527                     } else {
 528                         c+=16; /* empty stage 3 block */
 529                     }
 530                 }
 531             } else {
 532                 c+=1024; /* empty stage 2 block */
 533             }
 534         }
 535     } else {
 536         const uint32_t *stage2;
 537
 538         for(st1=0; st1<maxStage1; ++st1) {
 539             st2=table[st1];
 540             if(st2>(maxStage1>>1)) {
 541                 stage2=(const uint32_t *)table+st2;
 542                 for(st2=0; st2<64; ++st2) {
 543                     if((st3=stage2[st2])!=0) {
 544                         /* get the roundtrip flags for the stage 3 block */
 545                         st3>>=16;
 546
 547                         /*
 548                          * Add code points for which the roundtrip flag is set.
 549                          * Once we get a set for fallback mappings, we have to check
 550                          * non-roundtrip stage 3 results for whether they are 0.
 551                          * See ucnv_MBCSFromUnicodeWithOffsets() for details.
 552                          */
 553                         do {
 554                             if(st3&1) {
 555                                 sa->add(sa->set, c);
 556                             }
 557                             st3>>=1;
 558                         } while((++c&0xf)!=0);
 559                     } else {
 560                         c+=16; /* empty stage 3 block */
 561                     }
 562                 }
 563             } else {
 564                 c+=1024; /* empty stage 2 block */
 565             }
 566         }
 567     }
 568
 569     ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);
 570 }
 571
 572 static void
 573 ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
 574                    USetAdder *sa,
 575                    UConverterUnicodeSet which,
 576                    UErrorCode *pErrorCode) {
 577     if(cnv->options&_MBCS_OPTION_GB18030) {
 578         sa->addRange(sa->set, 0, 0xd7ff);
 579         sa->addRange(sa->set, 0xe000, 0x10ffff);
 580     } else {
 581         ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode);
 582     }
 583 }
 584
 585 /* conversion extensions for input not in the main table -------------------- */
 586
 587 /*
 588  * Hardcoded extension handling for GB 18030.
 589  * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
 590  *
 591  * In the future, conversion extensions may handle m:n mappings and delta tables,
 592  * see http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/conversion/conversion_extensions.html
 593  *
 594  * If an input character cannot be mapped, then these functions set an error
 595  * code. The framework will then call the callback function.
 596  */
 597
 598 /*
 599  * @return if(U_FAILURE) return the code point for cnv->fromUChar32
 600  *         else return 0 after output has been written to the target
 601  */
 602 static UChar32
 603 _extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
 604           UChar32 cp,
 605           const UChar **source, const UChar *sourceLimit,
 606           char **target, const char *targetLimit,
 607           int32_t **offsets, int32_t sourceIndex,
 608           UBool flush,
 609           UErrorCode *pErrorCode) {
 610     const int32_t *cx;
 611
 612     cnv->useSubChar1=FALSE;
 613
 614     if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
 615         ucnv_extInitialMatchFromU(
 616             cnv, cx,
 617             cp, source, sourceLimit,
 618             target, targetLimit,
 619             offsets, sourceIndex,
 620             flush,
 621             pErrorCode)
 622     ) {
 623         return 0; /* an extension mapping handled the input */
 624     }
 625
 626     /* GB 18030 */
 627     if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
 628         const uint32_t *range;
 629         int32_t i;
 630
 631         range=gb18030Ranges[0];
 632         for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
 633             if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {
 634                 /* found the Unicode code point, output the four-byte sequence for it */
 635                 uint32_t linear;
 636                 char bytes[4];
 637
 638                 /* get the linear value of the first GB 18030 code in this range */
 639                 linear=range[2]-LINEAR_18030_BASE;
 640
 641                 /* add the offset from the beginning of the range */
 642                 linear+=((uint32_t)cp-range[0]);
 643
 644                 /* turn this into a four-byte sequence */
 645                 bytes[3]=(char)(0x30+linear%10); linear/=10;
 646                 bytes[2]=(char)(0x81+linear%126); linear/=126;
 647                 bytes[1]=(char)(0x30+linear%10); linear/=10;
 648                 bytes[0]=(char)(0x81+linear);
 649
 650                 /* output this sequence */
 651                 ucnv_fromUWriteBytes(cnv,
 652                                      bytes, 4, target, targetLimit,
 653                                      offsets, sourceIndex, pErrorCode);
 654                 return 0;
 655             }
 656         }
 657     }
 658
 659     /* no mapping */
 660     *pErrorCode=U_INVALID_CHAR_FOUND;
 661     return cp;
 662 }
 663
 664 /*
 665  * Input sequence: cnv->toUBytes[0..length[
 666  * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
 667  *         else return 0 after output has been written to the target
 668  */
 669 static int8_t
 670 _extToU(UConverter *cnv, const UConverterSharedData *sharedData,
 671         int8_t length,
 672         const char **source, const char *sourceLimit,
 673         UChar **target, const UChar *targetLimit,
 674         int32_t **offsets, int32_t sourceIndex,
 675         UBool flush,
 676         UErrorCode *pErrorCode) {
 677     const int32_t *cx;
 678
 679     if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
 680         ucnv_extInitialMatchToU(
 681             cnv, cx,
 682             length, source, sourceLimit,
 683             target, targetLimit,
 684             offsets, sourceIndex,
 685             flush,
 686             pErrorCode)
 687     ) {
 688         return 0; /* an extension mapping handled the input */
 689     }
 690
 691     /* GB 18030 */
 692     if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
 693         const uint32_t *range;
 694         uint32_t linear;
 695         int32_t i;
 696
 697         linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]);
 698         range=gb18030Ranges[0];
 699         for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
 700             if(range[2]<=linear && linear<=range[3]) {
 701                 /* found the sequence, output the Unicode code point for it */
 702                 *pErrorCode=U_ZERO_ERROR;
 703
 704                 /* add the linear difference between the input and start sequences to the start code point */
 705                 linear=range[0]+(linear-range[2]);
 706
 707                 /* output this code point */
 708                 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode);
 709
 710                 return 0;
 711             }
 712         }
 713     }
 714
 715     /* no mapping */
 716     *pErrorCode=U_INVALID_CHAR_FOUND;
 717     return length;
 718 }
 719
 720 /* EBCDIC swap LF<->NL ------------------------------------------------------ */
 721
 722 /*
 723  * This code modifies a standard EBCDIC<->Unicode mapping table for
 724  * OS/390 (z/OS) Unix System Services (Open Edition).
 725  * The difference is in the mapping of Line Feed and New Line control codes:
 726  * Standard EBCDIC maps
 727  *
 728  *   <U000A> \x25 |0
 729  *   <U0085> \x15 |0
 730  *
 731  * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
 732  * mapping
 733  *
 734  *   <U000A> \x15 |0
 735  *   <U0085> \x25 |0
 736  *
 737  * This code modifies a loaded standard EBCDIC<->Unicode mapping table
 738  * by copying it into allocated memory and swapping the LF and NL values.
 739  * It allows to support the same EBCDIC charset in both versions without
 740  * duplicating the entire installed table.
 741  */
 742
 743 /* standard EBCDIC codes */
 744 #define EBCDIC_LF 0x25
 745 #define EBCDIC_NL 0x15
 746
 747 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
 748 #define EBCDIC_RT_LF 0xf25
 749 #define EBCDIC_RT_NL 0xf15
 750
 751 /* Unicode code points */
 752 #define U_LF 0x0a
 753 #define U_NL 0x85
 754
 755 static UBool
 756 _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) {
 757     UConverterMBCSTable *mbcsTable;
 758
 759     const uint16_t *table, *results;
 760     const uint8_t *bytes;
 761
 762     int32_t (*newStateTable)[256];
 763     uint16_t *newResults;
 764     uint8_t *p;
 765     char *name;
 766
 767     uint32_t stage2Entry;
 768     uint32_t size, sizeofFromUBytes;
 769
 770     mbcsTable=&sharedData->mbcs;
 771
 772     table=mbcsTable->fromUnicodeTable;
 773     bytes=mbcsTable->fromUnicodeBytes;
 774     results=(const uint16_t *)bytes;
 775
 776     /*
 777      * Check that this is an EBCDIC table with SBCS portion -
 778      * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
 779      *
 780      * If not, ignore the option. Options are always ignored if they do not apply.
 781      */
 782     if(!(
 783          (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
 784          mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
 785          mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL)
 786     )) {
 787         return FALSE;
 788     }
 789
 790     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
 791         if(!(
 792              EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
 793              EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)
 794         )) {
 795             return FALSE;
 796         }
 797     } else /* MBCS_OUTPUT_2_SISO */ {
 798         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
 799         if(!(
 800              MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&
 801              EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)
 802         )) {
 803             return FALSE;
 804         }
 805
 806         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
 807         if(!(
 808              MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&
 809              EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)
 810         )) {
 811             return FALSE;
 812         }
 813     }
 814
 815     if(mbcsTable->fromUBytesLength>0) {
 816         /*
 817          * We _know_ the number of bytes in the fromUnicodeBytes array
 818          * starting with header.version 4.1.
 819          */
 820         sizeofFromUBytes=mbcsTable->fromUBytesLength;
 821     } else {
 822         /*
 823          * Otherwise:
 824          * There used to be code to enumerate the fromUnicode
 825          * trie and find the highest entry, but it was removed in ICU 3.2
 826          * because it was not tested and caused a low code coverage number.
 827          * See Jitterbug 3674.
 828          * This affects only some .cnv file formats with a header.version
 829          * below 4.1, and only when swaplfnl is requested.
 830          *
 831          * ucnvmbcs.c revision 1.99 is the last one with the
 832          * ucnv_MBCSSizeofFromUBytes() function.
 833          */
 834         *pErrorCode=U_INVALID_FORMAT_ERROR;
 835         return FALSE;
 836     }
 837
 838     /*
 839      * The table has an appropriate format.
 840      * Allocate and build
 841      * - a modified to-Unicode state table
 842      * - a modified from-Unicode output array
 843      * - a converter name string with the swap option appended
 844      */
 845     size=
 846         mbcsTable->countStates*1024+
 847         sizeofFromUBytes+
 848         UCNV_MAX_CONVERTER_NAME_LENGTH+20;
 849     p=(uint8_t *)uprv_malloc(size);
 850     if(p==NULL) {
 851         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
 852         return FALSE;
 853     }
 854
 855     /* copy and modify the to-Unicode state table */
 856     newStateTable=(int32_t (*)[256])p;
 857     uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024);
 858
 859     newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
 860     newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
 861
 862     /* copy and modify the from-Unicode result table */
 863     newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
 864     uprv_memcpy(newResults, bytes, sizeofFromUBytes);
 865
 866     /* conveniently, the table access macros work on the left side of expressions */
 867     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
 868         MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;
 869         MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;
 870     } else /* MBCS_OUTPUT_2_SISO */ {
 871         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
 872         MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
 873
 874         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
 875         MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
 876     }
 877
 878     /* set the canonical converter name */
 879     name=(char *)newResults+sizeofFromUBytes;
 880     uprv_strcpy(name, sharedData->staticData->name);
 881     uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);
 882
 883     /* set the pointers */
 884     umtx_lock(NULL);
 885     if(mbcsTable->swapLFNLStateTable==NULL) {
 886         mbcsTable->swapLFNLStateTable=newStateTable;
 887         mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
 888         mbcsTable->swapLFNLName=name;
 889
 890         newStateTable=NULL;
 891     }
 892     umtx_unlock(NULL);
 893
 894     /* release the allocated memory if another thread beat us to it */
 895     if(newStateTable!=NULL) {
 896         uprv_free(newStateTable);
 897     }
 898     return TRUE;
 899 }
 900
 901 /* MBCS setup functions ----------------------------------------------------- */
 902
 903 static void
 904 ucnv_MBCSLoad(UConverterSharedData *sharedData,
 905           UConverterLoadArgs *pArgs,
 906           const uint8_t *raw,
 907           UErrorCode *pErrorCode) {
 908     UDataInfo info;
 909     UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
 910     _MBCSHeader *header=(_MBCSHeader *)raw;
 911     uint32_t offset;
 912
 913     if(header->version[0]!=4) {
 914         *pErrorCode=U_INVALID_TABLE_FORMAT;
 915         return;
 916     }
 917
 918     mbcsTable->outputType=(uint8_t)header->flags;
 919
 920     /* extension data, header version 4.2 and higher */
 921     offset=header->flags>>8;
 922     if(offset!=0) {
 923         mbcsTable->extIndexes=(const int32_t *)(raw+offset);
 924     }
 925
 926     if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {
 927         UConverterLoadArgs args={ 0 };
 928         UConverterSharedData *baseSharedData;
 929         const int32_t *extIndexes;
 930         const char *baseName;
 931
 932         /* extension-only file, load the base table and set values appropriately */
 933         if((extIndexes=mbcsTable->extIndexes)==NULL) {
 934             /* extension-only file without extension */
 935             *pErrorCode=U_INVALID_TABLE_FORMAT;
 936             return;
 937         }
 938
 939         if(pArgs->nestedLoads!=1) {
 940             /* an extension table must not be loaded as a base table */
 941             *pErrorCode=U_INVALID_TABLE_FILE;
 942             return;
 943         }
 944
 945         /* load the base table */
 946         baseName=(const char *)(header+1);
 947         if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {
 948             /* forbid loading this same extension-only file */
 949             *pErrorCode=U_INVALID_TABLE_FORMAT;
 950             return;
 951         }
 952
 953         /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
 954         args.size=sizeof(UConverterLoadArgs);
 955         args.nestedLoads=2;
 956         args.reserved=pArgs->reserved;
 957         args.options=pArgs->options;
 958         args.pkg=pArgs->pkg;
 959         args.name=baseName;
 960         baseSharedData=ucnv_load(&args, pErrorCode);
 961         if(U_FAILURE(*pErrorCode)) {
 962             return;
 963         }
 964         if( baseSharedData->staticData->conversionType!=UCNV_MBCS ||
 965             baseSharedData->mbcs.baseSharedData!=NULL
 966         ) {
 967             ucnv_unload(baseSharedData);
 968             *pErrorCode=U_INVALID_TABLE_FORMAT;
 969             return;
 970         }
 971
 972         /* copy the base table data */
 973         uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable));
 974
 975         /* overwrite values with relevant ones for the extension converter */
 976         mbcsTable->baseSharedData=baseSharedData;
 977         mbcsTable->extIndexes=extIndexes;
 978
 979         /*
 980          * It would be possible to share the swapLFNL data with a base converter,
 981          * but the generated name would have to be different, and the memory
 982          * would have to be free'd only once.
 983          * It is easier to just create the data for the extension converter
 984          * separately when it is requested.
 985          */
 986         mbcsTable->swapLFNLStateTable=NULL;
 987         mbcsTable->swapLFNLFromUnicodeBytes=NULL;
 988         mbcsTable->swapLFNLName=NULL;
 989
 990         /*
 991          * Set a special, runtime-only outputType if the extension converter
 992          * is a DBCS version of a base converter that also maps single bytes.
 993          */
 994         if( sharedData->staticData->conversionType==UCNV_DBCS ||
 995                 (sharedData->staticData->conversionType==UCNV_MBCS &&
 996                  sharedData->staticData->minBytesPerChar>=2)
 997         ) {
 998             if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {
 999                 /* the base converter is SI/SO-stateful */
1000                 int32_t entry;
1001
1002                 /* get the dbcs state from the state table entry for SO=0x0e */
1003                 entry=mbcsTable->stateTable[0][0xe];
1004                 if( MBCS_ENTRY_IS_FINAL(entry) &&
1005                     MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&
1006                     MBCS_ENTRY_FINAL_STATE(entry)!=0
1007                 ) {
1008                     mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
1009
1010                     mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1011                 }
1012             } else if(
1013                 baseSharedData->staticData->conversionType==UCNV_MBCS &&
1014                 baseSharedData->staticData->minBytesPerChar==1 &&
1015                 baseSharedData->staticData->maxBytesPerChar==2 &&
1016                 mbcsTable->countStates<=127
1017             ) {
1018                 /* non-stateful base converter, need to modify the state table */
1019                 int32_t (*newStateTable)[256];
1020                 int32_t *state;
1021                 int32_t i, count;
1022
1023                 /* allocate a new state table and copy the base state table contents */
1024                 count=mbcsTable->countStates;
1025                 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024);
1026                 if(newStateTable==NULL) {
1027                     ucnv_unload(baseSharedData);
1028                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1029                     return;
1030                 }
1031
1032                 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);
1033
1034                 /* change all final single-byte entries to go to a new all-illegal state */
1035                 state=newStateTable[0];
1036                 for(i=0; i<256; ++i) {
1037                     if(MBCS_ENTRY_IS_FINAL(state[i])) {
1038                         state[i]=MBCS_ENTRY_TRANSITION(count, 0);
1039                     }
1040                 }
1041
1042                 /* build the new all-illegal state */
1043                 state=newStateTable[count];
1044                 for(i=0; i<256; ++i) {
1045                     state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);
1046                 }
1047                 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable;
1048                 mbcsTable->countStates=(uint8_t)(count+1);
1049                 mbcsTable->stateTableOwned=TRUE;
1050
1051                 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1052             }
1053         }
1054
1055         /*
1056          * unlike below for files with base tables, do not get the unicodeMask
1057          * from the sharedData; instead, use the base table's unicodeMask,
1058          * which we copied in the memcpy above;
1059          * this is necessary because the static data unicodeMask, especially
1060          * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
1061          */
1062     } else {
1063         /* conversion file with a base table; an additional extension table is optional */
1064         /* make sure that the output type is known */
1065         switch(mbcsTable->outputType) {
1066         case MBCS_OUTPUT_1:
1067         case MBCS_OUTPUT_2:
1068         case MBCS_OUTPUT_3:
1069         case MBCS_OUTPUT_4:
1070         case MBCS_OUTPUT_3_EUC:
1071         case MBCS_OUTPUT_4_EUC:
1072         case MBCS_OUTPUT_2_SISO:
1073             /* OK */
1074             break;
1075         default:
1076             *pErrorCode=U_INVALID_TABLE_FORMAT;
1077             return;
1078         }
1079
1080         mbcsTable->countStates=(uint8_t)header->countStates;
1081         mbcsTable->countToUFallbacks=header->countToUFallbacks;
1082         mbcsTable->stateTable=(const int32_t (*)[256])(raw+sizeof(_MBCSHeader));
1083         mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
1084         mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
1085
1086         mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
1087         mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
1088         mbcsTable->fromUBytesLength=header->fromUBytesLength;
1089
1090         /*
1091          * converter versions 6.1 and up contain a unicodeMask that is
1092          * used here to select the most efficient function implementations
1093          */
1094         info.size=sizeof(UDataInfo);
1095         udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
1096         if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
1097             /* mask off possible future extensions to be safe */
1098             mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3);
1099         } else {
1100             /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
1101             mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
1102         }
1103     }
1104 }
1105
1106 static void
1107 ucnv_MBCSUnload(UConverterSharedData *sharedData) {
1108     UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
1109
1110     if(mbcsTable->swapLFNLStateTable!=NULL) {
1111         uprv_free(mbcsTable->swapLFNLStateTable);
1112     }
1113     if(mbcsTable->stateTableOwned) {
1114         uprv_free((void *)mbcsTable->stateTable);
1115     }
1116     if(mbcsTable->baseSharedData!=NULL) {
1117         ucnv_unload(mbcsTable->baseSharedData);
1118     }
1119 }
1120
1121 static void
1122 ucnv_MBCSOpen(UConverter *cnv,
1123           const char *name,
1124           const char *locale,
1125           uint32_t options,
1126           UErrorCode *pErrorCode) {
1127     UConverterMBCSTable *mbcsTable;
1128     const int32_t *extIndexes;
1129     uint8_t outputType;
1130     int8_t maxBytesPerUChar;
1131
1132     mbcsTable=&cnv->sharedData->mbcs;
1133     outputType=mbcsTable->outputType;
1134
1135     if(outputType==MBCS_OUTPUT_DBCS_ONLY) {
1136         /* the swaplfnl option does not apply, remove it */
1137         cnv->options=options&=~UCNV_OPTION_SWAP_LFNL;
1138     }
1139
1140     if((options&UCNV_OPTION_SWAP_LFNL)!=0) {
1141         /* do this because double-checked locking is broken */
1142         UBool isCached;
1143
1144         umtx_lock(NULL);
1145         isCached=mbcsTable->swapLFNLStateTable!=NULL;
1146         umtx_unlock(NULL);
1147
1148         if(!isCached) {
1149             if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
1150                 if(U_FAILURE(*pErrorCode)) {
1151                     return; /* something went wrong */
1152                 }
1153
1154                 /* the option does not apply, remove it */
1155                 cnv->options=options&=~UCNV_OPTION_SWAP_LFNL;
1156             }
1157         }
1158     }
1159
1160     if(uprv_strstr(name, "18030")!=NULL) {
1161         if(uprv_strstr(name, "gb18030")!=NULL || uprv_strstr(name, "GB18030")!=NULL) {
1162             /* set a flag for GB 18030 mode, which changes the callback behavior */
1163             cnv->options|=_MBCS_OPTION_GB18030;
1164         }
1165     }
1166
1167     /* fix maxBytesPerUChar depending on outputType and options etc. */
1168     if(outputType==MBCS_OUTPUT_2_SISO) {
1169         cnv->maxBytesPerUChar=3; /* SO+DBCS */
1170     }
1171
1172     extIndexes=mbcsTable->extIndexes;
1173     if(extIndexes!=NULL) {
1174         maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);
1175         if(outputType==MBCS_OUTPUT_2_SISO) {
1176             ++maxBytesPerUChar; /* SO + multiple DBCS */
1177         }
1178
1179         if(maxBytesPerUChar>cnv->maxBytesPerUChar) {
1180             cnv->maxBytesPerUChar=maxBytesPerUChar;
1181         }
1182     }
1183
1184 #if 0
1185     /*
1186      * documentation of UConverter fields used for status
1187      * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
1188      */
1189
1190     /* toUnicode */
1191     cnv->toUnicodeStatus=0;     /* offset */
1192     cnv->mode=0;                /* state */
1193     cnv->toULength=0;           /* byteIndex */
1194
1195     /* fromUnicode */
1196     cnv->fromUChar32=0;
1197     cnv->fromUnicodeStatus=1;   /* prevLength */
1198 #endif
1199 }
1200
1201 static const char *
1202 ucnv_MBCSGetName(const UConverter *cnv) {
1203     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) {
1204         return cnv->sharedData->mbcs.swapLFNLName;
1205     } else {
1206         return cnv->sharedData->staticData->name;
1207     }
1208 }
1209
1210 /* MBCS-to-Unicode conversion functions ------------------------------------- */
1211
1212 static UChar32
1213 ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
1214     const _MBCSToUFallback *toUFallbacks;
1215     uint32_t i, start, limit;
1216
1217     limit=mbcsTable->countToUFallbacks;
1218     if(limit>0) {
1219         /* do a binary search for the fallback mapping */
1220         toUFallbacks=mbcsTable->toUFallbacks;
1221         start=0;
1222         while(start<limit-1) {
1223             i=(start+limit)/2;
1224             if(offset<toUFallbacks[i].offset) {
1225                 limit=i;
1226             } else {
1227                 start=i;
1228             }
1229         }
1230
1231         /* did we really find it? */
1232         if(offset==toUFallbacks[start].offset) {
1233             return toUFallbacks[start].codePoint;
1234         }
1235     }
1236
1237     return 0xfffe;
1238 }
1239
1240 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
1241 static void
1242 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1243                                 UErrorCode *pErrorCode) {
1244     UConverter *cnv;
1245     const uint8_t *source, *sourceLimit;
1246     UChar *target;
1247     const UChar *targetLimit;
1248     int32_t *offsets;
1249
1250     const int32_t (*stateTable)[256];
1251
1252     int32_t sourceIndex;
1253
1254     int32_t entry;
1255     UChar c;
1256     uint8_t action;
1257
1258     /* set up the local pointers */
1259     cnv=pArgs->converter;
1260     source=(const uint8_t *)pArgs->source;
1261     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1262     target=pArgs->target;
1263     targetLimit=pArgs->targetLimit;
1264     offsets=pArgs->offsets;
1265
1266     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1267         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
1268     } else {
1269         stateTable=cnv->sharedData->mbcs.stateTable;
1270     }
1271
1272     /* sourceIndex=-1 if the current character began in the previous buffer */
1273     sourceIndex=0;
1274
1275     /* conversion loop */
1276     while(source<sourceLimit) {
1277         /*
1278          * This following test is to see if available input would overflow the output.
1279          * It does not catch output of more than one code unit that
1280          * overflows as a result of a surrogate pair or callback output
1281          * from the last source byte.
1282          * Therefore, those situations also test for overflows and will
1283          * then break the loop, too.
1284          */
1285         if(target>=targetLimit) {
1286             /* target is full */
1287             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1288             break;
1289         }
1290
1291         entry=stateTable[0][*source++];
1292         /* MBCS_ENTRY_IS_FINAL(entry) */
1293
1294         /* test the most common case first */
1295         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1296             /* output BMP code point */
1297             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1298             if(offsets!=NULL) {
1299                 *offsets++=sourceIndex;
1300             }
1301
1302             /* normal end of action codes: prepare for a new character */
1303             ++sourceIndex;
1304             continue;
1305         }
1306
1307         /*
1308          * An if-else-if chain provides more reliable performance for
1309          * the most common cases compared to a switch.
1310          */
1311         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1312         if(action==MBCS_STATE_VALID_DIRECT_20 ||
1313            (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
1314         ) {
1315             entry=MBCS_ENTRY_FINAL_VALUE(entry);
1316             /* output surrogate pair */
1317             *target++=(UChar)(0xd800|(UChar)(entry>>10));
1318             if(offsets!=NULL) {
1319                 *offsets++=sourceIndex;
1320             }
1321             c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
1322             if(target<targetLimit) {
1323                 *target++=c;
1324                 if(offsets!=NULL) {
1325                     *offsets++=sourceIndex;
1326                 }
1327             } else {
1328                 /* target overflow */
1329                 cnv->UCharErrorBuffer[0]=c;
1330                 cnv->UCharErrorBufferLength=1;
1331                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1332                 break;
1333             }
1334
1335             ++sourceIndex;
1336             continue;
1337         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1338             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
1339                 /* output BMP code point */
1340                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1341                 if(offsets!=NULL) {
1342                     *offsets++=sourceIndex;
1343                 }
1344
1345                 ++sourceIndex;
1346                 continue;
1347             }
1348         } else if(action==MBCS_STATE_UNASSIGNED) {
1349             /* just fall through */
1350         } else if(action==MBCS_STATE_ILLEGAL) {
1351             /* callback(illegal) */
1352             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1353         } else {
1354             /* reserved, must never occur */
1355             ++sourceIndex;
1356             continue;
1357         }
1358
1359         if(U_FAILURE(*pErrorCode)) {
1360             /* callback(illegal) */
1361             break;
1362         } else /* unassigned sequences indicated with byteIndex>0 */ {
1363             /* try an extension mapping */
1364             pArgs->source=(const char *)source;
1365             cnv->toUBytes[0]=*(source-1);
1366             cnv->toULength=_extToU(cnv, cnv->sharedData,
1367                                     1, (const char **)&source, (const char *)sourceLimit,
1368                                     &target, targetLimit,
1369                                     &offsets, sourceIndex,
1370                                     pArgs->flush,
1371                                     pErrorCode);
1372             sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
1373
1374             if(U_FAILURE(*pErrorCode)) {
1375                 /* not mappable or buffer overflow */
1376                 break;
1377             }
1378         }
1379     }
1380
1381     /* write back the updated pointers */
1382     pArgs->source=(const char *)source;
1383     pArgs->target=target;
1384     pArgs->offsets=offsets;
1385 }
1386
1387 /*
1388  * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
1389  * that only map to and from the BMP.
1390  * In addition to single-byte optimizations, the offset calculations
1391  * become much easier.
1392  */
1393 static void
1394 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
1395                             UErrorCode *pErrorCode) {
1396     UConverter *cnv;
1397     const uint8_t *source, *sourceLimit, *lastSource;
1398     UChar *target;
1399     int32_t targetCapacity, length;
1400     int32_t *offsets;
1401
1402     const int32_t (*stateTable)[256];
1403
1404     int32_t sourceIndex;
1405
1406     int32_t entry;
1407     uint8_t action;
1408
1409     /* set up the local pointers */
1410     cnv=pArgs->converter;
1411     source=(const uint8_t *)pArgs->source;
1412     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1413     target=pArgs->target;
1414     targetCapacity=pArgs->targetLimit-pArgs->target;
1415     offsets=pArgs->offsets;
1416
1417     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1418         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
1419     } else {
1420         stateTable=cnv->sharedData->mbcs.stateTable;
1421     }
1422
1423     /* sourceIndex=-1 if the current character began in the previous buffer */
1424     sourceIndex=0;
1425     lastSource=source;
1426
1427     /*
1428      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
1429      * for the minimum of the sourceLength and targetCapacity
1430      */
1431     length=sourceLimit-source;
1432     if(length<targetCapacity) {
1433         targetCapacity=length;
1434     }
1435
1436 #if MBCS_UNROLL_SINGLE_TO_BMP
1437     /* unrolling makes it faster on Pentium III/Windows 2000 */
1438     /* unroll the loop with the most common case */
1439 unrolled:
1440     if(targetCapacity>=16) {
1441         int32_t count, loops, oredEntries;
1442
1443         loops=count=targetCapacity>>4;
1444         do {
1445             oredEntries=entry=stateTable[0][*source++];
1446             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1447             oredEntries|=entry=stateTable[0][*source++];
1448             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1449             oredEntries|=entry=stateTable[0][*source++];
1450             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1451             oredEntries|=entry=stateTable[0][*source++];
1452             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1453             oredEntries|=entry=stateTable[0][*source++];
1454             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1455             oredEntries|=entry=stateTable[0][*source++];
1456             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1457             oredEntries|=entry=stateTable[0][*source++];
1458             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1459             oredEntries|=entry=stateTable[0][*source++];
1460             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1461             oredEntries|=entry=stateTable[0][*source++];
1462             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1463             oredEntries|=entry=stateTable[0][*source++];
1464             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1465             oredEntries|=entry=stateTable[0][*source++];
1466             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1467             oredEntries|=entry=stateTable[0][*source++];
1468             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1469             oredEntries|=entry=stateTable[0][*source++];
1470             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1471             oredEntries|=entry=stateTable[0][*source++];
1472             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1473             oredEntries|=entry=stateTable[0][*source++];
1474             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1475             oredEntries|=entry=stateTable[0][*source++];
1476             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1477
1478             /* were all 16 entries really valid? */
1479             if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
1480                 /* no, return to the first of these 16 */
1481                 source-=16;
1482                 target-=16;
1483                 break;
1484             }
1485         } while(--count>0);
1486         count=loops-count;
1487         targetCapacity-=16*count;
1488
1489         if(offsets!=NULL) {
1490             lastSource+=16*count;
1491             while(count>0) {
1492                 *offsets++=sourceIndex++;
1493                 *offsets++=sourceIndex++;
1494                 *offsets++=sourceIndex++;
1495                 *offsets++=sourceIndex++;
1496                 *offsets++=sourceIndex++;
1497                 *offsets++=sourceIndex++;
1498                 *offsets++=sourceIndex++;
1499                 *offsets++=sourceIndex++;
1500                 *offsets++=sourceIndex++;
1501                 *offsets++=sourceIndex++;
1502                 *offsets++=sourceIndex++;
1503                 *offsets++=sourceIndex++;
1504                 *offsets++=sourceIndex++;
1505                 *offsets++=sourceIndex++;
1506                 *offsets++=sourceIndex++;
1507                 *offsets++=sourceIndex++;
1508                 --count;
1509             }
1510         }
1511     }
1512 #endif
1513
1514     /* conversion loop */
1515     while(targetCapacity>0) {
1516         entry=stateTable[0][*source++];
1517         /* MBCS_ENTRY_IS_FINAL(entry) */
1518
1519         /* test the most common case first */
1520         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1521             /* output BMP code point */
1522             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1523             --targetCapacity;
1524             continue;
1525         }
1526
1527         /*
1528          * An if-else-if chain provides more reliable performance for
1529          * the most common cases compared to a switch.
1530          */
1531         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1532         if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1533             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
1534                 /* output BMP code point */
1535                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1536                 --targetCapacity;
1537                 continue;
1538             }
1539         } else if(action==MBCS_STATE_UNASSIGNED) {
1540             /* just fall through */
1541         } else if(action==MBCS_STATE_ILLEGAL) {
1542             /* callback(illegal) */
1543             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1544         } else {
1545             /* reserved, must never occur */
1546             continue;
1547         }
1548
1549         /* set offsets since the start or the last extension */
1550         if(offsets!=NULL) {
1551             int32_t count=(int32_t)(source-lastSource);
1552
1553             /* predecrement: do not set the offset for the callback-causing character */
1554             while(--count>0) {
1555                 *offsets++=sourceIndex++;
1556             }
1557             /* offset and sourceIndex are now set for the current character */
1558         }
1559
1560         if(U_FAILURE(*pErrorCode)) {
1561             /* callback(illegal) */
1562             break;
1563         } else /* unassigned sequences indicated with byteIndex>0 */ {
1564             /* try an extension mapping */
1565             lastSource=source;
1566             cnv->toUBytes[0]=*(source-1);
1567             cnv->toULength=_extToU(cnv, cnv->sharedData,
1568                                     1, (const char **)&source, (const char *)sourceLimit,
1569                                     &target, target+targetCapacity,
1570                                     &offsets, sourceIndex,
1571                                     pArgs->flush,
1572                                     pErrorCode);
1573             sourceIndex+=1+(int32_t)(source-lastSource);
1574
1575             if(U_FAILURE(*pErrorCode)) {
1576                 /* not mappable or buffer overflow */
1577                 break;
1578             }
1579
1580             /* recalculate the targetCapacity after an extension mapping */
1581             targetCapacity=pArgs->targetLimit-target;
1582             length=sourceLimit-source;
1583             if(length<targetCapacity) {
1584                 targetCapacity=length;
1585             }
1586         }
1587
1588 #if MBCS_UNROLL_SINGLE_TO_BMP
1589         /* unrolling makes it faster on Pentium III/Windows 2000 */
1590         goto unrolled;
1591 #endif
1592     }
1593
1594     if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
1595         /* target is full */
1596         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1597     }
1598
1599     /* set offsets since the start or the last callback */
1600     if(offsets!=NULL) {
1601         size_t count=source-lastSource;
1602         while(count>0) {
1603             *offsets++=sourceIndex++;
1604             --count;
1605         }
1606     }
1607
1608     /* write back the updated pointers */
1609     pArgs->source=(const char *)source;
1610     pArgs->target=target;
1611     pArgs->offsets=offsets;
1612 }
1613
1614 U_CFUNC void
1615 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1616                           UErrorCode *pErrorCode) {
1617     UConverter *cnv;
1618     const uint8_t *source, *sourceLimit;
1619     UChar *target;
1620     const UChar *targetLimit;
1621     int32_t *offsets;
1622
1623     const int32_t (*stateTable)[256];
1624     const uint16_t *unicodeCodeUnits;
1625
1626     uint32_t offset;
1627     uint8_t state;
1628     int8_t byteIndex;
1629     uint8_t *bytes;
1630
1631     int32_t sourceIndex, nextSourceIndex;
1632
1633     int32_t entry;
1634     UChar c;
1635     uint8_t action;
1636
1637     /* use optimized function if possible */
1638     cnv=pArgs->converter;
1639
1640     if(cnv->preToULength>0) {
1641         /*
1642          * pass sourceIndex=-1 because we continue from an earlier buffer
1643          * in the future, this may change with continuous offsets
1644          */
1645         ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
1646
1647         if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) {
1648             return;
1649         }
1650     }
1651
1652     if(cnv->sharedData->mbcs.countStates==1) {
1653         if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1654             ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
1655         } else {
1656             ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
1657         }
1658         return;
1659     }
1660
1661     /* set up the local pointers */
1662     source=(const uint8_t *)pArgs->source;
1663     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1664     target=pArgs->target;
1665     targetLimit=pArgs->targetLimit;
1666     offsets=pArgs->offsets;
1667
1668     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1669         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
1670     } else {
1671         stateTable=cnv->sharedData->mbcs.stateTable;
1672     }
1673     unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
1674
1675     /* get the converter state from UConverter */
1676     offset=cnv->toUnicodeStatus;
1677     byteIndex=cnv->toULength;
1678     bytes=cnv->toUBytes;
1679
1680     /*
1681      * if we are in the SBCS state for a DBCS-only converter,
1682      * then load the DBCS state from the MBCS data
1683      * (dbcsOnlyState==0 if it is not a DBCS-only converter)
1684      */
1685     if((state=(uint8_t)(cnv->mode))==0) {
1686         state=cnv->sharedData->mbcs.dbcsOnlyState;
1687     }
1688
1689     /* sourceIndex=-1 if the current character began in the previous buffer */
1690     sourceIndex=byteIndex==0 ? 0 : -1;
1691     nextSourceIndex=0;
1692
1693     /* conversion loop */
1694     while(source<sourceLimit) {
1695         /*
1696          * This following test is to see if available input would overflow the output.
1697          * It does not catch output of more than one code unit that
1698          * overflows as a result of a surrogate pair or callback output
1699          * from the last source byte.
1700          * Therefore, those situations also test for overflows and will
1701          * then break the loop, too.
1702          */
1703         if(target>=targetLimit) {
1704             /* target is full */
1705             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1706             break;
1707         }
1708
1709         if(byteIndex==0) {
1710             /* optimized loop for 1/2-byte input and BMP output */
1711             if(offsets==NULL) {
1712                 do {
1713                     entry=stateTable[state][*source];
1714                     if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1715                         state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
1716                         offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
1717
1718                         ++source;
1719                         if( source<sourceLimit &&
1720                             MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
1721                             MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
1722                             (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
1723                         ) {
1724                             ++source;
1725                             *target++=c;
1726                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1727                             offset=0;
1728                         } else {
1729                             /* set the state and leave the optimized loop */
1730                             bytes[0]=*(source-1);
1731                             byteIndex=1;
1732                             break;
1733                         }
1734                     } else {
1735                         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1736                             /* output BMP code point */
1737                             ++source;
1738                             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1739                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1740                         } else {
1741                             /* leave the optimized loop */
1742                             break;
1743                         }
1744                     }
1745                 } while(source<sourceLimit && target<targetLimit);
1746             } else /* offsets!=NULL */ {
1747                 do {
1748                     entry=stateTable[state][*source];
1749                     if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1750                         state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
1751                         offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
1752
1753                         ++source;
1754                         if( source<sourceLimit &&
1755                             MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
1756                             MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
1757                             (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
1758                         ) {
1759                             ++source;
1760                             *target++=c;
1761                             if(offsets!=NULL) {
1762                                 *offsets++=sourceIndex;
1763                                 sourceIndex=(nextSourceIndex+=2);
1764                             }
1765                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1766                             offset=0;
1767                         } else {
1768                             /* set the state and leave the optimized loop */
1769                             ++nextSourceIndex;
1770                             bytes[0]=*(source-1);
1771                             byteIndex=1;
1772                             break;
1773                         }
1774                     } else {
1775                         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1776                             /* output BMP code point */
1777                             ++source;
1778                             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1779                             if(offsets!=NULL) {
1780                                 *offsets++=sourceIndex;
1781                                 sourceIndex=++nextSourceIndex;
1782                             }
1783                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1784                         } else {
1785                             /* leave the optimized loop */
1786                             break;
1787                         }
1788                     }
1789                 } while(source<sourceLimit && target<targetLimit);
1790             }
1791
1792             /*
1793              * these tests and break statements could be put inside the loop
1794              * if C had "break outerLoop" like Java
1795              */
1796             if(source>=sourceLimit) {
1797                 break;
1798             }
1799             if(target>=targetLimit) {
1800                 /* target is full */
1801                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1802                 break;
1803             }
1804
1805             ++nextSourceIndex;
1806             bytes[byteIndex++]=*source++;
1807         } else /* byteIndex>0 */ {
1808             ++nextSourceIndex;
1809             entry=stateTable[state][bytes[byteIndex++]=*source++];
1810         }
1811
1812         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1813             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
1814             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
1815             continue;
1816         }
1817
1818         /* save the previous state for proper extension mapping with SI/SO-stateful converters */
1819         cnv->mode=state;
1820
1821         /* set the next state early so that we can reuse the entry variable */
1822         state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
1823
1824         /*
1825          * An if-else-if chain provides more reliable performance for
1826          * the most common cases compared to a switch.
1827          */
1828         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1829         if(action==MBCS_STATE_VALID_16) {
1830             offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
1831             c=unicodeCodeUnits[offset];
1832             if(c<0xfffe) {
1833                 /* output BMP code point */
1834                 *target++=c;
1835                 if(offsets!=NULL) {
1836                     *offsets++=sourceIndex;
1837                 }
1838                 byteIndex=0;
1839             } else if(c==0xfffe) {
1840                 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
1841                     /* output fallback BMP code point */
1842                     *target++=(UChar)entry;
1843                     if(offsets!=NULL) {
1844                         *offsets++=sourceIndex;
1845                     }
1846                     byteIndex=0;
1847                 }
1848             } else {
1849                 /* callback(illegal) */
1850                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1851             }
1852         } else if(action==MBCS_STATE_VALID_DIRECT_16) {
1853             /* output BMP code point */
1854             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1855             if(offsets!=NULL) {
1856                 *offsets++=sourceIndex;
1857             }
1858             byteIndex=0;
1859         } else if(action==MBCS_STATE_VALID_16_PAIR) {
1860             offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
1861             c=unicodeCodeUnits[offset++];
1862             if(c<0xd800) {
1863                 /* output BMP code point below 0xd800 */
1864                 *target++=c;
1865                 if(offsets!=NULL) {
1866                     *offsets++=sourceIndex;
1867                 }
1868                 byteIndex=0;
1869             } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
1870                 /* output roundtrip or fallback surrogate pair */
1871                 *target++=(UChar)(c&0xdbff);
1872                 if(offsets!=NULL) {
1873                     *offsets++=sourceIndex;
1874                 }
1875                 byteIndex=0;
1876                 if(target<targetLimit) {
1877                     *target++=unicodeCodeUnits[offset];
1878                     if(offsets!=NULL) {
1879                         *offsets++=sourceIndex;
1880                     }
1881                 } else {
1882                     /* target overflow */
1883                     cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
1884                     cnv->UCharErrorBufferLength=1;
1885                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1886
1887                     offset=0;
1888                     break;
1889                 }
1890             } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
1891                 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
1892                 *target++=unicodeCodeUnits[offset];
1893                 if(offsets!=NULL) {
1894                     *offsets++=sourceIndex;
1895                 }
1896                 byteIndex=0;
1897             } else if(c==0xffff) {
1898                 /* callback(illegal) */
1899                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1900             }
1901         } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
1902                   (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
1903         ) {
1904             entry=MBCS_ENTRY_FINAL_VALUE(entry);
1905             /* output surrogate pair */
1906             *target++=(UChar)(0xd800|(UChar)(entry>>10));
1907             if(offsets!=NULL) {
1908                 *offsets++=sourceIndex;
1909             }
1910             byteIndex=0;
1911             c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
1912             if(target<targetLimit) {
1913                 *target++=c;
1914                 if(offsets!=NULL) {
1915                     *offsets++=sourceIndex;
1916                 }
1917             } else {
1918                 /* target overflow */
1919                 cnv->UCharErrorBuffer[0]=c;
1920                 cnv->UCharErrorBufferLength=1;
1921                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1922
1923                 offset=0;
1924                 break;
1925             }
1926         } else if(action==MBCS_STATE_CHANGE_ONLY) {
1927             /*
1928              * This serves as a state change without any output.
1929              * It is useful for reading simple stateful encodings,
1930              * for example using just Shift-In/Shift-Out codes.
1931              * The 21 unused bits may later be used for more sophisticated
1932              * state transitions.
1933              */
1934             if(cnv->sharedData->mbcs.dbcsOnlyState==0) {
1935                 byteIndex=0;
1936             } else {
1937                 /* SI/SO are illegal for DBCS-only conversion */
1938                 state=(uint8_t)(cnv->mode); /* restore the previous state */
1939
1940                 /* callback(illegal) */
1941                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1942             }
1943         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1944             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
1945                 /* output BMP code point */
1946                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1947                 if(offsets!=NULL) {
1948                     *offsets++=sourceIndex;
1949                 }
1950                 byteIndex=0;
1951             }
1952         } else if(action==MBCS_STATE_UNASSIGNED) {
1953             /* just fall through */
1954         } else if(action==MBCS_STATE_ILLEGAL) {
1955             /* callback(illegal) */
1956             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1957         } else {
1958             /* reserved, must never occur */
1959             byteIndex=0;
1960         }
1961
1962         /* end of action codes: prepare for a new character */
1963         offset=0;
1964
1965         if(byteIndex==0) {
1966             sourceIndex=nextSourceIndex;
1967         } else if(U_FAILURE(*pErrorCode)) {
1968             /* callback(illegal) */
1969             break;
1970         } else /* unassigned sequences indicated with byteIndex>0 */ {
1971             /* try an extension mapping */
1972             pArgs->source=(const char *)source;
1973             byteIndex=_extToU(cnv, cnv->sharedData,
1974                               byteIndex, (const char **)&source, (const char *)sourceLimit,
1975                               &target, targetLimit,
1976                               &offsets, sourceIndex,
1977                               pArgs->flush,
1978                               pErrorCode);
1979             sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);
1980
1981             if(U_FAILURE(*pErrorCode)) {
1982                 /* not mappable or buffer overflow */
1983                 break;
1984             }
1985         }
1986     }
1987
1988     /* set the converter state back into UConverter */
1989     cnv->toUnicodeStatus=offset;
1990     cnv->mode=state;
1991     cnv->toULength=byteIndex;
1992
1993     /* write back the updated pointers */
1994     pArgs->source=(const char *)source;
1995     pArgs->target=target;
1996     pArgs->offsets=offsets;
1997 }
1998
1999 /*
2000  * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
2001  * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
2002  */
2003 static UChar32
2004 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
2005                         UErrorCode *pErrorCode) {
2006     UConverter *cnv;
2007     const int32_t (*stateTable)[256];
2008     const uint8_t *source, *sourceLimit;
2009
2010     int32_t entry;
2011     uint8_t action;
2012
2013     /* set up the local pointers */
2014     cnv=pArgs->converter;
2015     source=(const uint8_t *)pArgs->source;
2016     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2017     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2018         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2019     } else {
2020         stateTable=cnv->sharedData->mbcs.stateTable;
2021     }
2022
2023     /* conversion loop */
2024     while(source<sourceLimit) {
2025         entry=stateTable[0][*source++];
2026         /* MBCS_ENTRY_IS_FINAL(entry) */
2027
2028         /* write back the updated pointer early so that we can return directly */
2029         pArgs->source=(const char *)source;
2030
2031         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2032             /* output BMP code point */
2033             return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2034         }
2035
2036         /*
2037          * An if-else-if chain provides more reliable performance for
2038          * the most common cases compared to a switch.
2039          */
2040         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2041         if( action==MBCS_STATE_VALID_DIRECT_20 ||
2042             (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2043         ) {
2044             /* output supplementary code point */
2045             return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
2046         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2047             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2048                 /* output BMP code point */
2049                 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2050             }
2051         } else if(action==MBCS_STATE_UNASSIGNED) {
2052             /* just fall through */
2053         } else if(action==MBCS_STATE_ILLEGAL) {
2054             /* callback(illegal) */
2055             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2056         } else {
2057             /* reserved, must never occur */
2058             continue;
2059         }
2060
2061         if(U_FAILURE(*pErrorCode)) {
2062             /* callback(illegal) */
2063             break;
2064         } else /* unassigned sequence */ {
2065             /* defer to the generic implementation */
2066             pArgs->source=(const char *)source-1;
2067             return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2068         }
2069     }
2070
2071     /* no output because of empty input or only state changes */
2072     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2073     return 0xffff;
2074 }
2075
2076 /*
2077  * Version of _MBCSToUnicodeWithOffsets() optimized for single-character
2078  * conversion without offset handling.
2079  *
2080  * When a character does not have a mapping to Unicode, then we return to the
2081  * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
2082  * handling.
2083  * We also defer to the generic code in other complicated cases and have them
2084  * ultimately handled by _MBCSToUnicodeWithOffsets() itself.
2085  *
2086  * All normal mappings and errors are handled here.
2087  */
2088 static UChar32
2089 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
2090                   UErrorCode *pErrorCode) {
2091     UConverter *cnv;
2092     const uint8_t *source, *sourceLimit, *lastSource;
2093
2094     const int32_t (*stateTable)[256];
2095     const uint16_t *unicodeCodeUnits;
2096
2097     uint32_t offset;
2098     uint8_t state;
2099
2100     int32_t entry;
2101     UChar32 c;
2102     uint8_t action;
2103
2104     /* use optimized function if possible */
2105     cnv=pArgs->converter;
2106
2107     if(cnv->preToULength>0) {
2108         /* use the generic code in ucnv_getNextUChar() to continue with a partial match */
2109         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2110     }
2111
2112     if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
2113         /*
2114          * Using the generic ucnv_getNextUChar() code lets us deal correctly
2115          * with the rare case of a codepage that maps single surrogates
2116          * without adding the complexity to this already complicated function here.
2117          */
2118         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2119     } else if(cnv->sharedData->mbcs.countStates==1) {
2120         return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);
2121     }
2122
2123     /* set up the local pointers */
2124     source=lastSource=(const uint8_t *)pArgs->source;
2125     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2126
2127     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2128         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2129     } else {
2130         stateTable=cnv->sharedData->mbcs.stateTable;
2131     }
2132     unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
2133
2134     /* get the converter state from UConverter */
2135     offset=cnv->toUnicodeStatus;
2136
2137     /*
2138      * if we are in the SBCS state for a DBCS-only converter,
2139      * then load the DBCS state from the MBCS data
2140      * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2141      */
2142     if((state=(uint8_t)(cnv->mode))==0) {
2143         state=cnv->sharedData->mbcs.dbcsOnlyState;
2144     }
2145
2146     /* conversion loop */
2147     c=U_SENTINEL;
2148     while(source<sourceLimit) {
2149         entry=stateTable[state][*source++];
2150         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2151             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2152             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2153
2154             /* optimization for 1/2-byte input and BMP output */
2155             if( source<sourceLimit &&
2156                 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2157                 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2158                 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2159             ) {
2160                 ++source;
2161                 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2162                 /* output BMP code point */
2163                 break;
2164             }
2165         } else {
2166             /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2167             cnv->mode=state;
2168
2169             /* set the next state early so that we can reuse the entry variable */
2170             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2171
2172             /*
2173              * An if-else-if chain provides more reliable performance for
2174              * the most common cases compared to a switch.
2175              */
2176             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2177             if(action==MBCS_STATE_VALID_DIRECT_16) {
2178                 /* output BMP code point */
2179                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2180                 break;
2181             } else if(action==MBCS_STATE_VALID_16) {
2182                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2183                 c=unicodeCodeUnits[offset];
2184                 if(c<0xfffe) {
2185                     /* output BMP code point */
2186                     break;
2187                 } else if(c==0xfffe) {
2188                     if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
2189                         break;
2190                     }
2191                 } else {
2192                     /* callback(illegal) */
2193                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2194                 }
2195             } else if(action==MBCS_STATE_VALID_16_PAIR) {
2196                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2197                 c=unicodeCodeUnits[offset++];
2198                 if(c<0xd800) {
2199                     /* output BMP code point below 0xd800 */
2200                     break;
2201                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
2202                     /* output roundtrip or fallback supplementary code point */
2203                     c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
2204                     break;
2205                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
2206                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2207                     c=unicodeCodeUnits[offset];
2208                     break;
2209                 } else if(c==0xffff) {
2210                     /* callback(illegal) */
2211                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2212                 }
2213             } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
2214                       (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2215             ) {
2216                 /* output supplementary code point */
2217                 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
2218                 break;
2219             } else if(action==MBCS_STATE_CHANGE_ONLY) {
2220                 /*
2221                  * This serves as a state change without any output.
2222                  * It is useful for reading simple stateful encodings,
2223                  * for example using just Shift-In/Shift-Out codes.
2224                  * The 21 unused bits may later be used for more sophisticated
2225                  * state transitions.
2226                  */
2227                 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) {
2228                     /* SI/SO are illegal for DBCS-only conversion */
2229                     state=(uint8_t)(cnv->mode); /* restore the previous state */
2230
2231                     /* callback(illegal) */
2232                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2233                 }
2234             } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2235                 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2236                     /* output BMP code point */
2237                     c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2238                     break;
2239                 }
2240             } else if(action==MBCS_STATE_UNASSIGNED) {
2241                 /* just fall through */
2242             } else if(action==MBCS_STATE_ILLEGAL) {
2243                 /* callback(illegal) */
2244                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2245             } else {
2246                 /* reserved (must never occur), or only state change */
2247                 offset=0;
2248                 lastSource=source;
2249                 continue;
2250             }
2251
2252             /* end of action codes: prepare for a new character */
2253             offset=0;
2254
2255             if(U_FAILURE(*pErrorCode)) {
2256                 /* callback(illegal) */
2257                 break;
2258             } else /* unassigned sequence */ {
2259                 /* defer to the generic implementation */
2260                 cnv->toUnicodeStatus=0;
2261                 cnv->mode=state;
2262                 pArgs->source=(const char *)lastSource;
2263                 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2264             }
2265         }
2266     }
2267
2268     if(c<0) {
2269         if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
2270             *pErrorCode=U_TRUNCATED_CHAR_FOUND;
2271         }
2272         if(U_FAILURE(*pErrorCode)) {
2273             /* incomplete character byte sequence */
2274             uint8_t *bytes=cnv->toUBytes;
2275             cnv->toULength=(int8_t)(source-lastSource);
2276             do {
2277                 *bytes++=*lastSource++;
2278             } while(lastSource<source);
2279         } else {
2280             /* no output because of empty input or only state changes */
2281             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2282         }
2283         c=0xffff;
2284     }
2285
2286     /* set the converter state back into UConverter, ready for a new character */
2287     cnv->toUnicodeStatus=0;
2288     cnv->mode=state;
2289
2290     /* write back the updated pointer */
2291     pArgs->source=(const char *)source;
2292     return c;
2293 }
2294
2295 #if 0
2296 /*
2297  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
2298  * Removal improves code coverage.
2299  */
2300 /**
2301  * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
2302  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
2303  * It does not handle conversion extensions (_extToU()).
2304  */
2305 U_CFUNC UChar32
2306 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
2307                               uint8_t b, UBool useFallback) {
2308     int32_t entry;
2309     uint8_t action;
2310
2311     entry=sharedData->mbcs.stateTable[0][b];
2312     /* MBCS_ENTRY_IS_FINAL(entry) */
2313
2314     if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2315         /* output BMP code point */
2316         return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2317     }
2318
2319     /*
2320      * An if-else-if chain provides more reliable performance for
2321      * the most common cases compared to a switch.
2322      */
2323     action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2324     if(action==MBCS_STATE_VALID_DIRECT_20) {
2325         /* output supplementary code point */
2326         return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2327     } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2328         if(!TO_U_USE_FALLBACK(useFallback)) {
2329             return 0xfffe;
2330         }
2331         /* output BMP code point */
2332         return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2333     } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
2334         if(!TO_U_USE_FALLBACK(useFallback)) {
2335             return 0xfffe;
2336         }
2337         /* output supplementary code point */
2338         return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2339     } else if(action==MBCS_STATE_UNASSIGNED) {
2340         return 0xfffe;
2341     } else if(action==MBCS_STATE_ILLEGAL) {
2342         return 0xffff;
2343     } else {
2344         /* reserved, must never occur */
2345         return 0xffff;
2346     }
2347 }
2348 #endif
2349
2350 /*
2351  * This is a simple version of _MBCSGetNextUChar() that is used
2352  * by other converter implementations.
2353  * It only returns an "assigned" result if it consumes the entire input.
2354  * It does not use state from the converter, nor error codes.
2355  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
2356  * It handles conversion extensions but not GB 18030.
2357  *
2358  * Return value:
2359  * U+fffe   unassigned
2360  * U+ffff   illegal
2361  * otherwise the Unicode code point
2362  */
2363 U_CFUNC UChar32
2364 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
2365                         const char *source, int32_t length,
2366                         UBool useFallback) {
2367     const int32_t (*stateTable)[256];
2368     const uint16_t *unicodeCodeUnits;
2369
2370     uint32_t offset;
2371     uint8_t state, action;
2372
2373     UChar32 c;
2374     int32_t i, entry;
2375
2376     if(length<=0) {
2377         /* no input at all: "illegal" */
2378         return 0xffff;
2379     }
2380
2381 #if 0
2382 /*
2383  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
2384  * TODO In future releases, verify that this function is never called for SBCS
2385  * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
2386  * Removal improves code coverage.
2387  */
2388     /* use optimized function if possible */
2389     if(sharedData->mbcs.countStates==1) {
2390         if(length==1) {
2391             return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
2392         } else {
2393             return 0xffff; /* illegal: more than a single byte for an SBCS converter */
2394         }
2395     }
2396 #endif
2397
2398     /* set up the local pointers */
2399     stateTable=sharedData->mbcs.stateTable;
2400     unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;
2401
2402     /* converter state */
2403     offset=0;
2404     state=sharedData->mbcs.dbcsOnlyState;
2405
2406     /* conversion loop */
2407     for(i=0;;) {
2408         entry=stateTable[state][(uint8_t)source[i++]];
2409         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2410             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2411             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2412
2413             if(i==length) {
2414                 return 0xffff; /* truncated character */
2415             }
2416         } else {
2417             /*
2418              * An if-else-if chain provides more reliable performance for
2419              * the most common cases compared to a switch.
2420              */
2421             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2422             if(action==MBCS_STATE_VALID_16) {
2423                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2424                 c=unicodeCodeUnits[offset];
2425                 if(c!=0xfffe) {
2426                     /* done */
2427                 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2428                     c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);
2429                 /* else done with 0xfffe */
2430                 }
2431                 break;
2432             } else if(action==MBCS_STATE_VALID_DIRECT_16) {
2433                 /* output BMP code point */
2434                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2435                 break;
2436             } else if(action==MBCS_STATE_VALID_16_PAIR) {
2437                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2438                 c=unicodeCodeUnits[offset++];
2439                 if(c<0xd800) {
2440                     /* output BMP code point below 0xd800 */
2441                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
2442                     /* output roundtrip or fallback supplementary code point */
2443                     c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
2444                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
2445                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2446                     c=unicodeCodeUnits[offset];
2447                 } else if(c==0xffff) {
2448                     return 0xffff;
2449                 } else {
2450                     c=0xfffe;
2451                 }
2452                 break;
2453             } else if(action==MBCS_STATE_VALID_DIRECT_20) {
2454                 /* output supplementary code point */
2455                 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2456                 break;
2457             } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2458                 if(!TO_U_USE_FALLBACK(useFallback)) {
2459                     c=0xfffe;
2460                     break;
2461                 }
2462                 /* output BMP code point */
2463                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2464                 break;
2465             } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
2466                 if(!TO_U_USE_FALLBACK(useFallback)) {
2467                     c=0xfffe;
2468                     break;
2469                 }
2470                 /* output supplementary code point */
2471                 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2472                 break;
2473             } else if(action==MBCS_STATE_UNASSIGNED) {
2474                 c=0xfffe;
2475                 break;
2476             }
2477
2478             /*
2479              * forbid MBCS_STATE_CHANGE_ONLY for this function,
2480              * and MBCS_STATE_ILLEGAL and reserved action codes
2481              */
2482             return 0xffff;
2483         }
2484     }
2485
2486     if(i!=length) {
2487         /* illegal for this function: not all input consumed */
2488         return 0xffff;
2489     }
2490
2491     if(c==0xfffe) {
2492         /* try an extension mapping */
2493         const int32_t *cx=sharedData->mbcs.extIndexes;
2494         if(cx!=NULL) {
2495             return ucnv_extSimpleMatchToU(cx, source, length, useFallback);
2496         }
2497     }
2498
2499     return c;
2500 }
2501
2502 /* MBCS-from-Unicode conversion functions ----------------------------------- */
2503
2504 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
2505 static void
2506 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
2507                                   UErrorCode *pErrorCode) {
2508     UConverter *cnv;
2509     const UChar *source, *sourceLimit;
2510     uint8_t *target;
2511     int32_t targetCapacity;
2512     int32_t *offsets;
2513
2514     const uint16_t *table;
2515     const uint8_t *bytes;
2516
2517     UChar32 c;
2518
2519     int32_t sourceIndex, nextSourceIndex;
2520
2521     uint32_t stage2Entry;
2522     uint32_t value;
2523     int32_t length;
2524     uint8_t unicodeMask;
2525
2526     /* use optimized function if possible */
2527     cnv=pArgs->converter;
2528     unicodeMask=cnv->sharedData->mbcs.unicodeMask;
2529
2530     /* set up the local pointers */
2531     source=pArgs->source;
2532     sourceLimit=pArgs->sourceLimit;
2533     target=(uint8_t *)pArgs->target;
2534     targetCapacity=pArgs->targetLimit-pArgs->target;
2535     offsets=pArgs->offsets;
2536
2537     table=cnv->sharedData->mbcs.fromUnicodeTable;
2538     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2539         bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
2540     } else {
2541         bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
2542     }
2543
2544     /* get the converter state from UConverter */
2545     c=cnv->fromUChar32;
2546
2547     /* sourceIndex=-1 if the current character began in the previous buffer */
2548     sourceIndex= c==0 ? 0 : -1;
2549     nextSourceIndex=0;
2550
2551     /* conversion loop */
2552     if(c!=0 && targetCapacity>0) {
2553         goto getTrail;
2554     }
2555
2556     while(source<sourceLimit) {
2557         /*
2558          * This following test is to see if available input would overflow the output.
2559          * It does not catch output of more than one byte that
2560          * overflows as a result of a multi-byte character or callback output
2561          * from the last source character.
2562          * Therefore, those situations also test for overflows and will
2563          * then break the loop, too.
2564          */
2565         if(targetCapacity>0) {
2566             /*
2567              * Get a correct Unicode code point:
2568              * a single UChar for a BMP code point or
2569              * a matched surrogate pair for a "supplementary code point".
2570              */
2571             c=*source++;
2572             ++nextSourceIndex;
2573             /*
2574              * This also tests if the codepage maps single surrogates.
2575              * If it does, then surrogates are not paired but mapped separately.
2576              * Note that in this case unmatched surrogates are not detected.
2577              */
2578             if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
2579                 if(UTF_IS_SURROGATE_FIRST(c)) {
2580 getTrail:
2581                     if(source<sourceLimit) {
2582                         /* test the following code unit */
2583                         UChar trail=*source;
2584                         if(UTF_IS_SECOND_SURROGATE(trail)) {
2585                             ++source;
2586                             ++nextSourceIndex;
2587                             c=UTF16_GET_PAIR_VALUE(c, trail);
2588                             if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
2589                                 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
2590                                 /* callback(unassigned) */
2591                                 goto unassigned;
2592                             }
2593                             /* convert this supplementary code point */
2594                             /* exit this condition tree */
2595                         } else {
2596                             /* this is an unmatched lead code unit (1st surrogate) */
2597                             /* callback(illegal) */
2598                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2599                             break;
2600                         }
2601                     } else {
2602                         /* no more input */
2603                         break;
2604                     }
2605                 } else {
2606                     /* this is an unmatched trail code unit (2nd surrogate) */
2607                     /* callback(illegal) */
2608                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2609                     break;
2610                 }
2611             }
2612
2613             /* convert the Unicode code point in c into codepage bytes */
2614             stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
2615
2616             /* get the bytes and the length for the output */
2617             /* MBCS_OUTPUT_2 */
2618             value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
2619             if(value<=0xff) {
2620                 length=1;
2621             } else {
2622                 length=2;
2623             }
2624
2625             /* is this code point assigned, or do we use fallbacks? */
2626             if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
2627                  (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
2628             ) {
2629                 /*
2630                  * We allow a 0 byte output if the "assigned" bit is set for this entry.
2631                  * There is no way with this data structure for fallback output
2632                  * to be a zero byte.
2633                  */
2634
2635 unassigned:
2636                 /* try an extension mapping */
2637                 pArgs->source=source;
2638                 c=_extFromU(cnv, cnv->sharedData,
2639                             c, &source, sourceLimit,
2640                             (char **)&target, (char *)target+targetCapacity,
2641                             &offsets, sourceIndex,
2642                             pArgs->flush,
2643                             pErrorCode);
2644                 nextSourceIndex+=(int32_t)(source-pArgs->source);
2645
2646                 if(U_FAILURE(*pErrorCode)) {
2647                     /* not mappable or buffer overflow */
2648                     break;
2649                 } else {
2650                     /* a mapping was written to the target, continue */
2651
2652                     /* recalculate the targetCapacity after an extension mapping */
2653                     targetCapacity=pArgs->targetLimit-(char *)target;
2654
2655                     /* normal end of conversion: prepare for a new character */
2656                     sourceIndex=nextSourceIndex;
2657                     continue;
2658                 }
2659             }
2660
2661             /* write the output character bytes from value and length */
2662             /* from the first if in the loop we know that targetCapacity>0 */
2663             if(length==1) {
2664                 /* this is easy because we know that there is enough space */
2665                 *target++=(uint8_t)value;
2666                 if(offsets!=NULL) {
2667                     *offsets++=sourceIndex;
2668                 }
2669                 --targetCapacity;
2670             } else /* length==2 */ {
2671                 *target++=(uint8_t)(value>>8);
2672                 if(2<=targetCapacity) {
2673                     *target++=(uint8_t)value;
2674                     if(offsets!=NULL) {
2675                         *offsets++=sourceIndex;
2676                         *offsets++=sourceIndex;
2677                     }
2678                     targetCapacity-=2;
2679                 } else {
2680                     if(offsets!=NULL) {
2681                         *offsets++=sourceIndex;
2682                     }
2683                     cnv->charErrorBuffer[0]=(char)value;
2684                     cnv->charErrorBufferLength=1;
2685
2686                     /* target overflow */
2687                     targetCapacity=0;
2688                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2689                     c=0;
2690                     break;
2691                 }
2692             }
2693
2694             /* normal end of conversion: prepare for a new character */
2695             c=0;
2696             sourceIndex=nextSourceIndex;
2697             continue;
2698         } else {
2699             /* target is full */
2700             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2701             break;
2702         }
2703     }
2704
2705     /* set the converter state back into UConverter */
2706     cnv->fromUChar32=c;
2707
2708     /* write back the updated pointers */
2709     pArgs->source=source;
2710     pArgs->target=(char *)target;
2711     pArgs->offsets=offsets;
2712 }
2713
2714 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
2715 static void
2716 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
2717                                   UErrorCode *pErrorCode) {
2718     UConverter *cnv;
2719     const UChar *source, *sourceLimit;
2720     uint8_t *target;
2721     int32_t targetCapacity;
2722     int32_t *offsets;
2723
2724     const uint16_t *table;
2725     const uint16_t *results;
2726
2727     UChar32 c;
2728
2729     int32_t sourceIndex, nextSourceIndex;
2730
2731     uint16_t value, minValue;
2732     UBool hasSupplementary;
2733
2734     /* set up the local pointers */
2735     cnv=pArgs->converter;
2736     source=pArgs->source;
2737     sourceLimit=pArgs->sourceLimit;
2738     target=(uint8_t *)pArgs->target;
2739     targetCapacity=pArgs->targetLimit-pArgs->target;
2740     offsets=pArgs->offsets;
2741
2742     table=cnv->sharedData->mbcs.fromUnicodeTable;
2743     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2744         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
2745     } else {
2746         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
2747     }
2748
2749     if(cnv->useFallback) {
2750         /* use all roundtrip and fallback results */
2751         minValue=0x800;
2752     } else {
2753         /* use only roundtrips and fallbacks from private-use characters */
2754         minValue=0xc00;
2755     }
2756     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
2757
2758     /* get the converter state from UConverter */
2759     c=cnv->fromUChar32;
2760
2761     /* sourceIndex=-1 if the current character began in the previous buffer */
2762     sourceIndex= c==0 ? 0 : -1;
2763     nextSourceIndex=0;
2764
2765     /* conversion loop */
2766     if(c!=0 && targetCapacity>0) {
2767         goto getTrail;
2768     }
2769
2770     while(source<sourceLimit) {
2771         /*
2772          * This following test is to see if available input would overflow the output.
2773          * It does not catch output of more than one byte that
2774          * overflows as a result of a multi-byte character or callback output
2775          * from the last source character.
2776          * Therefore, those situations also test for overflows and will
2777          * then break the loop, too.
2778          */
2779         if(targetCapacity>0) {
2780             /*
2781              * Get a correct Unicode code point:
2782              * a single UChar for a BMP code point or
2783              * a matched surrogate pair for a "supplementary code point".
2784              */
2785             c=*source++;
2786             ++nextSourceIndex;
2787             if(UTF_IS_SURROGATE(c)) {
2788                 if(UTF_IS_SURROGATE_FIRST(c)) {
2789 getTrail:
2790                     if(source<sourceLimit) {
2791                         /* test the following code unit */
2792                         UChar trail=*source;
2793                         if(UTF_IS_SECOND_SURROGATE(trail)) {
2794                             ++source;
2795                             ++nextSourceIndex;
2796                             c=UTF16_GET_PAIR_VALUE(c, trail);
2797                             if(!hasSupplementary) {
2798                                 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
2799                                 /* callback(unassigned) */
2800                                 goto unassigned;
2801                             }
2802                             /* convert this supplementary code point */
2803                             /* exit this condition tree */
2804                         } else {
2805                             /* this is an unmatched lead code unit (1st surrogate) */
2806                             /* callback(illegal) */
2807                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2808                             break;
2809                         }
2810                     } else {
2811                         /* no more input */
2812                         break;
2813                     }
2814                 } else {
2815                     /* this is an unmatched trail code unit (2nd surrogate) */
2816                     /* callback(illegal) */
2817                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2818                     break;
2819                 }
2820             }
2821
2822             /* convert the Unicode code point in c into codepage bytes */
2823             value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
2824
2825             /* is this code point assigned, or do we use fallbacks? */
2826             if(value>=minValue) {
2827                 /* assigned, write the output character bytes from value and length */
2828                 /* length==1 */
2829                 /* this is easy because we know that there is enough space */
2830                 *target++=(uint8_t)value;
2831                 if(offsets!=NULL) {
2832                     *offsets++=sourceIndex;
2833                 }
2834                 --targetCapacity;
2835
2836                 /* normal end of conversion: prepare for a new character */
2837                 c=0;
2838                 sourceIndex=nextSourceIndex;
2839             } else { /* unassigned */
2840 unassigned:
2841                 /* try an extension mapping */
2842                 pArgs->source=source;
2843                 c=_extFromU(cnv, cnv->sharedData,
2844                             c, &source, sourceLimit,
2845                             (char **)&target, (char *)target+targetCapacity,
2846                             &offsets, sourceIndex,
2847                             pArgs->flush,
2848                             pErrorCode);
2849                 nextSourceIndex+=(int32_t)(source-pArgs->source);
2850
2851                 if(U_FAILURE(*pErrorCode)) {
2852                     /* not mappable or buffer overflow */
2853                     break;
2854                 } else {
2855                     /* a mapping was written to the target, continue */
2856
2857                     /* recalculate the targetCapacity after an extension mapping */
2858                     targetCapacity=pArgs->targetLimit-(char *)target;
2859
2860                     /* normal end of conversion: prepare for a new character */
2861                     sourceIndex=nextSourceIndex;
2862                 }
2863             }
2864         } else {
2865             /* target is full */
2866             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2867             break;
2868         }
2869     }
2870
2871     /* set the converter state back into UConverter */
2872     cnv->fromUChar32=c;
2873
2874     /* write back the updated pointers */
2875     pArgs->source=source;
2876     pArgs->target=(char *)target;
2877     pArgs->offsets=offsets;
2878 }
2879
2880 /*
2881  * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
2882  * that map only to and from the BMP.
2883  * In addition to single-byte/state optimizations, the offset calculations
2884  * become much easier.
2885  */
2886 static void
2887 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
2888                               UErrorCode *pErrorCode) {
2889     UConverter *cnv;
2890     const UChar *source, *sourceLimit, *lastSource;
2891     uint8_t *target;
2892     int32_t targetCapacity, length;
2893     int32_t *offsets;
2894
2895     const uint16_t *table;
2896     const uint16_t *results;
2897
2898     UChar32 c;
2899
2900     int32_t sourceIndex;
2901
2902     uint16_t value, minValue;
2903
2904     /* set up the local pointers */
2905     cnv=pArgs->converter;
2906     source=pArgs->source;
2907     sourceLimit=pArgs->sourceLimit;
2908     target=(uint8_t *)pArgs->target;
2909     targetCapacity=pArgs->targetLimit-pArgs->target;
2910     offsets=pArgs->offsets;
2911
2912     table=cnv->sharedData->mbcs.fromUnicodeTable;
2913     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2914         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
2915     } else {
2916         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
2917     }
2918
2919     if(cnv->useFallback) {
2920         /* use all roundtrip and fallback results */
2921         minValue=0x800;
2922     } else {
2923         /* use only roundtrips and fallbacks from private-use characters */
2924         minValue=0xc00;
2925     }
2926
2927     /* get the converter state from UConverter */
2928     c=cnv->fromUChar32;
2929
2930     /* sourceIndex=-1 if the current character began in the previous buffer */
2931     sourceIndex= c==0 ? 0 : -1;
2932     lastSource=source;
2933
2934     /*
2935      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
2936      * for the minimum of the sourceLength and targetCapacity
2937      */
2938     length=sourceLimit-source;
2939     if(length<targetCapacity) {
2940         targetCapacity=length;
2941     }
2942
2943     /* conversion loop */
2944     if(c!=0 && targetCapacity>0) {
2945         goto getTrail;
2946     }
2947
2948 #if MBCS_UNROLL_SINGLE_FROM_BMP
2949     /* unrolling makes it slower on Pentium III/Windows 2000?! */
2950     /* unroll the loop with the most common case */
2951 unrolled:
2952     if(targetCapacity>=4) {
2953         int32_t count, loops;
2954         uint16_t andedValues;
2955
2956         loops=count=targetCapacity>>2;
2957         do {
2958             c=*source++;
2959             andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
2960             *target++=(uint8_t)value;
2961             c=*source++;
2962             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
2963             *target++=(uint8_t)value;
2964             c=*source++;
2965             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
2966             *target++=(uint8_t)value;
2967             c=*source++;
2968             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
2969             *target++=(uint8_t)value;
2970
2971             /* were all 4 entries really valid? */
2972             if(andedValues<minValue) {
2973                 /* no, return to the first of these 4 */
2974                 source-=4;
2975                 target-=4;
2976                 break;
2977             }
2978         } while(--count>0);
2979         count=loops-count;
2980         targetCapacity-=4*count;
2981
2982         if(offsets!=NULL) {
2983             lastSource+=4*count;
2984             while(count>0) {
2985                 *offsets++=sourceIndex++;
2986                 *offsets++=sourceIndex++;
2987                 *offsets++=sourceIndex++;
2988                 *offsets++=sourceIndex++;
2989                 --count;
2990             }
2991         }
2992
2993         c=0;
2994     }
2995 #endif
2996
2997     while(targetCapacity>0) {
2998         /*
2999          * Get a correct Unicode code point:
3000          * a single UChar for a BMP code point or
3001          * a matched surrogate pair for a "supplementary code point".
3002          */
3003         c=*source++;
3004         /*
3005          * Do not immediately check for single surrogates:
3006          * Assume that they are unassigned and check for them in that case.
3007          * This speeds up the conversion of assigned characters.
3008          */
3009         /* convert the Unicode code point in c into codepage bytes */
3010         value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3011
3012         /* is this code point assigned, or do we use fallbacks? */
3013         if(value>=minValue) {
3014             /* assigned, write the output character bytes from value and length */
3015             /* length==1 */
3016             /* this is easy because we know that there is enough space */
3017             *target++=(uint8_t)value;
3018             --targetCapacity;
3019
3020             /* normal end of conversion: prepare for a new character */
3021             c=0;
3022             continue;
3023         } else if(!UTF_IS_SURROGATE(c)) {
3024             /* normal, unassigned BMP character */
3025         } else if(UTF_IS_SURROGATE_FIRST(c)) {
3026 getTrail:
3027             if(source<sourceLimit) {
3028                 /* test the following code unit */
3029                 UChar trail=*source;
3030                 if(UTF_IS_SECOND_SURROGATE(trail)) {
3031                     ++source;
3032                     c=UTF16_GET_PAIR_VALUE(c, trail);
3033                     /* this codepage does not map supplementary code points */
3034                     /* callback(unassigned) */
3035                 } else {
3036                     /* this is an unmatched lead code unit (1st surrogate) */
3037                     /* callback(illegal) */
3038                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3039                     break;
3040                 }
3041             } else {
3042                 /* no more input */
3043                 break;
3044             }
3045         } else {
3046             /* this is an unmatched trail code unit (2nd surrogate) */
3047             /* callback(illegal) */
3048             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3049             break;
3050         }
3051
3052         /* c does not have a mapping */
3053
3054         /* get the number of code units for c to correctly advance sourceIndex */
3055         length=U16_LENGTH(c);
3056
3057         /* set offsets since the start or the last extension */
3058         if(offsets!=NULL) {
3059             int32_t count=(int32_t)(source-lastSource);
3060
3061             /* do not set the offset for this character */
3062             count-=length;
3063
3064             while(count>0) {
3065                 *offsets++=sourceIndex++;
3066                 --count;
3067             }
3068             /* offsets and sourceIndex are now set for the current character */
3069         }
3070
3071         /* try an extension mapping */
3072         lastSource=source;
3073         c=_extFromU(cnv, cnv->sharedData,
3074                     c, &source, sourceLimit,
3075                     (char **)&target, (char *)target+targetCapacity,
3076                     &offsets, sourceIndex,
3077                     pArgs->flush,
3078                     pErrorCode);
3079         sourceIndex+=length+(int32_t)(source-lastSource);
3080         lastSource=source;
3081
3082         if(U_FAILURE(*pErrorCode)) {
3083             /* not mappable or buffer overflow */
3084             break;
3085         } else {
3086             /* a mapping was written to the target, continue */
3087
3088             /* recalculate the targetCapacity after an extension mapping */
3089             targetCapacity=pArgs->targetLimit-(char *)target;
3090             length=sourceLimit-source;
3091             if(length<targetCapacity) {
3092                 targetCapacity=length;
3093             }
3094         }
3095
3096 #if MBCS_UNROLL_SINGLE_FROM_BMP
3097         /* unrolling makes it slower on Pentium III/Windows 2000?! */
3098         goto unrolled;
3099 #endif
3100     }
3101
3102     if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
3103         /* target is full */
3104         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3105     }
3106
3107     /* set offsets since the start or the last callback */
3108     if(offsets!=NULL) {
3109         size_t count=source-lastSource;
3110         while(count>0) {
3111             *offsets++=sourceIndex++;
3112             --count;
3113         }
3114     }
3115
3116     /* set the converter state back into UConverter */
3117     cnv->fromUChar32=c;
3118
3119     /* write back the updated pointers */
3120     pArgs->source=source;
3121     pArgs->target=(char *)target;
3122     pArgs->offsets=offsets;
3123 }
3124
3125 U_CFUNC void
3126 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3127                             UErrorCode *pErrorCode) {
3128     UConverter *cnv;
3129     const UChar *source, *sourceLimit;
3130     uint8_t *target;
3131     int32_t targetCapacity;
3132     int32_t *offsets;
3133
3134     const uint16_t *table;
3135     const uint8_t *p, *bytes;
3136     uint8_t outputType;
3137
3138     UChar32 c;
3139
3140     int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
3141
3142     uint32_t stage2Entry;
3143     uint32_t value;
3144     int32_t length, prevLength;
3145     uint8_t unicodeMask;
3146
3147     cnv=pArgs->converter;
3148
3149     if(cnv->preFromUFirstCP>=0) {
3150         /*
3151          * pass sourceIndex=-1 because we continue from an earlier buffer
3152          * in the future, this may change with continuous offsets
3153          */
3154         ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
3155
3156         if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) {
3157             return;
3158         }
3159     }
3160
3161     /* use optimized function if possible */
3162     outputType=cnv->sharedData->mbcs.outputType;
3163     unicodeMask=cnv->sharedData->mbcs.unicodeMask;
3164     if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
3165         if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3166             ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
3167         } else {
3168             ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
3169         }
3170         return;
3171     } else if(outputType==MBCS_OUTPUT_2) {
3172         ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
3173         return;
3174     }
3175
3176     /* set up the local pointers */
3177     source=pArgs->source;
3178     sourceLimit=pArgs->sourceLimit;
3179     target=(uint8_t *)pArgs->target;
3180     targetCapacity=pArgs->targetLimit-pArgs->target;
3181     offsets=pArgs->offsets;
3182
3183     table=cnv->sharedData->mbcs.fromUnicodeTable;
3184
3185     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3186         bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3187     } else {
3188         bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
3189     }
3190
3191     /* get the converter state from UConverter */
3192     c=cnv->fromUChar32;
3193
3194     if(outputType==MBCS_OUTPUT_2_SISO) {
3195         prevLength=cnv->fromUnicodeStatus;
3196         if(prevLength==0) {
3197             /* set the real value */
3198             prevLength=1;
3199         }
3200     } else {
3201         /* prevent fromUnicodeStatus from being set to something non-0 */
3202         prevLength=0;
3203     }
3204
3205     /* sourceIndex=-1 if the current character began in the previous buffer */
3206     prevSourceIndex=-1;
3207     sourceIndex= c==0 ? 0 : -1;
3208     nextSourceIndex=0;
3209
3210     /* conversion loop */
3211     /*
3212      * This is another piece of ugly code:
3213      * A goto into the loop if the converter state contains a first surrogate
3214      * from the previous function call.
3215      * It saves me to check in each loop iteration a check of if(c==0)
3216      * and duplicating the trail-surrogate-handling code in the else
3217      * branch of that check.
3218      * I could not find any other way to get around this other than
3219      * using a function call for the conversion and callback, which would
3220      * be even more inefficient.
3221      *
3222      * Markus Scherer 2000-jul-19
3223      */
3224     if(c!=0 && targetCapacity>0) {
3225         goto getTrail;
3226     }
3227
3228     while(source<sourceLimit) {
3229         /*
3230          * This following test is to see if available input would overflow the output.
3231          * It does not catch output of more than one byte that
3232          * overflows as a result of a multi-byte character or callback output
3233          * from the last source character.
3234          * Therefore, those situations also test for overflows and will
3235          * then break the loop, too.
3236          */
3237         if(targetCapacity>0) {
3238             /*
3239              * Get a correct Unicode code point:
3240              * a single UChar for a BMP code point or
3241              * a matched surrogate pair for a "supplementary code point".
3242              */
3243             c=*source++;
3244             ++nextSourceIndex;
3245             /*
3246              * This also tests if the codepage maps single surrogates.
3247              * If it does, then surrogates are not paired but mapped separately.
3248              * Note that in this case unmatched surrogates are not detected.
3249              */
3250             if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
3251                 if(UTF_IS_SURROGATE_FIRST(c)) {
3252 getTrail:
3253                     if(source<sourceLimit) {
3254                         /* test the following code unit */
3255                         UChar trail=*source;
3256                         if(UTF_IS_SECOND_SURROGATE(trail)) {
3257                             ++source;
3258                             ++nextSourceIndex;
3259                             c=UTF16_GET_PAIR_VALUE(c, trail);
3260                             if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3261                                 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3262                                 cnv->fromUnicodeStatus=prevLength; /* save the old state */
3263                                 /* callback(unassigned) */
3264                                 goto unassigned;
3265                             }
3266                             /* convert this supplementary code point */
3267                             /* exit this condition tree */
3268                         } else {
3269                             /* this is an unmatched lead code unit (1st surrogate) */
3270                             /* callback(illegal) */
3271                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3272                             break;
3273                         }
3274                     } else {
3275                         /* no more input */
3276                         break;
3277                     }
3278                 } else {
3279                     /* this is an unmatched trail code unit (2nd surrogate) */
3280                     /* callback(illegal) */
3281                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3282                     break;
3283                 }
3284             }
3285
3286             /* convert the Unicode code point in c into codepage bytes */
3287
3288             /*
3289              * The basic lookup is a triple-stage compact array (trie) lookup.
3290              * For details see the beginning of this file.
3291              *
3292              * Single-byte codepages are handled with a different data structure
3293              * by _MBCSSingle... functions.
3294              *
3295              * The result consists of a 32-bit value from stage 2 and
3296              * a pointer to as many bytes as are stored per character.
3297              * The pointer points to the character's bytes in stage 3.
3298              * Bits 15..0 of the stage 2 entry contain the stage 3 index
3299              * for that pointer, while bits 31..16 are flags for which of
3300              * the 16 characters in the block are roundtrip-assigned.
3301              *
3302              * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
3303              * respectively as uint32_t, in the platform encoding.
3304              * For 3-byte codepages, the bytes are always stored in big-endian order.
3305              *
3306              * For EUC encodings that use only either 0x8e or 0x8f as the first
3307              * byte of their longest byte sequences, the first two bytes in
3308              * this third stage indicate with their 7th bits whether these bytes
3309              * are to be written directly or actually need to be preceeded by
3310              * one of the two Single-Shift codes. With this, the third stage
3311              * stores one byte fewer per character than the actual maximum length of
3312              * EUC byte sequences.
3313              *
3314              * Other than that, leading zero bytes are removed and the other
3315              * bytes output. A single zero byte may be output if the "assigned"
3316              * bit in stage 2 was on.
3317              * The data structure does not support zero byte output as a fallback,
3318              * and also does not allow output of leading zeros.
3319              */
3320             stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
3321
3322             /* get the bytes and the length for the output */
3323             switch(outputType) {
3324             case MBCS_OUTPUT_2:
3325                 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3326                 if(value<=0xff) {
3327                     length=1;
3328                 } else {
3329                     length=2;
3330                 }
3331                 break;
3332             case MBCS_OUTPUT_2_SISO:
3333                 /* 1/2-byte stateful with Shift-In/Shift-Out */
3334                 /*
3335                  * Save the old state in the converter object
3336                  * right here, then change the local prevLength state variable if necessary.
3337                  * Then, if this character turns out to be unassigned or a fallback that
3338                  * is not taken, the callback code must not save the new state in the converter
3339                  * because the new state is for a character that is not output.
3340                  * However, the callback must still restore the state from the converter
3341                  * in case the callback function changed it for its output.
3342                  */
3343                 cnv->fromUnicodeStatus=prevLength; /* save the old state */
3344                 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3345                 if(value<=0xff) {
3346                     if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) {
3347                         /* no mapping, leave value==0 */
3348                         length=0;
3349                     } else if(prevLength<=1) {
3350                         length=1;
3351                     } else {
3352                         /* change from double-byte mode to single-byte */
3353                         value|=(uint32_t)UCNV_SI<<8;
3354                         length=2;
3355                         prevLength=1;
3356                     }
3357                 } else {
3358                     if(prevLength==2) {
3359                         length=2;
3360                     } else {
3361                         /* change from single-byte mode to double-byte */
3362                         value|=(uint32_t)UCNV_SO<<16;
3363                         length=3;
3364                         prevLength=2;
3365                     }
3366                 }
3367                 break;
3368             case MBCS_OUTPUT_DBCS_ONLY:
3369                 /* table with single-byte results, but only DBCS mappings used */
3370                 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3371                 if(value<=0xff) {
3372                     /* no mapping or SBCS result, not taken for DBCS-only */
3373                     value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
3374                     length=0;
3375                 } else {
3376                     length=2;
3377                 }
3378                 break;
3379             case MBCS_OUTPUT_3:
3380                 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
3381                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
3382                 if(value<=0xff) {
3383                     length=1;
3384                 } else if(value<=0xffff) {
3385                     length=2;
3386                 } else {
3387                     length=3;
3388                 }
3389                 break;
3390             case MBCS_OUTPUT_4:
3391                 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
3392                 if(value<=0xff) {
3393                     length=1;
3394                 } else if(value<=0xffff) {
3395                     length=2;
3396                 } else if(value<=0xffffff) {
3397                     length=3;
3398                 } else {
3399                     length=4;
3400                 }
3401                 break;
3402             case MBCS_OUTPUT_3_EUC:
3403                 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3404                 /* EUC 16-bit fixed-length representation */
3405                 if(value<=0xff) {
3406                     length=1;
3407                 } else if((value&0x8000)==0) {
3408                     value|=0x8e8000;
3409                     length=3;
3410                 } else if((value&0x80)==0) {
3411                     value|=0x8f0080;
3412                     length=3;
3413                 } else {
3414                     length=2;
3415                 }
3416                 break;
3417             case MBCS_OUTPUT_4_EUC:
3418                 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
3419                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
3420                 /* EUC 16-bit fixed-length representation applied to the first two bytes */
3421                 if(value<=0xff) {
3422                     length=1;
3423                 } else if(value<=0xffff) {
3424                     length=2;
3425                 } else if((value&0x800000)==0) {
3426                     value|=0x8e800000;
3427                     length=4;
3428                 } else if((value&0x8000)==0) {
3429                     value|=0x8f008000;
3430                     length=4;
3431                 } else {
3432                     length=3;
3433                 }
3434                 break;
3435             default:
3436                 /* must not occur */
3437                 /*
3438                  * To avoid compiler warnings that value & length may be
3439                  * used without having been initialized, we set them here.
3440                  * In reality, this is unreachable code.
3441                  * Not having a default branch also causes warnings with
3442                  * some compilers.
3443                  */
3444                 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
3445                 length=0;
3446                 break;
3447             }
3448
3449             /* is this code point assigned, or do we use fallbacks? */
3450             if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 ||
3451                  (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
3452             ) {
3453                 /*
3454                  * We allow a 0 byte output if the "assigned" bit is set for this entry.
3455                  * There is no way with this data structure for fallback output
3456                  * to be a zero byte.
3457                  */
3458
3459 unassigned:
3460                 /* try an extension mapping */
3461                 pArgs->source=source;
3462                 c=_extFromU(cnv, cnv->sharedData,
3463                             c, &source, sourceLimit,
3464                             (char **)&target, (char *)target+targetCapacity,
3465                             &offsets, sourceIndex,
3466                             pArgs->flush,
3467                             pErrorCode);
3468                 nextSourceIndex+=(int32_t)(source-pArgs->source);
3469                 prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
3470
3471                 if(U_FAILURE(*pErrorCode)) {
3472                     /* not mappable or buffer overflow */
3473                     break;
3474                 } else {
3475                     /* a mapping was written to the target, continue */
3476
3477                     /* recalculate the targetCapacity after an extension mapping */
3478                     targetCapacity=pArgs->targetLimit-(char *)target;
3479
3480                     /* normal end of conversion: prepare for a new character */
3481                     if(offsets!=NULL) {
3482                         prevSourceIndex=sourceIndex;
3483                         sourceIndex=nextSourceIndex;
3484                     }
3485                     continue;
3486                 }
3487             }
3488
3489             /* write the output character bytes from value and length */
3490             /* from the first if in the loop we know that targetCapacity>0 */
3491             if(length<=targetCapacity) {
3492                 if(offsets==NULL) {
3493                     switch(length) {
3494                         /* each branch falls through to the next one */
3495                     case 4:
3496                         *target++=(uint8_t)(value>>24);
3497                     case 3:
3498                         *target++=(uint8_t)(value>>16);
3499                     case 2:
3500                         *target++=(uint8_t)(value>>8);
3501                     case 1:
3502                         *target++=(uint8_t)value;
3503                     default:
3504                         /* will never occur */
3505                         break;
3506                     }
3507                 } else {
3508                     switch(length) {
3509                         /* each branch falls through to the next one */
3510                     case 4:
3511                         *target++=(uint8_t)(value>>24);
3512                         *offsets++=sourceIndex;
3513                     case 3:
3514                         *target++=(uint8_t)(value>>16);
3515                         *offsets++=sourceIndex;
3516                     case 2:
3517                         *target++=(uint8_t)(value>>8);
3518                         *offsets++=sourceIndex;
3519                     case 1:
3520                         *target++=(uint8_t)value;
3521                         *offsets++=sourceIndex;
3522                     default:
3523                         /* will never occur */
3524                         break;
3525                     }
3526                 }
3527                 targetCapacity-=length;
3528             } else {
3529                 uint8_t *charErrorBuffer;
3530
3531                 /*
3532                  * We actually do this backwards here:
3533                  * In order to save an intermediate variable, we output
3534                  * first to the overflow buffer what does not fit into the
3535                  * regular target.
3536                  */
3537                 /* we know that 1<=targetCapacity<length<=4 */
3538                 length-=targetCapacity;
3539                 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
3540                 switch(length) {
3541                     /* each branch falls through to the next one */
3542                 case 3:
3543                     *charErrorBuffer++=(uint8_t)(value>>16);
3544                 case 2:
3545                     *charErrorBuffer++=(uint8_t)(value>>8);
3546                 case 1:
3547                     *charErrorBuffer=(uint8_t)value;
3548                 default:
3549                     /* will never occur */
3550                     break;
3551                 }
3552                 cnv->charErrorBufferLength=(int8_t)length;
3553
3554                 /* now output what fits into the regular target */
3555                 value>>=8*length; /* length was reduced by targetCapacity */
3556                 switch(targetCapacity) {
3557                     /* each branch falls through to the next one */
3558                 case 3:
3559                     *target++=(uint8_t)(value>>16);
3560                     if(offsets!=NULL) {
3561                         *offsets++=sourceIndex;
3562                     }
3563                 case 2:
3564                     *target++=(uint8_t)(value>>8);
3565                     if(offsets!=NULL) {
3566                         *offsets++=sourceIndex;
3567                     }
3568                 case 1:
3569                     *target++=(uint8_t)value;
3570                     if(offsets!=NULL) {
3571                         *offsets++=sourceIndex;
3572                     }
3573                 default:
3574                     /* will never occur */
3575                     break;
3576                 }
3577
3578                 /* target overflow */
3579                 targetCapacity=0;
3580                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3581                 c=0;
3582                 break;
3583             }
3584
3585             /* normal end of conversion: prepare for a new character */
3586             c=0;
3587             if(offsets!=NULL) {
3588                 prevSourceIndex=sourceIndex;
3589                 sourceIndex=nextSourceIndex;
3590             }
3591             continue;
3592         } else {
3593             /* target is full */
3594             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3595             break;
3596         }
3597     }
3598
3599     /*
3600      * the end of the input stream and detection of truncated input
3601      * are handled by the framework, but for EBCDIC_STATEFUL conversion
3602      * we need to emit an SI at the very end
3603      *
3604      * conditions:
3605      *   successful
3606      *   EBCDIC_STATEFUL in DBCS mode
3607      *   end of input and no truncated input
3608      */
3609     if( U_SUCCESS(*pErrorCode) &&
3610         outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&
3611         pArgs->flush && source>=sourceLimit && c==0
3612     ) {
3613         /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
3614         if(targetCapacity>0) {
3615             *target++=(uint8_t)UCNV_SI;
3616             if(offsets!=NULL) {
3617                 /* set the last source character's index (sourceIndex points at sourceLimit now) */
3618                 *offsets++=prevSourceIndex;
3619             }
3620         } else {
3621             /* target is full */
3622             cnv->charErrorBuffer[0]=(char)UCNV_SI;
3623             cnv->charErrorBufferLength=1;
3624             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3625         }
3626         prevLength=1; /* we switched into SBCS */
3627     }
3628
3629     /* set the converter state back into UConverter */
3630     cnv->fromUChar32=c;
3631     cnv->fromUnicodeStatus=prevLength;
3632
3633     /* write back the updated pointers */
3634     pArgs->source=source;
3635     pArgs->target=(char *)target;
3636     pArgs->offsets=offsets;
3637 }
3638
3639 /*
3640  * This is another simple conversion function for internal use by other
3641  * conversion implementations.
3642  * It does not use the converter state nor call callbacks.
3643  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3644  * It handles conversion extensions but not GB 18030.
3645  *
3646  * It converts one single Unicode code point into codepage bytes, encoded
3647  * as one 32-bit value. The function returns the number of bytes in *pValue:
3648  * 1..4 the number of bytes in *pValue
3649  * 0    unassigned (*pValue undefined)
3650  * -1   illegal (currently not used, *pValue undefined)
3651  *
3652  * *pValue will contain the resulting bytes with the last byte in bits 7..0,
3653  * the second to last byte in bits 15..8, etc.
3654  * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
3655  */
3656 U_CFUNC int32_t
3657 ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,
3658                  UChar32 c, uint32_t *pValue,
3659                  UBool useFallback) {
3660     const int32_t *cx;
3661     const uint16_t *table;
3662 #if 0
3663 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
3664     const uint8_t *p;
3665 #endif
3666     uint32_t stage2Entry;
3667     uint32_t value;
3668     int32_t length;
3669
3670     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3671     if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3672         table=sharedData->mbcs.fromUnicodeTable;
3673
3674         /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
3675         if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {
3676             value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
3677             /* is this code point assigned, or do we use fallbacks? */
3678             if(useFallback ? value>=0x800 : value>=0xc00) {
3679                 *pValue=value&0xff;
3680                 return 1;
3681             }
3682         } else /* outputType!=MBCS_OUTPUT_1 */ {
3683             stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
3684
3685             /* get the bytes and the length for the output */
3686             switch(sharedData->mbcs.outputType) {
3687             case MBCS_OUTPUT_2:
3688                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
3689                 if(value<=0xff) {
3690                     length=1;
3691                 } else {
3692                     length=2;
3693                 }
3694                 break;
3695 #if 0
3696 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
3697             case MBCS_OUTPUT_DBCS_ONLY:
3698                 /* table with single-byte results, but only DBCS mappings used */
3699                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
3700                 if(value<=0xff) {
3701                     /* no mapping or SBCS result, not taken for DBCS-only */
3702                     value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
3703                     length=0;
3704                 } else {
3705                     length=2;
3706                 }
3707                 break;
3708             case MBCS_OUTPUT_3:
3709                 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
3710                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
3711                 if(value<=0xff) {
3712                     length=1;
3713                 } else if(value<=0xffff) {
3714                     length=2;
3715                 } else {
3716                     length=3;
3717                 }
3718                 break;
3719             case MBCS_OUTPUT_4:
3720                 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
3721                 if(value<=0xff) {
3722                     length=1;
3723                 } else if(value<=0xffff) {
3724                     length=2;
3725                 } else if(value<=0xffffff) {
3726                     length=3;
3727                 } else {
3728                     length=4;
3729                 }
3730                 break;
3731             case MBCS_OUTPUT_3_EUC:
3732                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
3733                 /* EUC 16-bit fixed-length representation */
3734                 if(value<=0xff) {
3735                     length=1;
3736                 } else if((value&0x8000)==0) {
3737                     value|=0x8e8000;
3738                     length=3;
3739                 } else if((value&0x80)==0) {
3740                     value|=0x8f0080;
3741                     length=3;
3742                 } else {
3743                     length=2;
3744                 }
3745                 break;
3746             case MBCS_OUTPUT_4_EUC:
3747                 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
3748                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
3749                 /* EUC 16-bit fixed-length representation applied to the first two bytes */
3750                 if(value<=0xff) {
3751                     length=1;
3752                 } else if(value<=0xffff) {
3753                     length=2;
3754                 } else if((value&0x800000)==0) {
3755                     value|=0x8e800000;
3756                     length=4;
3757                 } else if((value&0x8000)==0) {
3758                     value|=0x8f008000;
3759                     length=4;
3760                 } else {
3761                     length=3;
3762                 }
3763                 break;
3764 #endif
3765             default:
3766                 /* must not occur */
3767                 return -1;
3768             }
3769
3770             /* is this code point assigned, or do we use fallbacks? */
3771             if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
3772                 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0)
3773             ) {
3774                 /*
3775                  * We allow a 0 byte output if the "assigned" bit is set for this entry.
3776                  * There is no way with this data structure for fallback output
3777                  * to be a zero byte.
3778                  */
3779                 /* assigned */
3780                 *pValue=value;
3781                 return length;
3782             }
3783         }
3784     }
3785
3786     cx=sharedData->mbcs.extIndexes;
3787     if(cx!=NULL) {
3788         return ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
3789     }
3790
3791     /* unassigned */
3792     return 0;
3793 }
3794
3795
3796 #if 0
3797 /*
3798  * This function has been moved to ucnv2022.c for inlining.
3799  * This implementation is here only for documentation purposes
3800  */
3801
3802 /**
3803  * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
3804  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3805  * It does not handle conversion extensions (_extFromU()).
3806  *
3807  * It returns the codepage byte for the code point, or -1 if it is unassigned.
3808  */
3809 U_CFUNC int32_t
3810 ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
3811                        UChar32 c,
3812                        UBool useFallback) {
3813     const uint16_t *table;
3814     int32_t value;
3815
3816     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3817     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3818         return -1;
3819     }
3820
3821     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
3822     table=sharedData->mbcs.fromUnicodeTable;
3823
3824     /* get the byte for the output */
3825     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
3826     /* is this code point assigned, or do we use fallbacks? */
3827     if(useFallback ? value>=0x800 : value>=0xc00) {
3828         return value&0xff;
3829     } else {
3830         return -1;
3831     }
3832 }
3833 #endif
3834
3835 /* miscellaneous ------------------------------------------------------------ */
3836
3837 static void
3838 ucnv_MBCSGetStarters(const UConverter* cnv,
3839                  UBool starters[256],
3840                  UErrorCode *pErrorCode) {
3841     const int32_t *state0;
3842     int i;
3843
3844     state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState];
3845     for(i=0; i<256; ++i) {
3846         /* all bytes that cause a state transition from state 0 are lead bytes */
3847         starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);
3848     }
3849 }
3850
3851 /*
3852  * This is an internal function that allows other converter implementations
3853  * to check whether a byte is a lead byte.
3854  */
3855 U_CFUNC UBool
3856 ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {
3857     return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]);
3858 }
3859
3860 static void
3861 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
3862               int32_t offsetIndex,
3863               UErrorCode *pErrorCode) {
3864     UConverter *cnv=pArgs->converter;
3865     char *p, *subchar;
3866     char buffer[4];
3867     int32_t length;
3868
3869     /* first, select between subChar and subChar1 */
3870     if( cnv->subChar1!=0 &&
3871         (cnv->sharedData->mbcs.extIndexes!=NULL ?
3872             cnv->useSubChar1 :
3873             (cnv->invalidUCharBuffer[0]<=0xff))
3874     ) {
3875         /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
3876         subchar=(char *)&cnv->subChar1;
3877         length=1;
3878     } else {
3879         /* select subChar in all other cases */
3880         subchar=(char *)cnv->subChar;
3881         length=cnv->subCharLen;
3882     }
3883
3884     /* reset the selector for the next code point */
3885     cnv->useSubChar1=FALSE;
3886
3887     switch(cnv->sharedData->mbcs.outputType) {
3888     case MBCS_OUTPUT_2_SISO:
3889         p=buffer;
3890
3891         /* fromUnicodeStatus contains prevLength */
3892         switch(length) {
3893         case 1:
3894             if(cnv->fromUnicodeStatus==2) {
3895                 /* DBCS mode and SBCS sub char: change to SBCS */
3896                 cnv->fromUnicodeStatus=1;
3897                 *p++=UCNV_SI;
3898             }
3899             *p++=subchar[0];
3900             break;
3901         case 2:
3902             if(cnv->fromUnicodeStatus<=1) {
3903                 /* SBCS mode and DBCS sub char: change to DBCS */
3904                 cnv->fromUnicodeStatus=2;
3905                 *p++=UCNV_SO;
3906             }
3907             *p++=subchar[0];
3908             *p++=subchar[1];
3909             break;
3910         default:
3911             *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3912             return;
3913         }
3914         ucnv_cbFromUWriteBytes(pArgs,
3915                                buffer, (int32_t)(p-buffer),
3916                                offsetIndex, pErrorCode);
3917         break;
3918     default:
3919         ucnv_cbFromUWriteBytes(pArgs,
3920                                subchar, length,
3921                                offsetIndex, pErrorCode);
3922         break;
3923     }
3924 }
3925
3926 U_CFUNC UConverterType
3927 ucnv_MBCSGetType(const UConverter* converter) {
3928     /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
3929     if(converter->sharedData->mbcs.countStates==1) {
3930         return (UConverterType)UCNV_SBCS;
3931     } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {
3932         return (UConverterType)UCNV_EBCDIC_STATEFUL;
3933     } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) {
3934         return (UConverterType)UCNV_DBCS;
3935     }
3936     return (UConverterType)UCNV_MBCS;
3937 }
3938
3939 static const UConverterImpl _MBCSImpl={
3940     UCNV_MBCS,
3941
3942     ucnv_MBCSLoad,
3943     ucnv_MBCSUnload,
3944
3945     ucnv_MBCSOpen,
3946     NULL,
3947     NULL,
3948
3949     ucnv_MBCSToUnicodeWithOffsets,
3950     ucnv_MBCSToUnicodeWithOffsets,
3951     ucnv_MBCSFromUnicodeWithOffsets,
3952     ucnv_MBCSFromUnicodeWithOffsets,
3953     ucnv_MBCSGetNextUChar,
3954
3955     ucnv_MBCSGetStarters,
3956     ucnv_MBCSGetName,
3957     ucnv_MBCSWriteSub,
3958     NULL,
3959     ucnv_MBCSGetUnicodeSet
3960 };
3961
3962
3963 /* Static data is in tools/makeconv/ucnvstat.c for data-based
3964  * converters. Be sure to update it as well.
3965  */
3966
3967 const UConverterSharedData _MBCSData={
3968     sizeof(UConverterSharedData), 1,
3969     NULL, NULL, NULL, FALSE, &_MBCSImpl,
3970     0
3971 };
3972
3973 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */