icuSources/common/ucnvmbcs.c

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2000-2008, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *   file name:  ucnvmbcs.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2000jul03
  14 *   created by: Markus W. Scherer
  15 *
  16 *   The current code in this file replaces the previous implementation
  17 *   of conversion code from multi-byte codepages to Unicode and back.
  18 *   This implementation supports the following:
  19 *   - legacy variable-length codepages with up to 4 bytes per character
  20 *   - all Unicode code points (up to 0x10ffff)
  21 *   - efficient distinction of unassigned vs. illegal byte sequences
  22 *   - it is possible in fromUnicode() to directly deal with simple
  23 *     stateful encodings (used for EBCDIC_STATEFUL)
  24 *   - it is possible to convert Unicode code points
  25 *     to a single zero byte (but not as a fallback except for SBCS)
  26 *
  27 *   Remaining limitations in fromUnicode:
  28 *   - byte sequences must not have leading zero bytes
  29 *   - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
  30 *   - limitation to up to 4 bytes per character
  31 *
  32 *   ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
  33 *   limitations and adds m:n character mappings and other features.
  34 *   See ucnv_ext.h for details.
  35 *
  36 *   Change history:
  37 *
  38 *    5/6/2001       Ram       Moved  MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
  39 *                             MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
  40 *                             macros to ucnvmbcs.h file
  41 */
  42
  43 #include "unicode/utypes.h"
  44
  45 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
  46
  47 #include "unicode/ucnv.h"
  48 #include "unicode/ucnv_cb.h"
  49 #include "unicode/udata.h"
  50 #include "unicode/uset.h"
  51 #include "ucnv_bld.h"
  52 #include "ucnvmbcs.h"
  53 #include "ucnv_ext.h"
  54 #include "ucnv_cnv.h"
  55 #include "umutex.h"
  56 #include "cmemory.h"
  57 #include "cstring.h"
  58
  59 /* control optimizations according to the platform */
  60 #define MBCS_UNROLL_SINGLE_TO_BMP 1
  61 #define MBCS_UNROLL_SINGLE_FROM_BMP 0
  62
  63 /*
  64  * _MBCSHeader versions 5.3 & 4.3
  65  * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
  66  *
  67  * This version is optional. Version 5 is used for incompatible data format changes.
  68  * makeconv will continue to generate version 4 files if possible.
  69  *
  70  * Changes from version 4:
  71  *
  72  * The main difference is an additional _MBCSHeader field with
  73  * - the length (number of uint32_t) of the _MBCSHeader
  74  * - flags for further incompatible data format changes
  75  * - flags for further, backward compatible data format changes
  76  *
  77  * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
  78  * the file and needs to be reconstituted at load time.
  79  * This requires a utf8Friendly format with an additional mbcsIndex table for fast
  80  * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
  81  * (For details about these structures see below, and see ucnvmbcs.h.)
  82  *
  83  *   utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
  84  *   of the Unicode code points. (This requires that the .ucm file has the |0 etc.
  85  *   precision markers for all mappings.)
  86  *
  87  *   All fallbacks have been moved to the extension table, leaving only roundtrips in the
  88  *   omitted data that can be reconstituted from the toUnicode data.
  89  *
  90  *   Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
  91  *   With only roundtrip mappings in the base fromUnicode data, this part is fully
  92  *   redundant with the mbcsIndex and will be reconstituted from that (also using the
  93  *   stage 1 table which contains the information about how stage 2 was compacted).
  94  *
  95  *   The rest of the stage 2 table, the part for code points above maxFastUChar,
  96  *   is stored in the file and will be appended to the reconstituted part.
  97  *
  98  *   The entire fromUBytes array is omitted from the file and will be reconstitued.
  99  *   This is done by enumerating all toUnicode roundtrip mappings, performing
 100  *   each mapping (using the stage 1 and reconstituted stage 2 tables) and
 101  *   writing instead of reading the byte values.
 102  *
 103  * _MBCSHeader version 4.3
 104  *
 105  * Change from version 4.2:
 106  * - Optional utf8Friendly data structures, with 64-entry stage 3 block
 107  *   allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
 108  *   files which can be used instead of stages 1 & 2.
 109  *   Faster lookups for roundtrips from most commonly used characters,
 110  *   and lookups from UTF-8 byte sequences with a natural bit distribution.
 111  *   See ucnvmbcs.h for more details.
 112  *
 113  * Change from version 4.1:
 114  * - Added an optional extension table structure at the end of the .cnv file.
 115  *   It is present if the upper bits of the header flags field contains a non-zero
 116  *   byte offset to it.
 117  *   Files that contain only a conversion table and no base table
 118  *   use the special outputType MBCS_OUTPUT_EXT_ONLY.
 119  *   These contain the base table name between the MBCS header and the extension
 120  *   data.
 121  *
 122  * Change from version 4.0:
 123  * - Replace header.reserved with header.fromUBytesLength so that all
 124  *   fields in the data have length.
 125  *
 126  * Changes from version 3 (for performance improvements):
 127  * - new bit distribution for state table entries
 128  * - reordered action codes
 129  * - new data structure for single-byte fromUnicode
 130  *   + stage 2 only contains indexes
 131  *   + stage 3 stores 16 bits per character with classification bits 15..8
 132  * - no multiplier for stage 1 entries
 133  * - stage 2 for non-single-byte codepages contains the index and the flags in
 134  *   one 32-bit value
 135  * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
 136  *
 137  * For more details about old versions of the MBCS data structure, see
 138  * the corresponding versions of this file.
 139  *
 140  * Converting stateless codepage data ---------------------------------------***
 141  * (or codepage data with simple states) to Unicode.
 142  *
 143  * Data structure and algorithm for converting from complex legacy codepages
 144  * to Unicode. (Designed before 2000-may-22.)
 145  *
 146  * The basic idea is that the structure of legacy codepages can be described
 147  * with state tables.
 148  * When reading a byte stream, each input byte causes a state transition.
 149  * Some transitions result in the output of a code point, some result in
 150  * "unassigned" or "illegal" output.
 151  * This is used here for character conversion.
 152  *
 153  * The data structure begins with a state table consisting of a row
 154  * per state, with 256 entries (columns) per row for each possible input
 155  * byte value.
 156  * Each entry is 32 bits wide, with two formats distinguished by
 157  * the sign bit (bit 31):
 158  *
 159  * One format for transitional entries (bit 31 not set) for non-final bytes, and
 160  * one format for final entries (bit 31 set).
 161  * Both formats contain the number of the next state in the same bit
 162  * positions.
 163  * State 0 is the initial state.
 164  *
 165  * Most of the time, the offset values of subsequent states are added
 166  * up to a scalar value. This value will eventually be the index of
 167  * the Unicode code point in a table that follows the state table.
 168  * The effect is that the code points for final state table rows
 169  * are contiguous. The code points of final state rows follow each other
 170  * in the order of the references to those final states by previous
 171  * states, etc.
 172  *
 173  * For some terminal states, the offset is itself the output Unicode
 174  * code point (16 bits for a BMP code point or 20 bits for a supplementary
 175  * code point (stored as code point minus 0x10000 so that 20 bits are enough).
 176  * For others, the code point in the Unicode table is stored with either
 177  * one or two code units: one for BMP code points, two for a pair of
 178  * surrogates.
 179  * All code points for a final state entry take up the same number of code
 180  * units, regardless of whether they all actually _use_ the same number
 181  * of code units. This is necessary for simple array access.
 182  *
 183  * An additional feature comes in with what in ICU is called "fallback"
 184  * mappings:
 185  *
 186  * In addition to round-trippable, precise, 1:1 mappings, there are often
 187  * mappings defined between similar, though not the same, characters.
 188  * Typically, such mappings occur only in fromUnicode mapping tables because
 189  * Unicode has a superset repertoire of most other codepages. However, it
 190  * is possible to provide such mappings in the toUnicode tables, too.
 191  * In this case, the fallback mappings are partly integrated into the
 192  * general state tables because the structure of the encoding includes their
 193  * byte sequences.
 194  * For final entries in an initial state, fallback mappings are stored in
 195  * the entry itself like with roundtrip mappings.
 196  * For other final entries, they are stored in the code units table if
 197  * the entry is for a pair of code units.
 198  * For single-unit results in the code units table, there is no space to
 199  * alternatively hold a fallback mapping; in this case, the code unit
 200  * is stored as U+fffe (unassigned), and the fallback mapping needs to
 201  * be looked up by the scalar offset value in a separate table.
 202  *
 203  * "Unassigned" state entries really mean "structurally unassigned",
 204  * i.e., such a byte sequence will never have a mapping result.
 205  *
 206  * The interpretation of the bits in each entry is as follows:
 207  *
 208  * Bit 31 not set, not a terminal entry ("transitional"):
 209  * 30..24 next state
 210  * 23..0  offset delta, to be added up
 211  *
 212  * Bit 31 set, terminal ("final") entry:
 213  * 30..24 next state (regardless of action code)
 214  * 23..20 action code:
 215  *        action codes 0 and 1 result in precise-mapping Unicode code points
 216  *        0  valid byte sequence
 217  *           19..16 not used, 0
 218  *           15..0  16-bit Unicode BMP code point
 219  *                  never U+fffe or U+ffff
 220  *        1  valid byte sequence
 221  *           19..0  20-bit Unicode supplementary code point
 222  *                  never U+fffe or U+ffff
 223  *
 224  *        action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
 225  *        2  valid byte sequence (fallback)
 226  *           19..16 not used, 0
 227  *           15..0  16-bit Unicode BMP code point as fallback result
 228  *        3  valid byte sequence (fallback)
 229  *           19..0  20-bit Unicode supplementary code point as fallback result
 230  *
 231  *        action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
 232  *        depending on the code units they result in
 233  *        4  valid byte sequence
 234  *           19..9  not used, 0
 235  *            8..0  final offset delta
 236  *                  pointing to one 16-bit code unit which may be
 237  *                  fffe  unassigned -- look for a fallback for this offset
 238  *                  ffff  illegal
 239  *        5  valid byte sequence
 240  *           19..9  not used, 0
 241  *            8..0  final offset delta
 242  *                  pointing to two 16-bit code units
 243  *                  (typically UTF-16 surrogates)
 244  *                  the result depends on the first code unit as follows:
 245  *                  0000..d7ff  roundtrip BMP code point (1st alone)
 246  *                  d800..dbff  roundtrip surrogate pair (1st, 2nd)
 247  *                  dc00..dfff  fallback surrogate pair (1st-400, 2nd)
 248  *                  e000        roundtrip BMP code point (2nd alone)
 249  *                  e001        fallback BMP code point (2nd alone)
 250  *                  fffe        unassigned
 251  *                  ffff        illegal
 252  *           (the final offset deltas are at most 255 * 2,
 253  *            times 2 because of storing code unit pairs)
 254  *
 255  *        6  unassigned byte sequence
 256  *           19..16 not used, 0
 257  *           15..0  16-bit Unicode BMP code point U+fffe (new with version 2)
 258  *                  this does not contain a final offset delta because the main
 259  *                  purpose of this action code is to save scalar offset values;
 260  *                  therefore, fallback values cannot be assigned to byte
 261  *                  sequences that result in this action code
 262  *        7  illegal byte sequence
 263  *           19..16 not used, 0
 264  *           15..0  16-bit Unicode BMP code point U+ffff (new with version 2)
 265  *        8  state change only
 266  *           19..0  not used, 0
 267  *           useful for state changes in simple stateful encodings,
 268  *           at Shift-In/Shift-Out codes
 269  *
 270  *
 271  *        9..15 reserved for future use
 272  *           current implementations will only perform a state change
 273  *           and ignore bits 19..0
 274  *
 275  * An encoding with contiguous ranges of unassigned byte sequences, like
 276  * Shift-JIS and especially EUC-TW, can be stored efficiently by having
 277  * at least two states for the trail bytes:
 278  * One trail byte state that results in code points, and one that only
 279  * has "unassigned" and "illegal" terminal states.
 280  *
 281  * Note: partly by accident, this data structure supports simple stateful
 282  * encodings without any additional logic.
 283  * Currently, only simple Shift-In/Shift-Out schemes are handled with
 284  * appropriate state tables (especially EBCDIC_STATEFUL!).
 285  *
 286  * MBCS version 2 added:
 287  * unassigned and illegal action codes have U+fffe and U+ffff
 288  * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
 289  *
 290  * Converting from Unicode to codepage bytes --------------------------------***
 291  *
 292  * The conversion data structure for fromUnicode is designed for the known
 293  * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
 294  * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
 295  * a roundtrip mapping.
 296  *
 297  * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
 298  * like in the character properties table.
 299  * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
 300  * with the resulting bytes is at offsetFromUBytes.
 301  *
 302  * Beginning with version 4, single-byte codepages have a significantly different
 303  * trie compared to other codepages.
 304  * In all cases, the entry in stage 1 is directly the index of the block of
 305  * 64 entries in stage 2.
 306  *
 307  * Single-byte lookup:
 308  *
 309  * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
 310  * Stage 3 contains one 16-bit word per result:
 311  * Bits 15..8 indicate the kind of result:
 312  *    f  roundtrip result
 313  *    c  fallback result from private-use code point
 314  *    8  fallback result from other code points
 315  *    0  unassigned
 316  * Bits 7..0 contain the codepage byte. A zero byte is always possible.
 317  *
 318  * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly
 319  * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup
 320  * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
 321  * ASCII code points can be looked up with a linear array access into stage 3.
 322  * See maxFastUChar and other details in ucnvmbcs.h.
 323  *
 324  * Multi-byte lookup:
 325  *
 326  * Stage 2 contains a 32-bit word for each 16-block in stage 3:
 327  * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
 328  *             test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
 329  *             If this test is false, then a non-zero result will be interpreted as
 330  *             a fallback mapping.
 331  * Bits 15..0  contain the index to stage 3, which must be multiplied by 16*(bytes per char)
 332  *
 333  * Stage 3 contains 2, 3, or 4 bytes per result.
 334  * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
 335  * while 3 bytes are stored as bytes in big-endian order.
 336  * Leading zero bytes are ignored, and the number of bytes is counted.
 337  * A zero byte mapping result is possible as a roundtrip result.
 338  * For some output types, the actual result is processed from this;
 339  * see ucnv_MBCSFromUnicodeWithOffsets().
 340  *
 341  * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
 342  * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
 343  *
 344  * In version 4.3, a utf8Friendly file contains an mbcsIndex table.
 345  * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup
 346  * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
 347  * ASCII code points can be looked up with a linear array access into stage 3.
 348  * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.
 349  *
 350  * In version 3, stage 2 blocks may overlap by multiples of the multiplier
 351  * for compaction.
 352  * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
 353  * may overlap by any number of entries.
 354  *
 355  * MBCS version 2 added:
 356  * the converter checks for known output types, which allows
 357  * adding new ones without crashing an unaware converter
 358  */
 359
 360 static const UConverterImpl _SBCSUTF8Impl;
 361 static const UConverterImpl _DBCSUTF8Impl;
 362
 363 /* GB 18030 data ------------------------------------------------------------ */
 364
 365 /* helper macros for linear values for GB 18030 four-byte sequences */
 366 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
 367
 368 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
 369
 370 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
 371
 372 /*
 373  * Some ranges of GB 18030 where both the Unicode code points and the
 374  * GB four-byte sequences are contiguous and are handled algorithmically by
 375  * the special callback functions below.
 376  * The values are start & end of Unicode & GB codes.
 377  *
 378  * Note that single surrogates are not mapped by GB 18030
 379  * as of the re-released mapping tables from 2000-nov-30.
 380  */
 381 static const uint32_t
 382 gb18030Ranges[13][4]={
 383     {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
 384     {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
 385     {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)},
 386     {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
 387     {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
 388     {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
 389     {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
 390     {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
 391     {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
 392     {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
 393     {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
 394     {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
 395     {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
 396 };
 397
 398 /* bit flag for UConverter.options indicating GB 18030 special handling */
 399 #define _MBCS_OPTION_GB18030 0x8000
 400
 401 /* Miscellaneous ------------------------------------------------------------ */
 402
 403 /**
 404  * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
 405  * consecutive sequences of bytes, starting from the one encoded in value,
 406  * to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
 407  * Does not currently support m:n mappings or reverse fallbacks.
 408  * This function will not be called for sequences of bytes with leading zeros.
 409  *
 410  * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
 411  * @param value contains 1..4 bytes of the first byte sequence, right-aligned
 412  * @param codePoints resulting Unicode code points, or negative if a byte sequence does
 413  *        not map to anything
 414  * @return TRUE to continue enumeration, FALSE to stop
 415  */
 416 typedef UBool U_CALLCONV
 417 UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]);
 418
 419 /* similar to ucnv_MBCSGetNextUChar() but recursive */
 420 static UBool
 421 enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],
 422         int32_t state, uint32_t offset,
 423         uint32_t value,
 424         UConverterEnumToUCallback *callback, const void *context,
 425         UErrorCode *pErrorCode) {
 426     UChar32 codePoints[32];
 427     const int32_t *row;
 428     const uint16_t *unicodeCodeUnits;
 429     UChar32 anyCodePoints;
 430     int32_t b, limit;
 431
 432     row=mbcsTable->stateTable[state];
 433     unicodeCodeUnits=mbcsTable->unicodeCodeUnits;
 434
 435     value<<=8;
 436     anyCodePoints=-1;  /* becomes non-negative if there is a mapping */
 437
 438     b=(stateProps[state]&0x38)<<2;
 439     if(b==0 && stateProps[state]>=0x40) {
 440         /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */
 441         codePoints[0]=U_SENTINEL;
 442         b=1;
 443     }
 444     limit=((stateProps[state]&7)+1)<<5;
 445     while(b<limit) {
 446         int32_t entry=row[b];
 447         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
 448             int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry);
 449             if(stateProps[nextState]>=0) {
 450                 /* recurse to a state with non-ignorable actions */
 451                 if(!enumToU(
 452                         mbcsTable, stateProps, nextState,
 453                         offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
 454                         value|(uint32_t)b,
 455                         callback, context,
 456                         pErrorCode)) {
 457                     return FALSE;
 458                 }
 459             }
 460             codePoints[b&0x1f]=U_SENTINEL;
 461         } else {
 462             UChar32 c;
 463             int32_t action;
 464
 465             /*
 466              * An if-else-if chain provides more reliable performance for
 467              * the most common cases compared to a switch.
 468              */
 469             action=MBCS_ENTRY_FINAL_ACTION(entry);
 470             if(action==MBCS_STATE_VALID_DIRECT_16) {
 471                 /* output BMP code point */
 472                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
 473             } else if(action==MBCS_STATE_VALID_16) {
 474                 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
 475                 c=unicodeCodeUnits[finalOffset];
 476                 if(c<0xfffe) {
 477                     /* output BMP code point */
 478                 } else {
 479                     c=U_SENTINEL;
 480                 }
 481             } else if(action==MBCS_STATE_VALID_16_PAIR) {
 482                 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
 483                 c=unicodeCodeUnits[finalOffset++];
 484                 if(c<0xd800) {
 485                     /* output BMP code point below 0xd800 */
 486                 } else if(c<=0xdbff) {
 487                     /* output roundtrip or fallback supplementary code point */
 488                     c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00);
 489                 } else if(c==0xe000) {
 490                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
 491                     c=unicodeCodeUnits[finalOffset];
 492                 } else {
 493                     c=U_SENTINEL;
 494                 }
 495             } else if(action==MBCS_STATE_VALID_DIRECT_20) {
 496                 /* output supplementary code point */
 497                 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
 498             } else {
 499                 c=U_SENTINEL;
 500             }
 501
 502             codePoints[b&0x1f]=c;
 503             anyCodePoints&=c;
 504         }
 505         if(((++b)&0x1f)==0) {
 506             if(anyCodePoints>=0) {
 507                 if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) {
 508                     return FALSE;
 509                 }
 510                 anyCodePoints=-1;
 511             }
 512         }
 513     }
 514     return TRUE;
 515 }
 516
 517 /*
 518  * Only called if stateProps[state]==-1.
 519  * A recursive call may do stateProps[state]|=0x40 if this state is the target of an
 520  * MBCS_STATE_CHANGE_ONLY.
 521  */
 522 static int8_t
 523 getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) {
 524     const int32_t *row;
 525     int32_t min, max, entry, nextState;
 526
 527     row=stateTable[state];
 528     stateProps[state]=0;
 529
 530     /* find first non-ignorable state */
 531     for(min=0;; ++min) {
 532         entry=row[min];
 533         nextState=MBCS_ENTRY_STATE(entry);
 534         if(stateProps[nextState]==-1) {
 535             getStateProp(stateTable, stateProps, nextState);
 536         }
 537         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
 538             if(stateProps[nextState]>=0) {
 539                 break;
 540             }
 541         } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
 542             break;
 543         }
 544         if(min==0xff) {
 545             stateProps[state]=-0x40;  /* (int8_t)0xc0 */
 546             return stateProps[state];
 547         }
 548     }
 549     stateProps[state]|=(int8_t)((min>>5)<<3);
 550
 551     /* find last non-ignorable state */
 552     for(max=0xff; min<max; --max) {
 553         entry=row[max];
 554         nextState=MBCS_ENTRY_STATE(entry);
 555         if(stateProps[nextState]==-1) {
 556             getStateProp(stateTable, stateProps, nextState);
 557         }
 558         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
 559             if(stateProps[nextState]>=0) {
 560                 break;
 561             }
 562         } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
 563             break;
 564         }
 565     }
 566     stateProps[state]|=(int8_t)(max>>5);
 567
 568     /* recurse further and collect direct-state information */
 569     while(min<=max) {
 570         entry=row[min];
 571         nextState=MBCS_ENTRY_STATE(entry);
 572         if(stateProps[nextState]==-1) {
 573             getStateProp(stateTable, stateProps, nextState);
 574         }
 575         if(MBCS_ENTRY_IS_FINAL(entry)) {
 576             stateProps[nextState]|=0x40;
 577             if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) {
 578                 stateProps[state]|=0x40;
 579             }
 580         }
 581         ++min;
 582     }
 583     return stateProps[state];
 584 }
 585
 586 /*
 587  * Internal function enumerating the toUnicode data of an MBCS converter.
 588  * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
 589  * table, but could also be used for a future ucnv_getUnicodeSet() option
 590  * that includes reverse fallbacks (after updating this function's implementation).
 591  * Currently only handles roundtrip mappings.
 592  * Does not currently handle extensions.
 593  */
 594 static void
 595 ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable,
 596                        UConverterEnumToUCallback *callback, const void *context,
 597                        UErrorCode *pErrorCode) {
 598     /*
 599      * Properties for each state, to speed up the enumeration.
 600      * Ignorable actions are unassigned/illegal/state-change-only:
 601      * They do not lead to mappings.
 602      *
 603      * Bits 7..6:
 604      * 1 direct/initial state (stateful converters have multiple)
 605      * 0 non-initial state with transitions or with non-ignorable result actions
 606      * -1 final state with only ignorable actions
 607      *
 608      * Bits 5..3:
 609      * The lowest byte value with non-ignorable actions is
 610      * value<<5 (rounded down).
 611      *
 612      * Bits 2..0:
 613      * The highest byte value with non-ignorable actions is
 614      * (value<<5)&0x1f (rounded up).
 615      */
 616     int8_t stateProps[MBCS_MAX_STATE_COUNT];
 617     int32_t state;
 618
 619     uprv_memset(stateProps, -1, sizeof(stateProps));
 620
 621     /* recurse from state 0 and set all stateProps */
 622     getStateProp(mbcsTable->stateTable, stateProps, 0);
 623
 624     for(state=0; state<mbcsTable->countStates; ++state) {
 625         /*if(stateProps[state]==-1) {
 626             printf("unused/unreachable <icu:state> %d\n", state);
 627         }*/
 628         if(stateProps[state]>=0x40) {
 629             /* start from each direct state */
 630             enumToU(
 631                 mbcsTable, stateProps, state, 0, 0,
 632                 callback, context,
 633                 pErrorCode);
 634         }
 635     }
 636 }
 637
 638 U_CFUNC void
 639 ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
 640                                          const USetAdder *sa,
 641                                          UConverterUnicodeSet which,
 642                                          UConverterSetFilter filter,
 643                                          UErrorCode *pErrorCode) {
 644     const UConverterMBCSTable *mbcsTable;
 645     const uint16_t *table;
 646
 647     uint32_t st3;
 648     uint16_t st1, maxStage1, st2;
 649
 650     UChar32 c;
 651
 652     /* enumerate the from-Unicode trie table */
 653     mbcsTable=&sharedData->mbcs;
 654     table=mbcsTable->fromUnicodeTable;
 655     if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
 656         maxStage1=0x440;
 657     } else {
 658         maxStage1=0x40;
 659     }
 660
 661     c=0; /* keep track of the current code point while enumerating */
 662
 663     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
 664         const uint16_t *stage2, *stage3, *results;
 665         uint16_t minValue;
 666
 667         results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
 668
 669         /*
 670          * Set a threshold variable for selecting which mappings to use.
 671          * See ucnv_MBCSSingleFromBMPWithOffsets() and
 672          * MBCS_SINGLE_RESULT_FROM_U() for details.
 673          */
 674         if(which==UCNV_ROUNDTRIP_SET) {
 675             /* use only roundtrips */
 676             minValue=0xf00;
 677         } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
 678             /* use all roundtrip and fallback results */
 679             minValue=0x800;
 680         }
 681
 682         for(st1=0; st1<maxStage1; ++st1) {
 683             st2=table[st1];
 684             if(st2>maxStage1) {
 685                 stage2=table+st2;
 686                 for(st2=0; st2<64; ++st2) {
 687                     if((st3=stage2[st2])!=0) {
 688                         /* read the stage 3 block */
 689                         stage3=results+st3;
 690
 691                         do {
 692                             if(*stage3++>=minValue) {
 693                                 sa->add(sa->set, c);
 694                             }
 695                         } while((++c&0xf)!=0);
 696                     } else {
 697                         c+=16; /* empty stage 3 block */
 698                     }
 699                 }
 700             } else {
 701                 c+=1024; /* empty stage 2 block */
 702             }
 703         }
 704     } else {
 705         const uint32_t *stage2;
 706         const uint8_t *stage3, *bytes;
 707         uint32_t st3Multiplier;
 708         uint32_t value;
 709         UBool useFallback;
 710
 711         bytes=mbcsTable->fromUnicodeBytes;
 712
 713         useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
 714
 715         switch(mbcsTable->outputType) {
 716         case MBCS_OUTPUT_3:
 717         case MBCS_OUTPUT_4_EUC:
 718             st3Multiplier=3;
 719             break;
 720         case MBCS_OUTPUT_4:
 721             st3Multiplier=4;
 722             break;
 723         default:
 724             st3Multiplier=2;
 725             break;
 726         }
 727
 728         for(st1=0; st1<maxStage1; ++st1) {
 729             st2=table[st1];
 730             if(st2>(maxStage1>>1)) {
 731                 stage2=(const uint32_t *)table+st2;
 732                 for(st2=0; st2<64; ++st2) {
 733                     if((st3=stage2[st2])!=0) {
 734                         /* read the stage 3 block */
 735                         stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3;
 736
 737                         /* get the roundtrip flags for the stage 3 block */
 738                         st3>>=16;
 739
 740                         /*
 741                          * Add code points for which the roundtrip flag is set,
 742                          * or which map to non-zero bytes if we use fallbacks.
 743                          * See ucnv_MBCSFromUnicodeWithOffsets() for details.
 744                          */
 745                         switch(filter) {
 746                         case UCNV_SET_FILTER_NONE:
 747                             do {
 748                                 if(st3&1) {
 749                                     sa->add(sa->set, c);
 750                                     stage3+=st3Multiplier;
 751                                 } else if(useFallback) {
 752                                     uint8_t b=0;
 753                                     switch(st3Multiplier) {
 754                                     case 4:
 755                                         b|=*stage3++;
 756                                     case 3:
 757                                         b|=*stage3++;
 758                                     case 2:
 759                                         b|=stage3[0]|stage3[1];
 760                                         stage3+=2;
 761                                     default:
 762                                         break;
 763                                     }
 764                                     if(b!=0) {
 765                                         sa->add(sa->set, c);
 766                                     }
 767                                 }
 768                                 st3>>=1;
 769                             } while((++c&0xf)!=0);
 770                             break;
 771                         case UCNV_SET_FILTER_DBCS_ONLY:
 772                              /* Ignore single-byte results (<0x100). */
 773                             do {
 774                                 if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
 775                                     sa->add(sa->set, c);
 776                                 }
 777                                 st3>>=1;
 778                                 stage3+=2;  /* +=st3Multiplier */
 779                             } while((++c&0xf)!=0);
 780                             break;
 781                         case UCNV_SET_FILTER_2022_CN:
 782                              /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
 783                             do {
 784                                 if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
 785                                     sa->add(sa->set, c);
 786                                 }
 787                                 st3>>=1;
 788                                 stage3+=3;  /* +=st3Multiplier */
 789                             } while((++c&0xf)!=0);
 790                             break;
 791                         case UCNV_SET_FILTER_SJIS:
 792                              /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
 793                             do {
 794                                 if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
 795                                     sa->add(sa->set, c);
 796                                 }
 797                                 st3>>=1;
 798                                 stage3+=2;  /* +=st3Multiplier */
 799                             } while((++c&0xf)!=0);
 800                             break;
 801                         case UCNV_SET_FILTER_GR94DBCS:
 802                             /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
 803                             do {
 804                                 if( ((st3&1)!=0 || useFallback) &&
 805                                     (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
 806                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
 807                                 ) {
 808                                     sa->add(sa->set, c);
 809                                 }
 810                                 st3>>=1;
 811                                 stage3+=2;  /* +=st3Multiplier */
 812                             } while((++c&0xf)!=0);
 813                             break;
 814                         case UCNV_SET_FILTER_HZ:
 815                             /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
 816                             do {
 817                                 if( ((st3&1)!=0 || useFallback) &&
 818                                     (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
 819                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
 820                                 ) {
 821                                     sa->add(sa->set, c);
 822                                 }
 823                                 st3>>=1;
 824                                 stage3+=2;  /* +=st3Multiplier */
 825                             } while((++c&0xf)!=0);
 826                             break;
 827                         default:
 828                             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
 829                             return;
 830                         }
 831                     } else {
 832                         c+=16; /* empty stage 3 block */
 833                     }
 834                 }
 835             } else {
 836                 c+=1024; /* empty stage 2 block */
 837             }
 838         }
 839     }
 840
 841     ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
 842 }
 843
 844 U_CFUNC void
 845 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
 846                                  const USetAdder *sa,
 847                                  UConverterUnicodeSet which,
 848                                  UErrorCode *pErrorCode) {
 849     ucnv_MBCSGetFilteredUnicodeSetForUnicode(
 850         sharedData, sa, which,
 851         sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
 852             UCNV_SET_FILTER_DBCS_ONLY :
 853             UCNV_SET_FILTER_NONE,
 854         pErrorCode);
 855 }
 856
 857 static void
 858 ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
 859                    const USetAdder *sa,
 860                    UConverterUnicodeSet which,
 861                    UErrorCode *pErrorCode) {
 862     if(cnv->options&_MBCS_OPTION_GB18030) {
 863         sa->addRange(sa->set, 0, 0xd7ff);
 864         sa->addRange(sa->set, 0xe000, 0x10ffff);
 865     } else {
 866         ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode);
 867     }
 868 }
 869
 870 /* conversion extensions for input not in the main table -------------------- */
 871
 872 /*
 873  * Hardcoded extension handling for GB 18030.
 874  * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
 875  *
 876  * In the future, conversion extensions may handle m:n mappings and delta tables,
 877  * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html
 878  *
 879  * If an input character cannot be mapped, then these functions set an error
 880  * code. The framework will then call the callback function.
 881  */
 882
 883 /*
 884  * @return if(U_FAILURE) return the code point for cnv->fromUChar32
 885  *         else return 0 after output has been written to the target
 886  */
 887 static UChar32
 888 _extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
 889           UChar32 cp,
 890           const UChar **source, const UChar *sourceLimit,
 891           uint8_t **target, const uint8_t *targetLimit,
 892           int32_t **offsets, int32_t sourceIndex,
 893           UBool flush,
 894           UErrorCode *pErrorCode) {
 895     const int32_t *cx;
 896
 897     cnv->useSubChar1=FALSE;
 898
 899     if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
 900         ucnv_extInitialMatchFromU(
 901             cnv, cx,
 902             cp, source, sourceLimit,
 903             (char **)target, (char *)targetLimit,
 904             offsets, sourceIndex,
 905             flush,
 906             pErrorCode)
 907     ) {
 908         return 0; /* an extension mapping handled the input */
 909     }
 910
 911     /* GB 18030 */
 912     if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
 913         const uint32_t *range;
 914         int32_t i;
 915
 916         range=gb18030Ranges[0];
 917         for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
 918             if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {
 919                 /* found the Unicode code point, output the four-byte sequence for it */
 920                 uint32_t linear;
 921                 char bytes[4];
 922
 923                 /* get the linear value of the first GB 18030 code in this range */
 924                 linear=range[2]-LINEAR_18030_BASE;
 925
 926                 /* add the offset from the beginning of the range */
 927                 linear+=((uint32_t)cp-range[0]);
 928
 929                 /* turn this into a four-byte sequence */
 930                 bytes[3]=(char)(0x30+linear%10); linear/=10;
 931                 bytes[2]=(char)(0x81+linear%126); linear/=126;
 932                 bytes[1]=(char)(0x30+linear%10); linear/=10;
 933                 bytes[0]=(char)(0x81+linear);
 934
 935                 /* output this sequence */
 936                 ucnv_fromUWriteBytes(cnv,
 937                                      bytes, 4, (char **)target, (char *)targetLimit,
 938                                      offsets, sourceIndex, pErrorCode);
 939                 return 0;
 940             }
 941         }
 942     }
 943
 944     /* no mapping */
 945     *pErrorCode=U_INVALID_CHAR_FOUND;
 946     return cp;
 947 }
 948
 949 /*
 950  * Input sequence: cnv->toUBytes[0..length[
 951  * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
 952  *         else return 0 after output has been written to the target
 953  */
 954 static int8_t
 955 _extToU(UConverter *cnv, const UConverterSharedData *sharedData,
 956         int8_t length,
 957         const uint8_t **source, const uint8_t *sourceLimit,
 958         UChar **target, const UChar *targetLimit,
 959         int32_t **offsets, int32_t sourceIndex,
 960         UBool flush,
 961         UErrorCode *pErrorCode) {
 962     const int32_t *cx;
 963
 964     if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
 965         ucnv_extInitialMatchToU(
 966             cnv, cx,
 967             length, (const char **)source, (const char *)sourceLimit,
 968             target, targetLimit,
 969             offsets, sourceIndex,
 970             flush,
 971             pErrorCode)
 972     ) {
 973         return 0; /* an extension mapping handled the input */
 974     }
 975
 976     /* GB 18030 */
 977     if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
 978         const uint32_t *range;
 979         uint32_t linear;
 980         int32_t i;
 981
 982         linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]);
 983         range=gb18030Ranges[0];
 984         for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
 985             if(range[2]<=linear && linear<=range[3]) {
 986                 /* found the sequence, output the Unicode code point for it */
 987                 *pErrorCode=U_ZERO_ERROR;
 988
 989                 /* add the linear difference between the input and start sequences to the start code point */
 990                 linear=range[0]+(linear-range[2]);
 991
 992                 /* output this code point */
 993                 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode);
 994
 995                 return 0;
 996             }
 997         }
 998     }
 999
1000     /* no mapping */
1001     *pErrorCode=U_INVALID_CHAR_FOUND;
1002     return length;
1003 }
1004
1005 /* EBCDIC swap LF<->NL ------------------------------------------------------ */
1006
1007 /*
1008  * This code modifies a standard EBCDIC<->Unicode mapping table for
1009  * OS/390 (z/OS) Unix System Services (Open Edition).
1010  * The difference is in the mapping of Line Feed and New Line control codes:
1011  * Standard EBCDIC maps
1012  *
1013  *   <U000A> \x25 |0
1014  *   <U0085> \x15 |0
1015  *
1016  * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
1017  * mapping
1018  *
1019  *   <U000A> \x15 |0
1020  *   <U0085> \x25 |0
1021  *
1022  * This code modifies a loaded standard EBCDIC<->Unicode mapping table
1023  * by copying it into allocated memory and swapping the LF and NL values.
1024  * It allows to support the same EBCDIC charset in both versions without
1025  * duplicating the entire installed table.
1026  */
1027
1028 /* standard EBCDIC codes */
1029 #define EBCDIC_LF 0x25
1030 #define EBCDIC_NL 0x15
1031
1032 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
1033 #define EBCDIC_RT_LF 0xf25
1034 #define EBCDIC_RT_NL 0xf15
1035
1036 /* Unicode code points */
1037 #define U_LF 0x0a
1038 #define U_NL 0x85
1039
1040 static UBool
1041 _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) {
1042     UConverterMBCSTable *mbcsTable;
1043
1044     const uint16_t *table, *results;
1045     const uint8_t *bytes;
1046
1047     int32_t (*newStateTable)[256];
1048     uint16_t *newResults;
1049     uint8_t *p;
1050     char *name;
1051
1052     uint32_t stage2Entry;
1053     uint32_t size, sizeofFromUBytes;
1054
1055     mbcsTable=&sharedData->mbcs;
1056
1057     table=mbcsTable->fromUnicodeTable;
1058     bytes=mbcsTable->fromUnicodeBytes;
1059     results=(const uint16_t *)bytes;
1060
1061     /*
1062      * Check that this is an EBCDIC table with SBCS portion -
1063      * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
1064      *
1065      * If not, ignore the option. Options are always ignored if they do not apply.
1066      */
1067     if(!(
1068          (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
1069          mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
1070          mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL)
1071     )) {
1072         return FALSE;
1073     }
1074
1075     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
1076         if(!(
1077              EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
1078              EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)
1079         )) {
1080             return FALSE;
1081         }
1082     } else /* MBCS_OUTPUT_2_SISO */ {
1083         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
1084         if(!(
1085              MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&
1086              EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)
1087         )) {
1088             return FALSE;
1089         }
1090
1091         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
1092         if(!(
1093              MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&
1094              EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)
1095         )) {
1096             return FALSE;
1097         }
1098     }
1099
1100     if(mbcsTable->fromUBytesLength>0) {
1101         /*
1102          * We _know_ the number of bytes in the fromUnicodeBytes array
1103          * starting with header.version 4.1.
1104          */
1105         sizeofFromUBytes=mbcsTable->fromUBytesLength;
1106     } else {
1107         /*
1108          * Otherwise:
1109          * There used to be code to enumerate the fromUnicode
1110          * trie and find the highest entry, but it was removed in ICU 3.2
1111          * because it was not tested and caused a low code coverage number.
1112          * See Jitterbug 3674.
1113          * This affects only some .cnv file formats with a header.version
1114          * below 4.1, and only when swaplfnl is requested.
1115          *
1116          * ucnvmbcs.c revision 1.99 is the last one with the
1117          * ucnv_MBCSSizeofFromUBytes() function.
1118          */
1119         *pErrorCode=U_INVALID_FORMAT_ERROR;
1120         return FALSE;
1121     }
1122
1123     /*
1124      * The table has an appropriate format.
1125      * Allocate and build
1126      * - a modified to-Unicode state table
1127      * - a modified from-Unicode output array
1128      * - a converter name string with the swap option appended
1129      */
1130     size=
1131         mbcsTable->countStates*1024+
1132         sizeofFromUBytes+
1133         UCNV_MAX_CONVERTER_NAME_LENGTH+20;
1134     p=(uint8_t *)uprv_malloc(size);
1135     if(p==NULL) {
1136         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1137         return FALSE;
1138     }
1139
1140     /* copy and modify the to-Unicode state table */
1141     newStateTable=(int32_t (*)[256])p;
1142     uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024);
1143
1144     newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
1145     newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
1146
1147     /* copy and modify the from-Unicode result table */
1148     newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
1149     uprv_memcpy(newResults, bytes, sizeofFromUBytes);
1150
1151     /* conveniently, the table access macros work on the left side of expressions */
1152     if(mbcsTable->outputType==MBCS_OUTPUT_1) {
1153         MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;
1154         MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;
1155     } else /* MBCS_OUTPUT_2_SISO */ {
1156         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
1157         MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
1158
1159         stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
1160         MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
1161     }
1162
1163     /* set the canonical converter name */
1164     name=(char *)newResults+sizeofFromUBytes;
1165     uprv_strcpy(name, sharedData->staticData->name);
1166     uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);
1167
1168     /* set the pointers */
1169     umtx_lock(NULL);
1170     if(mbcsTable->swapLFNLStateTable==NULL) {
1171         mbcsTable->swapLFNLStateTable=newStateTable;
1172         mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
1173         mbcsTable->swapLFNLName=name;
1174
1175         newStateTable=NULL;
1176     }
1177     umtx_unlock(NULL);
1178
1179     /* release the allocated memory if another thread beat us to it */
1180     if(newStateTable!=NULL) {
1181         uprv_free(newStateTable);
1182     }
1183     return TRUE;
1184 }
1185
1186 /* reconstitute omitted fromUnicode data ------------------------------------ */
1187
1188 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */
1189 static UBool U_CALLCONV
1190 writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) {
1191     UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context;
1192     const uint16_t *table;
1193     uint32_t *stage2;
1194     uint8_t *bytes, *p;
1195     UChar32 c;
1196     int32_t i, st3;
1197
1198     table=mbcsTable->fromUnicodeTable;
1199     bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;
1200
1201     /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
1202     switch(mbcsTable->outputType) {
1203     case MBCS_OUTPUT_3_EUC:
1204         if(value<=0xffff) {
1205             /* short sequences are stored directly */
1206             /* code set 0 or 1 */
1207         } else if(value<=0x8effff) {
1208             /* code set 2 */
1209             value&=0x7fff;
1210         } else /* first byte is 0x8f */ {
1211             /* code set 3 */
1212             value&=0xff7f;
1213         }
1214         break;
1215     case MBCS_OUTPUT_4_EUC:
1216         if(value<=0xffffff) {
1217             /* short sequences are stored directly */
1218             /* code set 0 or 1 */
1219         } else if(value<=0x8effffff) {
1220             /* code set 2 */
1221             value&=0x7fffff;
1222         } else /* first byte is 0x8f */ {
1223             /* code set 3 */
1224             value&=0xff7fff;
1225         }
1226         break;
1227     default:
1228         break;
1229     }
1230
1231     for(i=0; i<=0x1f; ++value, ++i) {
1232         c=codePoints[i];
1233         if(c<0) {
1234             continue;
1235         }
1236
1237         /* locate the stage 2 & 3 data */
1238         stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f);
1239         p=bytes;
1240         st3=(int32_t)(uint16_t)*stage2*16+(c&0xf);
1241
1242         /* write the codepage bytes into stage 3 */
1243         switch(mbcsTable->outputType) {
1244         case MBCS_OUTPUT_3:
1245         case MBCS_OUTPUT_4_EUC:
1246             p+=st3*3;
1247             p[0]=(uint8_t)(value>>16);
1248             p[1]=(uint8_t)(value>>8);
1249             p[2]=(uint8_t)value;
1250             break;
1251         case MBCS_OUTPUT_4:
1252             ((uint32_t *)p)[st3]=value;
1253             break;
1254         default:
1255             /* 2 bytes per character */
1256             ((uint16_t *)p)[st3]=(uint16_t)value;
1257             break;
1258         }
1259
1260         /* set the roundtrip flag */
1261         *stage2|=(1UL<<(16+(c&0xf)));
1262     }
1263     return TRUE;
1264  }
1265
1266 static void
1267 reconstituteData(UConverterMBCSTable *mbcsTable,
1268                  uint32_t stage1Length, uint32_t stage2Length,
1269                  uint32_t fullStage2Length,  /* lengths are numbers of units, not bytes */
1270                  UErrorCode *pErrorCode) {
1271     uint16_t *stage1;
1272     uint32_t *stage2;
1273     uint8_t *bytes;
1274     uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength;
1275     mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
1276     if(mbcsTable->reconstitutedData==NULL) {
1277         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1278         return;
1279     }
1280     uprv_memset(mbcsTable->reconstitutedData, 0, dataLength);
1281
1282     /* copy existing data and reroute the pointers */
1283     stage1=(uint16_t *)mbcsTable->reconstitutedData;
1284     uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2);
1285
1286     stage2=(uint32_t *)(stage1+stage1Length);
1287     uprv_memcpy(stage2+(fullStage2Length-stage2Length),
1288                 mbcsTable->fromUnicodeTable+stage1Length,
1289                 stage2Length*4);
1290
1291     mbcsTable->fromUnicodeTable=stage1;
1292     mbcsTable->fromUnicodeBytes=bytes=(uint8_t *)(stage2+fullStage2Length);
1293
1294     /* indexes into stage 2 count from the bottom of the fromUnicodeTable */
1295     stage2=(uint32_t *)stage1;
1296
1297     /* reconstitute the initial part of stage 2 from the mbcsIndex */
1298     {
1299         int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6;
1300         int32_t stageUTF8Index=0;
1301         int32_t st1, st2, st3, i;
1302
1303         for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) {
1304             st2=stage1[st1];
1305             if(st2!=stage1Length/2) {
1306                 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
1307                 for(i=0; i<16; ++i) {
1308                     st3=mbcsTable->mbcsIndex[stageUTF8Index++];
1309                     if(st3!=0) {
1310                         /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
1311                         st3>>=4;
1312                         /*
1313                          * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
1314                          * allocated together as a single 64-block for access from the mbcsIndex
1315                          */
1316                         stage2[st2++]=st3++;
1317                         stage2[st2++]=st3++;
1318                         stage2[st2++]=st3++;
1319                         stage2[st2++]=st3;
1320                     } else {
1321                         /* no stage 3 block, skip */
1322                         st2+=4;
1323                     }
1324                 }
1325             } else {
1326                 /* no stage 2 block, skip */
1327                 stageUTF8Index+=16;
1328             }
1329         }
1330     }
1331
1332     /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
1333     ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode);
1334 }
1335
1336 /* MBCS setup functions ----------------------------------------------------- */
1337
1338 static void
1339 ucnv_MBCSLoad(UConverterSharedData *sharedData,
1340           UConverterLoadArgs *pArgs,
1341           const uint8_t *raw,
1342           UErrorCode *pErrorCode) {
1343     UDataInfo info;
1344     UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
1345     _MBCSHeader *header=(_MBCSHeader *)raw;
1346     uint32_t offset;
1347     uint32_t headerLength;
1348     UBool noFromU=FALSE;
1349
1350     if(header->version[0]==4) {
1351         headerLength=MBCS_HEADER_V4_LENGTH;
1352     } else if(header->version[0]==5 && header->version[1]>=3 &&
1353               (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) {
1354         headerLength=header->options&MBCS_OPT_LENGTH_MASK;
1355         noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0);
1356     } else {
1357         *pErrorCode=U_INVALID_TABLE_FORMAT;
1358         return;
1359     }
1360
1361     mbcsTable->outputType=(uint8_t)header->flags;
1362     if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {
1363         *pErrorCode=U_INVALID_TABLE_FORMAT;
1364         return;
1365     }
1366
1367     /* extension data, header version 4.2 and higher */
1368     offset=header->flags>>8;
1369     if(offset!=0) {
1370         mbcsTable->extIndexes=(const int32_t *)(raw+offset);
1371     }
1372
1373     if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {
1374         UConverterLoadArgs args={ 0 };
1375         UConverterSharedData *baseSharedData;
1376         const int32_t *extIndexes;
1377         const char *baseName;
1378
1379         /* extension-only file, load the base table and set values appropriately */
1380         if((extIndexes=mbcsTable->extIndexes)==NULL) {
1381             /* extension-only file without extension */
1382             *pErrorCode=U_INVALID_TABLE_FORMAT;
1383             return;
1384         }
1385
1386         if(pArgs->nestedLoads!=1) {
1387             /* an extension table must not be loaded as a base table */
1388             *pErrorCode=U_INVALID_TABLE_FILE;
1389             return;
1390         }
1391
1392         /* load the base table */
1393         baseName=(const char *)header+headerLength*4;
1394         if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {
1395             /* forbid loading this same extension-only file */
1396             *pErrorCode=U_INVALID_TABLE_FORMAT;
1397             return;
1398         }
1399
1400         /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
1401         args.size=sizeof(UConverterLoadArgs);
1402         args.nestedLoads=2;
1403         args.reserved=pArgs->reserved;
1404         args.options=pArgs->options;
1405         args.pkg=pArgs->pkg;
1406         args.name=baseName;
1407         baseSharedData=ucnv_load(&args, pErrorCode);
1408         if(U_FAILURE(*pErrorCode)) {
1409             return;
1410         }
1411         if( baseSharedData->staticData->conversionType!=UCNV_MBCS ||
1412             baseSharedData->mbcs.baseSharedData!=NULL
1413         ) {
1414             ucnv_unload(baseSharedData);
1415             *pErrorCode=U_INVALID_TABLE_FORMAT;
1416             return;
1417         }
1418
1419         /* copy the base table data */
1420         uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable));
1421
1422         /* overwrite values with relevant ones for the extension converter */
1423         mbcsTable->baseSharedData=baseSharedData;
1424         mbcsTable->extIndexes=extIndexes;
1425
1426         /*
1427          * It would be possible to share the swapLFNL data with a base converter,
1428          * but the generated name would have to be different, and the memory
1429          * would have to be free'd only once.
1430          * It is easier to just create the data for the extension converter
1431          * separately when it is requested.
1432          */
1433         mbcsTable->swapLFNLStateTable=NULL;
1434         mbcsTable->swapLFNLFromUnicodeBytes=NULL;
1435         mbcsTable->swapLFNLName=NULL;
1436
1437         /*
1438          * The reconstitutedData must be deleted only when the base converter
1439          * is unloaded.
1440          */
1441         mbcsTable->reconstitutedData=NULL;
1442
1443         /*
1444          * Set a special, runtime-only outputType if the extension converter
1445          * is a DBCS version of a base converter that also maps single bytes.
1446          */
1447         if( sharedData->staticData->conversionType==UCNV_DBCS ||
1448                 (sharedData->staticData->conversionType==UCNV_MBCS &&
1449                  sharedData->staticData->minBytesPerChar>=2)
1450         ) {
1451             if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {
1452                 /* the base converter is SI/SO-stateful */
1453                 int32_t entry;
1454
1455                 /* get the dbcs state from the state table entry for SO=0x0e */
1456                 entry=mbcsTable->stateTable[0][0xe];
1457                 if( MBCS_ENTRY_IS_FINAL(entry) &&
1458                     MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&
1459                     MBCS_ENTRY_FINAL_STATE(entry)!=0
1460                 ) {
1461                     mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
1462
1463                     mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1464                 }
1465             } else if(
1466                 baseSharedData->staticData->conversionType==UCNV_MBCS &&
1467                 baseSharedData->staticData->minBytesPerChar==1 &&
1468                 baseSharedData->staticData->maxBytesPerChar==2 &&
1469                 mbcsTable->countStates<=127
1470             ) {
1471                 /* non-stateful base converter, need to modify the state table */
1472                 int32_t (*newStateTable)[256];
1473                 int32_t *state;
1474                 int32_t i, count;
1475
1476                 /* allocate a new state table and copy the base state table contents */
1477                 count=mbcsTable->countStates;
1478                 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024);
1479                 if(newStateTable==NULL) {
1480                     ucnv_unload(baseSharedData);
1481                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1482                     return;
1483                 }
1484
1485                 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);
1486
1487                 /* change all final single-byte entries to go to a new all-illegal state */
1488                 state=newStateTable[0];
1489                 for(i=0; i<256; ++i) {
1490                     if(MBCS_ENTRY_IS_FINAL(state[i])) {
1491                         state[i]=MBCS_ENTRY_TRANSITION(count, 0);
1492                     }
1493                 }
1494
1495                 /* build the new all-illegal state */
1496                 state=newStateTable[count];
1497                 for(i=0; i<256; ++i) {
1498                     state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);
1499                 }
1500                 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable;
1501                 mbcsTable->countStates=(uint8_t)(count+1);
1502                 mbcsTable->stateTableOwned=TRUE;
1503
1504                 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1505             }
1506         }
1507
1508         /*
1509          * unlike below for files with base tables, do not get the unicodeMask
1510          * from the sharedData; instead, use the base table's unicodeMask,
1511          * which we copied in the memcpy above;
1512          * this is necessary because the static data unicodeMask, especially
1513          * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
1514          */
1515     } else {
1516         /* conversion file with a base table; an additional extension table is optional */
1517         /* make sure that the output type is known */
1518         switch(mbcsTable->outputType) {
1519         case MBCS_OUTPUT_1:
1520         case MBCS_OUTPUT_2:
1521         case MBCS_OUTPUT_3:
1522         case MBCS_OUTPUT_4:
1523         case MBCS_OUTPUT_3_EUC:
1524         case MBCS_OUTPUT_4_EUC:
1525         case MBCS_OUTPUT_2_SISO:
1526             /* OK */
1527             break;
1528         default:
1529             *pErrorCode=U_INVALID_TABLE_FORMAT;
1530             return;
1531         }
1532
1533         mbcsTable->countStates=(uint8_t)header->countStates;
1534         mbcsTable->countToUFallbacks=header->countToUFallbacks;
1535         mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4);
1536         mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
1537         mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
1538
1539         mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
1540         mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
1541         mbcsTable->fromUBytesLength=header->fromUBytesLength;
1542
1543         /*
1544          * converter versions 6.1 and up contain a unicodeMask that is
1545          * used here to select the most efficient function implementations
1546          */
1547         info.size=sizeof(UDataInfo);
1548         udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
1549         if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
1550             /* mask off possible future extensions to be safe */
1551             mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3);
1552         } else {
1553             /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
1554             mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
1555         }
1556
1557         /*
1558          * _MBCSHeader.version 4.3 adds utf8Friendly data structures.
1559          * Check for the header version, SBCS vs. MBCS, and for whether the
1560          * data structures are optimized for code points as high as what the
1561          * runtime code is designed for.
1562          * The implementation does not handle mapping tables with entries for
1563          * unpaired surrogates.
1564          */
1565         if( header->version[1]>=3 &&
1566             (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 &&
1567             (mbcsTable->countStates==1 ?
1568                 (header->version[2]>=(SBCS_FAST_MAX>>8)) :
1569                 (header->version[2]>=(MBCS_FAST_MAX>>8))
1570             )
1571         ) {
1572             mbcsTable->utf8Friendly=TRUE;
1573
1574             if(mbcsTable->countStates==1) {
1575                 /*
1576                  * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
1577                  * Build a table with indexes to each block, to be used instead of
1578                  * the regular stage 1/2 table.
1579                  */
1580                 int32_t i;
1581                 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) {
1582                     mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)];
1583                 }
1584                 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */
1585                 mbcsTable->maxFastUChar=SBCS_FAST_MAX;
1586             } else {
1587                 /*
1588                  * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
1589                  * The .cnv file is prebuilt with an additional stage table with indexes
1590                  * to each block.
1591                  */
1592                 mbcsTable->mbcsIndex=(const uint16_t *)
1593                     (mbcsTable->fromUnicodeBytes+
1594                      (noFromU ? 0 : mbcsTable->fromUBytesLength));
1595                 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff;
1596             }
1597         }
1598
1599         /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
1600         {
1601             uint32_t asciiRoundtrips=0xffffffff;
1602             int32_t i;
1603
1604             for(i=0; i<0x80; ++i) {
1605                 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) {
1606                     asciiRoundtrips&=~((uint32_t)1<<(i>>2));
1607                 }
1608             }
1609             mbcsTable->asciiRoundtrips=asciiRoundtrips;
1610         }
1611
1612         if(noFromU) {
1613             uint32_t stage1Length=
1614                 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?
1615                     0x440 : 0x40;
1616             uint32_t stage2Length=
1617                 (header->offsetFromUBytes-header->offsetFromUTable)/4-
1618                 stage1Length/2;
1619             reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode);
1620         }
1621     }
1622
1623     /* Set the impl pointer here so that it is set for both extension-only and base tables. */
1624     if(mbcsTable->utf8Friendly) {
1625         if(mbcsTable->countStates==1) {
1626             sharedData->impl=&_SBCSUTF8Impl;
1627         } else {
1628             if(mbcsTable->outputType==MBCS_OUTPUT_2) {
1629                 sharedData->impl=&_DBCSUTF8Impl;
1630             }
1631         }
1632     }
1633
1634     if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) {
1635         /*
1636          * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
1637          * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
1638          */
1639         mbcsTable->asciiRoundtrips=0;
1640     }
1641 }
1642
1643 static void
1644 ucnv_MBCSUnload(UConverterSharedData *sharedData) {
1645     UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
1646
1647     if(mbcsTable->swapLFNLStateTable!=NULL) {
1648         uprv_free(mbcsTable->swapLFNLStateTable);
1649     }
1650     if(mbcsTable->stateTableOwned) {
1651         uprv_free((void *)mbcsTable->stateTable);
1652     }
1653     if(mbcsTable->baseSharedData!=NULL) {
1654         ucnv_unload(mbcsTable->baseSharedData);
1655     }
1656     if(mbcsTable->reconstitutedData!=NULL) {
1657         uprv_free(mbcsTable->reconstitutedData);
1658     }
1659 }
1660
1661 static void
1662 ucnv_MBCSOpen(UConverter *cnv,
1663           const char *name,
1664           const char *locale,
1665           uint32_t options,
1666           UErrorCode *pErrorCode) {
1667     UConverterMBCSTable *mbcsTable;
1668     const int32_t *extIndexes;
1669     uint8_t outputType;
1670     int8_t maxBytesPerUChar;
1671
1672     mbcsTable=&cnv->sharedData->mbcs;
1673     outputType=mbcsTable->outputType;
1674
1675     if(outputType==MBCS_OUTPUT_DBCS_ONLY) {
1676         /* the swaplfnl option does not apply, remove it */
1677         cnv->options=options&=~UCNV_OPTION_SWAP_LFNL;
1678     }
1679
1680     if((options&UCNV_OPTION_SWAP_LFNL)!=0) {
1681         /* do this because double-checked locking is broken */
1682         UBool isCached;
1683
1684         umtx_lock(NULL);
1685         isCached=mbcsTable->swapLFNLStateTable!=NULL;
1686         umtx_unlock(NULL);
1687
1688         if(!isCached) {
1689             if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
1690                 if(U_FAILURE(*pErrorCode)) {
1691                     return; /* something went wrong */
1692                 }
1693
1694                 /* the option does not apply, remove it */
1695                 cnv->options=options&=~UCNV_OPTION_SWAP_LFNL;
1696             }
1697         }
1698     }
1699
1700     if(uprv_strstr(name, "18030")!=NULL) {
1701         if(uprv_strstr(name, "gb18030")!=NULL || uprv_strstr(name, "GB18030")!=NULL) {
1702             /* set a flag for GB 18030 mode, which changes the callback behavior */
1703             cnv->options|=_MBCS_OPTION_GB18030;
1704         }
1705     }
1706
1707     /* fix maxBytesPerUChar depending on outputType and options etc. */
1708     if(outputType==MBCS_OUTPUT_2_SISO) {
1709         cnv->maxBytesPerUChar=3; /* SO+DBCS */
1710     }
1711
1712     extIndexes=mbcsTable->extIndexes;
1713     if(extIndexes!=NULL) {
1714         maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);
1715         if(outputType==MBCS_OUTPUT_2_SISO) {
1716             ++maxBytesPerUChar; /* SO + multiple DBCS */
1717         }
1718
1719         if(maxBytesPerUChar>cnv->maxBytesPerUChar) {
1720             cnv->maxBytesPerUChar=maxBytesPerUChar;
1721         }
1722     }
1723
1724 #if 0
1725     /*
1726      * documentation of UConverter fields used for status
1727      * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
1728      */
1729
1730     /* toUnicode */
1731     cnv->toUnicodeStatus=0;     /* offset */
1732     cnv->mode=0;                /* state */
1733     cnv->toULength=0;           /* byteIndex */
1734
1735     /* fromUnicode */
1736     cnv->fromUChar32=0;
1737     cnv->fromUnicodeStatus=1;   /* prevLength */
1738 #endif
1739 }
1740
1741 static const char *
1742 ucnv_MBCSGetName(const UConverter *cnv) {
1743     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) {
1744         return cnv->sharedData->mbcs.swapLFNLName;
1745     } else {
1746         return cnv->sharedData->staticData->name;
1747     }
1748 }
1749
1750 /* MBCS-to-Unicode conversion functions ------------------------------------- */
1751
1752 static UChar32
1753 ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
1754     const _MBCSToUFallback *toUFallbacks;
1755     uint32_t i, start, limit;
1756
1757     limit=mbcsTable->countToUFallbacks;
1758     if(limit>0) {
1759         /* do a binary search for the fallback mapping */
1760         toUFallbacks=mbcsTable->toUFallbacks;
1761         start=0;
1762         while(start<limit-1) {
1763             i=(start+limit)/2;
1764             if(offset<toUFallbacks[i].offset) {
1765                 limit=i;
1766             } else {
1767                 start=i;
1768             }
1769         }
1770
1771         /* did we really find it? */
1772         if(offset==toUFallbacks[start].offset) {
1773             return toUFallbacks[start].codePoint;
1774         }
1775     }
1776
1777     return 0xfffe;
1778 }
1779
1780 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
1781 static void
1782 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1783                                 UErrorCode *pErrorCode) {
1784     UConverter *cnv;
1785     const uint8_t *source, *sourceLimit;
1786     UChar *target;
1787     const UChar *targetLimit;
1788     int32_t *offsets;
1789
1790     const int32_t (*stateTable)[256];
1791
1792     int32_t sourceIndex;
1793
1794     int32_t entry;
1795     UChar c;
1796     uint8_t action;
1797
1798     /* set up the local pointers */
1799     cnv=pArgs->converter;
1800     source=(const uint8_t *)pArgs->source;
1801     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1802     target=pArgs->target;
1803     targetLimit=pArgs->targetLimit;
1804     offsets=pArgs->offsets;
1805
1806     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1807         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
1808     } else {
1809         stateTable=cnv->sharedData->mbcs.stateTable;
1810     }
1811
1812     /* sourceIndex=-1 if the current character began in the previous buffer */
1813     sourceIndex=0;
1814
1815     /* conversion loop */
1816     while(source<sourceLimit) {
1817         /*
1818          * This following test is to see if available input would overflow the output.
1819          * It does not catch output of more than one code unit that
1820          * overflows as a result of a surrogate pair or callback output
1821          * from the last source byte.
1822          * Therefore, those situations also test for overflows and will
1823          * then break the loop, too.
1824          */
1825         if(target>=targetLimit) {
1826             /* target is full */
1827             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1828             break;
1829         }
1830
1831         entry=stateTable[0][*source++];
1832         /* MBCS_ENTRY_IS_FINAL(entry) */
1833
1834         /* test the most common case first */
1835         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
1836             /* output BMP code point */
1837             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1838             if(offsets!=NULL) {
1839                 *offsets++=sourceIndex;
1840             }
1841
1842             /* normal end of action codes: prepare for a new character */
1843             ++sourceIndex;
1844             continue;
1845         }
1846
1847         /*
1848          * An if-else-if chain provides more reliable performance for
1849          * the most common cases compared to a switch.
1850          */
1851         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1852         if(action==MBCS_STATE_VALID_DIRECT_20 ||
1853            (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
1854         ) {
1855             entry=MBCS_ENTRY_FINAL_VALUE(entry);
1856             /* output surrogate pair */
1857             *target++=(UChar)(0xd800|(UChar)(entry>>10));
1858             if(offsets!=NULL) {
1859                 *offsets++=sourceIndex;
1860             }
1861             c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
1862             if(target<targetLimit) {
1863                 *target++=c;
1864                 if(offsets!=NULL) {
1865                     *offsets++=sourceIndex;
1866                 }
1867             } else {
1868                 /* target overflow */
1869                 cnv->UCharErrorBuffer[0]=c;
1870                 cnv->UCharErrorBufferLength=1;
1871                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1872                 break;
1873             }
1874
1875             ++sourceIndex;
1876             continue;
1877         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
1878             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
1879                 /* output BMP code point */
1880                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1881                 if(offsets!=NULL) {
1882                     *offsets++=sourceIndex;
1883                 }
1884
1885                 ++sourceIndex;
1886                 continue;
1887             }
1888         } else if(action==MBCS_STATE_UNASSIGNED) {
1889             /* just fall through */
1890         } else if(action==MBCS_STATE_ILLEGAL) {
1891             /* callback(illegal) */
1892             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1893         } else {
1894             /* reserved, must never occur */
1895             ++sourceIndex;
1896             continue;
1897         }
1898
1899         if(U_FAILURE(*pErrorCode)) {
1900             /* callback(illegal) */
1901             break;
1902         } else /* unassigned sequences indicated with byteIndex>0 */ {
1903             /* try an extension mapping */
1904             pArgs->source=(const char *)source;
1905             cnv->toUBytes[0]=*(source-1);
1906             cnv->toULength=_extToU(cnv, cnv->sharedData,
1907                                     1, &source, sourceLimit,
1908                                     &target, targetLimit,
1909                                     &offsets, sourceIndex,
1910                                     pArgs->flush,
1911                                     pErrorCode);
1912             sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
1913
1914             if(U_FAILURE(*pErrorCode)) {
1915                 /* not mappable or buffer overflow */
1916                 break;
1917             }
1918         }
1919     }
1920
1921     /* write back the updated pointers */
1922     pArgs->source=(const char *)source;
1923     pArgs->target=target;
1924     pArgs->offsets=offsets;
1925 }
1926
1927 /*
1928  * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
1929  * that only map to and from the BMP.
1930  * In addition to single-byte optimizations, the offset calculations
1931  * become much easier.
1932  */
1933 static void
1934 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
1935                             UErrorCode *pErrorCode) {
1936     UConverter *cnv;
1937     const uint8_t *source, *sourceLimit, *lastSource;
1938     UChar *target;
1939     int32_t targetCapacity, length;
1940     int32_t *offsets;
1941
1942     const int32_t (*stateTable)[256];
1943
1944     int32_t sourceIndex;
1945
1946     int32_t entry;
1947     uint8_t action;
1948
1949     /* set up the local pointers */
1950     cnv=pArgs->converter;
1951     source=(const uint8_t *)pArgs->source;
1952     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1953     target=pArgs->target;
1954     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1955     offsets=pArgs->offsets;
1956
1957     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
1958         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
1959     } else {
1960         stateTable=cnv->sharedData->mbcs.stateTable;
1961     }
1962
1963     /* sourceIndex=-1 if the current character began in the previous buffer */
1964     sourceIndex=0;
1965     lastSource=source;
1966
1967     /*
1968      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
1969      * for the minimum of the sourceLength and targetCapacity
1970      */
1971     length=(int32_t)(sourceLimit-source);
1972     if(length<targetCapacity) {
1973         targetCapacity=length;
1974     }
1975
1976 #if MBCS_UNROLL_SINGLE_TO_BMP
1977     /* unrolling makes it faster on Pentium III/Windows 2000 */
1978     /* unroll the loop with the most common case */
1979 unrolled:
1980     if(targetCapacity>=16) {
1981         int32_t count, loops, oredEntries;
1982
1983         loops=count=targetCapacity>>4;
1984         do {
1985             oredEntries=entry=stateTable[0][*source++];
1986             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1987             oredEntries|=entry=stateTable[0][*source++];
1988             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1989             oredEntries|=entry=stateTable[0][*source++];
1990             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1991             oredEntries|=entry=stateTable[0][*source++];
1992             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1993             oredEntries|=entry=stateTable[0][*source++];
1994             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1995             oredEntries|=entry=stateTable[0][*source++];
1996             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1997             oredEntries|=entry=stateTable[0][*source++];
1998             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
1999             oredEntries|=entry=stateTable[0][*source++];
2000             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2001             oredEntries|=entry=stateTable[0][*source++];
2002             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2003             oredEntries|=entry=stateTable[0][*source++];
2004             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2005             oredEntries|=entry=stateTable[0][*source++];
2006             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2007             oredEntries|=entry=stateTable[0][*source++];
2008             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2009             oredEntries|=entry=stateTable[0][*source++];
2010             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2011             oredEntries|=entry=stateTable[0][*source++];
2012             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2013             oredEntries|=entry=stateTable[0][*source++];
2014             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2015             oredEntries|=entry=stateTable[0][*source++];
2016             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2017
2018             /* were all 16 entries really valid? */
2019             if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
2020                 /* no, return to the first of these 16 */
2021                 source-=16;
2022                 target-=16;
2023                 break;
2024             }
2025         } while(--count>0);
2026         count=loops-count;
2027         targetCapacity-=16*count;
2028
2029         if(offsets!=NULL) {
2030             lastSource+=16*count;
2031             while(count>0) {
2032                 *offsets++=sourceIndex++;
2033                 *offsets++=sourceIndex++;
2034                 *offsets++=sourceIndex++;
2035                 *offsets++=sourceIndex++;
2036                 *offsets++=sourceIndex++;
2037                 *offsets++=sourceIndex++;
2038                 *offsets++=sourceIndex++;
2039                 *offsets++=sourceIndex++;
2040                 *offsets++=sourceIndex++;
2041                 *offsets++=sourceIndex++;
2042                 *offsets++=sourceIndex++;
2043                 *offsets++=sourceIndex++;
2044                 *offsets++=sourceIndex++;
2045                 *offsets++=sourceIndex++;
2046                 *offsets++=sourceIndex++;
2047                 *offsets++=sourceIndex++;
2048                 --count;
2049             }
2050         }
2051     }
2052 #endif
2053
2054     /* conversion loop */
2055     while(targetCapacity>0) {
2056         entry=stateTable[0][*source++];
2057         /* MBCS_ENTRY_IS_FINAL(entry) */
2058
2059         /* test the most common case first */
2060         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2061             /* output BMP code point */
2062             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2063             --targetCapacity;
2064             continue;
2065         }
2066
2067         /*
2068          * An if-else-if chain provides more reliable performance for
2069          * the most common cases compared to a switch.
2070          */
2071         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2072         if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2073             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2074                 /* output BMP code point */
2075                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2076                 --targetCapacity;
2077                 continue;
2078             }
2079         } else if(action==MBCS_STATE_UNASSIGNED) {
2080             /* just fall through */
2081         } else if(action==MBCS_STATE_ILLEGAL) {
2082             /* callback(illegal) */
2083             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2084         } else {
2085             /* reserved, must never occur */
2086             continue;
2087         }
2088
2089         /* set offsets since the start or the last extension */
2090         if(offsets!=NULL) {
2091             int32_t count=(int32_t)(source-lastSource);
2092
2093             /* predecrement: do not set the offset for the callback-causing character */
2094             while(--count>0) {
2095                 *offsets++=sourceIndex++;
2096             }
2097             /* offset and sourceIndex are now set for the current character */
2098         }
2099
2100         if(U_FAILURE(*pErrorCode)) {
2101             /* callback(illegal) */
2102             break;
2103         } else /* unassigned sequences indicated with byteIndex>0 */ {
2104             /* try an extension mapping */
2105             lastSource=source;
2106             cnv->toUBytes[0]=*(source-1);
2107             cnv->toULength=_extToU(cnv, cnv->sharedData,
2108                                     1, &source, sourceLimit,
2109                                     &target, pArgs->targetLimit,
2110                                     &offsets, sourceIndex,
2111                                     pArgs->flush,
2112                                     pErrorCode);
2113             sourceIndex+=1+(int32_t)(source-lastSource);
2114
2115             if(U_FAILURE(*pErrorCode)) {
2116                 /* not mappable or buffer overflow */
2117                 break;
2118             }
2119
2120             /* recalculate the targetCapacity after an extension mapping */
2121             targetCapacity=(int32_t)(pArgs->targetLimit-target);
2122             length=(int32_t)(sourceLimit-source);
2123             if(length<targetCapacity) {
2124                 targetCapacity=length;
2125             }
2126         }
2127
2128 #if MBCS_UNROLL_SINGLE_TO_BMP
2129         /* unrolling makes it faster on Pentium III/Windows 2000 */
2130         goto unrolled;
2131 #endif
2132     }
2133
2134     if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
2135         /* target is full */
2136         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2137     }
2138
2139     /* set offsets since the start or the last callback */
2140     if(offsets!=NULL) {
2141         size_t count=source-lastSource;
2142         while(count>0) {
2143             *offsets++=sourceIndex++;
2144             --count;
2145         }
2146     }
2147
2148     /* write back the updated pointers */
2149     pArgs->source=(const char *)source;
2150     pArgs->target=target;
2151     pArgs->offsets=offsets;
2152 }
2153
2154 static UBool
2155 hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
2156     const int32_t *row=stateTable[state];
2157     int32_t b, entry;
2158     /* First test for final entries in this state for some commonly valid byte values. */
2159     entry=row[0xa1];
2160     if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2161         MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2162     ) {
2163         return TRUE;
2164     }
2165     entry=row[0x41];
2166     if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2167         MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2168     ) {
2169         return TRUE;
2170     }
2171     /* Then test for final entries in this state. */
2172     for(b=0; b<=0xff; ++b) {
2173         entry=row[b];
2174         if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2175             MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2176         ) {
2177             return TRUE;
2178         }
2179     }
2180     /* Then recurse for transition entries. */
2181     for(b=0; b<=0xff; ++b) {
2182         entry=row[b];
2183         if( MBCS_ENTRY_IS_TRANSITION(entry) &&
2184             hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
2185         ) {
2186             return TRUE;
2187         }
2188     }
2189     return FALSE;
2190 }
2191
2192 /*
2193  * Is byte b a single/lead byte in this state?
2194  * Recurse for transition states, because here we don't want to say that
2195  * b is a lead byte if all byte sequences that start with b are illegal.
2196  */
2197 static UBool
2198 isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
2199     const int32_t *row=stateTable[state];
2200     int32_t entry=row[b];
2201     if(MBCS_ENTRY_IS_TRANSITION(entry)) {   /* lead byte */
2202         return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
2203     } else {
2204         uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2205         if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
2206             return FALSE;   /* SI/SO are illegal for DBCS-only conversion */
2207         } else {
2208             return action!=MBCS_STATE_ILLEGAL;
2209         }
2210     }
2211 }
2212
2213 U_CFUNC void
2214 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
2215                           UErrorCode *pErrorCode) {
2216     UConverter *cnv;
2217     const uint8_t *source, *sourceLimit;
2218     UChar *target;
2219     const UChar *targetLimit;
2220     int32_t *offsets;
2221
2222     const int32_t (*stateTable)[256];
2223     const uint16_t *unicodeCodeUnits;
2224
2225     uint32_t offset;
2226     uint8_t state;
2227     int8_t byteIndex;
2228     uint8_t *bytes;
2229
2230     int32_t sourceIndex, nextSourceIndex;
2231
2232     int32_t entry;
2233     UChar c;
2234     uint8_t action;
2235
2236     /* use optimized function if possible */
2237     cnv=pArgs->converter;
2238
2239     if(cnv->preToULength>0) {
2240         /*
2241          * pass sourceIndex=-1 because we continue from an earlier buffer
2242          * in the future, this may change with continuous offsets
2243          */
2244         ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
2245
2246         if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) {
2247             return;
2248         }
2249     }
2250
2251     if(cnv->sharedData->mbcs.countStates==1) {
2252         if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
2253             ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
2254         } else {
2255             ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
2256         }
2257         return;
2258     }
2259
2260     /* set up the local pointers */
2261     source=(const uint8_t *)pArgs->source;
2262     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2263     target=pArgs->target;
2264     targetLimit=pArgs->targetLimit;
2265     offsets=pArgs->offsets;
2266
2267     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2268         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2269     } else {
2270         stateTable=cnv->sharedData->mbcs.stateTable;
2271     }
2272     unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
2273
2274     /* get the converter state from UConverter */
2275     offset=cnv->toUnicodeStatus;
2276     byteIndex=cnv->toULength;
2277     bytes=cnv->toUBytes;
2278
2279     /*
2280      * if we are in the SBCS state for a DBCS-only converter,
2281      * then load the DBCS state from the MBCS data
2282      * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2283      */
2284     if((state=(uint8_t)(cnv->mode))==0) {
2285         state=cnv->sharedData->mbcs.dbcsOnlyState;
2286     }
2287
2288     /* sourceIndex=-1 if the current character began in the previous buffer */
2289     sourceIndex=byteIndex==0 ? 0 : -1;
2290     nextSourceIndex=0;
2291
2292     /* conversion loop */
2293     while(source<sourceLimit) {
2294         /*
2295          * This following test is to see if available input would overflow the output.
2296          * It does not catch output of more than one code unit that
2297          * overflows as a result of a surrogate pair or callback output
2298          * from the last source byte.
2299          * Therefore, those situations also test for overflows and will
2300          * then break the loop, too.
2301          */
2302         if(target>=targetLimit) {
2303             /* target is full */
2304             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2305             break;
2306         }
2307
2308         if(byteIndex==0) {
2309             /* optimized loop for 1/2-byte input and BMP output */
2310             if(offsets==NULL) {
2311                 do {
2312                     entry=stateTable[state][*source];
2313                     if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2314                         state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2315                         offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2316
2317                         ++source;
2318                         if( source<sourceLimit &&
2319                             MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2320                             MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2321                             (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2322                         ) {
2323                             ++source;
2324                             *target++=c;
2325                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2326                             offset=0;
2327                         } else {
2328                             /* set the state and leave the optimized loop */
2329                             bytes[0]=*(source-1);
2330                             byteIndex=1;
2331                             break;
2332                         }
2333                     } else {
2334                         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2335                             /* output BMP code point */
2336                             ++source;
2337                             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2338                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2339                         } else {
2340                             /* leave the optimized loop */
2341                             break;
2342                         }
2343                     }
2344                 } while(source<sourceLimit && target<targetLimit);
2345             } else /* offsets!=NULL */ {
2346                 do {
2347                     entry=stateTable[state][*source];
2348                     if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2349                         state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2350                         offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2351
2352                         ++source;
2353                         if( source<sourceLimit &&
2354                             MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2355                             MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2356                             (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2357                         ) {
2358                             ++source;
2359                             *target++=c;
2360                             if(offsets!=NULL) {
2361                                 *offsets++=sourceIndex;
2362                                 sourceIndex=(nextSourceIndex+=2);
2363                             }
2364                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2365                             offset=0;
2366                         } else {
2367                             /* set the state and leave the optimized loop */
2368                             ++nextSourceIndex;
2369                             bytes[0]=*(source-1);
2370                             byteIndex=1;
2371                             break;
2372                         }
2373                     } else {
2374                         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2375                             /* output BMP code point */
2376                             ++source;
2377                             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2378                             if(offsets!=NULL) {
2379                                 *offsets++=sourceIndex;
2380                                 sourceIndex=++nextSourceIndex;
2381                             }
2382                             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2383                         } else {
2384                             /* leave the optimized loop */
2385                             break;
2386                         }
2387                     }
2388                 } while(source<sourceLimit && target<targetLimit);
2389             }
2390
2391             /*
2392              * these tests and break statements could be put inside the loop
2393              * if C had "break outerLoop" like Java
2394              */
2395             if(source>=sourceLimit) {
2396                 break;
2397             }
2398             if(target>=targetLimit) {
2399                 /* target is full */
2400                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2401                 break;
2402             }
2403
2404             ++nextSourceIndex;
2405             bytes[byteIndex++]=*source++;
2406         } else /* byteIndex>0 */ {
2407             ++nextSourceIndex;
2408             entry=stateTable[state][bytes[byteIndex++]=*source++];
2409         }
2410
2411         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2412             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2413             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2414             continue;
2415         }
2416
2417         /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2418         cnv->mode=state;
2419
2420         /* set the next state early so that we can reuse the entry variable */
2421         state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2422
2423         /*
2424          * An if-else-if chain provides more reliable performance for
2425          * the most common cases compared to a switch.
2426          */
2427         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2428         if(action==MBCS_STATE_VALID_16) {
2429             offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2430             c=unicodeCodeUnits[offset];
2431             if(c<0xfffe) {
2432                 /* output BMP code point */
2433                 *target++=c;
2434                 if(offsets!=NULL) {
2435                     *offsets++=sourceIndex;
2436                 }
2437                 byteIndex=0;
2438             } else if(c==0xfffe) {
2439                 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
2440                     /* output fallback BMP code point */
2441                     *target++=(UChar)entry;
2442                     if(offsets!=NULL) {
2443                         *offsets++=sourceIndex;
2444                     }
2445                     byteIndex=0;
2446                 }
2447             } else {
2448                 /* callback(illegal) */
2449                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2450             }
2451         } else if(action==MBCS_STATE_VALID_DIRECT_16) {
2452             /* output BMP code point */
2453             *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2454             if(offsets!=NULL) {
2455                 *offsets++=sourceIndex;
2456             }
2457             byteIndex=0;
2458         } else if(action==MBCS_STATE_VALID_16_PAIR) {
2459             offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2460             c=unicodeCodeUnits[offset++];
2461             if(c<0xd800) {
2462                 /* output BMP code point below 0xd800 */
2463                 *target++=c;
2464                 if(offsets!=NULL) {
2465                     *offsets++=sourceIndex;
2466                 }
2467                 byteIndex=0;
2468             } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
2469                 /* output roundtrip or fallback surrogate pair */
2470                 *target++=(UChar)(c&0xdbff);
2471                 if(offsets!=NULL) {
2472                     *offsets++=sourceIndex;
2473                 }
2474                 byteIndex=0;
2475                 if(target<targetLimit) {
2476                     *target++=unicodeCodeUnits[offset];
2477                     if(offsets!=NULL) {
2478                         *offsets++=sourceIndex;
2479                     }
2480                 } else {
2481                     /* target overflow */
2482                     cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
2483                     cnv->UCharErrorBufferLength=1;
2484                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2485
2486                     offset=0;
2487                     break;
2488                 }
2489             } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
2490                 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2491                 *target++=unicodeCodeUnits[offset];
2492                 if(offsets!=NULL) {
2493                     *offsets++=sourceIndex;
2494                 }
2495                 byteIndex=0;
2496             } else if(c==0xffff) {
2497                 /* callback(illegal) */
2498                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2499             }
2500         } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
2501                   (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2502         ) {
2503             entry=MBCS_ENTRY_FINAL_VALUE(entry);
2504             /* output surrogate pair */
2505             *target++=(UChar)(0xd800|(UChar)(entry>>10));
2506             if(offsets!=NULL) {
2507                 *offsets++=sourceIndex;
2508             }
2509             byteIndex=0;
2510             c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
2511             if(target<targetLimit) {
2512                 *target++=c;
2513                 if(offsets!=NULL) {
2514                     *offsets++=sourceIndex;
2515                 }
2516             } else {
2517                 /* target overflow */
2518                 cnv->UCharErrorBuffer[0]=c;
2519                 cnv->UCharErrorBufferLength=1;
2520                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2521
2522                 offset=0;
2523                 break;
2524             }
2525         } else if(action==MBCS_STATE_CHANGE_ONLY) {
2526             /*
2527              * This serves as a state change without any output.
2528              * It is useful for reading simple stateful encodings,
2529              * for example using just Shift-In/Shift-Out codes.
2530              * The 21 unused bits may later be used for more sophisticated
2531              * state transitions.
2532              */
2533             if(cnv->sharedData->mbcs.dbcsOnlyState==0) {
2534                 byteIndex=0;
2535             } else {
2536                 /* SI/SO are illegal for DBCS-only conversion */
2537                 state=(uint8_t)(cnv->mode); /* restore the previous state */
2538
2539                 /* callback(illegal) */
2540                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2541             }
2542         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2543             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2544                 /* output BMP code point */
2545                 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2546                 if(offsets!=NULL) {
2547                     *offsets++=sourceIndex;
2548                 }
2549                 byteIndex=0;
2550             }
2551         } else if(action==MBCS_STATE_UNASSIGNED) {
2552             /* just fall through */
2553         } else if(action==MBCS_STATE_ILLEGAL) {
2554             /* callback(illegal) */
2555             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2556         } else {
2557             /* reserved, must never occur */
2558             byteIndex=0;
2559         }
2560
2561         /* end of action codes: prepare for a new character */
2562         offset=0;
2563
2564         if(byteIndex==0) {
2565             sourceIndex=nextSourceIndex;
2566         } else if(U_FAILURE(*pErrorCode)) {
2567             /* callback(illegal) */
2568             if(byteIndex>1) {
2569                 /*
2570                  * Ticket 5691: consistent illegal sequences:
2571                  * - We include at least the first byte in the illegal sequence.
2572                  * - If any of the non-initial bytes could be the start of a character,
2573                  *   we stop the illegal sequence before the first one of those.
2574                  */
2575                 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
2576                 int8_t i;
2577                 for(i=1;
2578                     i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
2579                     ++i) {}
2580                 if(i<byteIndex) {
2581                     /* Back out some bytes. */
2582                     int8_t backOutDistance=byteIndex-i;
2583                     int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
2584                     byteIndex=i;  /* length of reported illegal byte sequence */
2585                     if(backOutDistance<=bytesFromThisBuffer) {
2586                         source-=backOutDistance;
2587                     } else {
2588                         /* Back out bytes from the previous buffer: Need to replay them. */
2589                         cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
2590                         /* preToULength is negative! */
2591                         uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
2592                         source=(const uint8_t *)pArgs->source;
2593                     }
2594                 }
2595             }
2596             break;
2597         } else /* unassigned sequences indicated with byteIndex>0 */ {
2598             /* try an extension mapping */
2599             pArgs->source=(const char *)source;
2600             byteIndex=_extToU(cnv, cnv->sharedData,
2601                               byteIndex, &source, sourceLimit,
2602                               &target, targetLimit,
2603                               &offsets, sourceIndex,
2604                               pArgs->flush,
2605                               pErrorCode);
2606             sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
2607
2608             if(U_FAILURE(*pErrorCode)) {
2609                 /* not mappable or buffer overflow */
2610                 break;
2611             }
2612         }
2613     }
2614
2615     /* set the converter state back into UConverter */
2616     cnv->toUnicodeStatus=offset;
2617     cnv->mode=state;
2618     cnv->toULength=byteIndex;
2619
2620     /* write back the updated pointers */
2621     pArgs->source=(const char *)source;
2622     pArgs->target=target;
2623     pArgs->offsets=offsets;
2624 }
2625
2626 /*
2627  * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
2628  * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
2629  */
2630 static UChar32
2631 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
2632                         UErrorCode *pErrorCode) {
2633     UConverter *cnv;
2634     const int32_t (*stateTable)[256];
2635     const uint8_t *source, *sourceLimit;
2636
2637     int32_t entry;
2638     uint8_t action;
2639
2640     /* set up the local pointers */
2641     cnv=pArgs->converter;
2642     source=(const uint8_t *)pArgs->source;
2643     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2644     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2645         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2646     } else {
2647         stateTable=cnv->sharedData->mbcs.stateTable;
2648     }
2649
2650     /* conversion loop */
2651     while(source<sourceLimit) {
2652         entry=stateTable[0][*source++];
2653         /* MBCS_ENTRY_IS_FINAL(entry) */
2654
2655         /* write back the updated pointer early so that we can return directly */
2656         pArgs->source=(const char *)source;
2657
2658         if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2659             /* output BMP code point */
2660             return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2661         }
2662
2663         /*
2664          * An if-else-if chain provides more reliable performance for
2665          * the most common cases compared to a switch.
2666          */
2667         action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2668         if( action==MBCS_STATE_VALID_DIRECT_20 ||
2669             (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2670         ) {
2671             /* output supplementary code point */
2672             return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
2673         } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2674             if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2675                 /* output BMP code point */
2676                 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2677             }
2678         } else if(action==MBCS_STATE_UNASSIGNED) {
2679             /* just fall through */
2680         } else if(action==MBCS_STATE_ILLEGAL) {
2681             /* callback(illegal) */
2682             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2683         } else {
2684             /* reserved, must never occur */
2685             continue;
2686         }
2687
2688         if(U_FAILURE(*pErrorCode)) {
2689             /* callback(illegal) */
2690             break;
2691         } else /* unassigned sequence */ {
2692             /* defer to the generic implementation */
2693             pArgs->source=(const char *)source-1;
2694             return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2695         }
2696     }
2697
2698     /* no output because of empty input or only state changes */
2699     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2700     return 0xffff;
2701 }
2702
2703 /*
2704  * Version of _MBCSToUnicodeWithOffsets() optimized for single-character
2705  * conversion without offset handling.
2706  *
2707  * When a character does not have a mapping to Unicode, then we return to the
2708  * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
2709  * handling.
2710  * We also defer to the generic code in other complicated cases and have them
2711  * ultimately handled by _MBCSToUnicodeWithOffsets() itself.
2712  *
2713  * All normal mappings and errors are handled here.
2714  */
2715 static UChar32
2716 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
2717                   UErrorCode *pErrorCode) {
2718     UConverter *cnv;
2719     const uint8_t *source, *sourceLimit, *lastSource;
2720
2721     const int32_t (*stateTable)[256];
2722     const uint16_t *unicodeCodeUnits;
2723
2724     uint32_t offset;
2725     uint8_t state;
2726
2727     int32_t entry;
2728     UChar32 c;
2729     uint8_t action;
2730
2731     /* use optimized function if possible */
2732     cnv=pArgs->converter;
2733
2734     if(cnv->preToULength>0) {
2735         /* use the generic code in ucnv_getNextUChar() to continue with a partial match */
2736         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2737     }
2738
2739     if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
2740         /*
2741          * Using the generic ucnv_getNextUChar() code lets us deal correctly
2742          * with the rare case of a codepage that maps single surrogates
2743          * without adding the complexity to this already complicated function here.
2744          */
2745         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2746     } else if(cnv->sharedData->mbcs.countStates==1) {
2747         return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);
2748     }
2749
2750     /* set up the local pointers */
2751     source=lastSource=(const uint8_t *)pArgs->source;
2752     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2753
2754     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2755         stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2756     } else {
2757         stateTable=cnv->sharedData->mbcs.stateTable;
2758     }
2759     unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
2760
2761     /* get the converter state from UConverter */
2762     offset=cnv->toUnicodeStatus;
2763
2764     /*
2765      * if we are in the SBCS state for a DBCS-only converter,
2766      * then load the DBCS state from the MBCS data
2767      * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2768      */
2769     if((state=(uint8_t)(cnv->mode))==0) {
2770         state=cnv->sharedData->mbcs.dbcsOnlyState;
2771     }
2772
2773     /* conversion loop */
2774     c=U_SENTINEL;
2775     while(source<sourceLimit) {
2776         entry=stateTable[state][*source++];
2777         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2778             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2779             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2780
2781             /* optimization for 1/2-byte input and BMP output */
2782             if( source<sourceLimit &&
2783                 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2784                 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2785                 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2786             ) {
2787                 ++source;
2788                 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2789                 /* output BMP code point */
2790                 break;
2791             }
2792         } else {
2793             /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2794             cnv->mode=state;
2795
2796             /* set the next state early so that we can reuse the entry variable */
2797             state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2798
2799             /*
2800              * An if-else-if chain provides more reliable performance for
2801              * the most common cases compared to a switch.
2802              */
2803             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2804             if(action==MBCS_STATE_VALID_DIRECT_16) {
2805                 /* output BMP code point */
2806                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2807                 break;
2808             } else if(action==MBCS_STATE_VALID_16) {
2809                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2810                 c=unicodeCodeUnits[offset];
2811                 if(c<0xfffe) {
2812                     /* output BMP code point */
2813                     break;
2814                 } else if(c==0xfffe) {
2815                     if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
2816                         break;
2817                     }
2818                 } else {
2819                     /* callback(illegal) */
2820                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2821                 }
2822             } else if(action==MBCS_STATE_VALID_16_PAIR) {
2823                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2824                 c=unicodeCodeUnits[offset++];
2825                 if(c<0xd800) {
2826                     /* output BMP code point below 0xd800 */
2827                     break;
2828                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
2829                     /* output roundtrip or fallback supplementary code point */
2830                     c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
2831                     break;
2832                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
2833                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2834                     c=unicodeCodeUnits[offset];
2835                     break;
2836                 } else if(c==0xffff) {
2837                     /* callback(illegal) */
2838                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2839                 }
2840             } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
2841                       (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2842             ) {
2843                 /* output supplementary code point */
2844                 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
2845                 break;
2846             } else if(action==MBCS_STATE_CHANGE_ONLY) {
2847                 /*
2848                  * This serves as a state change without any output.
2849                  * It is useful for reading simple stateful encodings,
2850                  * for example using just Shift-In/Shift-Out codes.
2851                  * The 21 unused bits may later be used for more sophisticated
2852                  * state transitions.
2853                  */
2854                 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) {
2855                     /* SI/SO are illegal for DBCS-only conversion */
2856                     state=(uint8_t)(cnv->mode); /* restore the previous state */
2857
2858                     /* callback(illegal) */
2859                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2860                 }
2861             } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2862                 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2863                     /* output BMP code point */
2864                     c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2865                     break;
2866                 }
2867             } else if(action==MBCS_STATE_UNASSIGNED) {
2868                 /* just fall through */
2869             } else if(action==MBCS_STATE_ILLEGAL) {
2870                 /* callback(illegal) */
2871                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2872             } else {
2873                 /* reserved (must never occur), or only state change */
2874                 offset=0;
2875                 lastSource=source;
2876                 continue;
2877             }
2878
2879             /* end of action codes: prepare for a new character */
2880             offset=0;
2881
2882             if(U_FAILURE(*pErrorCode)) {
2883                 /* callback(illegal) */
2884                 break;
2885             } else /* unassigned sequence */ {
2886                 /* defer to the generic implementation */
2887                 cnv->toUnicodeStatus=0;
2888                 cnv->mode=state;
2889                 pArgs->source=(const char *)lastSource;
2890                 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2891             }
2892         }
2893     }
2894
2895     if(c<0) {
2896         if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
2897             /* incomplete character byte sequence */
2898             uint8_t *bytes=cnv->toUBytes;
2899             cnv->toULength=(int8_t)(source-lastSource);
2900             do {
2901                 *bytes++=*lastSource++;
2902             } while(lastSource<source);
2903             *pErrorCode=U_TRUNCATED_CHAR_FOUND;
2904         } else if(U_FAILURE(*pErrorCode)) {
2905             /* callback(illegal) */
2906             /*
2907              * Ticket 5691: consistent illegal sequences:
2908              * - We include at least the first byte in the illegal sequence.
2909              * - If any of the non-initial bytes could be the start of a character,
2910              *   we stop the illegal sequence before the first one of those.
2911              */
2912             UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
2913             uint8_t *bytes=cnv->toUBytes;
2914             *bytes++=*lastSource++;     /* first byte */
2915             if(lastSource==source) {
2916                 cnv->toULength=1;
2917             } else /* lastSource<source: multi-byte character */ {
2918                 int8_t i;
2919                 for(i=1;
2920                     lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
2921                     ++i
2922                 ) {
2923                     *bytes++=*lastSource++;
2924                 }
2925                 cnv->toULength=i;
2926                 source=lastSource;
2927             }
2928         } else {
2929             /* no output because of empty input or only state changes */
2930             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2931         }
2932         c=0xffff;
2933     }
2934
2935     /* set the converter state back into UConverter, ready for a new character */
2936     cnv->toUnicodeStatus=0;
2937     cnv->mode=state;
2938
2939     /* write back the updated pointer */
2940     pArgs->source=(const char *)source;
2941     return c;
2942 }
2943
2944 #if 0
2945 /*
2946  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
2947  * Removal improves code coverage.
2948  */
2949 /**
2950  * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
2951  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
2952  * It does not handle conversion extensions (_extToU()).
2953  */
2954 U_CFUNC UChar32
2955 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
2956                               uint8_t b, UBool useFallback) {
2957     int32_t entry;
2958     uint8_t action;
2959
2960     entry=sharedData->mbcs.stateTable[0][b];
2961     /* MBCS_ENTRY_IS_FINAL(entry) */
2962
2963     if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2964         /* output BMP code point */
2965         return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2966     }
2967
2968     /*
2969      * An if-else-if chain provides more reliable performance for
2970      * the most common cases compared to a switch.
2971      */
2972     action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2973     if(action==MBCS_STATE_VALID_DIRECT_20) {
2974         /* output supplementary code point */
2975         return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2976     } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2977         if(!TO_U_USE_FALLBACK(useFallback)) {
2978             return 0xfffe;
2979         }
2980         /* output BMP code point */
2981         return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2982     } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
2983         if(!TO_U_USE_FALLBACK(useFallback)) {
2984             return 0xfffe;
2985         }
2986         /* output supplementary code point */
2987         return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
2988     } else if(action==MBCS_STATE_UNASSIGNED) {
2989         return 0xfffe;
2990     } else if(action==MBCS_STATE_ILLEGAL) {
2991         return 0xffff;
2992     } else {
2993         /* reserved, must never occur */
2994         return 0xffff;
2995     }
2996 }
2997 #endif
2998
2999 /*
3000  * This is a simple version of _MBCSGetNextUChar() that is used
3001  * by other converter implementations.
3002  * It only returns an "assigned" result if it consumes the entire input.
3003  * It does not use state from the converter, nor error codes.
3004  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3005  * It handles conversion extensions but not GB 18030.
3006  *
3007  * Return value:
3008  * U+fffe   unassigned
3009  * U+ffff   illegal
3010  * otherwise the Unicode code point
3011  */
3012 U_CFUNC UChar32
3013 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
3014                         const char *source, int32_t length,
3015                         UBool useFallback) {
3016     const int32_t (*stateTable)[256];
3017     const uint16_t *unicodeCodeUnits;
3018
3019     uint32_t offset;
3020     uint8_t state, action;
3021
3022     UChar32 c;
3023     int32_t i, entry;
3024
3025     if(length<=0) {
3026         /* no input at all: "illegal" */
3027         return 0xffff;
3028     }
3029
3030 #if 0
3031 /*
3032  * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
3033  * TODO In future releases, verify that this function is never called for SBCS
3034  * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
3035  * Removal improves code coverage.
3036  */
3037     /* use optimized function if possible */
3038     if(sharedData->mbcs.countStates==1) {
3039         if(length==1) {
3040             return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
3041         } else {
3042             return 0xffff; /* illegal: more than a single byte for an SBCS converter */
3043         }
3044     }
3045 #endif
3046
3047     /* set up the local pointers */
3048     stateTable=sharedData->mbcs.stateTable;
3049     unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;
3050
3051     /* converter state */
3052     offset=0;
3053     state=sharedData->mbcs.dbcsOnlyState;
3054
3055     /* conversion loop */
3056     for(i=0;;) {
3057         entry=stateTable[state][(uint8_t)source[i++]];
3058         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
3059             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
3060             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
3061
3062             if(i==length) {
3063                 return 0xffff; /* truncated character */
3064             }
3065         } else {
3066             /*
3067              * An if-else-if chain provides more reliable performance for
3068              * the most common cases compared to a switch.
3069              */
3070             action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
3071             if(action==MBCS_STATE_VALID_16) {
3072                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
3073                 c=unicodeCodeUnits[offset];
3074                 if(c!=0xfffe) {
3075                     /* done */
3076                 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
3077                     c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);
3078                 /* else done with 0xfffe */
3079                 }
3080                 break;
3081             } else if(action==MBCS_STATE_VALID_DIRECT_16) {
3082                 /* output BMP code point */
3083                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3084                 break;
3085             } else if(action==MBCS_STATE_VALID_16_PAIR) {
3086                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
3087                 c=unicodeCodeUnits[offset++];
3088                 if(c<0xd800) {
3089                     /* output BMP code point below 0xd800 */
3090                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
3091                     /* output roundtrip or fallback supplementary code point */
3092                     c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
3093                 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
3094                     /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
3095                     c=unicodeCodeUnits[offset];
3096                 } else if(c==0xffff) {
3097                     return 0xffff;
3098                 } else {
3099                     c=0xfffe;
3100                 }
3101                 break;
3102             } else if(action==MBCS_STATE_VALID_DIRECT_20) {
3103                 /* output supplementary code point */
3104                 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3105                 break;
3106             } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
3107                 if(!TO_U_USE_FALLBACK(useFallback)) {
3108                     c=0xfffe;
3109                     break;
3110                 }
3111                 /* output BMP code point */
3112                 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3113                 break;
3114             } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
3115                 if(!TO_U_USE_FALLBACK(useFallback)) {
3116                     c=0xfffe;
3117                     break;
3118                 }
3119                 /* output supplementary code point */
3120                 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3121                 break;
3122             } else if(action==MBCS_STATE_UNASSIGNED) {
3123                 c=0xfffe;
3124                 break;
3125             }
3126
3127             /*
3128              * forbid MBCS_STATE_CHANGE_ONLY for this function,
3129              * and MBCS_STATE_ILLEGAL and reserved action codes
3130              */
3131             return 0xffff;
3132         }
3133     }
3134
3135     if(i!=length) {
3136         /* illegal for this function: not all input consumed */
3137         return 0xffff;
3138     }
3139
3140     if(c==0xfffe) {
3141         /* try an extension mapping */
3142         const int32_t *cx=sharedData->mbcs.extIndexes;
3143         if(cx!=NULL) {
3144             return ucnv_extSimpleMatchToU(cx, source, length, useFallback);
3145         }
3146     }
3147
3148     return c;
3149 }
3150
3151 /* MBCS-from-Unicode conversion functions ----------------------------------- */
3152
3153 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
3154 static void
3155 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3156                                   UErrorCode *pErrorCode) {
3157     UConverter *cnv;
3158     const UChar *source, *sourceLimit;
3159     uint8_t *target;
3160     int32_t targetCapacity;
3161     int32_t *offsets;
3162
3163     const uint16_t *table;
3164     const uint16_t *mbcsIndex;
3165     const uint8_t *bytes;
3166
3167     UChar32 c;
3168
3169     int32_t sourceIndex, nextSourceIndex;
3170
3171     uint32_t stage2Entry;
3172     uint32_t asciiRoundtrips;
3173     uint32_t value;
3174     uint8_t unicodeMask;
3175
3176     /* use optimized function if possible */
3177     cnv=pArgs->converter;
3178     unicodeMask=cnv->sharedData->mbcs.unicodeMask;
3179
3180     /* set up the local pointers */
3181     source=pArgs->source;
3182     sourceLimit=pArgs->sourceLimit;
3183     target=(uint8_t *)pArgs->target;
3184     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3185     offsets=pArgs->offsets;
3186
3187     table=cnv->sharedData->mbcs.fromUnicodeTable;
3188     mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
3189     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3190         bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3191     } else {
3192         bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
3193     }
3194     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
3195
3196     /* get the converter state from UConverter */
3197     c=cnv->fromUChar32;
3198
3199     /* sourceIndex=-1 if the current character began in the previous buffer */
3200     sourceIndex= c==0 ? 0 : -1;
3201     nextSourceIndex=0;
3202
3203     /* conversion loop */
3204     if(c!=0 && targetCapacity>0) {
3205         goto getTrail;
3206     }
3207
3208     while(source<sourceLimit) {
3209         /*
3210          * This following test is to see if available input would overflow the output.
3211          * It does not catch output of more than one byte that
3212          * overflows as a result of a multi-byte character or callback output
3213          * from the last source character.
3214          * Therefore, those situations also test for overflows and will
3215          * then break the loop, too.
3216          */
3217         if(targetCapacity>0) {
3218             /*
3219              * Get a correct Unicode code point:
3220              * a single UChar for a BMP code point or
3221              * a matched surrogate pair for a "supplementary code point".
3222              */
3223             c=*source++;
3224             ++nextSourceIndex;
3225             if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3226                 *target++=(uint8_t)c;
3227                 if(offsets!=NULL) {
3228                     *offsets++=sourceIndex;
3229                     sourceIndex=nextSourceIndex;
3230                 }
3231                 --targetCapacity;
3232                 c=0;
3233                 continue;
3234             }
3235             /*
3236              * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
3237              * to avoid dealing with surrogates.
3238              * MBCS_FAST_MAX must be >=0xd7ff.
3239              */
3240             if(c<=0xd7ff) {
3241                 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c);
3242                 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
3243                 if(value==0) {
3244                     goto unassigned;
3245                 }
3246                 /* output the value */
3247             } else {
3248                 /*
3249                  * This also tests if the codepage maps single surrogates.
3250                  * If it does, then surrogates are not paired but mapped separately.
3251                  * Note that in this case unmatched surrogates are not detected.
3252                  */
3253                 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
3254                     if(UTF_IS_SURROGATE_FIRST(c)) {
3255 getTrail:
3256                         if(source<sourceLimit) {
3257                             /* test the following code unit */
3258                             UChar trail=*source;
3259                             if(UTF_IS_SECOND_SURROGATE(trail)) {
3260                                 ++source;
3261                                 ++nextSourceIndex;
3262                                 c=UTF16_GET_PAIR_VALUE(c, trail);
3263                                 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3264                                     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3265                                     /* callback(unassigned) */
3266                                     goto unassigned;
3267                                 }
3268                                 /* convert this supplementary code point */
3269                                 /* exit this condition tree */
3270                             } else {
3271                                 /* this is an unmatched lead code unit (1st surrogate) */
3272                                 /* callback(illegal) */
3273                                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3274                                 break;
3275                             }
3276                         } else {
3277                             /* no more input */
3278                             break;
3279                         }
3280                     } else {
3281                         /* this is an unmatched trail code unit (2nd surrogate) */
3282                         /* callback(illegal) */
3283                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3284                         break;
3285                     }
3286                 }
3287
3288                 /* convert the Unicode code point in c into codepage bytes */
3289                 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
3290
3291                 /* get the bytes and the length for the output */
3292                 /* MBCS_OUTPUT_2 */
3293                 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3294
3295                 /* is this code point assigned, or do we use fallbacks? */
3296                 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
3297                      (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
3298                 ) {
3299                     /*
3300                      * We allow a 0 byte output if the "assigned" bit is set for this entry.
3301                      * There is no way with this data structure for fallback output
3302                      * to be a zero byte.
3303                      */
3304
3305 unassigned:
3306                     /* try an extension mapping */
3307                     pArgs->source=source;
3308                     c=_extFromU(cnv, cnv->sharedData,
3309                                 c, &source, sourceLimit,
3310                                 &target, target+targetCapacity,
3311                                 &offsets, sourceIndex,
3312                                 pArgs->flush,
3313                                 pErrorCode);
3314                     nextSourceIndex+=(int32_t)(source-pArgs->source);
3315
3316                     if(U_FAILURE(*pErrorCode)) {
3317                         /* not mappable or buffer overflow */
3318                         break;
3319                     } else {
3320                         /* a mapping was written to the target, continue */
3321
3322                         /* recalculate the targetCapacity after an extension mapping */
3323                         targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3324
3325                         /* normal end of conversion: prepare for a new character */
3326                         sourceIndex=nextSourceIndex;
3327                         continue;
3328                     }
3329                 }
3330             }
3331
3332             /* write the output character bytes from value and length */
3333             /* from the first if in the loop we know that targetCapacity>0 */
3334             if(value<=0xff) {
3335                 /* this is easy because we know that there is enough space */
3336                 *target++=(uint8_t)value;
3337                 if(offsets!=NULL) {
3338                     *offsets++=sourceIndex;
3339                 }
3340                 --targetCapacity;
3341             } else /* length==2 */ {
3342                 *target++=(uint8_t)(value>>8);
3343                 if(2<=targetCapacity) {
3344                     *target++=(uint8_t)value;
3345                     if(offsets!=NULL) {
3346                         *offsets++=sourceIndex;
3347                         *offsets++=sourceIndex;
3348                     }
3349                     targetCapacity-=2;
3350                 } else {
3351                     if(offsets!=NULL) {
3352                         *offsets++=sourceIndex;
3353                     }
3354                     cnv->charErrorBuffer[0]=(char)value;
3355                     cnv->charErrorBufferLength=1;
3356
3357                     /* target overflow */
3358                     targetCapacity=0;
3359                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3360                     c=0;
3361                     break;
3362                 }
3363             }
3364
3365             /* normal end of conversion: prepare for a new character */
3366             c=0;
3367             sourceIndex=nextSourceIndex;
3368             continue;
3369         } else {
3370             /* target is full */
3371             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3372             break;
3373         }
3374     }
3375
3376     /* set the converter state back into UConverter */
3377     cnv->fromUChar32=c;
3378
3379     /* write back the updated pointers */
3380     pArgs->source=source;
3381     pArgs->target=(char *)target;
3382     pArgs->offsets=offsets;
3383 }
3384
3385 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
3386 static void
3387 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3388                                   UErrorCode *pErrorCode) {
3389     UConverter *cnv;
3390     const UChar *source, *sourceLimit;
3391     uint8_t *target;
3392     int32_t targetCapacity;
3393     int32_t *offsets;
3394
3395     const uint16_t *table;
3396     const uint16_t *results;
3397
3398     UChar32 c;
3399
3400     int32_t sourceIndex, nextSourceIndex;
3401
3402     uint16_t value, minValue;
3403     UBool hasSupplementary;
3404
3405     /* set up the local pointers */
3406     cnv=pArgs->converter;
3407     source=pArgs->source;
3408     sourceLimit=pArgs->sourceLimit;
3409     target=(uint8_t *)pArgs->target;
3410     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3411     offsets=pArgs->offsets;
3412
3413     table=cnv->sharedData->mbcs.fromUnicodeTable;
3414     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3415         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3416     } else {
3417         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
3418     }
3419
3420     if(cnv->useFallback) {
3421         /* use all roundtrip and fallback results */
3422         minValue=0x800;
3423     } else {
3424         /* use only roundtrips and fallbacks from private-use characters */
3425         minValue=0xc00;
3426     }
3427     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
3428
3429     /* get the converter state from UConverter */
3430     c=cnv->fromUChar32;
3431
3432     /* sourceIndex=-1 if the current character began in the previous buffer */
3433     sourceIndex= c==0 ? 0 : -1;
3434     nextSourceIndex=0;
3435
3436     /* conversion loop */
3437     if(c!=0 && targetCapacity>0) {
3438         goto getTrail;
3439     }
3440
3441     while(source<sourceLimit) {
3442         /*
3443          * This following test is to see if available input would overflow the output.
3444          * It does not catch output of more than one byte that
3445          * overflows as a result of a multi-byte character or callback output
3446          * from the last source character.
3447          * Therefore, those situations also test for overflows and will
3448          * then break the loop, too.
3449          */
3450         if(targetCapacity>0) {
3451             /*
3452              * Get a correct Unicode code point:
3453              * a single UChar for a BMP code point or
3454              * a matched surrogate pair for a "supplementary code point".
3455              */
3456             c=*source++;
3457             ++nextSourceIndex;
3458             if(UTF_IS_SURROGATE(c)) {
3459                 if(UTF_IS_SURROGATE_FIRST(c)) {
3460 getTrail:
3461                     if(source<sourceLimit) {
3462                         /* test the following code unit */
3463                         UChar trail=*source;
3464                         if(UTF_IS_SECOND_SURROGATE(trail)) {
3465                             ++source;
3466                             ++nextSourceIndex;
3467                             c=UTF16_GET_PAIR_VALUE(c, trail);
3468                             if(!hasSupplementary) {
3469                                 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3470                                 /* callback(unassigned) */
3471                                 goto unassigned;
3472                             }
3473                             /* convert this supplementary code point */
3474                             /* exit this condition tree */
3475                         } else {
3476                             /* this is an unmatched lead code unit (1st surrogate) */
3477                             /* callback(illegal) */
3478                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3479                             break;
3480                         }
3481                     } else {
3482                         /* no more input */
3483                         break;
3484                     }
3485                 } else {
3486                     /* this is an unmatched trail code unit (2nd surrogate) */
3487                     /* callback(illegal) */
3488                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3489                     break;
3490                 }
3491             }
3492
3493             /* convert the Unicode code point in c into codepage bytes */
3494             value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3495
3496             /* is this code point assigned, or do we use fallbacks? */
3497             if(value>=minValue) {
3498                 /* assigned, write the output character bytes from value and length */
3499                 /* length==1 */
3500                 /* this is easy because we know that there is enough space */
3501                 *target++=(uint8_t)value;
3502                 if(offsets!=NULL) {
3503                     *offsets++=sourceIndex;
3504                 }
3505                 --targetCapacity;
3506
3507                 /* normal end of conversion: prepare for a new character */
3508                 c=0;
3509                 sourceIndex=nextSourceIndex;
3510             } else { /* unassigned */
3511 unassigned:
3512                 /* try an extension mapping */
3513                 pArgs->source=source;
3514                 c=_extFromU(cnv, cnv->sharedData,
3515                             c, &source, sourceLimit,
3516                             &target, target+targetCapacity,
3517                             &offsets, sourceIndex,
3518                             pArgs->flush,
3519                             pErrorCode);
3520                 nextSourceIndex+=(int32_t)(source-pArgs->source);
3521
3522                 if(U_FAILURE(*pErrorCode)) {
3523                     /* not mappable or buffer overflow */
3524                     break;
3525                 } else {
3526                     /* a mapping was written to the target, continue */
3527
3528                     /* recalculate the targetCapacity after an extension mapping */
3529                     targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3530
3531                     /* normal end of conversion: prepare for a new character */
3532                     sourceIndex=nextSourceIndex;
3533                 }
3534             }
3535         } else {
3536             /* target is full */
3537             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3538             break;
3539         }
3540     }
3541
3542     /* set the converter state back into UConverter */
3543     cnv->fromUChar32=c;
3544
3545     /* write back the updated pointers */
3546     pArgs->source=source;
3547     pArgs->target=(char *)target;
3548     pArgs->offsets=offsets;
3549 }
3550
3551 /*
3552  * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
3553  * that map only to and from the BMP.
3554  * In addition to single-byte/state optimizations, the offset calculations
3555  * become much easier.
3556  * It would be possible to use the sbcsIndex for UTF-8-friendly tables,
3557  * but measurements have shown that this diminishes performance
3558  * in more cases than it improves it.
3559  * See SVN revision 21013 (2007-feb-06) for the last version with #if switches
3560  * for various MBCS and SBCS optimizations.
3561  */
3562 static void
3563 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
3564                               UErrorCode *pErrorCode) {
3565     UConverter *cnv;
3566     const UChar *source, *sourceLimit, *lastSource;
3567     uint8_t *target;
3568     int32_t targetCapacity, length;
3569     int32_t *offsets;
3570
3571     const uint16_t *table;
3572     const uint16_t *results;
3573
3574     UChar32 c;
3575
3576     int32_t sourceIndex;
3577
3578     uint32_t asciiRoundtrips;
3579     uint16_t value, minValue;
3580
3581     /* set up the local pointers */
3582     cnv=pArgs->converter;
3583     source=pArgs->source;
3584     sourceLimit=pArgs->sourceLimit;
3585     target=(uint8_t *)pArgs->target;
3586     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3587     offsets=pArgs->offsets;
3588
3589     table=cnv->sharedData->mbcs.fromUnicodeTable;
3590     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3591         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3592     } else {
3593         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
3594     }
3595     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
3596
3597     if(cnv->useFallback) {
3598         /* use all roundtrip and fallback results */
3599         minValue=0x800;
3600     } else {
3601         /* use only roundtrips and fallbacks from private-use characters */
3602         minValue=0xc00;
3603     }
3604
3605     /* get the converter state from UConverter */
3606     c=cnv->fromUChar32;
3607
3608     /* sourceIndex=-1 if the current character began in the previous buffer */
3609     sourceIndex= c==0 ? 0 : -1;
3610     lastSource=source;
3611
3612     /*
3613      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
3614      * for the minimum of the sourceLength and targetCapacity
3615      */
3616     length=(int32_t)(sourceLimit-source);
3617     if(length<targetCapacity) {
3618         targetCapacity=length;
3619     }
3620
3621     /* conversion loop */
3622     if(c!=0 && targetCapacity>0) {
3623         goto getTrail;
3624     }
3625
3626 #if MBCS_UNROLL_SINGLE_FROM_BMP
3627     /* unrolling makes it slower on Pentium III/Windows 2000?! */
3628     /* unroll the loop with the most common case */
3629 unrolled:
3630     if(targetCapacity>=4) {
3631         int32_t count, loops;
3632         uint16_t andedValues;
3633
3634         loops=count=targetCapacity>>2;
3635         do {
3636             c=*source++;
3637             andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3638             *target++=(uint8_t)value;
3639             c=*source++;
3640             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3641             *target++=(uint8_t)value;
3642             c=*source++;
3643             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3644             *target++=(uint8_t)value;
3645             c=*source++;
3646             andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3647             *target++=(uint8_t)value;
3648
3649             /* were all 4 entries really valid? */
3650             if(andedValues<minValue) {
3651                 /* no, return to the first of these 4 */
3652                 source-=4;
3653                 target-=4;
3654                 break;
3655             }
3656         } while(--count>0);
3657         count=loops-count;
3658         targetCapacity-=4*count;
3659
3660         if(offsets!=NULL) {
3661             lastSource+=4*count;
3662             while(count>0) {
3663                 *offsets++=sourceIndex++;
3664                 *offsets++=sourceIndex++;
3665                 *offsets++=sourceIndex++;
3666                 *offsets++=sourceIndex++;
3667                 --count;
3668             }
3669         }
3670
3671         c=0;
3672     }
3673 #endif
3674
3675     while(targetCapacity>0) {
3676         /*
3677          * Get a correct Unicode code point:
3678          * a single UChar for a BMP code point or
3679          * a matched surrogate pair for a "supplementary code point".
3680          */
3681         c=*source++;
3682         /*
3683          * Do not immediately check for single surrogates:
3684          * Assume that they are unassigned and check for them in that case.
3685          * This speeds up the conversion of assigned characters.
3686          */
3687         /* convert the Unicode code point in c into codepage bytes */
3688         if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3689             *target++=(uint8_t)c;
3690             --targetCapacity;
3691             c=0;
3692             continue;
3693         }
3694         value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3695         /* is this code point assigned, or do we use fallbacks? */
3696         if(value>=minValue) {
3697             /* assigned, write the output character bytes from value and length */
3698             /* length==1 */
3699             /* this is easy because we know that there is enough space */
3700             *target++=(uint8_t)value;
3701             --targetCapacity;
3702
3703             /* normal end of conversion: prepare for a new character */
3704             c=0;
3705             continue;
3706         } else if(!UTF_IS_SURROGATE(c)) {
3707             /* normal, unassigned BMP character */
3708         } else if(UTF_IS_SURROGATE_FIRST(c)) {
3709 getTrail:
3710             if(source<sourceLimit) {
3711                 /* test the following code unit */
3712                 UChar trail=*source;
3713                 if(UTF_IS_SECOND_SURROGATE(trail)) {
3714                     ++source;
3715                     c=UTF16_GET_PAIR_VALUE(c, trail);
3716                     /* this codepage does not map supplementary code points */
3717                     /* callback(unassigned) */
3718                 } else {
3719                     /* this is an unmatched lead code unit (1st surrogate) */
3720                     /* callback(illegal) */
3721                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3722                     break;
3723                 }
3724             } else {
3725                 /* no more input */
3726                 if (pArgs->flush) {
3727                     *pErrorCode=U_TRUNCATED_CHAR_FOUND;
3728                 }
3729                 break;
3730             }
3731         } else {
3732             /* this is an unmatched trail code unit (2nd surrogate) */
3733             /* callback(illegal) */
3734             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3735             break;
3736         }
3737
3738         /* c does not have a mapping */
3739
3740         /* get the number of code units for c to correctly advance sourceIndex */
3741         length=U16_LENGTH(c);
3742
3743         /* set offsets since the start or the last extension */
3744         if(offsets!=NULL) {
3745             int32_t count=(int32_t)(source-lastSource);
3746
3747             /* do not set the offset for this character */
3748             count-=length;
3749
3750             while(count>0) {
3751                 *offsets++=sourceIndex++;
3752                 --count;
3753             }
3754             /* offsets and sourceIndex are now set for the current character */
3755         }
3756
3757         /* try an extension mapping */
3758         lastSource=source;
3759         c=_extFromU(cnv, cnv->sharedData,
3760                     c, &source, sourceLimit,
3761                     &target, (const uint8_t *)(pArgs->targetLimit),
3762                     &offsets, sourceIndex,
3763                     pArgs->flush,
3764                     pErrorCode);
3765         sourceIndex+=length+(int32_t)(source-lastSource);
3766         lastSource=source;
3767
3768         if(U_FAILURE(*pErrorCode)) {
3769             /* not mappable or buffer overflow */
3770             break;
3771         } else {
3772             /* a mapping was written to the target, continue */
3773
3774             /* recalculate the targetCapacity after an extension mapping */
3775             targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
3776             length=(int32_t)(sourceLimit-source);
3777             if(length<targetCapacity) {
3778                 targetCapacity=length;
3779             }
3780         }
3781
3782 #if MBCS_UNROLL_SINGLE_FROM_BMP
3783         /* unrolling makes it slower on Pentium III/Windows 2000?! */
3784         goto unrolled;
3785 #endif
3786     }
3787
3788     if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
3789         /* target is full */
3790         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3791     }
3792
3793     /* set offsets since the start or the last callback */
3794     if(offsets!=NULL) {
3795         size_t count=source-lastSource;
3796         if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) {
3797             /*
3798             Caller gave us a partial supplementary character,
3799             which this function couldn't convert in any case.
3800             The callback will handle the offset.
3801             */
3802             count--;
3803         }
3804         while(count>0) {
3805             *offsets++=sourceIndex++;
3806             --count;
3807         }
3808     }
3809
3810     /* set the converter state back into UConverter */
3811     cnv->fromUChar32=c;
3812
3813     /* write back the updated pointers */
3814     pArgs->source=source;
3815     pArgs->target=(char *)target;
3816     pArgs->offsets=offsets;
3817 }
3818
3819 U_CFUNC void
3820 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3821                             UErrorCode *pErrorCode) {
3822     UConverter *cnv;
3823     const UChar *source, *sourceLimit;
3824     uint8_t *target;
3825     int32_t targetCapacity;
3826     int32_t *offsets;
3827
3828     const uint16_t *table;
3829     const uint16_t *mbcsIndex;
3830     const uint8_t *p, *bytes;
3831     uint8_t outputType;
3832
3833     UChar32 c;
3834
3835     int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
3836
3837     uint32_t stage2Entry;
3838     uint32_t asciiRoundtrips;
3839     uint32_t value;
3840     int32_t length, prevLength;
3841     uint8_t unicodeMask;
3842
3843     cnv=pArgs->converter;
3844
3845     if(cnv->preFromUFirstCP>=0) {
3846         /*
3847          * pass sourceIndex=-1 because we continue from an earlier buffer
3848          * in the future, this may change with continuous offsets
3849          */
3850         ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
3851
3852         if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) {
3853             return;
3854         }
3855     }
3856
3857     /* use optimized function if possible */
3858     outputType=cnv->sharedData->mbcs.outputType;
3859     unicodeMask=cnv->sharedData->mbcs.unicodeMask;
3860     if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
3861         if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3862             ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
3863         } else {
3864             ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
3865         }
3866         return;
3867     } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {
3868         ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
3869         return;
3870     }
3871
3872     /* set up the local pointers */
3873     source=pArgs->source;
3874     sourceLimit=pArgs->sourceLimit;
3875     target=(uint8_t *)pArgs->target;
3876     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
3877     offsets=pArgs->offsets;
3878
3879     table=cnv->sharedData->mbcs.fromUnicodeTable;
3880     if(cnv->sharedData->mbcs.utf8Friendly) {
3881         mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
3882     } else {
3883         mbcsIndex=NULL;
3884     }
3885     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
3886         bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
3887     } else {
3888         bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
3889     }
3890     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
3891
3892     /* get the converter state from UConverter */
3893     c=cnv->fromUChar32;
3894
3895     if(outputType==MBCS_OUTPUT_2_SISO) {
3896         prevLength=cnv->fromUnicodeStatus;
3897         if(prevLength==0) {
3898             /* set the real value */
3899             prevLength=1;
3900         }
3901     } else {
3902         /* prevent fromUnicodeStatus from being set to something non-0 */
3903         prevLength=0;
3904     }
3905
3906     /* sourceIndex=-1 if the current character began in the previous buffer */
3907     prevSourceIndex=-1;
3908     sourceIndex= c==0 ? 0 : -1;
3909     nextSourceIndex=0;
3910
3911     /* conversion loop */
3912     /*
3913      * This is another piece of ugly code:
3914      * A goto into the loop if the converter state contains a first surrogate
3915      * from the previous function call.
3916      * It saves me to check in each loop iteration a check of if(c==0)
3917      * and duplicating the trail-surrogate-handling code in the else
3918      * branch of that check.
3919      * I could not find any other way to get around this other than
3920      * using a function call for the conversion and callback, which would
3921      * be even more inefficient.
3922      *
3923      * Markus Scherer 2000-jul-19
3924      */
3925     if(c!=0 && targetCapacity>0) {
3926         goto getTrail;
3927     }
3928
3929     while(source<sourceLimit) {
3930         /*
3931          * This following test is to see if available input would overflow the output.
3932          * It does not catch output of more than one byte that
3933          * overflows as a result of a multi-byte character or callback output
3934          * from the last source character.
3935          * Therefore, those situations also test for overflows and will
3936          * then break the loop, too.
3937          */
3938         if(targetCapacity>0) {
3939             /*
3940              * Get a correct Unicode code point:
3941              * a single UChar for a BMP code point or
3942              * a matched surrogate pair for a "supplementary code point".
3943              */
3944             c=*source++;
3945             ++nextSourceIndex;
3946             if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3947                 *target++=(uint8_t)c;
3948                 if(offsets!=NULL) {
3949                     *offsets++=sourceIndex;
3950                     prevSourceIndex=sourceIndex;
3951                     sourceIndex=nextSourceIndex;
3952                 }
3953                 --targetCapacity;
3954                 c=0;
3955                 continue;
3956             }
3957             /*
3958              * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
3959              * to avoid dealing with surrogates.
3960              * MBCS_FAST_MAX must be >=0xd7ff.
3961              */
3962             if(c<=0xd7ff && mbcsIndex!=NULL) {
3963                 value=mbcsIndex[c>>6];
3964
3965                 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */
3966                 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
3967                 switch(outputType) {
3968                 case MBCS_OUTPUT_2:
3969                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
3970                     if(value<=0xff) {
3971                         if(value==0) {
3972                             goto unassigned;
3973                         } else {
3974                             length=1;
3975                         }
3976                     } else {
3977                         length=2;
3978                     }
3979                     break;
3980                 case MBCS_OUTPUT_2_SISO:
3981                     /* 1/2-byte stateful with Shift-In/Shift-Out */
3982                     /*
3983                      * Save the old state in the converter object
3984                      * right here, then change the local prevLength state variable if necessary.
3985                      * Then, if this character turns out to be unassigned or a fallback that
3986                      * is not taken, the callback code must not save the new state in the converter
3987                      * because the new state is for a character that is not output.
3988                      * However, the callback must still restore the state from the converter
3989                      * in case the callback function changed it for its output.
3990                      */
3991                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
3992                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
3993                     if(value<=0xff) {
3994                         if(value==0) {
3995                             goto unassigned;
3996                         } else if(prevLength<=1) {
3997                             length=1;
3998                         } else {
3999                             /* change from double-byte mode to single-byte */
4000                             value|=(uint32_t)UCNV_SI<<8;
4001                             length=2;
4002                             prevLength=1;
4003                         }
4004                     } else {
4005                         if(prevLength==2) {
4006                             length=2;
4007                         } else {
4008                             /* change from single-byte mode to double-byte */
4009                             value|=(uint32_t)UCNV_SO<<16;
4010                             length=3;
4011                             prevLength=2;
4012                         }
4013                     }
4014                     break;
4015                 case MBCS_OUTPUT_DBCS_ONLY:
4016                     /* table with single-byte results, but only DBCS mappings used */
4017                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
4018                     if(value<=0xff) {
4019                         /* no mapping or SBCS result, not taken for DBCS-only */
4020                         goto unassigned;
4021                     } else {
4022                         length=2;
4023                     }
4024                     break;
4025                 case MBCS_OUTPUT_3:
4026                     p=bytes+(value+(c&0x3f))*3;
4027                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4028                     if(value<=0xff) {
4029                         if(value==0) {
4030                             goto unassigned;
4031                         } else {
4032                             length=1;
4033                         }
4034                     } else if(value<=0xffff) {
4035                         length=2;
4036                     } else {
4037                         length=3;
4038                     }
4039                     break;
4040                 case MBCS_OUTPUT_4:
4041                     value=((const uint32_t *)bytes)[value +(c&0x3f)];
4042                     if(value<=0xff) {
4043                         if(value==0) {
4044                             goto unassigned;
4045                         } else {
4046                             length=1;
4047                         }
4048                     } else if(value<=0xffff) {
4049                         length=2;
4050                     } else if(value<=0xffffff) {
4051                         length=3;
4052                     } else {
4053                         length=4;
4054                     }
4055                     break;
4056                 case MBCS_OUTPUT_3_EUC:
4057                     value=((const uint16_t *)bytes)[value +(c&0x3f)];
4058                     /* EUC 16-bit fixed-length representation */
4059                     if(value<=0xff) {
4060                         if(value==0) {
4061                             goto unassigned;
4062                         } else {
4063                             length=1;
4064                         }
4065                     } else if((value&0x8000)==0) {
4066                         value|=0x8e8000;
4067                         length=3;
4068                     } else if((value&0x80)==0) {
4069                         value|=0x8f0080;
4070                         length=3;
4071                     } else {
4072                         length=2;
4073                     }
4074                     break;
4075                 case MBCS_OUTPUT_4_EUC:
4076                     p=bytes+(value+(c&0x3f))*3;
4077                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4078                     /* EUC 16-bit fixed-length representation applied to the first two bytes */
4079                     if(value<=0xff) {
4080                         if(value==0) {
4081                             goto unassigned;
4082                         } else {
4083                             length=1;
4084                         }
4085                     } else if(value<=0xffff) {
4086                         length=2;
4087                     } else if((value&0x800000)==0) {
4088                         value|=0x8e800000;
4089                         length=4;
4090                     } else if((value&0x8000)==0) {
4091                         value|=0x8f008000;
4092                         length=4;
4093                     } else {
4094                         length=3;
4095                     }
4096                     break;
4097                 default:
4098                     /* must not occur */
4099                     /*
4100                      * To avoid compiler warnings that value & length may be
4101                      * used without having been initialized, we set them here.
4102                      * In reality, this is unreachable code.
4103                      * Not having a default branch also causes warnings with
4104                      * some compilers.
4105                      */
4106                     value=0;
4107                     length=0;
4108                     break;
4109                 }
4110                 /* output the value */
4111             } else {
4112                 /*
4113                  * This also tests if the codepage maps single surrogates.
4114                  * If it does, then surrogates are not paired but mapped separately.
4115                  * Note that in this case unmatched surrogates are not detected.
4116                  */
4117                 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
4118                     if(UTF_IS_SURROGATE_FIRST(c)) {
4119 getTrail:
4120                         if(source<sourceLimit) {
4121                             /* test the following code unit */
4122                             UChar trail=*source;
4123                             if(UTF_IS_SECOND_SURROGATE(trail)) {
4124                                 ++source;
4125                                 ++nextSourceIndex;
4126                                 c=UTF16_GET_PAIR_VALUE(c, trail);
4127                                 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4128                                     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4129                                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
4130                                     /* callback(unassigned) */
4131                                     goto unassigned;
4132                                 }
4133                                 /* convert this supplementary code point */
4134                                 /* exit this condition tree */
4135                             } else {
4136                                 /* this is an unmatched lead code unit (1st surrogate) */
4137                                 /* callback(illegal) */
4138                                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
4139                                 break;
4140                             }
4141                         } else {
4142                             /* no more input */
4143                             break;
4144                         }
4145                     } else {
4146                         /* this is an unmatched trail code unit (2nd surrogate) */
4147                         /* callback(illegal) */
4148                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
4149                         break;
4150                     }
4151                 }
4152
4153                 /* convert the Unicode code point in c into codepage bytes */
4154
4155                 /*
4156                  * The basic lookup is a triple-stage compact array (trie) lookup.
4157                  * For details see the beginning of this file.
4158                  *
4159                  * Single-byte codepages are handled with a different data structure
4160                  * by _MBCSSingle... functions.
4161                  *
4162                  * The result consists of a 32-bit value from stage 2 and
4163                  * a pointer to as many bytes as are stored per character.
4164                  * The pointer points to the character's bytes in stage 3.
4165                  * Bits 15..0 of the stage 2 entry contain the stage 3 index
4166                  * for that pointer, while bits 31..16 are flags for which of
4167                  * the 16 characters in the block are roundtrip-assigned.
4168                  *
4169                  * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
4170                  * respectively as uint32_t, in the platform encoding.
4171                  * For 3-byte codepages, the bytes are always stored in big-endian order.
4172                  *
4173                  * For EUC encodings that use only either 0x8e or 0x8f as the first
4174                  * byte of their longest byte sequences, the first two bytes in
4175                  * this third stage indicate with their 7th bits whether these bytes
4176                  * are to be written directly or actually need to be preceeded by
4177                  * one of the two Single-Shift codes. With this, the third stage
4178                  * stores one byte fewer per character than the actual maximum length of
4179                  * EUC byte sequences.
4180                  *
4181                  * Other than that, leading zero bytes are removed and the other
4182                  * bytes output. A single zero byte may be output if the "assigned"
4183                  * bit in stage 2 was on.
4184                  * The data structure does not support zero byte output as a fallback,
4185                  * and also does not allow output of leading zeros.
4186                  */
4187                 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
4188
4189                 /* get the bytes and the length for the output */
4190                 switch(outputType) {
4191                 case MBCS_OUTPUT_2:
4192                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4193                     if(value<=0xff) {
4194                         length=1;
4195                     } else {
4196                         length=2;
4197                     }
4198                     break;
4199                 case MBCS_OUTPUT_2_SISO:
4200                     /* 1/2-byte stateful with Shift-In/Shift-Out */
4201                     /*
4202                      * Save the old state in the converter object
4203                      * right here, then change the local prevLength state variable if necessary.
4204                      * Then, if this character turns out to be unassigned or a fallback that
4205                      * is not taken, the callback code must not save the new state in the converter
4206                      * because the new state is for a character that is not output.
4207                      * However, the callback must still restore the state from the converter
4208                      * in case the callback function changed it for its output.
4209                      */
4210                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
4211                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4212                     if(value<=0xff) {
4213                         if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) {
4214                             /* no mapping, leave value==0 */
4215                             length=0;
4216                         } else if(prevLength<=1) {
4217                             length=1;
4218                         } else {
4219                             /* change from double-byte mode to single-byte */
4220                             value|=(uint32_t)UCNV_SI<<8;
4221                             length=2;
4222                             prevLength=1;
4223                         }
4224                     } else {
4225                         if(prevLength==2) {
4226                             length=2;
4227                         } else {
4228                             /* change from single-byte mode to double-byte */
4229                             value|=(uint32_t)UCNV_SO<<16;
4230                             length=3;
4231                             prevLength=2;
4232                         }
4233                     }
4234                     break;
4235                 case MBCS_OUTPUT_DBCS_ONLY:
4236                     /* table with single-byte results, but only DBCS mappings used */
4237                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4238                     if(value<=0xff) {
4239                         /* no mapping or SBCS result, not taken for DBCS-only */
4240                         value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4241                         length=0;
4242                     } else {
4243                         length=2;
4244                     }
4245                     break;
4246                 case MBCS_OUTPUT_3:
4247                     p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
4248                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4249                     if(value<=0xff) {
4250                         length=1;
4251                     } else if(value<=0xffff) {
4252                         length=2;
4253                     } else {
4254                         length=3;
4255                     }
4256                     break;
4257                 case MBCS_OUTPUT_4:
4258                     value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
4259                     if(value<=0xff) {
4260                         length=1;
4261                     } else if(value<=0xffff) {
4262                         length=2;
4263                     } else if(value<=0xffffff) {
4264                         length=3;
4265                     } else {
4266                         length=4;
4267                     }
4268                     break;
4269                 case MBCS_OUTPUT_3_EUC:
4270                     value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4271                     /* EUC 16-bit fixed-length representation */
4272                     if(value<=0xff) {
4273                         length=1;
4274                     } else if((value&0x8000)==0) {
4275                         value|=0x8e8000;
4276                         length=3;
4277                     } else if((value&0x80)==0) {
4278                         value|=0x8f0080;
4279                         length=3;
4280                     } else {
4281                         length=2;
4282                     }
4283                     break;
4284                 case MBCS_OUTPUT_4_EUC:
4285                     p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
4286                     value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4287                     /* EUC 16-bit fixed-length representation applied to the first two bytes */
4288                     if(value<=0xff) {
4289                         length=1;
4290                     } else if(value<=0xffff) {
4291                         length=2;
4292                     } else if((value&0x800000)==0) {
4293                         value|=0x8e800000;
4294                         length=4;
4295                     } else if((value&0x8000)==0) {
4296                         value|=0x8f008000;
4297                         length=4;
4298                     } else {
4299                         length=3;
4300                     }
4301                     break;
4302                 default:
4303                     /* must not occur */
4304                     /*
4305                      * To avoid compiler warnings that value & length may be
4306                      * used without having been initialized, we set them here.
4307                      * In reality, this is unreachable code.
4308                      * Not having a default branch also causes warnings with
4309                      * some compilers.
4310                      */
4311                     value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4312                     length=0;
4313                     break;
4314                 }
4315
4316                 /* is this code point assigned, or do we use fallbacks? */
4317                 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 ||
4318                      (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
4319                 ) {
4320                     /*
4321                      * We allow a 0 byte output if the "assigned" bit is set for this entry.
4322                      * There is no way with this data structure for fallback output
4323                      * to be a zero byte.
4324                      */
4325
4326 unassigned:
4327                     /* try an extension mapping */
4328                     pArgs->source=source;
4329                     c=_extFromU(cnv, cnv->sharedData,
4330                                 c, &source, sourceLimit,
4331                                 &target, target+targetCapacity,
4332                                 &offsets, sourceIndex,
4333                                 pArgs->flush,
4334                                 pErrorCode);
4335                     nextSourceIndex+=(int32_t)(source-pArgs->source);
4336                     prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
4337
4338                     if(U_FAILURE(*pErrorCode)) {
4339                         /* not mappable or buffer overflow */
4340                         break;
4341                     } else {
4342                         /* a mapping was written to the target, continue */
4343
4344                         /* recalculate the targetCapacity after an extension mapping */
4345                         targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
4346
4347                         /* normal end of conversion: prepare for a new character */
4348                         if(offsets!=NULL) {
4349                             prevSourceIndex=sourceIndex;
4350                             sourceIndex=nextSourceIndex;
4351                         }
4352                         continue;
4353                     }
4354                 }
4355             }
4356
4357             /* write the output character bytes from value and length */
4358             /* from the first if in the loop we know that targetCapacity>0 */
4359             if(length<=targetCapacity) {
4360                 if(offsets==NULL) {
4361                     switch(length) {
4362                         /* each branch falls through to the next one */
4363                     case 4:
4364                         *target++=(uint8_t)(value>>24);
4365                     case 3:
4366                         *target++=(uint8_t)(value>>16);
4367                     case 2:
4368                         *target++=(uint8_t)(value>>8);
4369                     case 1:
4370                         *target++=(uint8_t)value;
4371                     default:
4372                         /* will never occur */
4373                         break;
4374                     }
4375                 } else {
4376                     switch(length) {
4377                         /* each branch falls through to the next one */
4378                     case 4:
4379                         *target++=(uint8_t)(value>>24);
4380                         *offsets++=sourceIndex;
4381                     case 3:
4382                         *target++=(uint8_t)(value>>16);
4383                         *offsets++=sourceIndex;
4384                     case 2:
4385                         *target++=(uint8_t)(value>>8);
4386                         *offsets++=sourceIndex;
4387                     case 1:
4388                         *target++=(uint8_t)value;
4389                         *offsets++=sourceIndex;
4390                     default:
4391                         /* will never occur */
4392                         break;
4393                     }
4394                 }
4395                 targetCapacity-=length;
4396             } else {
4397                 uint8_t *charErrorBuffer;
4398
4399                 /*
4400                  * We actually do this backwards here:
4401                  * In order to save an intermediate variable, we output
4402                  * first to the overflow buffer what does not fit into the
4403                  * regular target.
4404                  */
4405                 /* we know that 1<=targetCapacity<length<=4 */
4406                 length-=targetCapacity;
4407                 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
4408                 switch(length) {
4409                     /* each branch falls through to the next one */
4410                 case 3:
4411                     *charErrorBuffer++=(uint8_t)(value>>16);
4412                 case 2:
4413                     *charErrorBuffer++=(uint8_t)(value>>8);
4414                 case 1:
4415                     *charErrorBuffer=(uint8_t)value;
4416                 default:
4417                     /* will never occur */
4418                     break;
4419                 }
4420                 cnv->charErrorBufferLength=(int8_t)length;
4421
4422                 /* now output what fits into the regular target */
4423                 value>>=8*length; /* length was reduced by targetCapacity */
4424                 switch(targetCapacity) {
4425                     /* each branch falls through to the next one */
4426                 case 3:
4427                     *target++=(uint8_t)(value>>16);
4428                     if(offsets!=NULL) {
4429                         *offsets++=sourceIndex;
4430                     }
4431                 case 2:
4432                     *target++=(uint8_t)(value>>8);
4433                     if(offsets!=NULL) {
4434                         *offsets++=sourceIndex;
4435                     }
4436                 case 1:
4437                     *target++=(uint8_t)value;
4438                     if(offsets!=NULL) {
4439                         *offsets++=sourceIndex;
4440                     }
4441                 default:
4442                     /* will never occur */
4443                     break;
4444                 }
4445
4446                 /* target overflow */
4447                 targetCapacity=0;
4448                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4449                 c=0;
4450                 break;
4451             }
4452
4453             /* normal end of conversion: prepare for a new character */
4454             c=0;
4455             if(offsets!=NULL) {
4456                 prevSourceIndex=sourceIndex;
4457                 sourceIndex=nextSourceIndex;
4458             }
4459             continue;
4460         } else {
4461             /* target is full */
4462             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4463             break;
4464         }
4465     }
4466
4467     /*
4468      * the end of the input stream and detection of truncated input
4469      * are handled by the framework, but for EBCDIC_STATEFUL conversion
4470      * we need to emit an SI at the very end
4471      *
4472      * conditions:
4473      *   successful
4474      *   EBCDIC_STATEFUL in DBCS mode
4475      *   end of input and no truncated input
4476      */
4477     if( U_SUCCESS(*pErrorCode) &&
4478         outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&
4479         pArgs->flush && source>=sourceLimit && c==0
4480     ) {
4481         /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
4482         if(targetCapacity>0) {
4483             *target++=(uint8_t)UCNV_SI;
4484             if(offsets!=NULL) {
4485                 /* set the last source character's index (sourceIndex points at sourceLimit now) */
4486                 *offsets++=prevSourceIndex;
4487             }
4488         } else {
4489             /* target is full */
4490             cnv->charErrorBuffer[0]=(char)UCNV_SI;
4491             cnv->charErrorBufferLength=1;
4492             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4493         }
4494         prevLength=1; /* we switched into SBCS */
4495     }
4496
4497     /* set the converter state back into UConverter */
4498     cnv->fromUChar32=c;
4499     cnv->fromUnicodeStatus=prevLength;
4500
4501     /* write back the updated pointers */
4502     pArgs->source=source;
4503     pArgs->target=(char *)target;
4504     pArgs->offsets=offsets;
4505 }
4506
4507 /*
4508  * This is another simple conversion function for internal use by other
4509  * conversion implementations.
4510  * It does not use the converter state nor call callbacks.
4511  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
4512  * It handles conversion extensions but not GB 18030.
4513  *
4514  * It converts one single Unicode code point into codepage bytes, encoded
4515  * as one 32-bit value. The function returns the number of bytes in *pValue:
4516  * 1..4 the number of bytes in *pValue
4517  * 0    unassigned (*pValue undefined)
4518  * -1   illegal (currently not used, *pValue undefined)
4519  *
4520  * *pValue will contain the resulting bytes with the last byte in bits 7..0,
4521  * the second to last byte in bits 15..8, etc.
4522  * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
4523  */
4524 U_CFUNC int32_t
4525 ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,
4526                  UChar32 c, uint32_t *pValue,
4527                  UBool useFallback) {
4528     const int32_t *cx;
4529     const uint16_t *table;
4530 #if 0
4531 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
4532     const uint8_t *p;
4533 #endif
4534     uint32_t stage2Entry;
4535     uint32_t value;
4536     int32_t length;
4537
4538     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4539     if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4540         table=sharedData->mbcs.fromUnicodeTable;
4541
4542         /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
4543         if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {
4544             value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
4545             /* is this code point assigned, or do we use fallbacks? */
4546             if(useFallback ? value>=0x800 : value>=0xc00) {
4547                 *pValue=value&0xff;
4548                 return 1;
4549             }
4550         } else /* outputType!=MBCS_OUTPUT_1 */ {
4551             stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
4552
4553             /* get the bytes and the length for the output */
4554             switch(sharedData->mbcs.outputType) {
4555             case MBCS_OUTPUT_2:
4556                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4557                 if(value<=0xff) {
4558                     length=1;
4559                 } else {
4560                     length=2;
4561                 }
4562                 break;
4563 #if 0
4564 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
4565             case MBCS_OUTPUT_DBCS_ONLY:
4566                 /* table with single-byte results, but only DBCS mappings used */
4567                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4568                 if(value<=0xff) {
4569                     /* no mapping or SBCS result, not taken for DBCS-only */
4570                     value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4571                     length=0;
4572                 } else {
4573                     length=2;
4574                 }
4575                 break;
4576             case MBCS_OUTPUT_3:
4577                 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4578                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4579                 if(value<=0xff) {
4580                     length=1;
4581                 } else if(value<=0xffff) {
4582                     length=2;
4583                 } else {
4584                     length=3;
4585                 }
4586                 break;
4587             case MBCS_OUTPUT_4:
4588                 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4589                 if(value<=0xff) {
4590                     length=1;
4591                 } else if(value<=0xffff) {
4592                     length=2;
4593                 } else if(value<=0xffffff) {
4594                     length=3;
4595                 } else {
4596                     length=4;
4597                 }
4598                 break;
4599             case MBCS_OUTPUT_3_EUC:
4600                 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4601                 /* EUC 16-bit fixed-length representation */
4602                 if(value<=0xff) {
4603                     length=1;
4604                 } else if((value&0x8000)==0) {
4605                     value|=0x8e8000;
4606                     length=3;
4607                 } else if((value&0x80)==0) {
4608                     value|=0x8f0080;
4609                     length=3;
4610                 } else {
4611                     length=2;
4612                 }
4613                 break;
4614             case MBCS_OUTPUT_4_EUC:
4615                 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4616                 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4617                 /* EUC 16-bit fixed-length representation applied to the first two bytes */
4618                 if(value<=0xff) {
4619                     length=1;
4620                 } else if(value<=0xffff) {
4621                     length=2;
4622                 } else if((value&0x800000)==0) {
4623                     value|=0x8e800000;
4624                     length=4;
4625                 } else if((value&0x8000)==0) {
4626                     value|=0x8f008000;
4627                     length=4;
4628                 } else {
4629                     length=3;
4630                 }
4631                 break;
4632 #endif
4633             default:
4634                 /* must not occur */
4635                 return -1;
4636             }
4637
4638             /* is this code point assigned, or do we use fallbacks? */
4639             if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
4640                 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0)
4641             ) {
4642                 /*
4643                  * We allow a 0 byte output if the "assigned" bit is set for this entry.
4644                  * There is no way with this data structure for fallback output
4645                  * to be a zero byte.
4646                  */
4647                 /* assigned */
4648                 *pValue=value;
4649                 return length;
4650             }
4651         }
4652     }
4653
4654     cx=sharedData->mbcs.extIndexes;
4655     if(cx!=NULL) {
4656         length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
4657         return length>=0 ? length : -length;  /* return abs(length); */
4658     }
4659
4660     /* unassigned */
4661     return 0;
4662 }
4663
4664
4665 #if 0
4666 /*
4667  * This function has been moved to ucnv2022.c for inlining.
4668  * This implementation is here only for documentation purposes
4669  */
4670
4671 /**
4672  * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
4673  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
4674  * It does not handle conversion extensions (_extFromU()).
4675  *
4676  * It returns the codepage byte for the code point, or -1 if it is unassigned.
4677  */
4678 U_CFUNC int32_t
4679 ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
4680                        UChar32 c,
4681                        UBool useFallback) {
4682     const uint16_t *table;
4683     int32_t value;
4684
4685     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4686     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4687         return -1;
4688     }
4689
4690     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
4691     table=sharedData->mbcs.fromUnicodeTable;
4692
4693     /* get the byte for the output */
4694     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
4695     /* is this code point assigned, or do we use fallbacks? */
4696     if(useFallback ? value>=0x800 : value>=0xc00) {
4697         return value&0xff;
4698     } else {
4699         return -1;
4700     }
4701 }
4702 #endif
4703
4704 /* MBCS-from-UTF-8 conversion functions ------------------------------------- */
4705
4706 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
4707 static const UChar32
4708 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
4709
4710 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
4711 static const UChar32
4712 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
4713
4714 static void
4715 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
4716                   UConverterToUnicodeArgs *pToUArgs,
4717                   UErrorCode *pErrorCode) {
4718     UConverter *utf8, *cnv;
4719     const uint8_t *source, *sourceLimit;
4720     uint8_t *target;
4721     int32_t targetCapacity;
4722
4723     const uint16_t *table, *sbcsIndex;
4724     const uint16_t *results;
4725
4726     int8_t oldToULength, toULength, toULimit;
4727
4728     UChar32 c;
4729     uint8_t b, t1, t2;
4730
4731     uint32_t asciiRoundtrips;
4732     uint16_t value, minValue;
4733     UBool hasSupplementary;
4734
4735     /* set up the local pointers */
4736     utf8=pToUArgs->converter;
4737     cnv=pFromUArgs->converter;
4738     source=(uint8_t *)pToUArgs->source;
4739     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
4740     target=(uint8_t *)pFromUArgs->target;
4741     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
4742
4743     table=cnv->sharedData->mbcs.fromUnicodeTable;
4744     sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;
4745     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
4746         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
4747     } else {
4748         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
4749     }
4750     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
4751
4752     if(cnv->useFallback) {
4753         /* use all roundtrip and fallback results */
4754         minValue=0x800;
4755     } else {
4756         /* use only roundtrips and fallbacks from private-use characters */
4757         minValue=0xc00;
4758     }
4759     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
4760
4761     /* get the converter state from the UTF-8 UConverter */
4762     c=(UChar32)utf8->toUnicodeStatus;
4763     if(c!=0) {
4764         toULength=oldToULength=utf8->toULength;
4765         toULimit=(int8_t)utf8->mode;
4766     } else {
4767         toULength=oldToULength=toULimit=0;
4768     }
4769
4770     /*
4771      * Make sure that the last byte sequence before sourceLimit is complete
4772      * or runs into a lead byte.
4773      * Do not go back into the bytes that will be read for finishing a partial
4774      * sequence from the previous buffer.
4775      * In the conversion loop compare source with sourceLimit only once
4776      * per multi-byte character.
4777      */
4778     {
4779         int32_t i, length;
4780
4781         length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
4782         for(i=0; i<3 && i<length;) {
4783             b=*(sourceLimit-i-1);
4784             if(U8_IS_TRAIL(b)) {
4785                 ++i;
4786             } else {
4787                 if(i<utf8_countTrailBytes[b]) {
4788                     /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
4789                     sourceLimit-=i+1;
4790                 }
4791                 break;
4792             }
4793         }
4794     }
4795
4796     if(c!=0 && targetCapacity>0) {
4797         utf8->toUnicodeStatus=0;
4798         utf8->toULength=0;
4799         goto moreBytes;
4800         /*
4801          * Note: We could avoid the goto by duplicating some of the moreBytes
4802          * code, but only up to the point of collecting a complete UTF-8
4803          * sequence; then recurse for the toUBytes[toULength]
4804          * and then continue with normal conversion.
4805          *
4806          * If so, move this code to just after initializing the minimum
4807          * set of local variables for reading the UTF-8 input
4808          * (utf8, source, target, limits but not cnv, table, minValue, etc.).
4809          *
4810          * Potential advantages:
4811          * - avoid the goto
4812          * - oldToULength could become a local variable in just those code blocks
4813          *   that deal with buffer boundaries
4814          * - possibly faster if the goto prevents some compiler optimizations
4815          *   (this would need measuring to confirm)
4816          * Disadvantage:
4817          * - code duplication
4818          */
4819     }
4820
4821     /* conversion loop */
4822     while(source<sourceLimit) {
4823         if(targetCapacity>0) {
4824             b=*source++;
4825             if((int8_t)b>=0) {
4826                 /* convert ASCII */
4827                 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
4828                     *target++=(uint8_t)b;
4829                     --targetCapacity;
4830                     continue;
4831                 } else {
4832                     c=b;
4833                     value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c);
4834                 }
4835             } else {
4836                 if(b<0xe0) {
4837                     if( /* handle U+0080..U+07FF inline */
4838                         b>=0xc2 &&
4839                         (t1=(uint8_t)(*source-0x80)) <= 0x3f
4840                     ) {
4841                         c=b&0x1f;
4842                         ++source;
4843                         value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);
4844                         if(value>=minValue) {
4845                             *target++=(uint8_t)value;
4846                             --targetCapacity;
4847                             continue;
4848                         } else {
4849                             c=(c<<6)|t1;
4850                         }
4851                     } else {
4852                         c=-1;
4853                     }
4854                 } else if(b==0xe0) {
4855                     if( /* handle U+0800..U+0FFF inline */
4856                         (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 &&
4857                         (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
4858                     ) {
4859                         c=t1;
4860                         source+=2;
4861                         value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);
4862                         if(value>=minValue) {
4863                             *target++=(uint8_t)value;
4864                             --targetCapacity;
4865                             continue;
4866                         } else {
4867                             c=(c<<6)|t2;
4868                         }
4869                     } else {
4870                         c=-1;
4871                     }
4872                 } else {
4873                     c=-1;
4874                 }
4875
4876                 if(c<0) {
4877                     /* handle "complicated" and error cases, and continuing partial characters */
4878                     oldToULength=0;
4879                     toULength=1;
4880                     toULimit=utf8_countTrailBytes[b]+1;
4881                     c=b;
4882 moreBytes:
4883                     while(toULength<toULimit) {
4884                         if(source<sourceLimit) {
4885                             b=*source;
4886                             if(U8_IS_TRAIL(b)) {
4887                                 ++source;
4888                                 ++toULength;
4889                                 c=(c<<6)+b;
4890                             } else {
4891                                 break; /* sequence too short, stop with toULength<toULimit */
4892                             }
4893                         } else {
4894                             /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
4895                             source-=(toULength-oldToULength);
4896                             while(oldToULength<toULength) {
4897                                 utf8->toUBytes[oldToULength++]=*source++;
4898                             }
4899                             utf8->toUnicodeStatus=c;
4900                             utf8->toULength=toULength;
4901                             utf8->mode=toULimit;
4902                             pToUArgs->source=(char *)source;
4903                             pFromUArgs->target=(char *)target;
4904                             return;
4905                         }
4906                     }
4907
4908                     if( toULength==toULimit &&      /* consumed all trail bytes */
4909                         (toULength==3 || toULength==2) &&             /* BMP */
4910                         (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
4911                         (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
4912                     ) {
4913                         value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
4914                     } else if(
4915                         toULength==toULimit && toULength==4 &&
4916                         (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
4917                     ) {
4918                         /* supplementary code point */
4919                         if(!hasSupplementary) {
4920                             /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4921                             value=0;
4922                         } else {
4923                             value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
4924                         }
4925                     } else {
4926                         /* error handling: illegal UTF-8 byte sequence */
4927                         source-=(toULength-oldToULength);
4928                         while(oldToULength<toULength) {
4929                             utf8->toUBytes[oldToULength++]=*source++;
4930                         }
4931                         utf8->toULength=toULength;
4932                         pToUArgs->source=(char *)source;
4933                         pFromUArgs->target=(char *)target;
4934                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
4935                         return;
4936                     }
4937                 }
4938             }
4939
4940             if(value>=minValue) {
4941                 /* output the mapping for c */
4942                 *target++=(uint8_t)value;
4943                 --targetCapacity;
4944             } else {
4945                 /* value<minValue means c is unassigned (unmappable) */
4946                 /*
4947                  * Try an extension mapping.
4948                  * Pass in no source because we don't have UTF-16 input.
4949                  * If we have a partial match on c, we will return and revert
4950                  * to UTF-8->UTF-16->charset conversion.
4951                  */
4952                 static const UChar nul=0;
4953                 const UChar *noSource=&nul;
4954                 c=_extFromU(cnv, cnv->sharedData,
4955                             c, &noSource, noSource,
4956                             &target, target+targetCapacity,
4957                             NULL, -1,
4958                             pFromUArgs->flush,
4959                             pErrorCode);
4960
4961                 if(U_FAILURE(*pErrorCode)) {
4962                     /* not mappable or buffer overflow */
4963                     cnv->fromUChar32=c;
4964                     break;
4965                 } else if(cnv->preFromUFirstCP>=0) {
4966                     /*
4967                      * Partial match, return and revert to pivoting.
4968                      * In normal from-UTF-16 conversion, we would just continue
4969                      * but then exit the loop because the extension match would
4970                      * have consumed the source.
4971                      */
4972                     break;
4973                 } else {
4974                     /* a mapping was written to the target, continue */
4975
4976                     /* recalculate the targetCapacity after an extension mapping */
4977                     targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
4978                 }
4979             }
4980         } else {
4981             /* target is full */
4982             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4983             break;
4984         }
4985     }
4986
4987     /*
4988      * The sourceLimit may have been adjusted before the conversion loop
4989      * to stop before a truncated sequence.
4990      * If so, then collect the truncated sequence now.
4991      */
4992     if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
4993         c=utf8->toUBytes[0]=b=*source++;
4994         toULength=1;
4995         toULimit=utf8_countTrailBytes[b]+1;
4996         while(source<sourceLimit) {
4997             utf8->toUBytes[toULength++]=b=*source++;
4998             c=(c<<6)+b;
4999         }
5000         utf8->toUnicodeStatus=c;
5001         utf8->toULength=toULength;
5002         utf8->mode=toULimit;
5003     }
5004
5005     /* write back the updated pointers */
5006     pToUArgs->source=(char *)source;
5007     pFromUArgs->target=(char *)target;
5008 }
5009
5010 static void
5011 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
5012                   UConverterToUnicodeArgs *pToUArgs,
5013                   UErrorCode *pErrorCode) {
5014     UConverter *utf8, *cnv;
5015     const uint8_t *source, *sourceLimit;
5016     uint8_t *target;
5017     int32_t targetCapacity;
5018
5019     const uint16_t *table, *mbcsIndex;
5020     const uint16_t *results;
5021
5022     int8_t oldToULength, toULength, toULimit;
5023
5024     UChar32 c;
5025     uint8_t b, t1, t2;
5026
5027     uint32_t stage2Entry;
5028     uint32_t asciiRoundtrips;
5029     uint16_t value, minValue;
5030     UBool hasSupplementary;
5031
5032     /* set up the local pointers */
5033     utf8=pToUArgs->converter;
5034     cnv=pFromUArgs->converter;
5035     source=(uint8_t *)pToUArgs->source;
5036     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
5037     target=(uint8_t *)pFromUArgs->target;
5038     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
5039
5040     table=cnv->sharedData->mbcs.fromUnicodeTable;
5041     mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
5042     if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
5043         results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
5044     } else {
5045         results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
5046     }
5047     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
5048
5049     if(cnv->useFallback) {
5050         /* use all roundtrip and fallback results */
5051         minValue=0x800;
5052     } else {
5053         /* use only roundtrips and fallbacks from private-use characters */
5054         minValue=0xc00;
5055     }
5056     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
5057
5058     /* get the converter state from the UTF-8 UConverter */
5059     c=(UChar32)utf8->toUnicodeStatus;
5060     if(c!=0) {
5061         toULength=oldToULength=utf8->toULength;
5062         toULimit=(int8_t)utf8->mode;
5063     } else {
5064         toULength=oldToULength=toULimit=0;
5065     }
5066
5067     /*
5068      * Make sure that the last byte sequence before sourceLimit is complete
5069      * or runs into a lead byte.
5070      * Do not go back into the bytes that will be read for finishing a partial
5071      * sequence from the previous buffer.
5072      * In the conversion loop compare source with sourceLimit only once
5073      * per multi-byte character.
5074      */
5075     {
5076         int32_t i, length;
5077
5078         length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
5079         for(i=0; i<3 && i<length;) {
5080             b=*(sourceLimit-i-1);
5081             if(U8_IS_TRAIL(b)) {
5082                 ++i;
5083             } else {
5084                 if(i<utf8_countTrailBytes[b]) {
5085                     /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
5086                     sourceLimit-=i+1;
5087                 }
5088                 break;
5089             }
5090         }
5091     }
5092
5093     if(c!=0 && targetCapacity>0) {
5094         utf8->toUnicodeStatus=0;
5095         utf8->toULength=0;
5096         goto moreBytes;
5097         /* See note in ucnv_SBCSFromUTF8() about this goto. */
5098     }
5099
5100     /* conversion loop */
5101     while(source<sourceLimit) {
5102         if(targetCapacity>0) {
5103             b=*source++;
5104             if((int8_t)b>=0) {
5105                 /* convert ASCII */
5106                 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
5107                     *target++=b;
5108                     --targetCapacity;
5109                     continue;
5110                 } else {
5111                     value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b);
5112                     if(value==0) {
5113                         c=b;
5114                         goto unassigned;
5115                     }
5116                 }
5117             } else {
5118                 if(b>0xe0) {
5119                     if( /* handle U+1000..U+D7FF inline */
5120                         (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) ||
5121                                                         (b==0xed && (t1 <= 0x1f))) &&
5122                         (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
5123                     ) {
5124                         c=((b&0xf)<<6)|t1;
5125                         source+=2;
5126                         value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
5127                         if(value==0) {
5128                             c=(c<<6)|t2;
5129                             goto unassigned;
5130                         }
5131                     } else {
5132                         c=-1;
5133                     }
5134                 } else if(b<0xe0) {
5135                     if( /* handle U+0080..U+07FF inline */
5136                         b>=0xc2 &&
5137                         (t1=(uint8_t)(*source-0x80)) <= 0x3f
5138                     ) {
5139                         c=b&0x1f;
5140                         ++source;
5141                         value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1);
5142                         if(value==0) {
5143                             c=(c<<6)|t1;
5144                             goto unassigned;
5145                         }
5146                     } else {
5147                         c=-1;
5148                     }
5149                 } else {
5150                     c=-1;
5151                 }
5152
5153                 if(c<0) {
5154                     /* handle "complicated" and error cases, and continuing partial characters */
5155                     oldToULength=0;
5156                     toULength=1;
5157                     toULimit=utf8_countTrailBytes[b]+1;
5158                     c=b;
5159 moreBytes:
5160                     while(toULength<toULimit) {
5161                         if(source<sourceLimit) {
5162                             b=*source;
5163                             if(U8_IS_TRAIL(b)) {
5164                                 ++source;
5165                                 ++toULength;
5166                                 c=(c<<6)+b;
5167                             } else {
5168                                 break; /* sequence too short, stop with toULength<toULimit */
5169                             }
5170                         } else {
5171                             /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
5172                             source-=(toULength-oldToULength);
5173                             while(oldToULength<toULength) {
5174                                 utf8->toUBytes[oldToULength++]=*source++;
5175                             }
5176                             utf8->toUnicodeStatus=c;
5177                             utf8->toULength=toULength;
5178                             utf8->mode=toULimit;
5179                             pToUArgs->source=(char *)source;
5180                             pFromUArgs->target=(char *)target;
5181                             return;
5182                         }
5183                     }
5184
5185                     if( toULength==toULimit &&      /* consumed all trail bytes */
5186                         (toULength==3 || toULength==2) &&             /* BMP */
5187                         (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
5188                         (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
5189                     ) {
5190                         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
5191                     } else if(
5192                         toULength==toULimit && toULength==4 &&
5193                         (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
5194                     ) {
5195                         /* supplementary code point */
5196                         if(!hasSupplementary) {
5197                             /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
5198                             stage2Entry=0;
5199                         } else {
5200                             stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
5201                         }
5202                     } else {
5203                         /* error handling: illegal UTF-8 byte sequence */
5204                         source-=(toULength-oldToULength);
5205                         while(oldToULength<toULength) {
5206                             utf8->toUBytes[oldToULength++]=*source++;
5207                         }
5208                         utf8->toULength=toULength;
5209                         pToUArgs->source=(char *)source;
5210                         pFromUArgs->target=(char *)target;
5211                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
5212                         return;
5213                     }
5214
5215                     /* get the bytes and the length for the output */
5216                     /* MBCS_OUTPUT_2 */
5217                     value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c);
5218
5219                     /* is this code point assigned, or do we use fallbacks? */
5220                     if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
5221                          (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
5222                     ) {
5223                         goto unassigned;
5224                     }
5225                 }
5226             }
5227
5228             /* write the output character bytes from value and length */
5229             /* from the first if in the loop we know that targetCapacity>0 */
5230             if(value<=0xff) {
5231                 /* this is easy because we know that there is enough space */
5232                 *target++=(uint8_t)value;
5233                 --targetCapacity;
5234             } else /* length==2 */ {
5235                 *target++=(uint8_t)(value>>8);
5236                 if(2<=targetCapacity) {
5237                     *target++=(uint8_t)value;
5238                     targetCapacity-=2;
5239                 } else {
5240                     cnv->charErrorBuffer[0]=(char)value;
5241                     cnv->charErrorBufferLength=1;
5242
5243                     /* target overflow */
5244                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5245                     break;
5246                 }
5247             }
5248             continue;
5249
5250 unassigned:
5251             {
5252                 /*
5253                  * Try an extension mapping.
5254                  * Pass in no source because we don't have UTF-16 input.
5255                  * If we have a partial match on c, we will return and revert
5256                  * to UTF-8->UTF-16->charset conversion.
5257                  */
5258                 static const UChar nul=0;
5259                 const UChar *noSource=&nul;
5260                 c=_extFromU(cnv, cnv->sharedData,
5261                             c, &noSource, noSource,
5262                             &target, target+targetCapacity,
5263                             NULL, -1,
5264                             pFromUArgs->flush,
5265                             pErrorCode);
5266
5267                 if(U_FAILURE(*pErrorCode)) {
5268                     /* not mappable or buffer overflow */
5269                     cnv->fromUChar32=c;
5270                     break;
5271                 } else if(cnv->preFromUFirstCP>=0) {
5272                     /*
5273                      * Partial match, return and revert to pivoting.
5274                      * In normal from-UTF-16 conversion, we would just continue
5275                      * but then exit the loop because the extension match would
5276                      * have consumed the source.
5277                      */
5278                     break;
5279                 } else {
5280                     /* a mapping was written to the target, continue */
5281
5282                     /* recalculate the targetCapacity after an extension mapping */
5283                     targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
5284                     continue;
5285                 }
5286             }
5287         } else {
5288             /* target is full */
5289             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5290             break;
5291         }
5292     }
5293
5294     /*
5295      * The sourceLimit may have been adjusted before the conversion loop
5296      * to stop before a truncated sequence.
5297      * If so, then collect the truncated sequence now.
5298      */
5299     if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
5300         c=utf8->toUBytes[0]=b=*source++;
5301         toULength=1;
5302         toULimit=utf8_countTrailBytes[b]+1;
5303         while(source<sourceLimit) {
5304             utf8->toUBytes[toULength++]=b=*source++;
5305             c=(c<<6)+b;
5306         }
5307         utf8->toUnicodeStatus=c;
5308         utf8->toULength=toULength;
5309         utf8->mode=toULimit;
5310     }
5311
5312     /* write back the updated pointers */
5313     pToUArgs->source=(char *)source;
5314     pFromUArgs->target=(char *)target;
5315 }
5316
5317 /* miscellaneous ------------------------------------------------------------ */
5318
5319 static void
5320 ucnv_MBCSGetStarters(const UConverter* cnv,
5321                  UBool starters[256],
5322                  UErrorCode *pErrorCode) {
5323     const int32_t *state0;
5324     int i;
5325
5326     state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState];
5327     for(i=0; i<256; ++i) {
5328         /* all bytes that cause a state transition from state 0 are lead bytes */
5329         starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);
5330     }
5331 }
5332
5333 /*
5334  * This is an internal function that allows other converter implementations
5335  * to check whether a byte is a lead byte.
5336  */
5337 U_CFUNC UBool
5338 ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {
5339     return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]);
5340 }
5341
5342 static void
5343 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
5344               int32_t offsetIndex,
5345               UErrorCode *pErrorCode) {
5346     UConverter *cnv=pArgs->converter;
5347     char *p, *subchar;
5348     char buffer[4];
5349     int32_t length;
5350
5351     /* first, select between subChar and subChar1 */
5352     if( cnv->subChar1!=0 &&
5353         (cnv->sharedData->mbcs.extIndexes!=NULL ?
5354             cnv->useSubChar1 :
5355             (cnv->invalidUCharBuffer[0]<=0xff))
5356     ) {
5357         /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
5358         subchar=(char *)&cnv->subChar1;
5359         length=1;
5360     } else {
5361         /* select subChar in all other cases */
5362         subchar=(char *)cnv->subChars;
5363         length=cnv->subCharLen;
5364     }
5365
5366     /* reset the selector for the next code point */
5367     cnv->useSubChar1=FALSE;
5368
5369     if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) {
5370         p=buffer;
5371
5372         /* fromUnicodeStatus contains prevLength */
5373         switch(length) {
5374         case 1:
5375             if(cnv->fromUnicodeStatus==2) {
5376                 /* DBCS mode and SBCS sub char: change to SBCS */
5377                 cnv->fromUnicodeStatus=1;
5378                 *p++=UCNV_SI;
5379             }
5380             *p++=subchar[0];
5381             break;
5382         case 2:
5383             if(cnv->fromUnicodeStatus<=1) {
5384                 /* SBCS mode and DBCS sub char: change to DBCS */
5385                 cnv->fromUnicodeStatus=2;
5386                 *p++=UCNV_SO;
5387             }
5388             *p++=subchar[0];
5389             *p++=subchar[1];
5390             break;
5391         default:
5392             *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
5393             return;
5394         }
5395         subchar=buffer;
5396         length=(int32_t)(p-buffer);
5397     }
5398
5399     ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode);
5400 }
5401
5402 U_CFUNC UConverterType
5403 ucnv_MBCSGetType(const UConverter* converter) {
5404     /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
5405     if(converter->sharedData->mbcs.countStates==1) {
5406         return (UConverterType)UCNV_SBCS;
5407     } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {
5408         return (UConverterType)UCNV_EBCDIC_STATEFUL;
5409     } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) {
5410         return (UConverterType)UCNV_DBCS;
5411     }
5412     return (UConverterType)UCNV_MBCS;
5413 }
5414
5415 static const UConverterImpl _SBCSUTF8Impl={
5416     UCNV_MBCS,
5417
5418     ucnv_MBCSLoad,
5419     ucnv_MBCSUnload,
5420
5421     ucnv_MBCSOpen,
5422     NULL,
5423     NULL,
5424
5425     ucnv_MBCSToUnicodeWithOffsets,
5426     ucnv_MBCSToUnicodeWithOffsets,
5427     ucnv_MBCSFromUnicodeWithOffsets,
5428     ucnv_MBCSFromUnicodeWithOffsets,
5429     ucnv_MBCSGetNextUChar,
5430
5431     ucnv_MBCSGetStarters,
5432     ucnv_MBCSGetName,
5433     ucnv_MBCSWriteSub,
5434     NULL,
5435     ucnv_MBCSGetUnicodeSet,
5436
5437     NULL,
5438     ucnv_SBCSFromUTF8
5439 };
5440
5441 static const UConverterImpl _DBCSUTF8Impl={
5442     UCNV_MBCS,
5443
5444     ucnv_MBCSLoad,
5445     ucnv_MBCSUnload,
5446
5447     ucnv_MBCSOpen,
5448     NULL,
5449     NULL,
5450
5451     ucnv_MBCSToUnicodeWithOffsets,
5452     ucnv_MBCSToUnicodeWithOffsets,
5453     ucnv_MBCSFromUnicodeWithOffsets,
5454     ucnv_MBCSFromUnicodeWithOffsets,
5455     ucnv_MBCSGetNextUChar,
5456
5457     ucnv_MBCSGetStarters,
5458     ucnv_MBCSGetName,
5459     ucnv_MBCSWriteSub,
5460     NULL,
5461     ucnv_MBCSGetUnicodeSet,
5462
5463     NULL,
5464     ucnv_DBCSFromUTF8
5465 };
5466
5467 static const UConverterImpl _MBCSImpl={
5468     UCNV_MBCS,
5469
5470     ucnv_MBCSLoad,
5471     ucnv_MBCSUnload,
5472
5473     ucnv_MBCSOpen,
5474     NULL,
5475     NULL,
5476
5477     ucnv_MBCSToUnicodeWithOffsets,
5478     ucnv_MBCSToUnicodeWithOffsets,
5479     ucnv_MBCSFromUnicodeWithOffsets,
5480     ucnv_MBCSFromUnicodeWithOffsets,
5481     ucnv_MBCSGetNextUChar,
5482
5483     ucnv_MBCSGetStarters,
5484     ucnv_MBCSGetName,
5485     ucnv_MBCSWriteSub,
5486     NULL,
5487     ucnv_MBCSGetUnicodeSet
5488 };
5489
5490
5491 /* Static data is in tools/makeconv/ucnvstat.c for data-based
5492  * converters. Be sure to update it as well.
5493  */
5494
5495 const UConverterSharedData _MBCSData={
5496     sizeof(UConverterSharedData), 1,
5497     NULL, NULL, NULL, FALSE, &_MBCSImpl,
5498     0
5499 };
5500
5501 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */