]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/ucnvmbcs.cpp
ICU-57165.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucnvmbcs.cpp
CommitLineData
b75a7d8f
A
1/*
2******************************************************************************
3*
2ca993e8 4* Copyright (C) 2000-2016, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7******************************************************************************
b331163b 8* file name: ucnvmbcs.cpp
b75a7d8f
A
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2000jul03
14* created by: Markus W. Scherer
15*
16* The current code in this file replaces the previous implementation
17* of conversion code from multi-byte codepages to Unicode and back.
18* This implementation supports the following:
19* - legacy variable-length codepages with up to 4 bytes per character
20* - all Unicode code points (up to 0x10ffff)
21* - efficient distinction of unassigned vs. illegal byte sequences
22* - it is possible in fromUnicode() to directly deal with simple
23* stateful encodings (used for EBCDIC_STATEFUL)
374ca955 24* - it is possible to convert Unicode code points
b75a7d8f
A
25* to a single zero byte (but not as a fallback except for SBCS)
26*
27* Remaining limitations in fromUnicode:
28* - byte sequences must not have leading zero bytes
29* - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
30* - limitation to up to 4 bytes per character
31*
374ca955
A
32* ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these
33* limitations and adds m:n character mappings and other features.
34* See ucnv_ext.h for details.
35*
b75a7d8f
A
36* Change history:
37*
38* 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
39* MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
40* macros to ucnvmbcs.h file
41*/
42
43#include "unicode/utypes.h"
44
374ca955 45#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
b75a7d8f
A
46
47#include "unicode/ucnv.h"
48#include "unicode/ucnv_cb.h"
49#include "unicode/udata.h"
50#include "unicode/uset.h"
4388f060
A
51#include "unicode/utf8.h"
52#include "unicode/utf16.h"
b75a7d8f
A
53#include "ucnv_bld.h"
54#include "ucnvmbcs.h"
374ca955 55#include "ucnv_ext.h"
b75a7d8f 56#include "ucnv_cnv.h"
b75a7d8f
A
57#include "cmemory.h"
58#include "cstring.h"
b331163b 59#include "umutex.h"
b75a7d8f
A
60
61/* control optimizations according to the platform */
62#define MBCS_UNROLL_SINGLE_TO_BMP 1
63#define MBCS_UNROLL_SINGLE_FROM_BMP 0
64
65/*
46f4442e 66 * _MBCSHeader versions 5.3 & 4.3
b75a7d8f
A
67 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
68 *
46f4442e
A
69 * This version is optional. Version 5 is used for incompatible data format changes.
70 * makeconv will continue to generate version 4 files if possible.
71 *
72 * Changes from version 4:
73 *
74 * The main difference is an additional _MBCSHeader field with
75 * - the length (number of uint32_t) of the _MBCSHeader
76 * - flags for further incompatible data format changes
77 * - flags for further, backward compatible data format changes
78 *
79 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
80 * the file and needs to be reconstituted at load time.
81 * This requires a utf8Friendly format with an additional mbcsIndex table for fast
82 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
83 * (For details about these structures see below, and see ucnvmbcs.h.)
84 *
85 * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
86 * of the Unicode code points. (This requires that the .ucm file has the |0 etc.
87 * precision markers for all mappings.)
88 *
89 * All fallbacks have been moved to the extension table, leaving only roundtrips in the
90 * omitted data that can be reconstituted from the toUnicode data.
91 *
92 * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
93 * With only roundtrip mappings in the base fromUnicode data, this part is fully
94 * redundant with the mbcsIndex and will be reconstituted from that (also using the
95 * stage 1 table which contains the information about how stage 2 was compacted).
96 *
97 * The rest of the stage 2 table, the part for code points above maxFastUChar,
98 * is stored in the file and will be appended to the reconstituted part.
99 *
100 * The entire fromUBytes array is omitted from the file and will be reconstitued.
101 * This is done by enumerating all toUnicode roundtrip mappings, performing
102 * each mapping (using the stage 1 and reconstituted stage 2 tables) and
103 * writing instead of reading the byte values.
104 *
105 * _MBCSHeader version 4.3
106 *
107 * Change from version 4.2:
108 * - Optional utf8Friendly data structures, with 64-entry stage 3 block
109 * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
110 * files which can be used instead of stages 1 & 2.
111 * Faster lookups for roundtrips from most commonly used characters,
112 * and lookups from UTF-8 byte sequences with a natural bit distribution.
113 * See ucnvmbcs.h for more details.
114 *
374ca955
A
115 * Change from version 4.1:
116 * - Added an optional extension table structure at the end of the .cnv file.
117 * It is present if the upper bits of the header flags field contains a non-zero
118 * byte offset to it.
119 * Files that contain only a conversion table and no base table
120 * use the special outputType MBCS_OUTPUT_EXT_ONLY.
121 * These contain the base table name between the MBCS header and the extension
122 * data.
123 *
b75a7d8f
A
124 * Change from version 4.0:
125 * - Replace header.reserved with header.fromUBytesLength so that all
126 * fields in the data have length.
127 *
128 * Changes from version 3 (for performance improvements):
129 * - new bit distribution for state table entries
130 * - reordered action codes
131 * - new data structure for single-byte fromUnicode
132 * + stage 2 only contains indexes
133 * + stage 3 stores 16 bits per character with classification bits 15..8
134 * - no multiplier for stage 1 entries
135 * - stage 2 for non-single-byte codepages contains the index and the flags in
136 * one 32-bit value
137 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers
138 *
139 * For more details about old versions of the MBCS data structure, see
140 * the corresponding versions of this file.
141 *
142 * Converting stateless codepage data ---------------------------------------***
143 * (or codepage data with simple states) to Unicode.
144 *
145 * Data structure and algorithm for converting from complex legacy codepages
146 * to Unicode. (Designed before 2000-may-22.)
147 *
148 * The basic idea is that the structure of legacy codepages can be described
149 * with state tables.
150 * When reading a byte stream, each input byte causes a state transition.
151 * Some transitions result in the output of a code point, some result in
152 * "unassigned" or "illegal" output.
153 * This is used here for character conversion.
154 *
155 * The data structure begins with a state table consisting of a row
156 * per state, with 256 entries (columns) per row for each possible input
157 * byte value.
158 * Each entry is 32 bits wide, with two formats distinguished by
159 * the sign bit (bit 31):
160 *
161 * One format for transitional entries (bit 31 not set) for non-final bytes, and
162 * one format for final entries (bit 31 set).
163 * Both formats contain the number of the next state in the same bit
164 * positions.
165 * State 0 is the initial state.
166 *
167 * Most of the time, the offset values of subsequent states are added
168 * up to a scalar value. This value will eventually be the index of
169 * the Unicode code point in a table that follows the state table.
170 * The effect is that the code points for final state table rows
171 * are contiguous. The code points of final state rows follow each other
172 * in the order of the references to those final states by previous
173 * states, etc.
174 *
175 * For some terminal states, the offset is itself the output Unicode
176 * code point (16 bits for a BMP code point or 20 bits for a supplementary
177 * code point (stored as code point minus 0x10000 so that 20 bits are enough).
178 * For others, the code point in the Unicode table is stored with either
179 * one or two code units: one for BMP code points, two for a pair of
180 * surrogates.
181 * All code points for a final state entry take up the same number of code
182 * units, regardless of whether they all actually _use_ the same number
183 * of code units. This is necessary for simple array access.
184 *
185 * An additional feature comes in with what in ICU is called "fallback"
186 * mappings:
187 *
188 * In addition to round-trippable, precise, 1:1 mappings, there are often
189 * mappings defined between similar, though not the same, characters.
190 * Typically, such mappings occur only in fromUnicode mapping tables because
191 * Unicode has a superset repertoire of most other codepages. However, it
192 * is possible to provide such mappings in the toUnicode tables, too.
193 * In this case, the fallback mappings are partly integrated into the
194 * general state tables because the structure of the encoding includes their
195 * byte sequences.
196 * For final entries in an initial state, fallback mappings are stored in
197 * the entry itself like with roundtrip mappings.
198 * For other final entries, they are stored in the code units table if
199 * the entry is for a pair of code units.
200 * For single-unit results in the code units table, there is no space to
201 * alternatively hold a fallback mapping; in this case, the code unit
202 * is stored as U+fffe (unassigned), and the fallback mapping needs to
203 * be looked up by the scalar offset value in a separate table.
204 *
205 * "Unassigned" state entries really mean "structurally unassigned",
206 * i.e., such a byte sequence will never have a mapping result.
207 *
208 * The interpretation of the bits in each entry is as follows:
209 *
210 * Bit 31 not set, not a terminal entry ("transitional"):
211 * 30..24 next state
212 * 23..0 offset delta, to be added up
213 *
214 * Bit 31 set, terminal ("final") entry:
215 * 30..24 next state (regardless of action code)
216 * 23..20 action code:
217 * action codes 0 and 1 result in precise-mapping Unicode code points
218 * 0 valid byte sequence
219 * 19..16 not used, 0
220 * 15..0 16-bit Unicode BMP code point
221 * never U+fffe or U+ffff
222 * 1 valid byte sequence
223 * 19..0 20-bit Unicode supplementary code point
224 * never U+fffe or U+ffff
225 *
226 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points
227 * 2 valid byte sequence (fallback)
228 * 19..16 not used, 0
229 * 15..0 16-bit Unicode BMP code point as fallback result
230 * 3 valid byte sequence (fallback)
231 * 19..0 20-bit Unicode supplementary code point as fallback result
232 *
233 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results
234 * depending on the code units they result in
235 * 4 valid byte sequence
236 * 19..9 not used, 0
237 * 8..0 final offset delta
238 * pointing to one 16-bit code unit which may be
239 * fffe unassigned -- look for a fallback for this offset
240 * ffff illegal
241 * 5 valid byte sequence
242 * 19..9 not used, 0
243 * 8..0 final offset delta
244 * pointing to two 16-bit code units
245 * (typically UTF-16 surrogates)
246 * the result depends on the first code unit as follows:
247 * 0000..d7ff roundtrip BMP code point (1st alone)
248 * d800..dbff roundtrip surrogate pair (1st, 2nd)
249 * dc00..dfff fallback surrogate pair (1st-400, 2nd)
250 * e000 roundtrip BMP code point (2nd alone)
251 * e001 fallback BMP code point (2nd alone)
252 * fffe unassigned
253 * ffff illegal
254 * (the final offset deltas are at most 255 * 2,
255 * times 2 because of storing code unit pairs)
256 *
257 * 6 unassigned byte sequence
258 * 19..16 not used, 0
259 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2)
260 * this does not contain a final offset delta because the main
261 * purpose of this action code is to save scalar offset values;
262 * therefore, fallback values cannot be assigned to byte
263 * sequences that result in this action code
264 * 7 illegal byte sequence
265 * 19..16 not used, 0
266 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2)
267 * 8 state change only
268 * 19..0 not used, 0
269 * useful for state changes in simple stateful encodings,
270 * at Shift-In/Shift-Out codes
271 *
272 *
273 * 9..15 reserved for future use
274 * current implementations will only perform a state change
275 * and ignore bits 19..0
276 *
277 * An encoding with contiguous ranges of unassigned byte sequences, like
278 * Shift-JIS and especially EUC-TW, can be stored efficiently by having
279 * at least two states for the trail bytes:
280 * One trail byte state that results in code points, and one that only
281 * has "unassigned" and "illegal" terminal states.
282 *
46f4442e 283 * Note: partly by accident, this data structure supports simple stateful
b75a7d8f
A
284 * encodings without any additional logic.
285 * Currently, only simple Shift-In/Shift-Out schemes are handled with
286 * appropriate state tables (especially EBCDIC_STATEFUL!).
287 *
288 * MBCS version 2 added:
289 * unassigned and illegal action codes have U+fffe and U+ffff
290 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()
291 *
292 * Converting from Unicode to codepage bytes --------------------------------***
293 *
294 * The conversion data structure for fromUnicode is designed for the known
295 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to
296 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is
297 * a roundtrip mapping.
298 *
299 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3
300 * like in the character properties table.
301 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3
302 * with the resulting bytes is at offsetFromUBytes.
303 *
304 * Beginning with version 4, single-byte codepages have a significantly different
305 * trie compared to other codepages.
306 * In all cases, the entry in stage 1 is directly the index of the block of
307 * 64 entries in stage 2.
308 *
309 * Single-byte lookup:
310 *
311 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.
312 * Stage 3 contains one 16-bit word per result:
313 * Bits 15..8 indicate the kind of result:
314 * f roundtrip result
315 * c fallback result from private-use code point
316 * 8 fallback result from other code points
317 * 0 unassigned
318 * Bits 7..0 contain the codepage byte. A zero byte is always possible.
319 *
46f4442e
A
320 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly
321 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup
322 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
323 * ASCII code points can be looked up with a linear array access into stage 3.
324 * See maxFastUChar and other details in ucnvmbcs.h.
325 *
b75a7d8f
A
326 * Multi-byte lookup:
327 *
328 * Stage 2 contains a 32-bit word for each 16-block in stage 3:
329 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results
330 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
331 * If this test is false, then a non-zero result will be interpreted as
332 * a fallback mapping.
333 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char)
334 *
335 * Stage 3 contains 2, 3, or 4 bytes per result.
336 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,
337 * while 3 bytes are stored as bytes in big-endian order.
338 * Leading zero bytes are ignored, and the number of bytes is counted.
339 * A zero byte mapping result is possible as a roundtrip result.
340 * For some output types, the actual result is processed from this;
374ca955 341 * see ucnv_MBCSFromUnicodeWithOffsets().
b75a7d8f
A
342 *
343 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),
344 * or (version 3 and up) for BMP-only codepages, it contains 64 entries.
345 *
46f4442e
A
346 * In version 4.3, a utf8Friendly file contains an mbcsIndex table.
347 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup
348 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.
349 * ASCII code points can be looked up with a linear array access into stage 3.
350 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.
351 *
b75a7d8f
A
352 * In version 3, stage 2 blocks may overlap by multiples of the multiplier
353 * for compaction.
354 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)
355 * may overlap by any number of entries.
356 *
357 * MBCS version 2 added:
358 * the converter checks for known output types, which allows
359 * adding new ones without crashing an unaware converter
360 */
361
b331163b
A
362/**
363 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
364 * consecutive sequences of bytes, starting from the one encoded in value,
365 * to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
366 * Does not currently support m:n mappings or reverse fallbacks.
367 * This function will not be called for sequences of bytes with leading zeros.
368 *
369 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
370 * @param value contains 1..4 bytes of the first byte sequence, right-aligned
371 * @param codePoints resulting Unicode code points, or negative if a byte sequence does
372 * not map to anything
373 * @return TRUE to continue enumeration, FALSE to stop
374 */
375typedef UBool U_CALLCONV
376UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]);
377
378static void
379ucnv_MBCSLoad(UConverterSharedData *sharedData,
380 UConverterLoadArgs *pArgs,
381 const uint8_t *raw,
382 UErrorCode *pErrorCode);
383
384static void
385ucnv_MBCSUnload(UConverterSharedData *sharedData);
386
387static void
388ucnv_MBCSOpen(UConverter *cnv,
389 UConverterLoadArgs *pArgs,
390 UErrorCode *pErrorCode);
391
392static UChar32
393ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
394 UErrorCode *pErrorCode);
395
396static void
397ucnv_MBCSGetStarters(const UConverter* cnv,
398 UBool starters[256],
399 UErrorCode *pErrorCode);
400
401static const char *
402ucnv_MBCSGetName(const UConverter *cnv);
403
404static void
405ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
406 int32_t offsetIndex,
407 UErrorCode *pErrorCode);
408
409static UChar32
410ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
411 UErrorCode *pErrorCode);
412
413static void
414ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
415 UConverterToUnicodeArgs *pToUArgs,
416 UErrorCode *pErrorCode);
417
418static void
419ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
420 const USetAdder *sa,
421 UConverterUnicodeSet which,
422 UErrorCode *pErrorCode);
423
424static void
425ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
426 UConverterToUnicodeArgs *pToUArgs,
427 UErrorCode *pErrorCode);
428
429static const UConverterImpl _SBCSUTF8Impl={
430 UCNV_MBCS,
431
432 ucnv_MBCSLoad,
433 ucnv_MBCSUnload,
434
435 ucnv_MBCSOpen,
436 NULL,
437 NULL,
438
439 ucnv_MBCSToUnicodeWithOffsets,
440 ucnv_MBCSToUnicodeWithOffsets,
441 ucnv_MBCSFromUnicodeWithOffsets,
442 ucnv_MBCSFromUnicodeWithOffsets,
443 ucnv_MBCSGetNextUChar,
444
445 ucnv_MBCSGetStarters,
446 ucnv_MBCSGetName,
447 ucnv_MBCSWriteSub,
448 NULL,
449 ucnv_MBCSGetUnicodeSet,
450
451 NULL,
452 ucnv_SBCSFromUTF8
453};
454
455static const UConverterImpl _DBCSUTF8Impl={
456 UCNV_MBCS,
457
458 ucnv_MBCSLoad,
459 ucnv_MBCSUnload,
460
461 ucnv_MBCSOpen,
462 NULL,
463 NULL,
464
465 ucnv_MBCSToUnicodeWithOffsets,
466 ucnv_MBCSToUnicodeWithOffsets,
467 ucnv_MBCSFromUnicodeWithOffsets,
468 ucnv_MBCSFromUnicodeWithOffsets,
469 ucnv_MBCSGetNextUChar,
470
471 ucnv_MBCSGetStarters,
472 ucnv_MBCSGetName,
473 ucnv_MBCSWriteSub,
474 NULL,
475 ucnv_MBCSGetUnicodeSet,
476
477 NULL,
478 ucnv_DBCSFromUTF8
479};
480
481static const UConverterImpl _MBCSImpl={
482 UCNV_MBCS,
483
484 ucnv_MBCSLoad,
485 ucnv_MBCSUnload,
486
487 ucnv_MBCSOpen,
488 NULL,
489 NULL,
490
491 ucnv_MBCSToUnicodeWithOffsets,
492 ucnv_MBCSToUnicodeWithOffsets,
493 ucnv_MBCSFromUnicodeWithOffsets,
494 ucnv_MBCSFromUnicodeWithOffsets,
495 ucnv_MBCSGetNextUChar,
496
497 ucnv_MBCSGetStarters,
498 ucnv_MBCSGetName,
499 ucnv_MBCSWriteSub,
500 NULL,
501 ucnv_MBCSGetUnicodeSet,
502 NULL,
503 NULL
504};
505
506
507/* Static data is in tools/makeconv/ucnvstat.c for data-based
508 * converters. Be sure to update it as well.
509 */
510
511const UConverterSharedData _MBCSData={
512 sizeof(UConverterSharedData), 1,
2ca993e8 513 NULL, NULL, FALSE, TRUE, &_MBCSImpl,
b331163b
A
514 0, UCNV_MBCS_TABLE_INITIALIZER
515};
516
b75a7d8f
A
517
518/* GB 18030 data ------------------------------------------------------------ */
519
520/* helper macros for linear values for GB 18030 four-byte sequences */
521#define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d))
522
523#define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)
524
525#define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)
526
527/*
528 * Some ranges of GB 18030 where both the Unicode code points and the
529 * GB four-byte sequences are contiguous and are handled algorithmically by
530 * the special callback functions below.
531 * The values are start & end of Unicode & GB codes.
532 *
533 * Note that single surrogates are not mapped by GB 18030
534 * as of the re-released mapping tables from 2000-nov-30.
535 */
536static const uint32_t
4388f060 537gb18030Ranges[14][4]={
b75a7d8f
A
538 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
539 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
4388f060
A
540 {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)},
541 {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)},
b75a7d8f
A
542 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
543 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
544 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
545 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},
546 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},
547 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},
548 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},
549 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},
550 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},
551 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}
552};
553
554/* bit flag for UConverter.options indicating GB 18030 special handling */
555#define _MBCS_OPTION_GB18030 0x8000
556
729e4ab9
A
557/* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */
558#define _MBCS_OPTION_KEIS 0x01000
559#define _MBCS_OPTION_JEF 0x02000
560#define _MBCS_OPTION_JIPS 0x04000
561
562#define KEIS_SO_CHAR_1 0x0A
563#define KEIS_SO_CHAR_2 0x42
564#define KEIS_SI_CHAR_1 0x0A
565#define KEIS_SI_CHAR_2 0x41
566
567#define JEF_SO_CHAR 0x28
568#define JEF_SI_CHAR 0x29
569
570#define JIPS_SO_CHAR_1 0x1A
571#define JIPS_SO_CHAR_2 0x70
572#define JIPS_SI_CHAR_1 0x1A
573#define JIPS_SI_CHAR_2 0x71
574
575enum SISO_Option {
576 SI,
577 SO
578};
579typedef enum SISO_Option SISO_Option;
580
581static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) {
582 int32_t SISOLength = 0;
583
584 switch (option) {
585 case SI:
586 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
587 value[0] = KEIS_SI_CHAR_1;
588 value[1] = KEIS_SI_CHAR_2;
589 SISOLength = 2;
590 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
591 value[0] = JEF_SI_CHAR;
592 SISOLength = 1;
593 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
594 value[0] = JIPS_SI_CHAR_1;
595 value[1] = JIPS_SI_CHAR_2;
596 SISOLength = 2;
597 } else {
598 value[0] = UCNV_SI;
599 SISOLength = 1;
600 }
601 break;
602 case SO:
603 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {
604 value[0] = KEIS_SO_CHAR_1;
605 value[1] = KEIS_SO_CHAR_2;
606 SISOLength = 2;
607 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {
608 value[0] = JEF_SO_CHAR;
609 SISOLength = 1;
610 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {
611 value[0] = JIPS_SO_CHAR_1;
612 value[1] = JIPS_SO_CHAR_2;
613 SISOLength = 2;
614 } else {
615 value[0] = UCNV_SO;
616 SISOLength = 1;
617 }
618 break;
619 default:
620 /* Should never happen. */
621 break;
622 }
623
624 return SISOLength;
625}
626
b75a7d8f
A
627/* Miscellaneous ------------------------------------------------------------ */
628
374ca955 629/* similar to ucnv_MBCSGetNextUChar() but recursive */
46f4442e
A
630static UBool
631enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],
632 int32_t state, uint32_t offset,
633 uint32_t value,
634 UConverterEnumToUCallback *callback, const void *context,
635 UErrorCode *pErrorCode) {
636 UChar32 codePoints[32];
637 const int32_t *row;
638 const uint16_t *unicodeCodeUnits;
639 UChar32 anyCodePoints;
640 int32_t b, limit;
641
642 row=mbcsTable->stateTable[state];
643 unicodeCodeUnits=mbcsTable->unicodeCodeUnits;
374ca955 644
46f4442e
A
645 value<<=8;
646 anyCodePoints=-1; /* becomes non-negative if there is a mapping */
647
648 b=(stateProps[state]&0x38)<<2;
649 if(b==0 && stateProps[state]>=0x40) {
650 /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */
651 codePoints[0]=U_SENTINEL;
652 b=1;
653 }
654 limit=((stateProps[state]&7)+1)<<5;
655 while(b<limit) {
656 int32_t entry=row[b];
374ca955 657 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
46f4442e
A
658 int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry);
659 if(stateProps[nextState]>=0) {
660 /* recurse to a state with non-ignorable actions */
661 if(!enumToU(
662 mbcsTable, stateProps, nextState,
663 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
664 value|(uint32_t)b,
665 callback, context,
666 pErrorCode)) {
667 return FALSE;
668 }
669 }
670 codePoints[b&0x1f]=U_SENTINEL;
374ca955
A
671 } else {
672 UChar32 c;
46f4442e 673 int32_t action;
b75a7d8f 674
374ca955
A
675 /*
676 * An if-else-if chain provides more reliable performance for
677 * the most common cases compared to a switch.
678 */
46f4442e 679 action=MBCS_ENTRY_FINAL_ACTION(entry);
374ca955
A
680 if(action==MBCS_STATE_VALID_DIRECT_16) {
681 /* output BMP code point */
682 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
683 } else if(action==MBCS_STATE_VALID_16) {
46f4442e
A
684 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
685 c=unicodeCodeUnits[finalOffset];
374ca955
A
686 if(c<0xfffe) {
687 /* output BMP code point */
688 } else {
689 c=U_SENTINEL;
b75a7d8f 690 }
374ca955 691 } else if(action==MBCS_STATE_VALID_16_PAIR) {
46f4442e
A
692 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
693 c=unicodeCodeUnits[finalOffset++];
374ca955
A
694 if(c<0xd800) {
695 /* output BMP code point below 0xd800 */
696 } else if(c<=0xdbff) {
697 /* output roundtrip or fallback supplementary code point */
46f4442e 698 c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00);
374ca955
A
699 } else if(c==0xe000) {
700 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
46f4442e 701 c=unicodeCodeUnits[finalOffset];
374ca955
A
702 } else {
703 c=U_SENTINEL;
b75a7d8f 704 }
374ca955
A
705 } else if(action==MBCS_STATE_VALID_DIRECT_20) {
706 /* output supplementary code point */
707 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
46f4442e
A
708 } else {
709 c=U_SENTINEL;
710 }
711
712 codePoints[b&0x1f]=c;
713 anyCodePoints&=c;
714 }
715 if(((++b)&0x1f)==0) {
716 if(anyCodePoints>=0) {
717 if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) {
718 return FALSE;
719 }
720 anyCodePoints=-1;
b75a7d8f 721 }
46f4442e
A
722 }
723 }
724 return TRUE;
725}
b75a7d8f 726
46f4442e
A
727/*
728 * Only called if stateProps[state]==-1.
729 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an
730 * MBCS_STATE_CHANGE_ONLY.
731 */
732static int8_t
733getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) {
734 const int32_t *row;
735 int32_t min, max, entry, nextState;
736
737 row=stateTable[state];
738 stateProps[state]=0;
739
740 /* find first non-ignorable state */
741 for(min=0;; ++min) {
742 entry=row[min];
743 nextState=MBCS_ENTRY_STATE(entry);
744 if(stateProps[nextState]==-1) {
745 getStateProp(stateTable, stateProps, nextState);
746 }
747 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
748 if(stateProps[nextState]>=0) {
749 break;
750 }
751 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
752 break;
753 }
754 if(min==0xff) {
755 stateProps[state]=-0x40; /* (int8_t)0xc0 */
756 return stateProps[state];
757 }
758 }
759 stateProps[state]|=(int8_t)((min>>5)<<3);
760
761 /* find last non-ignorable state */
762 for(max=0xff; min<max; --max) {
763 entry=row[max];
764 nextState=MBCS_ENTRY_STATE(entry);
765 if(stateProps[nextState]==-1) {
766 getStateProp(stateTable, stateProps, nextState);
767 }
768 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
769 if(stateProps[nextState]>=0) {
770 break;
771 }
772 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
773 break;
774 }
775 }
776 stateProps[state]|=(int8_t)(max>>5);
777
778 /* recurse further and collect direct-state information */
779 while(min<=max) {
780 entry=row[min];
781 nextState=MBCS_ENTRY_STATE(entry);
782 if(stateProps[nextState]==-1) {
783 getStateProp(stateTable, stateProps, nextState);
784 }
785 if(MBCS_ENTRY_IS_FINAL(entry)) {
786 stateProps[nextState]|=0x40;
787 if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) {
788 stateProps[state]|=0x40;
374ca955 789 }
b75a7d8f 790 }
46f4442e 791 ++min;
b75a7d8f 792 }
46f4442e 793 return stateProps[state];
b75a7d8f
A
794}
795
374ca955 796/*
46f4442e
A
797 * Internal function enumerating the toUnicode data of an MBCS converter.
798 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
799 * table, but could also be used for a future ucnv_getUnicodeSet() option
800 * that includes reverse fallbacks (after updating this function's implementation).
801 * Currently only handles roundtrip mappings.
374ca955 802 * Does not currently handle extensions.
374ca955 803 */
46f4442e
A
804static void
805ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable,
806 UConverterEnumToUCallback *callback, const void *context,
807 UErrorCode *pErrorCode) {
808 /*
809 * Properties for each state, to speed up the enumeration.
810 * Ignorable actions are unassigned/illegal/state-change-only:
811 * They do not lead to mappings.
812 *
813 * Bits 7..6:
814 * 1 direct/initial state (stateful converters have multiple)
815 * 0 non-initial state with transitions or with non-ignorable result actions
816 * -1 final state with only ignorable actions
817 *
818 * Bits 5..3:
819 * The lowest byte value with non-ignorable actions is
820 * value<<5 (rounded down).
821 *
822 * Bits 2..0:
823 * The highest byte value with non-ignorable actions is
824 * (value<<5)&0x1f (rounded up).
825 */
826 int8_t stateProps[MBCS_MAX_STATE_COUNT];
827 int32_t state;
828
829 uprv_memset(stateProps, -1, sizeof(stateProps));
830
831 /* recurse from state 0 and set all stateProps */
832 getStateProp(mbcsTable->stateTable, stateProps, 0);
833
834 for(state=0; state<mbcsTable->countStates; ++state) {
835 /*if(stateProps[state]==-1) {
836 printf("unused/unreachable <icu:state> %d\n", state);
837 }*/
838 if(stateProps[state]>=0x40) {
839 /* start from each direct state */
840 enumToU(
841 mbcsTable, stateProps, state, 0, 0,
842 callback, context,
843 pErrorCode);
844 }
845 }
374ca955
A
846}
847
848U_CFUNC void
46f4442e
A
849ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
850 const USetAdder *sa,
851 UConverterUnicodeSet which,
852 UConverterSetFilter filter,
853 UErrorCode *pErrorCode) {
374ca955 854 const UConverterMBCSTable *mbcsTable;
b75a7d8f
A
855 const uint16_t *table;
856
857 uint32_t st3;
858 uint16_t st1, maxStage1, st2;
859
860 UChar32 c;
861
b75a7d8f 862 /* enumerate the from-Unicode trie table */
374ca955 863 mbcsTable=&sharedData->mbcs;
b75a7d8f
A
864 table=mbcsTable->fromUnicodeTable;
865 if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
866 maxStage1=0x440;
867 } else {
868 maxStage1=0x40;
869 }
870
871 c=0; /* keep track of the current code point while enumerating */
872
873 if(mbcsTable->outputType==MBCS_OUTPUT_1) {
874 const uint16_t *stage2, *stage3, *results;
46f4442e 875 uint16_t minValue;
b75a7d8f
A
876
877 results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
878
46f4442e
A
879 /*
880 * Set a threshold variable for selecting which mappings to use.
881 * See ucnv_MBCSSingleFromBMPWithOffsets() and
882 * MBCS_SINGLE_RESULT_FROM_U() for details.
883 */
884 if(which==UCNV_ROUNDTRIP_SET) {
885 /* use only roundtrips */
886 minValue=0xf00;
887 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
888 /* use all roundtrip and fallback results */
889 minValue=0x800;
890 }
891
b75a7d8f
A
892 for(st1=0; st1<maxStage1; ++st1) {
893 st2=table[st1];
894 if(st2>maxStage1) {
895 stage2=table+st2;
896 for(st2=0; st2<64; ++st2) {
897 if((st3=stage2[st2])!=0) {
898 /* read the stage 3 block */
899 stage3=results+st3;
900
b75a7d8f 901 do {
46f4442e 902 if(*stage3++>=minValue) {
374ca955
A
903 sa->add(sa->set, c);
904 }
905 } while((++c&0xf)!=0);
906 } else {
907 c+=16; /* empty stage 3 block */
908 }
909 }
910 } else {
911 c+=1024; /* empty stage 2 block */
912 }
913 }
46f4442e 914 } else {
374ca955 915 const uint32_t *stage2;
46f4442e
A
916 const uint8_t *stage3, *bytes;
917 uint32_t st3Multiplier;
918 uint32_t value;
919 UBool useFallback;
374ca955 920
46f4442e 921 bytes=mbcsTable->fromUnicodeBytes;
374ca955 922
46f4442e 923 useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
374ca955 924
46f4442e
A
925 switch(mbcsTable->outputType) {
926 case MBCS_OUTPUT_3:
927 case MBCS_OUTPUT_4_EUC:
928 st3Multiplier=3;
929 break;
930 case MBCS_OUTPUT_4:
931 st3Multiplier=4;
932 break;
933 default:
934 st3Multiplier=2;
935 break;
b75a7d8f 936 }
b75a7d8f
A
937
938 for(st1=0; st1<maxStage1; ++st1) {
939 st2=table[st1];
940 if(st2>(maxStage1>>1)) {
941 stage2=(const uint32_t *)table+st2;
942 for(st2=0; st2<64; ++st2) {
943 if((st3=stage2[st2])!=0) {
46f4442e
A
944 /* read the stage 3 block */
945 stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3;
946
b75a7d8f
A
947 /* get the roundtrip flags for the stage 3 block */
948 st3>>=16;
949
950 /*
46f4442e
A
951 * Add code points for which the roundtrip flag is set,
952 * or which map to non-zero bytes if we use fallbacks.
374ca955 953 * See ucnv_MBCSFromUnicodeWithOffsets() for details.
b75a7d8f 954 */
46f4442e
A
955 switch(filter) {
956 case UCNV_SET_FILTER_NONE:
957 do {
958 if(st3&1) {
959 sa->add(sa->set, c);
960 stage3+=st3Multiplier;
961 } else if(useFallback) {
962 uint8_t b=0;
963 switch(st3Multiplier) {
964 case 4:
965 b|=*stage3++;
2ca993e8
A
966 U_FALLTHROUGH;
967 case 3:
46f4442e 968 b|=*stage3++;
2ca993e8
A
969 U_FALLTHROUGH;
970 case 2:
46f4442e
A
971 b|=stage3[0]|stage3[1];
972 stage3+=2;
2ca993e8 973 U_FALLTHROUGH;
46f4442e
A
974 default:
975 break;
976 }
977 if(b!=0) {
978 sa->add(sa->set, c);
979 }
980 }
981 st3>>=1;
982 } while((++c&0xf)!=0);
983 break;
984 case UCNV_SET_FILTER_DBCS_ONLY:
985 /* Ignore single-byte results (<0x100). */
986 do {
987 if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
988 sa->add(sa->set, c);
989 }
990 st3>>=1;
991 stage3+=2; /* +=st3Multiplier */
992 } while((++c&0xf)!=0);
993 break;
994 case UCNV_SET_FILTER_2022_CN:
995 /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
996 do {
997 if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
998 sa->add(sa->set, c);
999 }
1000 st3>>=1;
1001 stage3+=3; /* +=st3Multiplier */
1002 } while((++c&0xf)!=0);
1003 break;
1004 case UCNV_SET_FILTER_SJIS:
1005 /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
1006 do {
1007 if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
1008 sa->add(sa->set, c);
1009 }
1010 st3>>=1;
1011 stage3+=2; /* +=st3Multiplier */
1012 } while((++c&0xf)!=0);
1013 break;
1014 case UCNV_SET_FILTER_GR94DBCS:
1015 /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
1016 do {
1017 if( ((st3&1)!=0 || useFallback) &&
1018 (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
1019 (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
1020 ) {
1021 sa->add(sa->set, c);
1022 }
1023 st3>>=1;
1024 stage3+=2; /* +=st3Multiplier */
1025 } while((++c&0xf)!=0);
1026 break;
1027 case UCNV_SET_FILTER_HZ:
1028 /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
1029 do {
1030 if( ((st3&1)!=0 || useFallback) &&
1031 (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
1032 (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
1033 ) {
1034 sa->add(sa->set, c);
1035 }
1036 st3>>=1;
1037 stage3+=2; /* +=st3Multiplier */
1038 } while((++c&0xf)!=0);
1039 break;
1040 default:
1041 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1042 return;
1043 }
b75a7d8f
A
1044 } else {
1045 c+=16; /* empty stage 3 block */
1046 }
1047 }
1048 } else {
1049 c+=1024; /* empty stage 2 block */
1050 }
1051 }
1052 }
374ca955 1053
46f4442e
A
1054 ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
1055}
1056
1057U_CFUNC void
1058ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
1059 const USetAdder *sa,
1060 UConverterUnicodeSet which,
1061 UErrorCode *pErrorCode) {
1062 ucnv_MBCSGetFilteredUnicodeSetForUnicode(
1063 sharedData, sa, which,
1064 sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
1065 UCNV_SET_FILTER_DBCS_ONLY :
1066 UCNV_SET_FILTER_NONE,
1067 pErrorCode);
374ca955
A
1068}
1069
1070static void
1071ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
73c04bcf 1072 const USetAdder *sa,
374ca955
A
1073 UConverterUnicodeSet which,
1074 UErrorCode *pErrorCode) {
1075 if(cnv->options&_MBCS_OPTION_GB18030) {
1076 sa->addRange(sa->set, 0, 0xd7ff);
1077 sa->addRange(sa->set, 0xe000, 0x10ffff);
1078 } else {
1079 ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode);
1080 }
1081}
1082
1083/* conversion extensions for input not in the main table -------------------- */
1084
1085/*
1086 * Hardcoded extension handling for GB 18030.
1087 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
1088 *
1089 * In the future, conversion extensions may handle m:n mappings and delta tables,
46f4442e 1090 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html
374ca955
A
1091 *
1092 * If an input character cannot be mapped, then these functions set an error
1093 * code. The framework will then call the callback function.
1094 */
1095
1096/*
1097 * @return if(U_FAILURE) return the code point for cnv->fromUChar32
1098 * else return 0 after output has been written to the target
1099 */
1100static UChar32
1101_extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
1102 UChar32 cp,
1103 const UChar **source, const UChar *sourceLimit,
46f4442e 1104 uint8_t **target, const uint8_t *targetLimit,
374ca955
A
1105 int32_t **offsets, int32_t sourceIndex,
1106 UBool flush,
1107 UErrorCode *pErrorCode) {
1108 const int32_t *cx;
1109
1110 cnv->useSubChar1=FALSE;
1111
1112 if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
1113 ucnv_extInitialMatchFromU(
1114 cnv, cx,
1115 cp, source, sourceLimit,
46f4442e 1116 (char **)target, (char *)targetLimit,
374ca955
A
1117 offsets, sourceIndex,
1118 flush,
1119 pErrorCode)
1120 ) {
1121 return 0; /* an extension mapping handled the input */
1122 }
1123
1124 /* GB 18030 */
1125 if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
1126 const uint32_t *range;
1127 int32_t i;
1128
1129 range=gb18030Ranges[0];
b331163b 1130 for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) {
374ca955
A
1131 if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {
1132 /* found the Unicode code point, output the four-byte sequence for it */
1133 uint32_t linear;
1134 char bytes[4];
1135
1136 /* get the linear value of the first GB 18030 code in this range */
1137 linear=range[2]-LINEAR_18030_BASE;
1138
1139 /* add the offset from the beginning of the range */
1140 linear+=((uint32_t)cp-range[0]);
1141
1142 /* turn this into a four-byte sequence */
1143 bytes[3]=(char)(0x30+linear%10); linear/=10;
1144 bytes[2]=(char)(0x81+linear%126); linear/=126;
1145 bytes[1]=(char)(0x30+linear%10); linear/=10;
1146 bytes[0]=(char)(0x81+linear);
1147
1148 /* output this sequence */
1149 ucnv_fromUWriteBytes(cnv,
46f4442e 1150 bytes, 4, (char **)target, (char *)targetLimit,
374ca955
A
1151 offsets, sourceIndex, pErrorCode);
1152 return 0;
1153 }
1154 }
1155 }
1156
1157 /* no mapping */
1158 *pErrorCode=U_INVALID_CHAR_FOUND;
1159 return cp;
1160}
1161
1162/*
1163 * Input sequence: cnv->toUBytes[0..length[
1164 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input
1165 * else return 0 after output has been written to the target
1166 */
1167static int8_t
1168_extToU(UConverter *cnv, const UConverterSharedData *sharedData,
1169 int8_t length,
46f4442e 1170 const uint8_t **source, const uint8_t *sourceLimit,
374ca955
A
1171 UChar **target, const UChar *targetLimit,
1172 int32_t **offsets, int32_t sourceIndex,
1173 UBool flush,
1174 UErrorCode *pErrorCode) {
1175 const int32_t *cx;
1176
1177 if( (cx=sharedData->mbcs.extIndexes)!=NULL &&
1178 ucnv_extInitialMatchToU(
1179 cnv, cx,
46f4442e 1180 length, (const char **)source, (const char *)sourceLimit,
374ca955
A
1181 target, targetLimit,
1182 offsets, sourceIndex,
1183 flush,
1184 pErrorCode)
1185 ) {
1186 return 0; /* an extension mapping handled the input */
1187 }
1188
1189 /* GB 18030 */
1190 if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
1191 const uint32_t *range;
1192 uint32_t linear;
1193 int32_t i;
1194
1195 linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]);
1196 range=gb18030Ranges[0];
b331163b 1197 for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) {
374ca955
A
1198 if(range[2]<=linear && linear<=range[3]) {
1199 /* found the sequence, output the Unicode code point for it */
1200 *pErrorCode=U_ZERO_ERROR;
1201
1202 /* add the linear difference between the input and start sequences to the start code point */
1203 linear=range[0]+(linear-range[2]);
1204
1205 /* output this code point */
1206 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode);
1207
1208 return 0;
1209 }
1210 }
1211 }
1212
1213 /* no mapping */
1214 *pErrorCode=U_INVALID_CHAR_FOUND;
1215 return length;
b75a7d8f
A
1216}
1217
1218/* EBCDIC swap LF<->NL ------------------------------------------------------ */
1219
1220/*
1221 * This code modifies a standard EBCDIC<->Unicode mapping table for
1222 * OS/390 (z/OS) Unix System Services (Open Edition).
1223 * The difference is in the mapping of Line Feed and New Line control codes:
1224 * Standard EBCDIC maps
1225 *
1226 * <U000A> \x25 |0
1227 * <U0085> \x15 |0
1228 *
1229 * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
1230 * mapping
1231 *
1232 * <U000A> \x15 |0
1233 * <U0085> \x25 |0
1234 *
1235 * This code modifies a loaded standard EBCDIC<->Unicode mapping table
1236 * by copying it into allocated memory and swapping the LF and NL values.
1237 * It allows to support the same EBCDIC charset in both versions without
1238 * duplicating the entire installed table.
1239 */
1240
1241/* standard EBCDIC codes */
1242#define EBCDIC_LF 0x25
1243#define EBCDIC_NL 0x15
1244
1245/* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
1246#define EBCDIC_RT_LF 0xf25
1247#define EBCDIC_RT_NL 0xf15
1248
1249/* Unicode code points */
1250#define U_LF 0x0a
1251#define U_NL 0x85
1252
1253static UBool
1254_EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) {
1255 UConverterMBCSTable *mbcsTable;
1256
1257 const uint16_t *table, *results;
1258 const uint8_t *bytes;
1259
1260 int32_t (*newStateTable)[256];
1261 uint16_t *newResults;
1262 uint8_t *p;
1263 char *name;
1264
1265 uint32_t stage2Entry;
1266 uint32_t size, sizeofFromUBytes;
1267
374ca955 1268 mbcsTable=&sharedData->mbcs;
b75a7d8f
A
1269
1270 table=mbcsTable->fromUnicodeTable;
1271 bytes=mbcsTable->fromUnicodeBytes;
1272 results=(const uint16_t *)bytes;
1273
1274 /*
1275 * Check that this is an EBCDIC table with SBCS portion -
1276 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.
1277 *
1278 * If not, ignore the option. Options are always ignored if they do not apply.
1279 */
1280 if(!(
1281 (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) &&
1282 mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
1283 mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL)
1284 )) {
1285 return FALSE;
1286 }
1287
1288 if(mbcsTable->outputType==MBCS_OUTPUT_1) {
1289 if(!(
1290 EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
1291 EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)
1292 )) {
1293 return FALSE;
1294 }
1295 } else /* MBCS_OUTPUT_2_SISO */ {
1296 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
1297 if(!(
1298 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&
1299 EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)
1300 )) {
1301 return FALSE;
1302 }
1303
1304 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
1305 if(!(
1306 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&
1307 EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)
1308 )) {
1309 return FALSE;
1310 }
1311 }
1312
374ca955
A
1313 if(mbcsTable->fromUBytesLength>0) {
1314 /*
1315 * We _know_ the number of bytes in the fromUnicodeBytes array
1316 * starting with header.version 4.1.
1317 */
1318 sizeofFromUBytes=mbcsTable->fromUBytesLength;
1319 } else {
1320 /*
1321 * Otherwise:
1322 * There used to be code to enumerate the fromUnicode
1323 * trie and find the highest entry, but it was removed in ICU 3.2
1324 * because it was not tested and caused a low code coverage number.
1325 * See Jitterbug 3674.
1326 * This affects only some .cnv file formats with a header.version
1327 * below 4.1, and only when swaplfnl is requested.
1328 *
1329 * ucnvmbcs.c revision 1.99 is the last one with the
1330 * ucnv_MBCSSizeofFromUBytes() function.
1331 */
1332 *pErrorCode=U_INVALID_FORMAT_ERROR;
1333 return FALSE;
1334 }
1335
b75a7d8f
A
1336 /*
1337 * The table has an appropriate format.
1338 * Allocate and build
1339 * - a modified to-Unicode state table
1340 * - a modified from-Unicode output array
1341 * - a converter name string with the swap option appended
1342 */
b75a7d8f
A
1343 size=
1344 mbcsTable->countStates*1024+
1345 sizeofFromUBytes+
1346 UCNV_MAX_CONVERTER_NAME_LENGTH+20;
1347 p=(uint8_t *)uprv_malloc(size);
1348 if(p==NULL) {
1349 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1350 return FALSE;
1351 }
1352
1353 /* copy and modify the to-Unicode state table */
1354 newStateTable=(int32_t (*)[256])p;
1355 uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024);
1356
1357 newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
1358 newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
1359
1360 /* copy and modify the from-Unicode result table */
1361 newResults=(uint16_t *)newStateTable[mbcsTable->countStates];
1362 uprv_memcpy(newResults, bytes, sizeofFromUBytes);
1363
1364 /* conveniently, the table access macros work on the left side of expressions */
1365 if(mbcsTable->outputType==MBCS_OUTPUT_1) {
1366 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;
1367 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;
1368 } else /* MBCS_OUTPUT_2_SISO */ {
1369 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);
1370 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;
1371
1372 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);
1373 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;
1374 }
1375
1376 /* set the canonical converter name */
1377 name=(char *)newResults+sizeofFromUBytes;
1378 uprv_strcpy(name, sharedData->staticData->name);
1379 uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);
1380
1381 /* set the pointers */
1382 umtx_lock(NULL);
1383 if(mbcsTable->swapLFNLStateTable==NULL) {
1384 mbcsTable->swapLFNLStateTable=newStateTable;
1385 mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;
1386 mbcsTable->swapLFNLName=name;
1387
1388 newStateTable=NULL;
1389 }
1390 umtx_unlock(NULL);
1391
1392 /* release the allocated memory if another thread beat us to it */
1393 if(newStateTable!=NULL) {
1394 uprv_free(newStateTable);
1395 }
1396 return TRUE;
1397}
1398
46f4442e
A
1399/* reconstitute omitted fromUnicode data ------------------------------------ */
1400
1401/* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */
1402static UBool U_CALLCONV
1403writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) {
1404 UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context;
1405 const uint16_t *table;
1406 uint32_t *stage2;
1407 uint8_t *bytes, *p;
1408 UChar32 c;
1409 int32_t i, st3;
1410
1411 table=mbcsTable->fromUnicodeTable;
1412 bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;
1413
1414 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
1415 switch(mbcsTable->outputType) {
1416 case MBCS_OUTPUT_3_EUC:
1417 if(value<=0xffff) {
1418 /* short sequences are stored directly */
1419 /* code set 0 or 1 */
1420 } else if(value<=0x8effff) {
1421 /* code set 2 */
1422 value&=0x7fff;
1423 } else /* first byte is 0x8f */ {
1424 /* code set 3 */
1425 value&=0xff7f;
1426 }
1427 break;
1428 case MBCS_OUTPUT_4_EUC:
1429 if(value<=0xffffff) {
1430 /* short sequences are stored directly */
1431 /* code set 0 or 1 */
1432 } else if(value<=0x8effffff) {
1433 /* code set 2 */
1434 value&=0x7fffff;
1435 } else /* first byte is 0x8f */ {
1436 /* code set 3 */
1437 value&=0xff7fff;
1438 }
1439 break;
1440 default:
1441 break;
1442 }
1443
1444 for(i=0; i<=0x1f; ++value, ++i) {
1445 c=codePoints[i];
1446 if(c<0) {
1447 continue;
1448 }
1449
1450 /* locate the stage 2 & 3 data */
1451 stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f);
1452 p=bytes;
1453 st3=(int32_t)(uint16_t)*stage2*16+(c&0xf);
1454
1455 /* write the codepage bytes into stage 3 */
1456 switch(mbcsTable->outputType) {
1457 case MBCS_OUTPUT_3:
1458 case MBCS_OUTPUT_4_EUC:
1459 p+=st3*3;
1460 p[0]=(uint8_t)(value>>16);
1461 p[1]=(uint8_t)(value>>8);
1462 p[2]=(uint8_t)value;
1463 break;
1464 case MBCS_OUTPUT_4:
1465 ((uint32_t *)p)[st3]=value;
1466 break;
1467 default:
1468 /* 2 bytes per character */
1469 ((uint16_t *)p)[st3]=(uint16_t)value;
1470 break;
1471 }
1472
1473 /* set the roundtrip flag */
1474 *stage2|=(1UL<<(16+(c&0xf)));
1475 }
1476 return TRUE;
1477 }
1478
1479static void
1480reconstituteData(UConverterMBCSTable *mbcsTable,
1481 uint32_t stage1Length, uint32_t stage2Length,
1482 uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */
1483 UErrorCode *pErrorCode) {
1484 uint16_t *stage1;
1485 uint32_t *stage2;
46f4442e
A
1486 uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength;
1487 mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
1488 if(mbcsTable->reconstitutedData==NULL) {
1489 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1490 return;
1491 }
1492 uprv_memset(mbcsTable->reconstitutedData, 0, dataLength);
1493
1494 /* copy existing data and reroute the pointers */
1495 stage1=(uint16_t *)mbcsTable->reconstitutedData;
1496 uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2);
1497
1498 stage2=(uint32_t *)(stage1+stage1Length);
1499 uprv_memcpy(stage2+(fullStage2Length-stage2Length),
1500 mbcsTable->fromUnicodeTable+stage1Length,
1501 stage2Length*4);
1502
1503 mbcsTable->fromUnicodeTable=stage1;
51004dcb 1504 mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length);
46f4442e
A
1505
1506 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */
1507 stage2=(uint32_t *)stage1;
1508
1509 /* reconstitute the initial part of stage 2 from the mbcsIndex */
1510 {
1511 int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6;
1512 int32_t stageUTF8Index=0;
1513 int32_t st1, st2, st3, i;
1514
1515 for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) {
1516 st2=stage1[st1];
b331163b 1517 if(st2!=(int32_t)stage1Length/2) {
46f4442e
A
1518 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
1519 for(i=0; i<16; ++i) {
1520 st3=mbcsTable->mbcsIndex[stageUTF8Index++];
1521 if(st3!=0) {
1522 /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
1523 st3>>=4;
1524 /*
1525 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
1526 * allocated together as a single 64-block for access from the mbcsIndex
1527 */
1528 stage2[st2++]=st3++;
1529 stage2[st2++]=st3++;
1530 stage2[st2++]=st3++;
1531 stage2[st2++]=st3;
1532 } else {
1533 /* no stage 3 block, skip */
1534 st2+=4;
1535 }
1536 }
1537 } else {
1538 /* no stage 2 block, skip */
1539 stageUTF8Index+=16;
1540 }
1541 }
1542 }
1543
1544 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
1545 ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode);
1546}
1547
b75a7d8f
A
1548/* MBCS setup functions ----------------------------------------------------- */
1549
1550static void
374ca955
A
1551ucnv_MBCSLoad(UConverterSharedData *sharedData,
1552 UConverterLoadArgs *pArgs,
b75a7d8f
A
1553 const uint8_t *raw,
1554 UErrorCode *pErrorCode) {
1555 UDataInfo info;
374ca955 1556 UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
b75a7d8f 1557 _MBCSHeader *header=(_MBCSHeader *)raw;
374ca955 1558 uint32_t offset;
46f4442e
A
1559 uint32_t headerLength;
1560 UBool noFromU=FALSE;
1561
1562 if(header->version[0]==4) {
1563 headerLength=MBCS_HEADER_V4_LENGTH;
1564 } else if(header->version[0]==5 && header->version[1]>=3 &&
1565 (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) {
1566 headerLength=header->options&MBCS_OPT_LENGTH_MASK;
1567 noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0);
1568 } else {
b75a7d8f
A
1569 *pErrorCode=U_INVALID_TABLE_FORMAT;
1570 return;
1571 }
1572
b75a7d8f 1573 mbcsTable->outputType=(uint8_t)header->flags;
46f4442e
A
1574 if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {
1575 *pErrorCode=U_INVALID_TABLE_FORMAT;
1576 return;
1577 }
b75a7d8f 1578
374ca955
A
1579 /* extension data, header version 4.2 and higher */
1580 offset=header->flags>>8;
1581 if(offset!=0) {
1582 mbcsTable->extIndexes=(const int32_t *)(raw+offset);
b75a7d8f
A
1583 }
1584
374ca955 1585 if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {
b331163b 1586 UConverterLoadArgs args=UCNV_LOAD_ARGS_INITIALIZER;
374ca955
A
1587 UConverterSharedData *baseSharedData;
1588 const int32_t *extIndexes;
1589 const char *baseName;
b75a7d8f 1590
374ca955
A
1591 /* extension-only file, load the base table and set values appropriately */
1592 if((extIndexes=mbcsTable->extIndexes)==NULL) {
1593 /* extension-only file without extension */
1594 *pErrorCode=U_INVALID_TABLE_FORMAT;
1595 return;
1596 }
b75a7d8f 1597
374ca955
A
1598 if(pArgs->nestedLoads!=1) {
1599 /* an extension table must not be loaded as a base table */
1600 *pErrorCode=U_INVALID_TABLE_FILE;
1601 return;
1602 }
b75a7d8f 1603
374ca955 1604 /* load the base table */
46f4442e 1605 baseName=(const char *)header+headerLength*4;
374ca955
A
1606 if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {
1607 /* forbid loading this same extension-only file */
1608 *pErrorCode=U_INVALID_TABLE_FORMAT;
1609 return;
1610 }
b75a7d8f 1611
374ca955
A
1612 /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */
1613 args.size=sizeof(UConverterLoadArgs);
1614 args.nestedLoads=2;
729e4ab9 1615 args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable;
374ca955
A
1616 args.reserved=pArgs->reserved;
1617 args.options=pArgs->options;
1618 args.pkg=pArgs->pkg;
1619 args.name=baseName;
1620 baseSharedData=ucnv_load(&args, pErrorCode);
1621 if(U_FAILURE(*pErrorCode)) {
1622 return;
1623 }
1624 if( baseSharedData->staticData->conversionType!=UCNV_MBCS ||
1625 baseSharedData->mbcs.baseSharedData!=NULL
1626 ) {
1627 ucnv_unload(baseSharedData);
1628 *pErrorCode=U_INVALID_TABLE_FORMAT;
1629 return;
1630 }
729e4ab9
A
1631 if(pArgs->onlyTestIsLoadable) {
1632 /*
1633 * Exit as soon as we know that we can load the converter
1634 * and the format is valid and supported.
1635 * The worst that can happen in the following code is a memory
1636 * allocation error.
1637 */
1638 ucnv_unload(baseSharedData);
1639 return;
1640 }
374ca955
A
1641
1642 /* copy the base table data */
1643 uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable));
1644
1645 /* overwrite values with relevant ones for the extension converter */
1646 mbcsTable->baseSharedData=baseSharedData;
1647 mbcsTable->extIndexes=extIndexes;
1648
1649 /*
1650 * It would be possible to share the swapLFNL data with a base converter,
1651 * but the generated name would have to be different, and the memory
1652 * would have to be free'd only once.
1653 * It is easier to just create the data for the extension converter
1654 * separately when it is requested.
1655 */
1656 mbcsTable->swapLFNLStateTable=NULL;
1657 mbcsTable->swapLFNLFromUnicodeBytes=NULL;
1658 mbcsTable->swapLFNLName=NULL;
1659
46f4442e
A
1660 /*
1661 * The reconstitutedData must be deleted only when the base converter
1662 * is unloaded.
1663 */
1664 mbcsTable->reconstitutedData=NULL;
1665
374ca955
A
1666 /*
1667 * Set a special, runtime-only outputType if the extension converter
1668 * is a DBCS version of a base converter that also maps single bytes.
1669 */
1670 if( sharedData->staticData->conversionType==UCNV_DBCS ||
1671 (sharedData->staticData->conversionType==UCNV_MBCS &&
1672 sharedData->staticData->minBytesPerChar>=2)
1673 ) {
1674 if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {
1675 /* the base converter is SI/SO-stateful */
1676 int32_t entry;
1677
1678 /* get the dbcs state from the state table entry for SO=0x0e */
1679 entry=mbcsTable->stateTable[0][0xe];
1680 if( MBCS_ENTRY_IS_FINAL(entry) &&
1681 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&
1682 MBCS_ENTRY_FINAL_STATE(entry)!=0
1683 ) {
1684 mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
1685
1686 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1687 }
1688 } else if(
1689 baseSharedData->staticData->conversionType==UCNV_MBCS &&
1690 baseSharedData->staticData->minBytesPerChar==1 &&
1691 baseSharedData->staticData->maxBytesPerChar==2 &&
1692 mbcsTable->countStates<=127
1693 ) {
1694 /* non-stateful base converter, need to modify the state table */
1695 int32_t (*newStateTable)[256];
1696 int32_t *state;
1697 int32_t i, count;
1698
1699 /* allocate a new state table and copy the base state table contents */
1700 count=mbcsTable->countStates;
1701 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024);
1702 if(newStateTable==NULL) {
1703 ucnv_unload(baseSharedData);
1704 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1705 return;
1706 }
1707
1708 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);
1709
1710 /* change all final single-byte entries to go to a new all-illegal state */
1711 state=newStateTable[0];
1712 for(i=0; i<256; ++i) {
1713 if(MBCS_ENTRY_IS_FINAL(state[i])) {
1714 state[i]=MBCS_ENTRY_TRANSITION(count, 0);
1715 }
1716 }
1717
1718 /* build the new all-illegal state */
1719 state=newStateTable[count];
1720 for(i=0; i<256; ++i) {
1721 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);
1722 }
1723 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable;
1724 mbcsTable->countStates=(uint8_t)(count+1);
1725 mbcsTable->stateTableOwned=TRUE;
1726
1727 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
1728 }
1729 }
1730
1731 /*
1732 * unlike below for files with base tables, do not get the unicodeMask
1733 * from the sharedData; instead, use the base table's unicodeMask,
1734 * which we copied in the memcpy above;
1735 * this is necessary because the static data unicodeMask, especially
1736 * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
1737 */
1738 } else {
1739 /* conversion file with a base table; an additional extension table is optional */
1740 /* make sure that the output type is known */
1741 switch(mbcsTable->outputType) {
1742 case MBCS_OUTPUT_1:
1743 case MBCS_OUTPUT_2:
1744 case MBCS_OUTPUT_3:
1745 case MBCS_OUTPUT_4:
1746 case MBCS_OUTPUT_3_EUC:
1747 case MBCS_OUTPUT_4_EUC:
1748 case MBCS_OUTPUT_2_SISO:
1749 /* OK */
1750 break;
1751 default:
1752 *pErrorCode=U_INVALID_TABLE_FORMAT;
1753 return;
1754 }
729e4ab9
A
1755 if(pArgs->onlyTestIsLoadable) {
1756 /*
1757 * Exit as soon as we know that we can load the converter
1758 * and the format is valid and supported.
1759 * The worst that can happen in the following code is a memory
1760 * allocation error.
1761 */
1762 return;
1763 }
374ca955
A
1764
1765 mbcsTable->countStates=(uint8_t)header->countStates;
1766 mbcsTable->countToUFallbacks=header->countToUFallbacks;
46f4442e 1767 mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4);
374ca955
A
1768 mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
1769 mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
1770
1771 mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
1772 mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
1773 mbcsTable->fromUBytesLength=header->fromUBytesLength;
1774
1775 /*
1776 * converter versions 6.1 and up contain a unicodeMask that is
1777 * used here to select the most efficient function implementations
1778 */
1779 info.size=sizeof(UDataInfo);
1780 udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
1781 if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
1782 /* mask off possible future extensions to be safe */
1783 mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3);
1784 } else {
1785 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
1786 mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
1787 }
46f4442e
A
1788
1789 /*
1790 * _MBCSHeader.version 4.3 adds utf8Friendly data structures.
1791 * Check for the header version, SBCS vs. MBCS, and for whether the
1792 * data structures are optimized for code points as high as what the
1793 * runtime code is designed for.
1794 * The implementation does not handle mapping tables with entries for
1795 * unpaired surrogates.
1796 */
1797 if( header->version[1]>=3 &&
1798 (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 &&
1799 (mbcsTable->countStates==1 ?
1800 (header->version[2]>=(SBCS_FAST_MAX>>8)) :
1801 (header->version[2]>=(MBCS_FAST_MAX>>8))
1802 )
1803 ) {
1804 mbcsTable->utf8Friendly=TRUE;
1805
1806 if(mbcsTable->countStates==1) {
1807 /*
1808 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
1809 * Build a table with indexes to each block, to be used instead of
1810 * the regular stage 1/2 table.
1811 */
1812 int32_t i;
1813 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) {
1814 mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)];
1815 }
1816 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */
1817 mbcsTable->maxFastUChar=SBCS_FAST_MAX;
1818 } else {
1819 /*
1820 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
1821 * The .cnv file is prebuilt with an additional stage table with indexes
1822 * to each block.
1823 */
1824 mbcsTable->mbcsIndex=(const uint16_t *)
1825 (mbcsTable->fromUnicodeBytes+
1826 (noFromU ? 0 : mbcsTable->fromUBytesLength));
1827 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff;
1828 }
1829 }
1830
1831 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
1832 {
1833 uint32_t asciiRoundtrips=0xffffffff;
1834 int32_t i;
1835
1836 for(i=0; i<0x80; ++i) {
1837 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) {
1838 asciiRoundtrips&=~((uint32_t)1<<(i>>2));
1839 }
1840 }
1841 mbcsTable->asciiRoundtrips=asciiRoundtrips;
1842 }
1843
1844 if(noFromU) {
1845 uint32_t stage1Length=
1846 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?
1847 0x440 : 0x40;
1848 uint32_t stage2Length=
1849 (header->offsetFromUBytes-header->offsetFromUTable)/4-
1850 stage1Length/2;
1851 reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode);
1852 }
1853 }
1854
1855 /* Set the impl pointer here so that it is set for both extension-only and base tables. */
1856 if(mbcsTable->utf8Friendly) {
1857 if(mbcsTable->countStates==1) {
1858 sharedData->impl=&_SBCSUTF8Impl;
1859 } else {
1860 if(mbcsTable->outputType==MBCS_OUTPUT_2) {
1861 sharedData->impl=&_DBCSUTF8Impl;
1862 }
1863 }
1864 }
1865
1866 if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) {
1867 /*
1868 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
1869 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
1870 */
1871 mbcsTable->asciiRoundtrips=0;
374ca955
A
1872 }
1873}
1874
1875static void
1876ucnv_MBCSUnload(UConverterSharedData *sharedData) {
1877 UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
1878
1879 if(mbcsTable->swapLFNLStateTable!=NULL) {
1880 uprv_free(mbcsTable->swapLFNLStateTable);
1881 }
1882 if(mbcsTable->stateTableOwned) {
1883 uprv_free((void *)mbcsTable->stateTable);
1884 }
1885 if(mbcsTable->baseSharedData!=NULL) {
1886 ucnv_unload(mbcsTable->baseSharedData);
1887 }
46f4442e
A
1888 if(mbcsTable->reconstitutedData!=NULL) {
1889 uprv_free(mbcsTable->reconstitutedData);
1890 }
374ca955
A
1891}
1892
1893static void
1894ucnv_MBCSOpen(UConverter *cnv,
729e4ab9
A
1895 UConverterLoadArgs *pArgs,
1896 UErrorCode *pErrorCode) {
374ca955
A
1897 UConverterMBCSTable *mbcsTable;
1898 const int32_t *extIndexes;
1899 uint8_t outputType;
1900 int8_t maxBytesPerUChar;
1901
729e4ab9
A
1902 if(pArgs->onlyTestIsLoadable) {
1903 return;
1904 }
1905
374ca955
A
1906 mbcsTable=&cnv->sharedData->mbcs;
1907 outputType=mbcsTable->outputType;
1908
1909 if(outputType==MBCS_OUTPUT_DBCS_ONLY) {
1910 /* the swaplfnl option does not apply, remove it */
729e4ab9 1911 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
374ca955
A
1912 }
1913
729e4ab9 1914 if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) {
b75a7d8f
A
1915 /* do this because double-checked locking is broken */
1916 UBool isCached;
1917
1918 umtx_lock(NULL);
374ca955 1919 isCached=mbcsTable->swapLFNLStateTable!=NULL;
b75a7d8f
A
1920 umtx_unlock(NULL);
1921
1922 if(!isCached) {
1923 if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
374ca955
A
1924 if(U_FAILURE(*pErrorCode)) {
1925 return; /* something went wrong */
1926 }
1927
b75a7d8f 1928 /* the option does not apply, remove it */
729e4ab9 1929 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;
b75a7d8f
A
1930 }
1931 }
1932 }
1933
729e4ab9
A
1934 if(uprv_strstr(pArgs->name, "18030")!=NULL) {
1935 if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) {
b75a7d8f
A
1936 /* set a flag for GB 18030 mode, which changes the callback behavior */
1937 cnv->options|=_MBCS_OPTION_GB18030;
1938 }
729e4ab9
A
1939 } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) || (uprv_strstr(pArgs->name, "keis")!=NULL)) {
1940 /* set a flag for KEIS converter, which changes the SI/SO character sequence */
1941 cnv->options|=_MBCS_OPTION_KEIS;
1942 } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) || (uprv_strstr(pArgs->name, "jef")!=NULL)) {
1943 /* set a flag for JEF converter, which changes the SI/SO character sequence */
1944 cnv->options|=_MBCS_OPTION_JEF;
1945 } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) || (uprv_strstr(pArgs->name, "jips")!=NULL)) {
1946 /* set a flag for JIPS converter, which changes the SI/SO character sequence */
1947 cnv->options|=_MBCS_OPTION_JIPS;
b75a7d8f
A
1948 }
1949
374ca955
A
1950 /* fix maxBytesPerUChar depending on outputType and options etc. */
1951 if(outputType==MBCS_OUTPUT_2_SISO) {
1952 cnv->maxBytesPerUChar=3; /* SO+DBCS */
1953 }
1954
1955 extIndexes=mbcsTable->extIndexes;
1956 if(extIndexes!=NULL) {
1957 maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);
1958 if(outputType==MBCS_OUTPUT_2_SISO) {
1959 ++maxBytesPerUChar; /* SO + multiple DBCS */
1960 }
1961
1962 if(maxBytesPerUChar>cnv->maxBytesPerUChar) {
1963 cnv->maxBytesPerUChar=maxBytesPerUChar;
1964 }
1965 }
1966
1967#if 0
1968 /*
1969 * documentation of UConverter fields used for status
1970 * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()
1971 */
1972
1973 /* toUnicode */
1974 cnv->toUnicodeStatus=0; /* offset */
1975 cnv->mode=0; /* state */
1976 cnv->toULength=0; /* byteIndex */
1977
1978 /* fromUnicode */
1979 cnv->fromUChar32=0;
1980 cnv->fromUnicodeStatus=1; /* prevLength */
1981#endif
b75a7d8f
A
1982}
1983
1984static const char *
374ca955
A
1985ucnv_MBCSGetName(const UConverter *cnv) {
1986 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) {
1987 return cnv->sharedData->mbcs.swapLFNLName;
b75a7d8f
A
1988 } else {
1989 return cnv->sharedData->staticData->name;
1990 }
1991}
1992
1993/* MBCS-to-Unicode conversion functions ------------------------------------- */
1994
1995static UChar32
374ca955 1996ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {
b75a7d8f
A
1997 const _MBCSToUFallback *toUFallbacks;
1998 uint32_t i, start, limit;
1999
2000 limit=mbcsTable->countToUFallbacks;
2001 if(limit>0) {
2002 /* do a binary search for the fallback mapping */
2003 toUFallbacks=mbcsTable->toUFallbacks;
2004 start=0;
2005 while(start<limit-1) {
2006 i=(start+limit)/2;
2007 if(offset<toUFallbacks[i].offset) {
2008 limit=i;
2009 } else {
2010 start=i;
2011 }
2012 }
2013
2014 /* did we really find it? */
2015 if(offset==toUFallbacks[start].offset) {
2016 return toUFallbacks[start].codePoint;
2017 }
2018 }
2019
2020 return 0xfffe;
2021}
2022
374ca955
A
2023/* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
2024static void
2025ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
2026 UErrorCode *pErrorCode) {
b75a7d8f
A
2027 UConverter *cnv;
2028 const uint8_t *source, *sourceLimit;
2029 UChar *target;
2030 const UChar *targetLimit;
2031 int32_t *offsets;
2032
2033 const int32_t (*stateTable)[256];
b75a7d8f 2034
374ca955 2035 int32_t sourceIndex;
b75a7d8f
A
2036
2037 int32_t entry;
2038 UChar c;
2039 uint8_t action;
b75a7d8f
A
2040
2041 /* set up the local pointers */
374ca955 2042 cnv=pArgs->converter;
b75a7d8f
A
2043 source=(const uint8_t *)pArgs->source;
2044 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2045 target=pArgs->target;
2046 targetLimit=pArgs->targetLimit;
2047 offsets=pArgs->offsets;
2048
2049 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
374ca955 2050 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
b75a7d8f 2051 } else {
374ca955 2052 stateTable=cnv->sharedData->mbcs.stateTable;
b75a7d8f 2053 }
b75a7d8f
A
2054
2055 /* sourceIndex=-1 if the current character began in the previous buffer */
374ca955 2056 sourceIndex=0;
b75a7d8f
A
2057
2058 /* conversion loop */
2059 while(source<sourceLimit) {
2060 /*
2061 * This following test is to see if available input would overflow the output.
2062 * It does not catch output of more than one code unit that
2063 * overflows as a result of a surrogate pair or callback output
2064 * from the last source byte.
2065 * Therefore, those situations also test for overflows and will
2066 * then break the loop, too.
2067 */
374ca955
A
2068 if(target>=targetLimit) {
2069 /* target is full */
2070 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2071 break;
2072 }
b75a7d8f 2073
374ca955
A
2074 entry=stateTable[0][*source++];
2075 /* MBCS_ENTRY_IS_FINAL(entry) */
b75a7d8f 2076
374ca955
A
2077 /* test the most common case first */
2078 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2079 /* output BMP code point */
2080 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2081 if(offsets!=NULL) {
2082 *offsets++=sourceIndex;
2083 }
b75a7d8f 2084
374ca955
A
2085 /* normal end of action codes: prepare for a new character */
2086 ++sourceIndex;
2087 continue;
2088 }
b75a7d8f 2089
374ca955
A
2090 /*
2091 * An if-else-if chain provides more reliable performance for
2092 * the most common cases compared to a switch.
2093 */
2094 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2095 if(action==MBCS_STATE_VALID_DIRECT_20 ||
2096 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2097 ) {
2098 entry=MBCS_ENTRY_FINAL_VALUE(entry);
2099 /* output surrogate pair */
2100 *target++=(UChar)(0xd800|(UChar)(entry>>10));
2101 if(offsets!=NULL) {
2102 *offsets++=sourceIndex;
2103 }
2104 c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
2105 if(target<targetLimit) {
2106 *target++=c;
2107 if(offsets!=NULL) {
2108 *offsets++=sourceIndex;
2109 }
2110 } else {
2111 /* target overflow */
2112 cnv->UCharErrorBuffer[0]=c;
2113 cnv->UCharErrorBufferLength=1;
2114 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2115 break;
2116 }
b75a7d8f 2117
374ca955
A
2118 ++sourceIndex;
2119 continue;
2120 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2121 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2122 /* output BMP code point */
2123 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2124 if(offsets!=NULL) {
2125 *offsets++=sourceIndex;
b75a7d8f
A
2126 }
2127
374ca955
A
2128 ++sourceIndex;
2129 continue;
b75a7d8f 2130 }
374ca955
A
2131 } else if(action==MBCS_STATE_UNASSIGNED) {
2132 /* just fall through */
2133 } else if(action==MBCS_STATE_ILLEGAL) {
2134 /* callback(illegal) */
2135 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f 2136 } else {
374ca955
A
2137 /* reserved, must never occur */
2138 ++sourceIndex;
2139 continue;
b75a7d8f 2140 }
b75a7d8f 2141
374ca955
A
2142 if(U_FAILURE(*pErrorCode)) {
2143 /* callback(illegal) */
2144 break;
2145 } else /* unassigned sequences indicated with byteIndex>0 */ {
2146 /* try an extension mapping */
2147 pArgs->source=(const char *)source;
2148 cnv->toUBytes[0]=*(source-1);
2149 cnv->toULength=_extToU(cnv, cnv->sharedData,
46f4442e 2150 1, &source, sourceLimit,
374ca955
A
2151 &target, targetLimit,
2152 &offsets, sourceIndex,
2153 pArgs->flush,
2154 pErrorCode);
2155 sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
2156
2157 if(U_FAILURE(*pErrorCode)) {
2158 /* not mappable or buffer overflow */
2159 break;
2160 }
b75a7d8f 2161 }
b75a7d8f
A
2162 }
2163
2164 /* write back the updated pointers */
2165 pArgs->source=(const char *)source;
2166 pArgs->target=target;
2167 pArgs->offsets=offsets;
2168}
2169
374ca955
A
2170/*
2171 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
2172 * that only map to and from the BMP.
2173 * In addition to single-byte optimizations, the offset calculations
2174 * become much easier.
2175 */
b75a7d8f 2176static void
374ca955
A
2177ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
2178 UErrorCode *pErrorCode) {
b75a7d8f 2179 UConverter *cnv;
374ca955 2180 const uint8_t *source, *sourceLimit, *lastSource;
b75a7d8f 2181 UChar *target;
374ca955 2182 int32_t targetCapacity, length;
b75a7d8f
A
2183 int32_t *offsets;
2184
2185 const int32_t (*stateTable)[256];
2186
374ca955 2187 int32_t sourceIndex;
b75a7d8f
A
2188
2189 int32_t entry;
b75a7d8f 2190 uint8_t action;
b75a7d8f
A
2191
2192 /* set up the local pointers */
2193 cnv=pArgs->converter;
2194 source=(const uint8_t *)pArgs->source;
2195 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2196 target=pArgs->target;
73c04bcf 2197 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
b75a7d8f
A
2198 offsets=pArgs->offsets;
2199
2200 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
374ca955 2201 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
b75a7d8f 2202 } else {
374ca955 2203 stateTable=cnv->sharedData->mbcs.stateTable;
b75a7d8f
A
2204 }
2205
2206 /* sourceIndex=-1 if the current character began in the previous buffer */
2207 sourceIndex=0;
374ca955 2208 lastSource=source;
b75a7d8f
A
2209
2210 /*
2211 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
2212 * for the minimum of the sourceLength and targetCapacity
2213 */
73c04bcf 2214 length=(int32_t)(sourceLimit-source);
b75a7d8f
A
2215 if(length<targetCapacity) {
2216 targetCapacity=length;
2217 }
2218
2219#if MBCS_UNROLL_SINGLE_TO_BMP
2220 /* unrolling makes it faster on Pentium III/Windows 2000 */
2221 /* unroll the loop with the most common case */
2222unrolled:
2223 if(targetCapacity>=16) {
2224 int32_t count, loops, oredEntries;
2225
2226 loops=count=targetCapacity>>4;
2227 do {
2228 oredEntries=entry=stateTable[0][*source++];
2229 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2230 oredEntries|=entry=stateTable[0][*source++];
2231 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2232 oredEntries|=entry=stateTable[0][*source++];
2233 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2234 oredEntries|=entry=stateTable[0][*source++];
2235 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2236 oredEntries|=entry=stateTable[0][*source++];
2237 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2238 oredEntries|=entry=stateTable[0][*source++];
2239 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2240 oredEntries|=entry=stateTable[0][*source++];
2241 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2242 oredEntries|=entry=stateTable[0][*source++];
2243 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2244 oredEntries|=entry=stateTable[0][*source++];
2245 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2246 oredEntries|=entry=stateTable[0][*source++];
2247 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2248 oredEntries|=entry=stateTable[0][*source++];
2249 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2250 oredEntries|=entry=stateTable[0][*source++];
2251 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2252 oredEntries|=entry=stateTable[0][*source++];
2253 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2254 oredEntries|=entry=stateTable[0][*source++];
2255 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2256 oredEntries|=entry=stateTable[0][*source++];
2257 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2258 oredEntries|=entry=stateTable[0][*source++];
2259 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2260
2261 /* were all 16 entries really valid? */
2262 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {
2263 /* no, return to the first of these 16 */
2264 source-=16;
2265 target-=16;
2266 break;
2267 }
2268 } while(--count>0);
2269 count=loops-count;
2270 targetCapacity-=16*count;
2271
2272 if(offsets!=NULL) {
2273 lastSource+=16*count;
2274 while(count>0) {
2275 *offsets++=sourceIndex++;
2276 *offsets++=sourceIndex++;
2277 *offsets++=sourceIndex++;
2278 *offsets++=sourceIndex++;
2279 *offsets++=sourceIndex++;
2280 *offsets++=sourceIndex++;
2281 *offsets++=sourceIndex++;
2282 *offsets++=sourceIndex++;
2283 *offsets++=sourceIndex++;
2284 *offsets++=sourceIndex++;
2285 *offsets++=sourceIndex++;
2286 *offsets++=sourceIndex++;
2287 *offsets++=sourceIndex++;
2288 *offsets++=sourceIndex++;
2289 *offsets++=sourceIndex++;
2290 *offsets++=sourceIndex++;
2291 --count;
2292 }
2293 }
2294 }
2295#endif
2296
2297 /* conversion loop */
729e4ab9 2298 while(targetCapacity > 0 && source < sourceLimit) {
b75a7d8f
A
2299 entry=stateTable[0][*source++];
2300 /* MBCS_ENTRY_IS_FINAL(entry) */
2301
2302 /* test the most common case first */
2303 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2304 /* output BMP code point */
2305 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2306 --targetCapacity;
2307 continue;
2308 }
2309
2310 /*
2311 * An if-else-if chain provides more reliable performance for
2312 * the most common cases compared to a switch.
2313 */
2314 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2315 if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
374ca955
A
2316 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2317 /* output BMP code point */
2318 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2319 --targetCapacity;
2320 continue;
b75a7d8f 2321 }
b75a7d8f 2322 } else if(action==MBCS_STATE_UNASSIGNED) {
374ca955 2323 /* just fall through */
b75a7d8f
A
2324 } else if(action==MBCS_STATE_ILLEGAL) {
2325 /* callback(illegal) */
b75a7d8f
A
2326 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2327 } else {
2328 /* reserved, must never occur */
2329 continue;
2330 }
2331
374ca955 2332 /* set offsets since the start or the last extension */
b75a7d8f
A
2333 if(offsets!=NULL) {
2334 int32_t count=(int32_t)(source-lastSource);
2335
2336 /* predecrement: do not set the offset for the callback-causing character */
2337 while(--count>0) {
2338 *offsets++=sourceIndex++;
2339 }
2340 /* offset and sourceIndex are now set for the current character */
2341 }
2342
374ca955
A
2343 if(U_FAILURE(*pErrorCode)) {
2344 /* callback(illegal) */
b75a7d8f 2345 break;
374ca955
A
2346 } else /* unassigned sequences indicated with byteIndex>0 */ {
2347 /* try an extension mapping */
2348 lastSource=source;
2349 cnv->toUBytes[0]=*(source-1);
2350 cnv->toULength=_extToU(cnv, cnv->sharedData,
46f4442e
A
2351 1, &source, sourceLimit,
2352 &target, pArgs->targetLimit,
374ca955
A
2353 &offsets, sourceIndex,
2354 pArgs->flush,
2355 pErrorCode);
2356 sourceIndex+=1+(int32_t)(source-lastSource);
2357
2358 if(U_FAILURE(*pErrorCode)) {
2359 /* not mappable or buffer overflow */
2360 break;
2361 }
2362
2363 /* recalculate the targetCapacity after an extension mapping */
73c04bcf
A
2364 targetCapacity=(int32_t)(pArgs->targetLimit-target);
2365 length=(int32_t)(sourceLimit-source);
374ca955
A
2366 if(length<targetCapacity) {
2367 targetCapacity=length;
2368 }
b75a7d8f
A
2369 }
2370
2371#if MBCS_UNROLL_SINGLE_TO_BMP
2372 /* unrolling makes it faster on Pentium III/Windows 2000 */
2373 goto unrolled;
2374#endif
2375 }
2376
2377 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
2378 /* target is full */
2379 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2380 }
2381
2382 /* set offsets since the start or the last callback */
2383 if(offsets!=NULL) {
2384 size_t count=source-lastSource;
2385 while(count>0) {
2386 *offsets++=sourceIndex++;
2387 --count;
2388 }
2389 }
2390
2391 /* write back the updated pointers */
2392 pArgs->source=(const char *)source;
2393 pArgs->target=target;
2394 pArgs->offsets=offsets;
2395}
2396
fd0068a8
A
2397static UBool
2398hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
2399 const int32_t *row=stateTable[state];
2400 int32_t b, entry;
2401 /* First test for final entries in this state for some commonly valid byte values. */
2402 entry=row[0xa1];
2403 if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2404 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2405 ) {
2406 return TRUE;
2407 }
2408 entry=row[0x41];
2409 if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2410 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2411 ) {
2412 return TRUE;
2413 }
2414 /* Then test for final entries in this state. */
2415 for(b=0; b<=0xff; ++b) {
2416 entry=row[b];
2417 if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
2418 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
2419 ) {
2420 return TRUE;
2421 }
2422 }
2423 /* Then recurse for transition entries. */
2424 for(b=0; b<=0xff; ++b) {
2425 entry=row[b];
2426 if( MBCS_ENTRY_IS_TRANSITION(entry) &&
2427 hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
2428 ) {
2429 return TRUE;
2430 }
2431 }
2432 return FALSE;
2433}
2434
2435/*
2436 * Is byte b a single/lead byte in this state?
2437 * Recurse for transition states, because here we don't want to say that
2438 * b is a lead byte if all byte sequences that start with b are illegal.
2439 */
2440static UBool
2441isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
2442 const int32_t *row=stateTable[state];
2443 int32_t entry=row[b];
2444 if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
2445 return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
2446 } else {
2447 uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2448 if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
2449 return FALSE; /* SI/SO are illegal for DBCS-only conversion */
2450 } else {
2451 return action!=MBCS_STATE_ILLEGAL;
2452 }
2453 }
2454}
2455
374ca955
A
2456U_CFUNC void
2457ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
2458 UErrorCode *pErrorCode) {
b75a7d8f
A
2459 UConverter *cnv;
2460 const uint8_t *source, *sourceLimit;
374ca955
A
2461 UChar *target;
2462 const UChar *targetLimit;
2463 int32_t *offsets;
b75a7d8f
A
2464
2465 const int32_t (*stateTable)[256];
2466 const uint16_t *unicodeCodeUnits;
2467
2468 uint32_t offset;
2469 uint8_t state;
2470 int8_t byteIndex;
2471 uint8_t *bytes;
2472
374ca955
A
2473 int32_t sourceIndex, nextSourceIndex;
2474
b75a7d8f 2475 int32_t entry;
374ca955 2476 UChar c;
b75a7d8f 2477 uint8_t action;
b75a7d8f
A
2478
2479 /* use optimized function if possible */
2480 cnv=pArgs->converter;
374ca955
A
2481
2482 if(cnv->preToULength>0) {
b75a7d8f 2483 /*
374ca955
A
2484 * pass sourceIndex=-1 because we continue from an earlier buffer
2485 * in the future, this may change with continuous offsets
b75a7d8f 2486 */
374ca955
A
2487 ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
2488
2489 if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) {
2490 return;
2491 }
2492 }
2493
2494 if(cnv->sharedData->mbcs.countStates==1) {
2495 if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
2496 ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
2497 } else {
2498 ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
2499 }
2500 return;
b75a7d8f
A
2501 }
2502
2503 /* set up the local pointers */
2504 source=(const uint8_t *)pArgs->source;
2505 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
374ca955
A
2506 target=pArgs->target;
2507 targetLimit=pArgs->targetLimit;
2508 offsets=pArgs->offsets;
b75a7d8f
A
2509
2510 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
374ca955 2511 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
b75a7d8f 2512 } else {
374ca955 2513 stateTable=cnv->sharedData->mbcs.stateTable;
b75a7d8f 2514 }
374ca955 2515 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
b75a7d8f
A
2516
2517 /* get the converter state from UConverter */
2518 offset=cnv->toUnicodeStatus;
b75a7d8f
A
2519 byteIndex=cnv->toULength;
2520 bytes=cnv->toUBytes;
2521
374ca955
A
2522 /*
2523 * if we are in the SBCS state for a DBCS-only converter,
2524 * then load the DBCS state from the MBCS data
2525 * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2526 */
2527 if((state=(uint8_t)(cnv->mode))==0) {
2528 state=cnv->sharedData->mbcs.dbcsOnlyState;
2529 }
2530
2531 /* sourceIndex=-1 if the current character began in the previous buffer */
2532 sourceIndex=byteIndex==0 ? 0 : -1;
2533 nextSourceIndex=0;
2534
b75a7d8f
A
2535 /* conversion loop */
2536 while(source<sourceLimit) {
374ca955
A
2537 /*
2538 * This following test is to see if available input would overflow the output.
2539 * It does not catch output of more than one code unit that
2540 * overflows as a result of a surrogate pair or callback output
2541 * from the last source byte.
2542 * Therefore, those situations also test for overflows and will
2543 * then break the loop, too.
2544 */
2545 if(target>=targetLimit) {
2546 /* target is full */
2547 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2548 break;
2549 }
2550
2551 if(byteIndex==0) {
2552 /* optimized loop for 1/2-byte input and BMP output */
2553 if(offsets==NULL) {
2554 do {
2555 entry=stateTable[state][*source];
2556 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2557 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2558 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2559
2560 ++source;
2561 if( source<sourceLimit &&
2562 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2563 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2564 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2565 ) {
2566 ++source;
2567 *target++=c;
2568 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2569 offset=0;
2570 } else {
2571 /* set the state and leave the optimized loop */
2572 bytes[0]=*(source-1);
2573 byteIndex=1;
2574 break;
2575 }
2576 } else {
2577 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2578 /* output BMP code point */
2579 ++source;
2580 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2581 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2582 } else {
2583 /* leave the optimized loop */
2584 break;
2585 }
2586 }
2587 } while(source<sourceLimit && target<targetLimit);
2588 } else /* offsets!=NULL */ {
2589 do {
2590 entry=stateTable[state][*source];
2591 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2592 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2593 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
2594
2595 ++source;
2596 if( source<sourceLimit &&
2597 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
2598 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
2599 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
2600 ) {
2601 ++source;
2602 *target++=c;
2603 if(offsets!=NULL) {
2604 *offsets++=sourceIndex;
2605 sourceIndex=(nextSourceIndex+=2);
2606 }
2607 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2608 offset=0;
2609 } else {
2610 /* set the state and leave the optimized loop */
2611 ++nextSourceIndex;
2612 bytes[0]=*(source-1);
2613 byteIndex=1;
2614 break;
2615 }
2616 } else {
2617 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2618 /* output BMP code point */
2619 ++source;
2620 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2621 if(offsets!=NULL) {
2622 *offsets++=sourceIndex;
2623 sourceIndex=++nextSourceIndex;
2624 }
2625 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2626 } else {
2627 /* leave the optimized loop */
2628 break;
2629 }
2630 }
2631 } while(source<sourceLimit && target<targetLimit);
2632 }
2633
2634 /*
2635 * these tests and break statements could be put inside the loop
2636 * if C had "break outerLoop" like Java
2637 */
2638 if(source>=sourceLimit) {
2639 break;
2640 }
2641 if(target>=targetLimit) {
2642 /* target is full */
2643 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2644 break;
2645 }
2646
2647 ++nextSourceIndex;
2648 bytes[byteIndex++]=*source++;
2649 } else /* byteIndex>0 */ {
2650 ++nextSourceIndex;
2651 entry=stateTable[state][bytes[byteIndex++]=*source++];
2652 }
2653
b75a7d8f
A
2654 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
2655 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
2656 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
374ca955
A
2657 continue;
2658 }
b75a7d8f 2659
374ca955
A
2660 /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2661 cnv->mode=state;
2662
2663 /* set the next state early so that we can reuse the entry variable */
2664 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2665
2666 /*
2667 * An if-else-if chain provides more reliable performance for
2668 * the most common cases compared to a switch.
2669 */
2670 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
2671 if(action==MBCS_STATE_VALID_16) {
2672 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2673 c=unicodeCodeUnits[offset];
2674 if(c<0xfffe) {
2675 /* output BMP code point */
2676 *target++=c;
2677 if(offsets!=NULL) {
2678 *offsets++=sourceIndex;
2679 }
2680 byteIndex=0;
2681 } else if(c==0xfffe) {
2682 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
2683 /* output fallback BMP code point */
2684 *target++=(UChar)entry;
2685 if(offsets!=NULL) {
2686 *offsets++=sourceIndex;
b75a7d8f 2687 }
374ca955 2688 byteIndex=0;
b75a7d8f 2689 }
374ca955
A
2690 } else {
2691 /* callback(illegal) */
2692 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2693 }
2694 } else if(action==MBCS_STATE_VALID_DIRECT_16) {
2695 /* output BMP code point */
2696 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2697 if(offsets!=NULL) {
2698 *offsets++=sourceIndex;
2699 }
2700 byteIndex=0;
2701 } else if(action==MBCS_STATE_VALID_16_PAIR) {
2702 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
2703 c=unicodeCodeUnits[offset++];
2704 if(c<0xd800) {
2705 /* output BMP code point below 0xd800 */
2706 *target++=c;
2707 if(offsets!=NULL) {
2708 *offsets++=sourceIndex;
b75a7d8f 2709 }
374ca955
A
2710 byteIndex=0;
2711 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
2712 /* output roundtrip or fallback surrogate pair */
2713 *target++=(UChar)(c&0xdbff);
2714 if(offsets!=NULL) {
2715 *offsets++=sourceIndex;
b75a7d8f 2716 }
374ca955
A
2717 byteIndex=0;
2718 if(target<targetLimit) {
2719 *target++=unicodeCodeUnits[offset];
2720 if(offsets!=NULL) {
2721 *offsets++=sourceIndex;
2722 }
2723 } else {
2724 /* target overflow */
2725 cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];
2726 cnv->UCharErrorBufferLength=1;
2727 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
2728
2729 offset=0;
2730 break;
b75a7d8f 2731 }
374ca955
A
2732 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
2733 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2734 *target++=unicodeCodeUnits[offset];
2735 if(offsets!=NULL) {
2736 *offsets++=sourceIndex;
2737 }
2738 byteIndex=0;
2739 } else if(c==0xffff) {
b75a7d8f 2740 /* callback(illegal) */
374ca955
A
2741 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2742 }
2743 } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
2744 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2745 ) {
2746 entry=MBCS_ENTRY_FINAL_VALUE(entry);
2747 /* output surrogate pair */
2748 *target++=(UChar)(0xd800|(UChar)(entry>>10));
2749 if(offsets!=NULL) {
2750 *offsets++=sourceIndex;
b75a7d8f 2751 }
b75a7d8f 2752 byteIndex=0;
374ca955
A
2753 c=(UChar)(0xdc00|(UChar)(entry&0x3ff));
2754 if(target<targetLimit) {
2755 *target++=c;
2756 if(offsets!=NULL) {
2757 *offsets++=sourceIndex;
b75a7d8f 2758 }
b75a7d8f 2759 } else {
374ca955
A
2760 /* target overflow */
2761 cnv->UCharErrorBuffer[0]=c;
2762 cnv->UCharErrorBufferLength=1;
2763 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
b75a7d8f 2764
374ca955
A
2765 offset=0;
2766 break;
2767 }
2768 } else if(action==MBCS_STATE_CHANGE_ONLY) {
b75a7d8f 2769 /*
374ca955
A
2770 * This serves as a state change without any output.
2771 * It is useful for reading simple stateful encodings,
2772 * for example using just Shift-In/Shift-Out codes.
2773 * The 21 unused bits may later be used for more sophisticated
2774 * state transitions.
b75a7d8f 2775 */
374ca955
A
2776 if(cnv->sharedData->mbcs.dbcsOnlyState==0) {
2777 byteIndex=0;
2778 } else {
2779 /* SI/SO are illegal for DBCS-only conversion */
2780 state=(uint8_t)(cnv->mode); /* restore the previous state */
2781
2782 /* callback(illegal) */
2783 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2784 }
2785 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
2786 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
2787 /* output BMP code point */
2788 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2789 if(offsets!=NULL) {
2790 *offsets++=sourceIndex;
2791 }
2792 byteIndex=0;
2793 }
2794 } else if(action==MBCS_STATE_UNASSIGNED) {
2795 /* just fall through */
2796 } else if(action==MBCS_STATE_ILLEGAL) {
2797 /* callback(illegal) */
2798 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2799 } else {
2800 /* reserved, must never occur */
2801 byteIndex=0;
b75a7d8f 2802 }
b75a7d8f 2803
374ca955
A
2804 /* end of action codes: prepare for a new character */
2805 offset=0;
2806
2807 if(byteIndex==0) {
2808 sourceIndex=nextSourceIndex;
2809 } else if(U_FAILURE(*pErrorCode)) {
2810 /* callback(illegal) */
fd0068a8
A
2811 if(byteIndex>1) {
2812 /*
2813 * Ticket 5691: consistent illegal sequences:
2814 * - We include at least the first byte in the illegal sequence.
2815 * - If any of the non-initial bytes could be the start of a character,
2816 * we stop the illegal sequence before the first one of those.
2817 */
2818 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
2819 int8_t i;
2820 for(i=1;
2821 i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
2822 ++i) {}
2823 if(i<byteIndex) {
2824 /* Back out some bytes. */
2825 int8_t backOutDistance=byteIndex-i;
2826 int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
2827 byteIndex=i; /* length of reported illegal byte sequence */
2828 if(backOutDistance<=bytesFromThisBuffer) {
2829 source-=backOutDistance;
2830 } else {
2831 /* Back out bytes from the previous buffer: Need to replay them. */
2832 cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
2833 /* preToULength is negative! */
2834 uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
2835 source=(const uint8_t *)pArgs->source;
2836 }
2837 }
2838 }
374ca955
A
2839 break;
2840 } else /* unassigned sequences indicated with byteIndex>0 */ {
2841 /* try an extension mapping */
2842 pArgs->source=(const char *)source;
2843 byteIndex=_extToU(cnv, cnv->sharedData,
46f4442e 2844 byteIndex, &source, sourceLimit,
374ca955
A
2845 &target, targetLimit,
2846 &offsets, sourceIndex,
2847 pArgs->flush,
2848 pErrorCode);
fd0068a8 2849 sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
374ca955
A
2850
2851 if(U_FAILURE(*pErrorCode)) {
2852 /* not mappable or buffer overflow */
2853 break;
2854 }
2855 }
b75a7d8f 2856 }
b75a7d8f 2857
374ca955
A
2858 /* set the converter state back into UConverter */
2859 cnv->toUnicodeStatus=offset;
b75a7d8f 2860 cnv->mode=state;
374ca955 2861 cnv->toULength=byteIndex;
b75a7d8f 2862
374ca955 2863 /* write back the updated pointers */
b75a7d8f 2864 pArgs->source=(const char *)source;
374ca955
A
2865 pArgs->target=target;
2866 pArgs->offsets=offsets;
b75a7d8f
A
2867}
2868
2869/*
374ca955
A
2870 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
2871 * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
b75a7d8f
A
2872 */
2873static UChar32
374ca955 2874ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
b75a7d8f 2875 UErrorCode *pErrorCode) {
b75a7d8f
A
2876 UConverter *cnv;
2877 const int32_t (*stateTable)[256];
2878 const uint8_t *source, *sourceLimit;
2879
2880 int32_t entry;
2881 uint8_t action;
b75a7d8f
A
2882
2883 /* set up the local pointers */
2884 cnv=pArgs->converter;
2885 source=(const uint8_t *)pArgs->source;
2886 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
2887 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
374ca955 2888 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
b75a7d8f 2889 } else {
374ca955 2890 stateTable=cnv->sharedData->mbcs.stateTable;
b75a7d8f
A
2891 }
2892
2893 /* conversion loop */
2894 while(source<sourceLimit) {
2895 entry=stateTable[0][*source++];
2896 /* MBCS_ENTRY_IS_FINAL(entry) */
2897
2898 /* write back the updated pointer early so that we can return directly */
2899 pArgs->source=(const char *)source;
2900
2901 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2902 /* output BMP code point */
2903 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2904 }
2905
2906 /*
2907 * An if-else-if chain provides more reliable performance for
2908 * the most common cases compared to a switch.
2909 */
2910 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
374ca955
A
2911 if( action==MBCS_STATE_VALID_DIRECT_20 ||
2912 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
2913 ) {
b75a7d8f
A
2914 /* output supplementary code point */
2915 return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
2916 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
374ca955 2917 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
b75a7d8f
A
2918 /* output BMP code point */
2919 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
2920 }
b75a7d8f 2921 } else if(action==MBCS_STATE_UNASSIGNED) {
374ca955 2922 /* just fall through */
b75a7d8f
A
2923 } else if(action==MBCS_STATE_ILLEGAL) {
2924 /* callback(illegal) */
b75a7d8f
A
2925 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
2926 } else {
2927 /* reserved, must never occur */
374ca955 2928 continue;
b75a7d8f
A
2929 }
2930
374ca955
A
2931 if(U_FAILURE(*pErrorCode)) {
2932 /* callback(illegal) */
2933 break;
2934 } else /* unassigned sequence */ {
2935 /* defer to the generic implementation */
2936 pArgs->source=(const char *)source-1;
2937 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
b75a7d8f
A
2938 }
2939 }
2940
374ca955 2941 /* no output because of empty input or only state changes */
b75a7d8f
A
2942 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2943 return 0xffff;
2944}
2945
2946/*
374ca955
A
2947 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character
2948 * conversion without offset handling.
b75a7d8f 2949 *
374ca955
A
2950 * When a character does not have a mapping to Unicode, then we return to the
2951 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback
2952 * handling.
2953 * We also defer to the generic code in other complicated cases and have them
2954 * ultimately handled by _MBCSToUnicodeWithOffsets() itself.
2955 *
2956 * All normal mappings and errors are handled here.
b75a7d8f 2957 */
374ca955
A
2958static UChar32
2959ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
2960 UErrorCode *pErrorCode) {
2961 UConverter *cnv;
2962 const uint8_t *source, *sourceLimit, *lastSource;
b75a7d8f
A
2963
2964 const int32_t (*stateTable)[256];
2965 const uint16_t *unicodeCodeUnits;
2966
2967 uint32_t offset;
374ca955 2968 uint8_t state;
b75a7d8f
A
2969
2970 int32_t entry;
374ca955
A
2971 UChar32 c;
2972 uint8_t action;
b75a7d8f 2973
374ca955
A
2974 /* use optimized function if possible */
2975 cnv=pArgs->converter;
2976
2977 if(cnv->preToULength>0) {
2978 /* use the generic code in ucnv_getNextUChar() to continue with a partial match */
2979 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
b75a7d8f
A
2980 }
2981
374ca955
A
2982 if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
2983 /*
2984 * Using the generic ucnv_getNextUChar() code lets us deal correctly
2985 * with the rare case of a codepage that maps single surrogates
2986 * without adding the complexity to this already complicated function here.
2987 */
2988 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
2989 } else if(cnv->sharedData->mbcs.countStates==1) {
2990 return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);
b75a7d8f 2991 }
b75a7d8f 2992
374ca955
A
2993 /* set up the local pointers */
2994 source=lastSource=(const uint8_t *)pArgs->source;
2995 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
b75a7d8f 2996
374ca955
A
2997 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
2998 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable;
2999 } else {
3000 stateTable=cnv->sharedData->mbcs.stateTable;
3001 }
3002 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;
3003
3004 /* get the converter state from UConverter */
3005 offset=cnv->toUnicodeStatus;
3006
3007 /*
3008 * if we are in the SBCS state for a DBCS-only converter,
3009 * then load the DBCS state from the MBCS data
3010 * (dbcsOnlyState==0 if it is not a DBCS-only converter)
3011 */
3012 if((state=(uint8_t)(cnv->mode))==0) {
3013 state=cnv->sharedData->mbcs.dbcsOnlyState;
3014 }
b75a7d8f
A
3015
3016 /* conversion loop */
374ca955
A
3017 c=U_SENTINEL;
3018 while(source<sourceLimit) {
b75a7d8f
A
3019 entry=stateTable[state][*source++];
3020 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
3021 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
3022 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
374ca955
A
3023
3024 /* optimization for 1/2-byte input and BMP output */
3025 if( source<sourceLimit &&
3026 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
3027 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
3028 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
3029 ) {
3030 ++source;
3031 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
3032 /* output BMP code point */
3033 break;
3034 }
b75a7d8f 3035 } else {
374ca955
A
3036 /* save the previous state for proper extension mapping with SI/SO-stateful converters */
3037 cnv->mode=state;
3038
3039 /* set the next state early so that we can reuse the entry variable */
3040 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
b75a7d8f
A
3041
3042 /*
3043 * An if-else-if chain provides more reliable performance for
3044 * the most common cases compared to a switch.
3045 */
3046 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
374ca955
A
3047 if(action==MBCS_STATE_VALID_DIRECT_16) {
3048 /* output BMP code point */
3049 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3050 break;
3051 } else if(action==MBCS_STATE_VALID_16) {
b75a7d8f 3052 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
374ca955
A
3053 c=unicodeCodeUnits[offset];
3054 if(c<0xfffe) {
3055 /* output BMP code point */
3056 break;
3057 } else if(c==0xfffe) {
3058 if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) {
3059 break;
3060 }
b75a7d8f 3061 } else {
374ca955
A
3062 /* callback(illegal) */
3063 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f 3064 }
b75a7d8f
A
3065 } else if(action==MBCS_STATE_VALID_16_PAIR) {
3066 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
374ca955
A
3067 c=unicodeCodeUnits[offset++];
3068 if(c<0xd800) {
b75a7d8f 3069 /* output BMP code point below 0xd800 */
374ca955
A
3070 break;
3071 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
b75a7d8f 3072 /* output roundtrip or fallback supplementary code point */
374ca955
A
3073 c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
3074 break;
3075 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
b75a7d8f 3076 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
374ca955
A
3077 c=unicodeCodeUnits[offset];
3078 break;
3079 } else if(c==0xffff) {
3080 /* callback(illegal) */
3081 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f 3082 }
374ca955
A
3083 } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
3084 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
3085 ) {
b75a7d8f 3086 /* output supplementary code point */
374ca955
A
3087 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
3088 break;
b75a7d8f
A
3089 } else if(action==MBCS_STATE_CHANGE_ONLY) {
3090 /*
3091 * This serves as a state change without any output.
3092 * It is useful for reading simple stateful encodings,
3093 * for example using just Shift-In/Shift-Out codes.
3094 * The 21 unused bits may later be used for more sophisticated
3095 * state transitions.
3096 */
374ca955
A
3097 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) {
3098 /* SI/SO are illegal for DBCS-only conversion */
3099 state=(uint8_t)(cnv->mode); /* restore the previous state */
3100
3101 /* callback(illegal) */
3102 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3103 }
3104 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
3105 if(UCNV_TO_U_USE_FALLBACK(cnv)) {
3106 /* output BMP code point */
3107 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3108 break;
b75a7d8f
A
3109 }
3110 } else if(action==MBCS_STATE_UNASSIGNED) {
374ca955 3111 /* just fall through */
b75a7d8f 3112 } else if(action==MBCS_STATE_ILLEGAL) {
374ca955
A
3113 /* callback(illegal) */
3114 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f 3115 } else {
374ca955
A
3116 /* reserved (must never occur), or only state change */
3117 offset=0;
3118 lastSource=source;
3119 continue;
b75a7d8f
A
3120 }
3121
374ca955 3122 /* end of action codes: prepare for a new character */
b75a7d8f 3123 offset=0;
374ca955
A
3124
3125 if(U_FAILURE(*pErrorCode)) {
3126 /* callback(illegal) */
3127 break;
3128 } else /* unassigned sequence */ {
3129 /* defer to the generic implementation */
3130 cnv->toUnicodeStatus=0;
3131 cnv->mode=state;
3132 pArgs->source=(const char *)lastSource;
3133 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
3134 }
b75a7d8f 3135 }
374ca955 3136 }
b75a7d8f 3137
374ca955
A
3138 if(c<0) {
3139 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
374ca955
A
3140 /* incomplete character byte sequence */
3141 uint8_t *bytes=cnv->toUBytes;
3142 cnv->toULength=(int8_t)(source-lastSource);
3143 do {
3144 *bytes++=*lastSource++;
3145 } while(lastSource<source);
fd0068a8
A
3146 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
3147 } else if(U_FAILURE(*pErrorCode)) {
3148 /* callback(illegal) */
3149 /*
3150 * Ticket 5691: consistent illegal sequences:
3151 * - We include at least the first byte in the illegal sequence.
3152 * - If any of the non-initial bytes could be the start of a character,
3153 * we stop the illegal sequence before the first one of those.
3154 */
3155 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
3156 uint8_t *bytes=cnv->toUBytes;
3157 *bytes++=*lastSource++; /* first byte */
3158 if(lastSource==source) {
3159 cnv->toULength=1;
3160 } else /* lastSource<source: multi-byte character */ {
3161 int8_t i;
3162 for(i=1;
3163 lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
3164 ++i
3165 ) {
3166 *bytes++=*lastSource++;
3167 }
3168 cnv->toULength=i;
3169 source=lastSource;
3170 }
374ca955
A
3171 } else {
3172 /* no output because of empty input or only state changes */
3173 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
3174 }
3175 c=0xffff;
3176 }
3177
3178 /* set the converter state back into UConverter, ready for a new character */
3179 cnv->toUnicodeStatus=0;
3180 cnv->mode=state;
3181
3182 /* write back the updated pointer */
3183 pArgs->source=(const char *)source;
3184 return c;
b75a7d8f
A
3185}
3186
3187#if 0
3188/*
3189 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
3190 * Removal improves code coverage.
3191 */
3192/**
374ca955 3193 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
b75a7d8f 3194 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
374ca955 3195 * It does not handle conversion extensions (_extToU()).
b75a7d8f
A
3196 */
3197U_CFUNC UChar32
374ca955 3198ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
b75a7d8f
A
3199 uint8_t b, UBool useFallback) {
3200 int32_t entry;
3201 uint8_t action;
3202
374ca955 3203 entry=sharedData->mbcs.stateTable[0][b];
b75a7d8f
A
3204 /* MBCS_ENTRY_IS_FINAL(entry) */
3205
3206 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
3207 /* output BMP code point */
3208 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3209 }
3210
3211 /*
3212 * An if-else-if chain provides more reliable performance for
3213 * the most common cases compared to a switch.
3214 */
3215 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
3216 if(action==MBCS_STATE_VALID_DIRECT_20) {
3217 /* output supplementary code point */
3218 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3219 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
3220 if(!TO_U_USE_FALLBACK(useFallback)) {
3221 return 0xfffe;
3222 }
3223 /* output BMP code point */
3224 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3225 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
3226 if(!TO_U_USE_FALLBACK(useFallback)) {
3227 return 0xfffe;
3228 }
3229 /* output supplementary code point */
3230 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3231 } else if(action==MBCS_STATE_UNASSIGNED) {
3232 return 0xfffe;
3233 } else if(action==MBCS_STATE_ILLEGAL) {
3234 return 0xffff;
3235 } else {
3236 /* reserved, must never occur */
3237 return 0xffff;
3238 }
3239}
3240#endif
3241
374ca955
A
3242/*
3243 * This is a simple version of _MBCSGetNextUChar() that is used
3244 * by other converter implementations.
3245 * It only returns an "assigned" result if it consumes the entire input.
3246 * It does not use state from the converter, nor error codes.
3247 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
3248 * It handles conversion extensions but not GB 18030.
3249 *
3250 * Return value:
3251 * U+fffe unassigned
3252 * U+ffff illegal
3253 * otherwise the Unicode code point
3254 */
3255U_CFUNC UChar32
3256ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
3257 const char *source, int32_t length,
3258 UBool useFallback) {
3259 const int32_t (*stateTable)[256];
3260 const uint16_t *unicodeCodeUnits;
3261
3262 uint32_t offset;
3263 uint8_t state, action;
3264
3265 UChar32 c;
3266 int32_t i, entry;
3267
3268 if(length<=0) {
3269 /* no input at all: "illegal" */
3270 return 0xffff;
3271 }
3272
3273#if 0
3274/*
3275 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
3276 * TODO In future releases, verify that this function is never called for SBCS
3277 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
3278 * Removal improves code coverage.
3279 */
3280 /* use optimized function if possible */
3281 if(sharedData->mbcs.countStates==1) {
3282 if(length==1) {
3283 return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
3284 } else {
3285 return 0xffff; /* illegal: more than a single byte for an SBCS converter */
3286 }
3287 }
3288#endif
3289
3290 /* set up the local pointers */
3291 stateTable=sharedData->mbcs.stateTable;
3292 unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;
3293
3294 /* converter state */
3295 offset=0;
3296 state=sharedData->mbcs.dbcsOnlyState;
3297
3298 /* conversion loop */
3299 for(i=0;;) {
3300 entry=stateTable[state][(uint8_t)source[i++]];
3301 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
3302 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
3303 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
3304
3305 if(i==length) {
3306 return 0xffff; /* truncated character */
3307 }
3308 } else {
3309 /*
3310 * An if-else-if chain provides more reliable performance for
3311 * the most common cases compared to a switch.
3312 */
3313 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
3314 if(action==MBCS_STATE_VALID_16) {
3315 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
3316 c=unicodeCodeUnits[offset];
3317 if(c!=0xfffe) {
3318 /* done */
3319 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
3320 c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);
3321 /* else done with 0xfffe */
3322 }
3323 break;
3324 } else if(action==MBCS_STATE_VALID_DIRECT_16) {
3325 /* output BMP code point */
3326 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3327 break;
3328 } else if(action==MBCS_STATE_VALID_16_PAIR) {
3329 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
3330 c=unicodeCodeUnits[offset++];
3331 if(c<0xd800) {
3332 /* output BMP code point below 0xd800 */
3333 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
3334 /* output roundtrip or fallback supplementary code point */
3335 c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
3336 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
3337 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
3338 c=unicodeCodeUnits[offset];
3339 } else if(c==0xffff) {
3340 return 0xffff;
3341 } else {
3342 c=0xfffe;
3343 }
3344 break;
3345 } else if(action==MBCS_STATE_VALID_DIRECT_20) {
3346 /* output supplementary code point */
3347 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3348 break;
3349 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
3350 if(!TO_U_USE_FALLBACK(useFallback)) {
3351 c=0xfffe;
3352 break;
3353 }
3354 /* output BMP code point */
3355 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
3356 break;
3357 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
3358 if(!TO_U_USE_FALLBACK(useFallback)) {
3359 c=0xfffe;
3360 break;
3361 }
3362 /* output supplementary code point */
3363 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
3364 break;
3365 } else if(action==MBCS_STATE_UNASSIGNED) {
3366 c=0xfffe;
3367 break;
3368 }
3369
3370 /*
3371 * forbid MBCS_STATE_CHANGE_ONLY for this function,
3372 * and MBCS_STATE_ILLEGAL and reserved action codes
3373 */
3374 return 0xffff;
3375 }
3376 }
3377
3378 if(i!=length) {
3379 /* illegal for this function: not all input consumed */
3380 return 0xffff;
3381 }
3382
3383 if(c==0xfffe) {
3384 /* try an extension mapping */
3385 const int32_t *cx=sharedData->mbcs.extIndexes;
3386 if(cx!=NULL) {
3387 return ucnv_extSimpleMatchToU(cx, source, length, useFallback);
3388 }
3389 }
3390
3391 return c;
3392}
3393
b75a7d8f
A
3394/* MBCS-from-Unicode conversion functions ----------------------------------- */
3395
374ca955
A
3396/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
3397static void
3398ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
3399 UErrorCode *pErrorCode) {
b75a7d8f
A
3400 UConverter *cnv;
3401 const UChar *source, *sourceLimit;
3402 uint8_t *target;
3403 int32_t targetCapacity;
3404 int32_t *offsets;
3405
3406 const uint16_t *table;
46f4442e 3407 const uint16_t *mbcsIndex;
374ca955 3408 const uint8_t *bytes;
b75a7d8f
A
3409
3410 UChar32 c;
3411
374ca955 3412 int32_t sourceIndex, nextSourceIndex;
b75a7d8f 3413
b75a7d8f 3414 uint32_t stage2Entry;
46f4442e 3415 uint32_t asciiRoundtrips;
b75a7d8f 3416 uint32_t value;
b75a7d8f
A
3417 uint8_t unicodeMask;
3418
3419 /* use optimized function if possible */
3420 cnv=pArgs->converter;
374ca955 3421 unicodeMask=cnv->sharedData->mbcs.unicodeMask;
b75a7d8f
A
3422
3423 /* set up the local pointers */
3424 source=pArgs->source;
3425 sourceLimit=pArgs->sourceLimit;
3426 target=(uint8_t *)pArgs->target;
73c04bcf 3427 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
b75a7d8f
A
3428 offsets=pArgs->offsets;
3429
374ca955 3430 table=cnv->sharedData->mbcs.fromUnicodeTable;
46f4442e 3431 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
b75a7d8f 3432 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
374ca955 3433 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
b75a7d8f 3434 } else {
374ca955 3435 bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
b75a7d8f 3436 }
46f4442e 3437 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
b75a7d8f
A
3438
3439 /* get the converter state from UConverter */
374ca955 3440 c=cnv->fromUChar32;
b75a7d8f
A
3441
3442 /* sourceIndex=-1 if the current character began in the previous buffer */
b75a7d8f
A
3443 sourceIndex= c==0 ? 0 : -1;
3444 nextSourceIndex=0;
3445
3446 /* conversion loop */
b75a7d8f
A
3447 if(c!=0 && targetCapacity>0) {
3448 goto getTrail;
3449 }
3450
3451 while(source<sourceLimit) {
3452 /*
3453 * This following test is to see if available input would overflow the output.
3454 * It does not catch output of more than one byte that
3455 * overflows as a result of a multi-byte character or callback output
3456 * from the last source character.
3457 * Therefore, those situations also test for overflows and will
3458 * then break the loop, too.
3459 */
3460 if(targetCapacity>0) {
3461 /*
3462 * Get a correct Unicode code point:
3463 * a single UChar for a BMP code point or
3464 * a matched surrogate pair for a "supplementary code point".
3465 */
3466 c=*source++;
3467 ++nextSourceIndex;
46f4442e
A
3468 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3469 *target++=(uint8_t)c;
3470 if(offsets!=NULL) {
3471 *offsets++=sourceIndex;
3472 sourceIndex=nextSourceIndex;
3473 }
3474 --targetCapacity;
3475 c=0;
3476 continue;
3477 }
b75a7d8f 3478 /*
46f4442e
A
3479 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
3480 * to avoid dealing with surrogates.
3481 * MBCS_FAST_MAX must be >=0xd7ff.
b75a7d8f 3482 */
46f4442e
A
3483 if(c<=0xd7ff) {
3484 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c);
3485 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
3486 if(value==0) {
3487 goto unassigned;
3488 }
3489 /* output the value */
3490 } else {
3491 /*
3492 * This also tests if the codepage maps single surrogates.
3493 * If it does, then surrogates are not paired but mapped separately.
3494 * Note that in this case unmatched surrogates are not detected.
3495 */
4388f060
A
3496 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
3497 if(U16_IS_SURROGATE_LEAD(c)) {
b75a7d8f 3498getTrail:
46f4442e
A
3499 if(source<sourceLimit) {
3500 /* test the following code unit */
3501 UChar trail=*source;
4388f060 3502 if(U16_IS_TRAIL(trail)) {
46f4442e
A
3503 ++source;
3504 ++nextSourceIndex;
4388f060 3505 c=U16_GET_SUPPLEMENTARY(c, trail);
46f4442e
A
3506 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
3507 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3508 /* callback(unassigned) */
3509 goto unassigned;
3510 }
3511 /* convert this supplementary code point */
3512 /* exit this condition tree */
3513 } else {
3514 /* this is an unmatched lead code unit (1st surrogate) */
3515 /* callback(illegal) */
3516 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
3517 break;
b75a7d8f 3518 }
b75a7d8f 3519 } else {
46f4442e 3520 /* no more input */
374ca955 3521 break;
b75a7d8f
A
3522 }
3523 } else {
46f4442e
A
3524 /* this is an unmatched trail code unit (2nd surrogate) */
3525 /* callback(illegal) */
3526 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
b75a7d8f
A
3527 break;
3528 }
b75a7d8f 3529 }
b75a7d8f 3530
46f4442e
A
3531 /* convert the Unicode code point in c into codepage bytes */
3532 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
b75a7d8f 3533
46f4442e
A
3534 /* get the bytes and the length for the output */
3535 /* MBCS_OUTPUT_2 */
3536 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
b75a7d8f 3537
46f4442e
A
3538 /* is this code point assigned, or do we use fallbacks? */
3539 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
3540 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
3541 ) {
3542 /*
3543 * We allow a 0 byte output if the "assigned" bit is set for this entry.
3544 * There is no way with this data structure for fallback output
3545 * to be a zero byte.
3546 */
b75a7d8f 3547
374ca955 3548unassigned:
46f4442e
A
3549 /* try an extension mapping */
3550 pArgs->source=source;
3551 c=_extFromU(cnv, cnv->sharedData,
3552 c, &source, sourceLimit,
3553 &target, target+targetCapacity,
3554 &offsets, sourceIndex,
3555 pArgs->flush,
3556 pErrorCode);
3557 nextSourceIndex+=(int32_t)(source-pArgs->source);
3558
3559 if(U_FAILURE(*pErrorCode)) {
3560 /* not mappable or buffer overflow */
3561 break;
3562 } else {
3563 /* a mapping was written to the target, continue */
b75a7d8f 3564
46f4442e
A
3565 /* recalculate the targetCapacity after an extension mapping */
3566 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
374ca955 3567
46f4442e
A
3568 /* normal end of conversion: prepare for a new character */
3569 sourceIndex=nextSourceIndex;
3570 continue;
3571 }
b75a7d8f 3572 }
374ca955 3573 }
b75a7d8f 3574
374ca955
A
3575 /* write the output character bytes from value and length */
3576 /* from the first if in the loop we know that targetCapacity>0 */
46f4442e 3577 if(value<=0xff) {
374ca955
A
3578 /* this is easy because we know that there is enough space */
3579 *target++=(uint8_t)value;
3580 if(offsets!=NULL) {
3581 *offsets++=sourceIndex;
3582 }
3583 --targetCapacity;
3584 } else /* length==2 */ {
3585 *target++=(uint8_t)(value>>8);
3586 if(2<=targetCapacity) {
3587 *target++=(uint8_t)value;
b75a7d8f
A
3588 if(offsets!=NULL) {
3589 *offsets++=sourceIndex;
b75a7d8f
A
3590 *offsets++=sourceIndex;
3591 }
374ca955
A
3592 targetCapacity-=2;
3593 } else {
b75a7d8f
A
3594 if(offsets!=NULL) {
3595 *offsets++=sourceIndex;
3596 }
374ca955
A
3597 cnv->charErrorBuffer[0]=(char)value;
3598 cnv->charErrorBufferLength=1;
3599
3600 /* target overflow */
3601 targetCapacity=0;
3602 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3603 c=0;
b75a7d8f
A
3604 break;
3605 }
b75a7d8f
A
3606 }
3607
3608 /* normal end of conversion: prepare for a new character */
3609 c=0;
374ca955 3610 sourceIndex=nextSourceIndex;
b75a7d8f 3611 continue;
b75a7d8f
A
3612 } else {
3613 /* target is full */
3614 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3615 break;
3616 }
3617 }
3618
b75a7d8f 3619 /* set the converter state back into UConverter */
374ca955 3620 cnv->fromUChar32=c;
b75a7d8f
A
3621
3622 /* write back the updated pointers */
3623 pArgs->source=source;
3624 pArgs->target=(char *)target;
3625 pArgs->offsets=offsets;
3626}
3627
374ca955 3628/* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
b75a7d8f 3629static void
374ca955 3630ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
b75a7d8f
A
3631 UErrorCode *pErrorCode) {
3632 UConverter *cnv;
3633 const UChar *source, *sourceLimit;
3634 uint8_t *target;
3635 int32_t targetCapacity;
3636 int32_t *offsets;
3637
3638 const uint16_t *table;
374ca955 3639 const uint16_t *results;
b75a7d8f
A
3640
3641 UChar32 c;
3642
3643 int32_t sourceIndex, nextSourceIndex;
3644
b75a7d8f
A
3645 uint16_t value, minValue;
3646 UBool hasSupplementary;
3647
3648 /* set up the local pointers */
3649 cnv=pArgs->converter;
3650 source=pArgs->source;
3651 sourceLimit=pArgs->sourceLimit;
3652 target=(uint8_t *)pArgs->target;
73c04bcf 3653 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
b75a7d8f
A
3654 offsets=pArgs->offsets;
3655
374ca955 3656 table=cnv->sharedData->mbcs.fromUnicodeTable;
b75a7d8f 3657 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
374ca955 3658 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
b75a7d8f 3659 } else {
374ca955 3660 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
b75a7d8f
A
3661 }
3662
3663 if(cnv->useFallback) {
3664 /* use all roundtrip and fallback results */
3665 minValue=0x800;
3666 } else {
3667 /* use only roundtrips and fallbacks from private-use characters */
3668 minValue=0xc00;
3669 }
374ca955 3670 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
b75a7d8f
A
3671
3672 /* get the converter state from UConverter */
374ca955 3673 c=cnv->fromUChar32;
b75a7d8f
A
3674
3675 /* sourceIndex=-1 if the current character began in the previous buffer */
3676 sourceIndex= c==0 ? 0 : -1;
3677 nextSourceIndex=0;
3678
3679 /* conversion loop */
3680 if(c!=0 && targetCapacity>0) {
3681 goto getTrail;
3682 }
3683
3684 while(source<sourceLimit) {
3685 /*
3686 * This following test is to see if available input would overflow the output.
3687 * It does not catch output of more than one byte that
3688 * overflows as a result of a multi-byte character or callback output
3689 * from the last source character.
3690 * Therefore, those situations also test for overflows and will
3691 * then break the loop, too.
3692 */
3693 if(targetCapacity>0) {
3694 /*
3695 * Get a correct Unicode code point:
3696 * a single UChar for a BMP code point or
3697 * a matched surrogate pair for a "supplementary code point".
3698 */
3699 c=*source++;
3700 ++nextSourceIndex;
4388f060
A
3701 if(U16_IS_SURROGATE(c)) {
3702 if(U16_IS_SURROGATE_LEAD(c)) {
b75a7d8f
A
3703getTrail:
3704 if(source<sourceLimit) {
3705 /* test the following code unit */
3706 UChar trail=*source;
4388f060 3707 if(U16_IS_TRAIL(trail)) {
b75a7d8f
A
3708 ++source;
3709 ++nextSourceIndex;
4388f060 3710 c=U16_GET_SUPPLEMENTARY(c, trail);
b75a7d8f
A
3711 if(!hasSupplementary) {
3712 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3713 /* callback(unassigned) */
3714 goto unassigned;
3715 }
3716 /* convert this supplementary code point */
3717 /* exit this condition tree */
3718 } else {
3719 /* this is an unmatched lead code unit (1st surrogate) */
3720 /* callback(illegal) */
b75a7d8f 3721 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
374ca955 3722 break;
b75a7d8f
A
3723 }
3724 } else {
3725 /* no more input */
3726 break;
3727 }
3728 } else {
3729 /* this is an unmatched trail code unit (2nd surrogate) */
3730 /* callback(illegal) */
b75a7d8f 3731 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
374ca955 3732 break;
b75a7d8f
A
3733 }
3734 }
3735
3736 /* convert the Unicode code point in c into codepage bytes */
3737 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3738
3739 /* is this code point assigned, or do we use fallbacks? */
3740 if(value>=minValue) {
3741 /* assigned, write the output character bytes from value and length */
3742 /* length==1 */
3743 /* this is easy because we know that there is enough space */
3744 *target++=(uint8_t)value;
3745 if(offsets!=NULL) {
3746 *offsets++=sourceIndex;
3747 }
3748 --targetCapacity;
3749
3750 /* normal end of conversion: prepare for a new character */
3751 c=0;
3752 sourceIndex=nextSourceIndex;
b75a7d8f 3753 } else { /* unassigned */
b75a7d8f 3754unassigned:
374ca955
A
3755 /* try an extension mapping */
3756 pArgs->source=source;
3757 c=_extFromU(cnv, cnv->sharedData,
3758 c, &source, sourceLimit,
46f4442e 3759 &target, target+targetCapacity,
374ca955
A
3760 &offsets, sourceIndex,
3761 pArgs->flush,
3762 pErrorCode);
3763 nextSourceIndex+=(int32_t)(source-pArgs->source);
3764
3765 if(U_FAILURE(*pErrorCode)) {
3766 /* not mappable or buffer overflow */
3767 break;
3768 } else {
3769 /* a mapping was written to the target, continue */
b75a7d8f 3770
374ca955 3771 /* recalculate the targetCapacity after an extension mapping */
73c04bcf 3772 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
b75a7d8f 3773
374ca955
A
3774 /* normal end of conversion: prepare for a new character */
3775 sourceIndex=nextSourceIndex;
3776 }
b75a7d8f 3777 }
b75a7d8f
A
3778 } else {
3779 /* target is full */
3780 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
3781 break;
3782 }
3783 }
3784
374ca955
A
3785 /* set the converter state back into UConverter */
3786 cnv->fromUChar32=c;
b75a7d8f
A
3787
3788 /* write back the updated pointers */
3789 pArgs->source=source;
3790 pArgs->target=(char *)target;
3791 pArgs->offsets=offsets;
3792}
3793
3794/*
374ca955 3795 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages
b75a7d8f
A
3796 * that map only to and from the BMP.
3797 * In addition to single-byte/state optimizations, the offset calculations
3798 * become much easier.
46f4442e
A
3799 * It would be possible to use the sbcsIndex for UTF-8-friendly tables,
3800 * but measurements have shown that this diminishes performance
3801 * in more cases than it improves it.
3802 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches
3803 * for various MBCS and SBCS optimizations.
b75a7d8f
A
3804 */
3805static void
374ca955 3806ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
b75a7d8f
A
3807 UErrorCode *pErrorCode) {
3808 UConverter *cnv;
3809 const UChar *source, *sourceLimit, *lastSource;
3810 uint8_t *target;
3811 int32_t targetCapacity, length;
3812 int32_t *offsets;
3813
3814 const uint16_t *table;
3815 const uint16_t *results;
3816
3817 UChar32 c;
3818
3819 int32_t sourceIndex;
3820
46f4442e 3821 uint32_t asciiRoundtrips;
b75a7d8f
A
3822 uint16_t value, minValue;
3823
3824 /* set up the local pointers */
3825 cnv=pArgs->converter;
3826 source=pArgs->source;
3827 sourceLimit=pArgs->sourceLimit;
3828 target=(uint8_t *)pArgs->target;
73c04bcf 3829 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
b75a7d8f
A
3830 offsets=pArgs->offsets;
3831
374ca955 3832 table=cnv->sharedData->mbcs.fromUnicodeTable;
b75a7d8f 3833 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
374ca955 3834 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
b75a7d8f 3835 } else {
374ca955 3836 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
b75a7d8f 3837 }
46f4442e 3838 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
b75a7d8f
A
3839
3840 if(cnv->useFallback) {
3841 /* use all roundtrip and fallback results */
3842 minValue=0x800;
3843 } else {
3844 /* use only roundtrips and fallbacks from private-use characters */
3845 minValue=0xc00;
3846 }
3847
3848 /* get the converter state from UConverter */
374ca955 3849 c=cnv->fromUChar32;
b75a7d8f
A
3850
3851 /* sourceIndex=-1 if the current character began in the previous buffer */
3852 sourceIndex= c==0 ? 0 : -1;
3853 lastSource=source;
3854
3855 /*
3856 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
3857 * for the minimum of the sourceLength and targetCapacity
3858 */
73c04bcf 3859 length=(int32_t)(sourceLimit-source);
b75a7d8f
A
3860 if(length<targetCapacity) {
3861 targetCapacity=length;
3862 }
3863
3864 /* conversion loop */
3865 if(c!=0 && targetCapacity>0) {
3866 goto getTrail;
3867 }
3868
3869#if MBCS_UNROLL_SINGLE_FROM_BMP
3870 /* unrolling makes it slower on Pentium III/Windows 2000?! */
3871 /* unroll the loop with the most common case */
3872unrolled:
3873 if(targetCapacity>=4) {
3874 int32_t count, loops;
3875 uint16_t andedValues;
3876
3877 loops=count=targetCapacity>>2;
3878 do {
3879 c=*source++;
3880 andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3881 *target++=(uint8_t)value;
3882 c=*source++;
3883 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3884 *target++=(uint8_t)value;
3885 c=*source++;
3886 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3887 *target++=(uint8_t)value;
3888 c=*source++;
3889 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
3890 *target++=(uint8_t)value;
3891
3892 /* were all 4 entries really valid? */
3893 if(andedValues<minValue) {
3894 /* no, return to the first of these 4 */
3895 source-=4;
3896 target-=4;
3897 break;
3898 }
3899 } while(--count>0);
3900 count=loops-count;
3901 targetCapacity-=4*count;
3902
3903 if(offsets!=NULL) {
3904 lastSource+=4*count;
3905 while(count>0) {
3906 *offsets++=sourceIndex++;
3907 *offsets++=sourceIndex++;
3908 *offsets++=sourceIndex++;
3909 *offsets++=sourceIndex++;
3910 --count;
3911 }
3912 }
3913
3914 c=0;
3915 }
3916#endif
3917
3918 while(targetCapacity>0) {
3919 /*
3920 * Get a correct Unicode code point:
3921 * a single UChar for a BMP code point or
3922 * a matched surrogate pair for a "supplementary code point".
3923 */
3924 c=*source++;
3925 /*
3926 * Do not immediately check for single surrogates:
3927 * Assume that they are unassigned and check for them in that case.
3928 * This speeds up the conversion of assigned characters.
3929 */
3930 /* convert the Unicode code point in c into codepage bytes */
46f4442e
A
3931 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
3932 *target++=(uint8_t)c;
3933 --targetCapacity;
3934 c=0;
3935 continue;
3936 }
b75a7d8f 3937 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
b75a7d8f
A
3938 /* is this code point assigned, or do we use fallbacks? */
3939 if(value>=minValue) {
3940 /* assigned, write the output character bytes from value and length */
3941 /* length==1 */
3942 /* this is easy because we know that there is enough space */
3943 *target++=(uint8_t)value;
3944 --targetCapacity;
3945
3946 /* normal end of conversion: prepare for a new character */
3947 c=0;
3948 continue;
4388f060 3949 } else if(!U16_IS_SURROGATE(c)) {
b75a7d8f 3950 /* normal, unassigned BMP character */
4388f060 3951 } else if(U16_IS_SURROGATE_LEAD(c)) {
b75a7d8f
A
3952getTrail:
3953 if(source<sourceLimit) {
3954 /* test the following code unit */
3955 UChar trail=*source;
4388f060 3956 if(U16_IS_TRAIL(trail)) {
b75a7d8f 3957 ++source;
4388f060 3958 c=U16_GET_SUPPLEMENTARY(c, trail);
b75a7d8f
A
3959 /* this codepage does not map supplementary code points */
3960 /* callback(unassigned) */
b75a7d8f
A
3961 } else {
3962 /* this is an unmatched lead code unit (1st surrogate) */
3963 /* callback(illegal) */
b75a7d8f 3964 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
374ca955 3965 break;
b75a7d8f
A
3966 }
3967 } else {
3968 /* no more input */
46f4442e
A
3969 if (pArgs->flush) {
3970 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
3971 }
b75a7d8f
A
3972 break;
3973 }
3974 } else {
3975 /* this is an unmatched trail code unit (2nd surrogate) */
3976 /* callback(illegal) */
b75a7d8f 3977 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
374ca955 3978 break;
b75a7d8f
A
3979 }
3980
374ca955 3981 /* c does not have a mapping */
b75a7d8f 3982
374ca955
A
3983 /* get the number of code units for c to correctly advance sourceIndex */
3984 length=U16_LENGTH(c);
3985
3986 /* set offsets since the start or the last extension */
b75a7d8f
A
3987 if(offsets!=NULL) {
3988 int32_t count=(int32_t)(source-lastSource);
3989
374ca955
A
3990 /* do not set the offset for this character */
3991 count-=length;
3992
3993 while(count>0) {
3994 *offsets++=sourceIndex++;
3995 --count;
3996 }
3997 /* offsets and sourceIndex are now set for the current character */
3998 }
3999
4000 /* try an extension mapping */
4001 lastSource=source;
4002 c=_extFromU(cnv, cnv->sharedData,
4003 c, &source, sourceLimit,
46f4442e 4004 &target, (const uint8_t *)(pArgs->targetLimit),
374ca955
A
4005 &offsets, sourceIndex,
4006 pArgs->flush,
4007 pErrorCode);
4008 sourceIndex+=length+(int32_t)(source-lastSource);
4009 lastSource=source;
4010
4011 if(U_FAILURE(*pErrorCode)) {
4012 /* not mappable or buffer overflow */
4013 break;
4014 } else {
4015 /* a mapping was written to the target, continue */
4016
4017 /* recalculate the targetCapacity after an extension mapping */
73c04bcf
A
4018 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
4019 length=(int32_t)(sourceLimit-source);
374ca955
A
4020 if(length<targetCapacity) {
4021 targetCapacity=length;
4022 }
4023 }
4024
4025#if MBCS_UNROLL_SINGLE_FROM_BMP
4026 /* unrolling makes it slower on Pentium III/Windows 2000?! */
4027 goto unrolled;
4028#endif
4029 }
4030
4031 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
4032 /* target is full */
4033 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4034 }
4035
4036 /* set offsets since the start or the last callback */
4037 if(offsets!=NULL) {
4038 size_t count=source-lastSource;
46f4442e
A
4039 if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) {
4040 /*
4041 Caller gave us a partial supplementary character,
4042 which this function couldn't convert in any case.
4043 The callback will handle the offset.
4044 */
4045 count--;
4046 }
374ca955
A
4047 while(count>0) {
4048 *offsets++=sourceIndex++;
4049 --count;
4050 }
4051 }
4052
4053 /* set the converter state back into UConverter */
4054 cnv->fromUChar32=c;
4055
4056 /* write back the updated pointers */
4057 pArgs->source=source;
4058 pArgs->target=(char *)target;
4059 pArgs->offsets=offsets;
4060}
4061
4062U_CFUNC void
4063ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
4064 UErrorCode *pErrorCode) {
4065 UConverter *cnv;
4066 const UChar *source, *sourceLimit;
4067 uint8_t *target;
4068 int32_t targetCapacity;
4069 int32_t *offsets;
4070
4071 const uint16_t *table;
46f4442e 4072 const uint16_t *mbcsIndex;
374ca955
A
4073 const uint8_t *p, *bytes;
4074 uint8_t outputType;
4075
4076 UChar32 c;
4077
4078 int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
4079
4080 uint32_t stage2Entry;
46f4442e 4081 uint32_t asciiRoundtrips;
374ca955 4082 uint32_t value;
51004dcb
A
4083 /* Shift-In and Shift-Out byte sequences differ by encoding scheme. */
4084 uint8_t siBytes[2] = {0, 0};
4085 uint8_t soBytes[2] = {0, 0};
4086 uint8_t siLength, soLength;
729e4ab9 4087 int32_t length = 0, prevLength;
374ca955
A
4088 uint8_t unicodeMask;
4089
4090 cnv=pArgs->converter;
4091
4092 if(cnv->preFromUFirstCP>=0) {
4093 /*
4094 * pass sourceIndex=-1 because we continue from an earlier buffer
4095 * in the future, this may change with continuous offsets
4096 */
4097 ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
4098
4099 if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) {
4100 return;
4101 }
4102 }
4103
4104 /* use optimized function if possible */
4105 outputType=cnv->sharedData->mbcs.outputType;
4106 unicodeMask=cnv->sharedData->mbcs.unicodeMask;
4107 if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
4108 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4109 ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
4110 } else {
4111 ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
4112 }
4113 return;
46f4442e 4114 } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {
374ca955
A
4115 ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);
4116 return;
4117 }
4118
4119 /* set up the local pointers */
4120 source=pArgs->source;
4121 sourceLimit=pArgs->sourceLimit;
4122 target=(uint8_t *)pArgs->target;
73c04bcf 4123 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
374ca955
A
4124 offsets=pArgs->offsets;
4125
4126 table=cnv->sharedData->mbcs.fromUnicodeTable;
46f4442e
A
4127 if(cnv->sharedData->mbcs.utf8Friendly) {
4128 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
4129 } else {
4130 mbcsIndex=NULL;
4131 }
374ca955
A
4132 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
4133 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
4134 } else {
4135 bytes=cnv->sharedData->mbcs.fromUnicodeBytes;
4136 }
46f4442e 4137 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
374ca955
A
4138
4139 /* get the converter state from UConverter */
4140 c=cnv->fromUChar32;
4141
4142 if(outputType==MBCS_OUTPUT_2_SISO) {
4143 prevLength=cnv->fromUnicodeStatus;
4144 if(prevLength==0) {
4145 /* set the real value */
4146 prevLength=1;
4147 }
4148 } else {
4149 /* prevent fromUnicodeStatus from being set to something non-0 */
4150 prevLength=0;
4151 }
4152
4153 /* sourceIndex=-1 if the current character began in the previous buffer */
4154 prevSourceIndex=-1;
4155 sourceIndex= c==0 ? 0 : -1;
4156 nextSourceIndex=0;
4157
729e4ab9 4158 /* Get the SI/SO character for the converter */
51004dcb
A
4159 siLength = getSISOBytes(SI, cnv->options, siBytes);
4160 soLength = getSISOBytes(SO, cnv->options, soBytes);
729e4ab9 4161
374ca955
A
4162 /* conversion loop */
4163 /*
4164 * This is another piece of ugly code:
4165 * A goto into the loop if the converter state contains a first surrogate
4166 * from the previous function call.
4167 * It saves me to check in each loop iteration a check of if(c==0)
4168 * and duplicating the trail-surrogate-handling code in the else
4169 * branch of that check.
4170 * I could not find any other way to get around this other than
4171 * using a function call for the conversion and callback, which would
4172 * be even more inefficient.
4173 *
4174 * Markus Scherer 2000-jul-19
4175 */
4176 if(c!=0 && targetCapacity>0) {
4177 goto getTrail;
4178 }
4179
4180 while(source<sourceLimit) {
4181 /*
4182 * This following test is to see if available input would overflow the output.
4183 * It does not catch output of more than one byte that
4184 * overflows as a result of a multi-byte character or callback output
4185 * from the last source character.
4186 * Therefore, those situations also test for overflows and will
4187 * then break the loop, too.
4188 */
4189 if(targetCapacity>0) {
4190 /*
4191 * Get a correct Unicode code point:
4192 * a single UChar for a BMP code point or
4193 * a matched surrogate pair for a "supplementary code point".
4194 */
4195 c=*source++;
4196 ++nextSourceIndex;
46f4442e
A
4197 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
4198 *target++=(uint8_t)c;
4199 if(offsets!=NULL) {
4200 *offsets++=sourceIndex;
4201 prevSourceIndex=sourceIndex;
4202 sourceIndex=nextSourceIndex;
4203 }
4204 --targetCapacity;
4205 c=0;
4206 continue;
4207 }
374ca955 4208 /*
46f4442e
A
4209 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX
4210 * to avoid dealing with surrogates.
4211 * MBCS_FAST_MAX must be >=0xd7ff.
374ca955 4212 */
46f4442e
A
4213 if(c<=0xd7ff && mbcsIndex!=NULL) {
4214 value=mbcsIndex[c>>6];
4215
4216 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */
4217 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */
4218 switch(outputType) {
4219 case MBCS_OUTPUT_2:
4220 value=((const uint16_t *)bytes)[value +(c&0x3f)];
4221 if(value<=0xff) {
4222 if(value==0) {
4223 goto unassigned;
4224 } else {
4225 length=1;
4226 }
4227 } else {
4228 length=2;
4229 }
4230 break;
4231 case MBCS_OUTPUT_2_SISO:
4232 /* 1/2-byte stateful with Shift-In/Shift-Out */
4233 /*
4234 * Save the old state in the converter object
4235 * right here, then change the local prevLength state variable if necessary.
4236 * Then, if this character turns out to be unassigned or a fallback that
4237 * is not taken, the callback code must not save the new state in the converter
4238 * because the new state is for a character that is not output.
4239 * However, the callback must still restore the state from the converter
4240 * in case the callback function changed it for its output.
4241 */
4242 cnv->fromUnicodeStatus=prevLength; /* save the old state */
4243 value=((const uint16_t *)bytes)[value +(c&0x3f)];
4244 if(value<=0xff) {
4245 if(value==0) {
4246 goto unassigned;
4247 } else if(prevLength<=1) {
4248 length=1;
4249 } else {
4250 /* change from double-byte mode to single-byte */
51004dcb
A
4251 if (siLength == 1) {
4252 value|=(uint32_t)siBytes[0]<<8;
729e4ab9 4253 length = 2;
51004dcb
A
4254 } else if (siLength == 2) {
4255 value|=(uint32_t)siBytes[1]<<8;
4256 value|=(uint32_t)siBytes[0]<<16;
729e4ab9
A
4257 length = 3;
4258 }
46f4442e
A
4259 prevLength=1;
4260 }
4261 } else {
4262 if(prevLength==2) {
4263 length=2;
4264 } else {
4265 /* change from single-byte mode to double-byte */
51004dcb
A
4266 if (soLength == 1) {
4267 value|=(uint32_t)soBytes[0]<<16;
729e4ab9 4268 length = 3;
51004dcb
A
4269 } else if (soLength == 2) {
4270 value|=(uint32_t)soBytes[1]<<16;
4271 value|=(uint32_t)soBytes[0]<<24;
729e4ab9
A
4272 length = 4;
4273 }
46f4442e
A
4274 prevLength=2;
4275 }
4276 }
4277 break;
4278 case MBCS_OUTPUT_DBCS_ONLY:
4279 /* table with single-byte results, but only DBCS mappings used */
4280 value=((const uint16_t *)bytes)[value +(c&0x3f)];
4281 if(value<=0xff) {
4282 /* no mapping or SBCS result, not taken for DBCS-only */
4283 goto unassigned;
4284 } else {
4285 length=2;
4286 }
4287 break;
4288 case MBCS_OUTPUT_3:
4289 p=bytes+(value+(c&0x3f))*3;
4290 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4291 if(value<=0xff) {
4292 if(value==0) {
4293 goto unassigned;
4294 } else {
4295 length=1;
4296 }
4297 } else if(value<=0xffff) {
4298 length=2;
4299 } else {
4300 length=3;
4301 }
4302 break;
4303 case MBCS_OUTPUT_4:
4304 value=((const uint32_t *)bytes)[value +(c&0x3f)];
4305 if(value<=0xff) {
4306 if(value==0) {
4307 goto unassigned;
4308 } else {
4309 length=1;
4310 }
4311 } else if(value<=0xffff) {
4312 length=2;
4313 } else if(value<=0xffffff) {
4314 length=3;
4315 } else {
4316 length=4;
4317 }
4318 break;
4319 case MBCS_OUTPUT_3_EUC:
4320 value=((const uint16_t *)bytes)[value +(c&0x3f)];
4321 /* EUC 16-bit fixed-length representation */
4322 if(value<=0xff) {
4323 if(value==0) {
4324 goto unassigned;
4325 } else {
4326 length=1;
4327 }
4328 } else if((value&0x8000)==0) {
4329 value|=0x8e8000;
4330 length=3;
4331 } else if((value&0x80)==0) {
4332 value|=0x8f0080;
4333 length=3;
4334 } else {
4335 length=2;
4336 }
4337 break;
4338 case MBCS_OUTPUT_4_EUC:
4339 p=bytes+(value+(c&0x3f))*3;
4340 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4341 /* EUC 16-bit fixed-length representation applied to the first two bytes */
4342 if(value<=0xff) {
4343 if(value==0) {
4344 goto unassigned;
4345 } else {
4346 length=1;
4347 }
4348 } else if(value<=0xffff) {
4349 length=2;
4350 } else if((value&0x800000)==0) {
4351 value|=0x8e800000;
4352 length=4;
4353 } else if((value&0x8000)==0) {
4354 value|=0x8f008000;
4355 length=4;
4356 } else {
4357 length=3;
4358 }
4359 break;
4360 default:
4361 /* must not occur */
4362 /*
4363 * To avoid compiler warnings that value & length may be
4364 * used without having been initialized, we set them here.
4365 * In reality, this is unreachable code.
4366 * Not having a default branch also causes warnings with
4367 * some compilers.
4368 */
4369 value=0;
4370 length=0;
4371 break;
4372 }
4373 /* output the value */
4374 } else {
4375 /*
4376 * This also tests if the codepage maps single surrogates.
4377 * If it does, then surrogates are not paired but mapped separately.
4378 * Note that in this case unmatched surrogates are not detected.
4379 */
4388f060
A
4380 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
4381 if(U16_IS_SURROGATE_LEAD(c)) {
374ca955 4382getTrail:
46f4442e
A
4383 if(source<sourceLimit) {
4384 /* test the following code unit */
4385 UChar trail=*source;
4388f060 4386 if(U16_IS_TRAIL(trail)) {
46f4442e
A
4387 ++source;
4388 ++nextSourceIndex;
4388f060 4389 c=U16_GET_SUPPLEMENTARY(c, trail);
46f4442e
A
4390 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4391 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4392 cnv->fromUnicodeStatus=prevLength; /* save the old state */
4393 /* callback(unassigned) */
4394 goto unassigned;
4395 }
4396 /* convert this supplementary code point */
4397 /* exit this condition tree */
4398 } else {
4399 /* this is an unmatched lead code unit (1st surrogate) */
4400 /* callback(illegal) */
4401 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
4402 break;
374ca955 4403 }
374ca955 4404 } else {
46f4442e 4405 /* no more input */
374ca955
A
4406 break;
4407 }
4408 } else {
46f4442e
A
4409 /* this is an unmatched trail code unit (2nd surrogate) */
4410 /* callback(illegal) */
4411 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
374ca955
A
4412 break;
4413 }
374ca955 4414 }
374ca955 4415
46f4442e 4416 /* convert the Unicode code point in c into codepage bytes */
374ca955 4417
374ca955 4418 /*
46f4442e
A
4419 * The basic lookup is a triple-stage compact array (trie) lookup.
4420 * For details see the beginning of this file.
4421 *
4422 * Single-byte codepages are handled with a different data structure
4423 * by _MBCSSingle... functions.
4424 *
4425 * The result consists of a 32-bit value from stage 2 and
4426 * a pointer to as many bytes as are stored per character.
4427 * The pointer points to the character's bytes in stage 3.
4428 * Bits 15..0 of the stage 2 entry contain the stage 3 index
4429 * for that pointer, while bits 31..16 are flags for which of
4430 * the 16 characters in the block are roundtrip-assigned.
4431 *
4432 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t
4433 * respectively as uint32_t, in the platform encoding.
4434 * For 3-byte codepages, the bytes are always stored in big-endian order.
4435 *
4436 * For EUC encodings that use only either 0x8e or 0x8f as the first
4437 * byte of their longest byte sequences, the first two bytes in
4438 * this third stage indicate with their 7th bits whether these bytes
4439 * are to be written directly or actually need to be preceeded by
4440 * one of the two Single-Shift codes. With this, the third stage
4441 * stores one byte fewer per character than the actual maximum length of
4442 * EUC byte sequences.
4443 *
4444 * Other than that, leading zero bytes are removed and the other
4445 * bytes output. A single zero byte may be output if the "assigned"
4446 * bit in stage 2 was on.
4447 * The data structure does not support zero byte output as a fallback,
4448 * and also does not allow output of leading zeros.
374ca955 4449 */
46f4442e
A
4450 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
4451
4452 /* get the bytes and the length for the output */
4453 switch(outputType) {
4454 case MBCS_OUTPUT_2:
4455 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4456 if(value<=0xff) {
4457 length=1;
4458 } else {
4459 length=2;
4460 }
4461 break;
4462 case MBCS_OUTPUT_2_SISO:
4463 /* 1/2-byte stateful with Shift-In/Shift-Out */
4464 /*
4465 * Save the old state in the converter object
4466 * right here, then change the local prevLength state variable if necessary.
4467 * Then, if this character turns out to be unassigned or a fallback that
4468 * is not taken, the callback code must not save the new state in the converter
4469 * because the new state is for a character that is not output.
4470 * However, the callback must still restore the state from the converter
4471 * in case the callback function changed it for its output.
4472 */
4473 cnv->fromUnicodeStatus=prevLength; /* save the old state */
4474 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4475 if(value<=0xff) {
4476 if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) {
4477 /* no mapping, leave value==0 */
4478 length=0;
4479 } else if(prevLength<=1) {
4480 length=1;
4481 } else {
4482 /* change from double-byte mode to single-byte */
51004dcb
A
4483 if (siLength == 1) {
4484 value|=(uint32_t)siBytes[0]<<8;
729e4ab9 4485 length = 2;
51004dcb
A
4486 } else if (siLength == 2) {
4487 value|=(uint32_t)siBytes[1]<<8;
4488 value|=(uint32_t)siBytes[0]<<16;
729e4ab9
A
4489 length = 3;
4490 }
46f4442e
A
4491 prevLength=1;
4492 }
4493 } else {
4494 if(prevLength==2) {
4495 length=2;
4496 } else {
4497 /* change from single-byte mode to double-byte */
51004dcb
A
4498 if (soLength == 1) {
4499 value|=(uint32_t)soBytes[0]<<16;
729e4ab9 4500 length = 3;
51004dcb
A
4501 } else if (soLength == 2) {
4502 value|=(uint32_t)soBytes[1]<<16;
4503 value|=(uint32_t)soBytes[0]<<24;
729e4ab9
A
4504 length = 4;
4505 }
46f4442e
A
4506 prevLength=2;
4507 }
4508 }
4509 break;
4510 case MBCS_OUTPUT_DBCS_ONLY:
4511 /* table with single-byte results, but only DBCS mappings used */
4512 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4513 if(value<=0xff) {
4514 /* no mapping or SBCS result, not taken for DBCS-only */
4515 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
374ca955 4516 length=0;
46f4442e
A
4517 } else {
4518 length=2;
4519 }
4520 break;
4521 case MBCS_OUTPUT_3:
4522 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
4523 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4524 if(value<=0xff) {
4525 length=1;
4526 } else if(value<=0xffff) {
4527 length=2;
4528 } else {
4529 length=3;
4530 }
4531 break;
4532 case MBCS_OUTPUT_4:
4533 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
4534 if(value<=0xff) {
4535 length=1;
4536 } else if(value<=0xffff) {
4537 length=2;
4538 } else if(value<=0xffffff) {
4539 length=3;
4540 } else {
4541 length=4;
4542 }
4543 break;
4544 case MBCS_OUTPUT_3_EUC:
4545 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4546 /* EUC 16-bit fixed-length representation */
4547 if(value<=0xff) {
374ca955 4548 length=1;
46f4442e
A
4549 } else if((value&0x8000)==0) {
4550 value|=0x8e8000;
4551 length=3;
4552 } else if((value&0x80)==0) {
4553 value|=0x8f0080;
4554 length=3;
374ca955 4555 } else {
374ca955 4556 length=2;
374ca955 4557 }
46f4442e
A
4558 break;
4559 case MBCS_OUTPUT_4_EUC:
4560 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
4561 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4562 /* EUC 16-bit fixed-length representation applied to the first two bytes */
4563 if(value<=0xff) {
4564 length=1;
4565 } else if(value<=0xffff) {
374ca955 4566 length=2;
46f4442e
A
4567 } else if((value&0x800000)==0) {
4568 value|=0x8e800000;
4569 length=4;
4570 } else if((value&0x8000)==0) {
4571 value|=0x8f008000;
4572 length=4;
374ca955 4573 } else {
374ca955 4574 length=3;
374ca955 4575 }
46f4442e
A
4576 break;
4577 default:
4578 /* must not occur */
4579 /*
4580 * To avoid compiler warnings that value & length may be
4581 * used without having been initialized, we set them here.
4582 * In reality, this is unreachable code.
4583 * Not having a default branch also causes warnings with
4584 * some compilers.
4585 */
374ca955
A
4586 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4587 length=0;
46f4442e 4588 break;
374ca955 4589 }
46f4442e
A
4590
4591 /* is this code point assigned, or do we use fallbacks? */
4592 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 ||
4593 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
4594 ) {
4595 /*
4596 * We allow a 0 byte output if the "assigned" bit is set for this entry.
4597 * There is no way with this data structure for fallback output
4598 * to be a zero byte.
4599 */
4600
4601unassigned:
4602 /* try an extension mapping */
4603 pArgs->source=source;
4604 c=_extFromU(cnv, cnv->sharedData,
4605 c, &source, sourceLimit,
4606 &target, target+targetCapacity,
4607 &offsets, sourceIndex,
4608 pArgs->flush,
4609 pErrorCode);
4610 nextSourceIndex+=(int32_t)(source-pArgs->source);
4611 prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
4612
4613 if(U_FAILURE(*pErrorCode)) {
4614 /* not mappable or buffer overflow */
4615 break;
4616 } else {
4617 /* a mapping was written to the target, continue */
4618
4619 /* recalculate the targetCapacity after an extension mapping */
4620 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);
4621
4622 /* normal end of conversion: prepare for a new character */
4623 if(offsets!=NULL) {
4624 prevSourceIndex=sourceIndex;
4625 sourceIndex=nextSourceIndex;
4626 }
4627 continue;
4628 }
374ca955 4629 }
46f4442e
A
4630 }
4631
4632 /* write the output character bytes from value and length */
4633 /* from the first if in the loop we know that targetCapacity>0 */
4634 if(length<=targetCapacity) {
4635 if(offsets==NULL) {
4636 switch(length) {
4637 /* each branch falls through to the next one */
4638 case 4:
4639 *target++=(uint8_t)(value>>24);
2ca993e8
A
4640 U_FALLTHROUGH;
4641 case 3:
46f4442e 4642 *target++=(uint8_t)(value>>16);
2ca993e8
A
4643 U_FALLTHROUGH;
4644 case 2:
46f4442e 4645 *target++=(uint8_t)(value>>8);
2ca993e8
A
4646 U_FALLTHROUGH;
4647 case 1:
46f4442e 4648 *target++=(uint8_t)value;
2ca993e8 4649 U_FALLTHROUGH;
46f4442e
A
4650 default:
4651 /* will never occur */
4652 break;
4653 }
374ca955
A
4654 } else {
4655 switch(length) {
4656 /* each branch falls through to the next one */
4657 case 4:
4658 *target++=(uint8_t)(value>>24);
4659 *offsets++=sourceIndex;
2ca993e8
A
4660 U_FALLTHROUGH;
4661 case 3:
374ca955
A
4662 *target++=(uint8_t)(value>>16);
4663 *offsets++=sourceIndex;
2ca993e8
A
4664 U_FALLTHROUGH;
4665 case 2:
374ca955
A
4666 *target++=(uint8_t)(value>>8);
4667 *offsets++=sourceIndex;
2ca993e8
A
4668 U_FALLTHROUGH;
4669 case 1:
374ca955
A
4670 *target++=(uint8_t)value;
4671 *offsets++=sourceIndex;
2ca993e8 4672 U_FALLTHROUGH;
374ca955
A
4673 default:
4674 /* will never occur */
4675 break;
4676 }
4677 }
4678 targetCapacity-=length;
4679 } else {
4680 uint8_t *charErrorBuffer;
4681
4682 /*
4683 * We actually do this backwards here:
4684 * In order to save an intermediate variable, we output
4685 * first to the overflow buffer what does not fit into the
4686 * regular target.
4687 */
4688 /* we know that 1<=targetCapacity<length<=4 */
4689 length-=targetCapacity;
4690 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
4691 switch(length) {
4692 /* each branch falls through to the next one */
4693 case 3:
4694 *charErrorBuffer++=(uint8_t)(value>>16);
2ca993e8
A
4695 U_FALLTHROUGH;
4696 case 2:
374ca955 4697 *charErrorBuffer++=(uint8_t)(value>>8);
2ca993e8
A
4698 U_FALLTHROUGH;
4699 case 1:
374ca955 4700 *charErrorBuffer=(uint8_t)value;
2ca993e8 4701 U_FALLTHROUGH;
374ca955
A
4702 default:
4703 /* will never occur */
4704 break;
4705 }
4706 cnv->charErrorBufferLength=(int8_t)length;
4707
4708 /* now output what fits into the regular target */
4709 value>>=8*length; /* length was reduced by targetCapacity */
4710 switch(targetCapacity) {
4711 /* each branch falls through to the next one */
4712 case 3:
4713 *target++=(uint8_t)(value>>16);
4714 if(offsets!=NULL) {
4715 *offsets++=sourceIndex;
4716 }
2ca993e8
A
4717 U_FALLTHROUGH;
4718 case 2:
374ca955
A
4719 *target++=(uint8_t)(value>>8);
4720 if(offsets!=NULL) {
4721 *offsets++=sourceIndex;
4722 }
2ca993e8
A
4723 U_FALLTHROUGH;
4724 case 1:
374ca955
A
4725 *target++=(uint8_t)value;
4726 if(offsets!=NULL) {
4727 *offsets++=sourceIndex;
4728 }
2ca993e8 4729 U_FALLTHROUGH;
374ca955
A
4730 default:
4731 /* will never occur */
4732 break;
4733 }
b75a7d8f 4734
374ca955
A
4735 /* target overflow */
4736 targetCapacity=0;
4737 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4738 c=0;
4739 break;
b75a7d8f 4740 }
b75a7d8f 4741
374ca955 4742 /* normal end of conversion: prepare for a new character */
b75a7d8f 4743 c=0;
374ca955
A
4744 if(offsets!=NULL) {
4745 prevSourceIndex=sourceIndex;
4746 sourceIndex=nextSourceIndex;
4747 }
4748 continue;
4749 } else {
b75a7d8f
A
4750 /* target is full */
4751 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4752 break;
4753 }
b75a7d8f
A
4754 }
4755
374ca955
A
4756 /*
4757 * the end of the input stream and detection of truncated input
4758 * are handled by the framework, but for EBCDIC_STATEFUL conversion
4759 * we need to emit an SI at the very end
4760 *
4761 * conditions:
4762 * successful
4763 * EBCDIC_STATEFUL in DBCS mode
4764 * end of input and no truncated input
4765 */
4766 if( U_SUCCESS(*pErrorCode) &&
4767 outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&
4768 pArgs->flush && source>=sourceLimit && c==0
4769 ) {
4770 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
4771 if(targetCapacity>0) {
51004dcb
A
4772 *target++=(uint8_t)siBytes[0];
4773 if (siLength == 2) {
729e4ab9 4774 if (targetCapacity<2) {
51004dcb 4775 cnv->charErrorBuffer[0]=(uint8_t)siBytes[1];
729e4ab9
A
4776 cnv->charErrorBufferLength=1;
4777 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
4778 } else {
51004dcb 4779 *target++=(uint8_t)siBytes[1];
729e4ab9
A
4780 }
4781 }
374ca955
A
4782 if(offsets!=NULL) {
4783 /* set the last source character's index (sourceIndex points at sourceLimit now) */
4784 *offsets++=prevSourceIndex;
4785 }
4786 } else {
4787 /* target is full */
51004dcb
A
4788 cnv->charErrorBuffer[0]=(uint8_t)siBytes[0];
4789 if (siLength == 2) {
4790 cnv->charErrorBuffer[1]=(uint8_t)siBytes[1];
729e4ab9 4791 }
51004dcb 4792 cnv->charErrorBufferLength=siLength;
374ca955 4793 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
b75a7d8f 4794 }
374ca955 4795 prevLength=1; /* we switched into SBCS */
b75a7d8f
A
4796 }
4797
374ca955
A
4798 /* set the converter state back into UConverter */
4799 cnv->fromUChar32=c;
4800 cnv->fromUnicodeStatus=prevLength;
b75a7d8f
A
4801
4802 /* write back the updated pointers */
4803 pArgs->source=source;
4804 pArgs->target=(char *)target;
4805 pArgs->offsets=offsets;
4806}
4807
4808/*
4809 * This is another simple conversion function for internal use by other
4810 * conversion implementations.
4811 * It does not use the converter state nor call callbacks.
4812 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
374ca955 4813 * It handles conversion extensions but not GB 18030.
b75a7d8f
A
4814 *
4815 * It converts one single Unicode code point into codepage bytes, encoded
4816 * as one 32-bit value. The function returns the number of bytes in *pValue:
4817 * 1..4 the number of bytes in *pValue
4818 * 0 unassigned (*pValue undefined)
4819 * -1 illegal (currently not used, *pValue undefined)
4820 *
4821 * *pValue will contain the resulting bytes with the last byte in bits 7..0,
4822 * the second to last byte in bits 15..8, etc.
4823 * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
4824 */
4825U_CFUNC int32_t
374ca955 4826ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,
b75a7d8f
A
4827 UChar32 c, uint32_t *pValue,
4828 UBool useFallback) {
374ca955
A
4829 const int32_t *cx;
4830 const uint16_t *table;
4831#if 0
4832/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
b75a7d8f 4833 const uint8_t *p;
374ca955 4834#endif
b75a7d8f
A
4835 uint32_t stage2Entry;
4836 uint32_t value;
4837 int32_t length;
4838
4839 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
374ca955
A
4840 if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
4841 table=sharedData->mbcs.fromUnicodeTable;
b75a7d8f 4842
374ca955
A
4843 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
4844 if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {
4845 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
4846 /* is this code point assigned, or do we use fallbacks? */
4847 if(useFallback ? value>=0x800 : value>=0xc00) {
4848 *pValue=value&0xff;
4849 return 1;
4850 }
4851 } else /* outputType!=MBCS_OUTPUT_1 */ {
4852 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
b75a7d8f 4853
374ca955
A
4854 /* get the bytes and the length for the output */
4855 switch(sharedData->mbcs.outputType) {
4856 case MBCS_OUTPUT_2:
4857 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4858 if(value<=0xff) {
4859 length=1;
4860 } else {
4861 length=2;
4862 }
4863 break;
4864#if 0
4865/* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
4866 case MBCS_OUTPUT_DBCS_ONLY:
4867 /* table with single-byte results, but only DBCS mappings used */
4868 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4869 if(value<=0xff) {
4870 /* no mapping or SBCS result, not taken for DBCS-only */
4871 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
4872 length=0;
4873 } else {
4874 length=2;
4875 }
4876 break;
4877 case MBCS_OUTPUT_3:
4878 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4879 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4880 if(value<=0xff) {
4881 length=1;
4882 } else if(value<=0xffff) {
4883 length=2;
4884 } else {
4885 length=3;
4886 }
4887 break;
4888 case MBCS_OUTPUT_4:
4889 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4890 if(value<=0xff) {
4891 length=1;
4892 } else if(value<=0xffff) {
4893 length=2;
4894 } else if(value<=0xffffff) {
4895 length=3;
4896 } else {
4897 length=4;
4898 }
4899 break;
4900 case MBCS_OUTPUT_3_EUC:
4901 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4902 /* EUC 16-bit fixed-length representation */
4903 if(value<=0xff) {
4904 length=1;
4905 } else if((value&0x8000)==0) {
4906 value|=0x8e8000;
4907 length=3;
4908 } else if((value&0x80)==0) {
4909 value|=0x8f0080;
4910 length=3;
4911 } else {
4912 length=2;
4913 }
4914 break;
4915 case MBCS_OUTPUT_4_EUC:
4916 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
4917 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
4918 /* EUC 16-bit fixed-length representation applied to the first two bytes */
4919 if(value<=0xff) {
4920 length=1;
4921 } else if(value<=0xffff) {
4922 length=2;
4923 } else if((value&0x800000)==0) {
4924 value|=0x8e800000;
4925 length=4;
4926 } else if((value&0x8000)==0) {
4927 value|=0x8f008000;
4928 length=4;
4929 } else {
4930 length=3;
4931 }
4932 break;
4933#endif
4934 default:
4935 /* must not occur */
4936 return -1;
4937 }
b75a7d8f 4938
374ca955
A
4939 /* is this code point assigned, or do we use fallbacks? */
4940 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
4941 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0)
4942 ) {
4943 /*
4944 * We allow a 0 byte output if the "assigned" bit is set for this entry.
4945 * There is no way with this data structure for fallback output
4946 * to be a zero byte.
4947 */
4948 /* assigned */
4949 *pValue=value;
4950 return length;
4951 }
b75a7d8f 4952 }
b75a7d8f
A
4953 }
4954
374ca955
A
4955 cx=sharedData->mbcs.extIndexes;
4956 if(cx!=NULL) {
46f4442e
A
4957 length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
4958 return length>=0 ? length : -length; /* return abs(length); */
b75a7d8f 4959 }
374ca955
A
4960
4961 /* unassigned */
4962 return 0;
b75a7d8f
A
4963}
4964
4965
4966#if 0
374ca955
A
4967/*
4968 * This function has been moved to ucnv2022.c for inlining.
4969 * This implementation is here only for documentation purposes
b75a7d8f
A
4970 */
4971
4972/**
374ca955 4973 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages.
b75a7d8f 4974 * It does not handle the EBCDIC swaplfnl option (set in UConverter).
374ca955 4975 * It does not handle conversion extensions (_extFromU()).
b75a7d8f
A
4976 *
4977 * It returns the codepage byte for the code point, or -1 if it is unassigned.
4978 */
4979U_CFUNC int32_t
374ca955 4980ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
b75a7d8f
A
4981 UChar32 c,
4982 UBool useFallback) {
4983 const uint16_t *table;
4984 int32_t value;
4985
4986 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
374ca955 4987 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
b75a7d8f
A
4988 return -1;
4989 }
4990
4991 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
374ca955 4992 table=sharedData->mbcs.fromUnicodeTable;
b75a7d8f
A
4993
4994 /* get the byte for the output */
374ca955 4995 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
b75a7d8f
A
4996 /* is this code point assigned, or do we use fallbacks? */
4997 if(useFallback ? value>=0x800 : value>=0xc00) {
4998 return value&0xff;
4999 } else {
5000 return -1;
5001 }
5002}
5003#endif
5004
46f4442e
A
5005/* MBCS-from-UTF-8 conversion functions ------------------------------------- */
5006
5007/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
5008static const UChar32
5009utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
5010
5011/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
5012static const UChar32
5013utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
5014
5015static void
5016ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
5017 UConverterToUnicodeArgs *pToUArgs,
5018 UErrorCode *pErrorCode) {
5019 UConverter *utf8, *cnv;
5020 const uint8_t *source, *sourceLimit;
5021 uint8_t *target;
5022 int32_t targetCapacity;
5023
5024 const uint16_t *table, *sbcsIndex;
5025 const uint16_t *results;
5026
5027 int8_t oldToULength, toULength, toULimit;
5028
5029 UChar32 c;
5030 uint8_t b, t1, t2;
5031
5032 uint32_t asciiRoundtrips;
5033 uint16_t value, minValue;
5034 UBool hasSupplementary;
5035
5036 /* set up the local pointers */
5037 utf8=pToUArgs->converter;
5038 cnv=pFromUArgs->converter;
5039 source=(uint8_t *)pToUArgs->source;
5040 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
5041 target=(uint8_t *)pFromUArgs->target;
5042 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
5043
5044 table=cnv->sharedData->mbcs.fromUnicodeTable;
5045 sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;
5046 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
5047 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
5048 } else {
5049 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
5050 }
5051 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
5052
5053 if(cnv->useFallback) {
5054 /* use all roundtrip and fallback results */
5055 minValue=0x800;
5056 } else {
5057 /* use only roundtrips and fallbacks from private-use characters */
5058 minValue=0xc00;
5059 }
5060 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
5061
5062 /* get the converter state from the UTF-8 UConverter */
5063 c=(UChar32)utf8->toUnicodeStatus;
5064 if(c!=0) {
5065 toULength=oldToULength=utf8->toULength;
5066 toULimit=(int8_t)utf8->mode;
5067 } else {
5068 toULength=oldToULength=toULimit=0;
5069 }
5070
5071 /*
5072 * Make sure that the last byte sequence before sourceLimit is complete
5073 * or runs into a lead byte.
5074 * Do not go back into the bytes that will be read for finishing a partial
5075 * sequence from the previous buffer.
5076 * In the conversion loop compare source with sourceLimit only once
5077 * per multi-byte character.
5078 */
5079 {
5080 int32_t i, length;
5081
5082 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
5083 for(i=0; i<3 && i<length;) {
5084 b=*(sourceLimit-i-1);
5085 if(U8_IS_TRAIL(b)) {
5086 ++i;
5087 } else {
51004dcb 5088 if(i<U8_COUNT_TRAIL_BYTES(b)) {
46f4442e
A
5089 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
5090 sourceLimit-=i+1;
5091 }
5092 break;
5093 }
5094 }
5095 }
5096
5097 if(c!=0 && targetCapacity>0) {
5098 utf8->toUnicodeStatus=0;
5099 utf8->toULength=0;
5100 goto moreBytes;
5101 /*
5102 * Note: We could avoid the goto by duplicating some of the moreBytes
5103 * code, but only up to the point of collecting a complete UTF-8
5104 * sequence; then recurse for the toUBytes[toULength]
5105 * and then continue with normal conversion.
5106 *
5107 * If so, move this code to just after initializing the minimum
5108 * set of local variables for reading the UTF-8 input
5109 * (utf8, source, target, limits but not cnv, table, minValue, etc.).
5110 *
5111 * Potential advantages:
5112 * - avoid the goto
5113 * - oldToULength could become a local variable in just those code blocks
5114 * that deal with buffer boundaries
5115 * - possibly faster if the goto prevents some compiler optimizations
5116 * (this would need measuring to confirm)
5117 * Disadvantage:
5118 * - code duplication
5119 */
5120 }
5121
5122 /* conversion loop */
5123 while(source<sourceLimit) {
5124 if(targetCapacity>0) {
5125 b=*source++;
5126 if((int8_t)b>=0) {
5127 /* convert ASCII */
5128 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
5129 *target++=(uint8_t)b;
5130 --targetCapacity;
5131 continue;
5132 } else {
5133 c=b;
5134 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c);
5135 }
5136 } else {
5137 if(b<0xe0) {
5138 if( /* handle U+0080..U+07FF inline */
5139 b>=0xc2 &&
5140 (t1=(uint8_t)(*source-0x80)) <= 0x3f
5141 ) {
5142 c=b&0x1f;
5143 ++source;
5144 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);
5145 if(value>=minValue) {
5146 *target++=(uint8_t)value;
5147 --targetCapacity;
5148 continue;
5149 } else {
5150 c=(c<<6)|t1;
5151 }
5152 } else {
5153 c=-1;
5154 }
5155 } else if(b==0xe0) {
5156 if( /* handle U+0800..U+0FFF inline */
5157 (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 &&
5158 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
5159 ) {
5160 c=t1;
5161 source+=2;
5162 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);
5163 if(value>=minValue) {
5164 *target++=(uint8_t)value;
5165 --targetCapacity;
5166 continue;
5167 } else {
5168 c=(c<<6)|t2;
5169 }
5170 } else {
5171 c=-1;
5172 }
5173 } else {
5174 c=-1;
5175 }
5176
5177 if(c<0) {
5178 /* handle "complicated" and error cases, and continuing partial characters */
5179 oldToULength=0;
5180 toULength=1;
51004dcb 5181 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
46f4442e
A
5182 c=b;
5183moreBytes:
5184 while(toULength<toULimit) {
729e4ab9
A
5185 /*
5186 * The sourceLimit may have been adjusted before the conversion loop
5187 * to stop before a truncated sequence.
5188 * Here we need to use the real limit in case we have two truncated
5189 * sequences at the end.
5190 * See ticket #7492.
5191 */
5192 if(source<(uint8_t *)pToUArgs->sourceLimit) {
46f4442e
A
5193 b=*source;
5194 if(U8_IS_TRAIL(b)) {
5195 ++source;
5196 ++toULength;
5197 c=(c<<6)+b;
5198 } else {
5199 break; /* sequence too short, stop with toULength<toULimit */
5200 }
5201 } else {
5202 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
5203 source-=(toULength-oldToULength);
5204 while(oldToULength<toULength) {
5205 utf8->toUBytes[oldToULength++]=*source++;
5206 }
5207 utf8->toUnicodeStatus=c;
5208 utf8->toULength=toULength;
5209 utf8->mode=toULimit;
5210 pToUArgs->source=(char *)source;
5211 pFromUArgs->target=(char *)target;
5212 return;
5213 }
5214 }
5215
5216 if( toULength==toULimit && /* consumed all trail bytes */
5217 (toULength==3 || toULength==2) && /* BMP */
5218 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
5219 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */
5220 ) {
5221 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
5222 } else if(
5223 toULength==toULimit && toULength==4 &&
5224 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
5225 ) {
5226 /* supplementary code point */
5227 if(!hasSupplementary) {
5228 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
5229 value=0;
5230 } else {
5231 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
5232 }
5233 } else {
5234 /* error handling: illegal UTF-8 byte sequence */
5235 source-=(toULength-oldToULength);
5236 while(oldToULength<toULength) {
5237 utf8->toUBytes[oldToULength++]=*source++;
5238 }
5239 utf8->toULength=toULength;
5240 pToUArgs->source=(char *)source;
5241 pFromUArgs->target=(char *)target;
5242 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
5243 return;
5244 }
5245 }
5246 }
5247
5248 if(value>=minValue) {
5249 /* output the mapping for c */
5250 *target++=(uint8_t)value;
5251 --targetCapacity;
5252 } else {
5253 /* value<minValue means c is unassigned (unmappable) */
5254 /*
5255 * Try an extension mapping.
5256 * Pass in no source because we don't have UTF-16 input.
5257 * If we have a partial match on c, we will return and revert
5258 * to UTF-8->UTF-16->charset conversion.
5259 */
5260 static const UChar nul=0;
5261 const UChar *noSource=&nul;
5262 c=_extFromU(cnv, cnv->sharedData,
5263 c, &noSource, noSource,
5264 &target, target+targetCapacity,
5265 NULL, -1,
5266 pFromUArgs->flush,
5267 pErrorCode);
5268
5269 if(U_FAILURE(*pErrorCode)) {
5270 /* not mappable or buffer overflow */
5271 cnv->fromUChar32=c;
5272 break;
5273 } else if(cnv->preFromUFirstCP>=0) {
5274 /*
5275 * Partial match, return and revert to pivoting.
5276 * In normal from-UTF-16 conversion, we would just continue
5277 * but then exit the loop because the extension match would
5278 * have consumed the source.
5279 */
51004dcb 5280 *pErrorCode=U_USING_DEFAULT_WARNING;
46f4442e
A
5281 break;
5282 } else {
5283 /* a mapping was written to the target, continue */
5284
5285 /* recalculate the targetCapacity after an extension mapping */
5286 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
5287 }
5288 }
5289 } else {
5290 /* target is full */
5291 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5292 break;
5293 }
5294 }
5295
5296 /*
5297 * The sourceLimit may have been adjusted before the conversion loop
5298 * to stop before a truncated sequence.
5299 * If so, then collect the truncated sequence now.
5300 */
51004dcb
A
5301 if(U_SUCCESS(*pErrorCode) &&
5302 cnv->preFromUFirstCP<0 &&
5303 source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
46f4442e
A
5304 c=utf8->toUBytes[0]=b=*source++;
5305 toULength=1;
51004dcb 5306 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
46f4442e
A
5307 while(source<sourceLimit) {
5308 utf8->toUBytes[toULength++]=b=*source++;
5309 c=(c<<6)+b;
5310 }
5311 utf8->toUnicodeStatus=c;
5312 utf8->toULength=toULength;
5313 utf8->mode=toULimit;
5314 }
5315
5316 /* write back the updated pointers */
5317 pToUArgs->source=(char *)source;
5318 pFromUArgs->target=(char *)target;
5319}
5320
5321static void
5322ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
5323 UConverterToUnicodeArgs *pToUArgs,
5324 UErrorCode *pErrorCode) {
5325 UConverter *utf8, *cnv;
5326 const uint8_t *source, *sourceLimit;
5327 uint8_t *target;
5328 int32_t targetCapacity;
5329
5330 const uint16_t *table, *mbcsIndex;
5331 const uint16_t *results;
5332
5333 int8_t oldToULength, toULength, toULimit;
5334
5335 UChar32 c;
5336 uint8_t b, t1, t2;
5337
5338 uint32_t stage2Entry;
5339 uint32_t asciiRoundtrips;
4388f060 5340 uint16_t value;
46f4442e
A
5341 UBool hasSupplementary;
5342
5343 /* set up the local pointers */
5344 utf8=pToUArgs->converter;
5345 cnv=pFromUArgs->converter;
5346 source=(uint8_t *)pToUArgs->source;
5347 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
5348 target=(uint8_t *)pFromUArgs->target;
5349 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
5350
5351 table=cnv->sharedData->mbcs.fromUnicodeTable;
5352 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;
5353 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
5354 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;
5355 } else {
5356 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;
5357 }
5358 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
5359
46f4442e
A
5360 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
5361
5362 /* get the converter state from the UTF-8 UConverter */
5363 c=(UChar32)utf8->toUnicodeStatus;
5364 if(c!=0) {
5365 toULength=oldToULength=utf8->toULength;
5366 toULimit=(int8_t)utf8->mode;
5367 } else {
5368 toULength=oldToULength=toULimit=0;
5369 }
5370
5371 /*
5372 * Make sure that the last byte sequence before sourceLimit is complete
5373 * or runs into a lead byte.
5374 * Do not go back into the bytes that will be read for finishing a partial
5375 * sequence from the previous buffer.
5376 * In the conversion loop compare source with sourceLimit only once
5377 * per multi-byte character.
5378 */
5379 {
5380 int32_t i, length;
5381
5382 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
5383 for(i=0; i<3 && i<length;) {
5384 b=*(sourceLimit-i-1);
5385 if(U8_IS_TRAIL(b)) {
5386 ++i;
5387 } else {
51004dcb 5388 if(i<U8_COUNT_TRAIL_BYTES(b)) {
46f4442e
A
5389 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
5390 sourceLimit-=i+1;
5391 }
5392 break;
5393 }
5394 }
5395 }
5396
5397 if(c!=0 && targetCapacity>0) {
5398 utf8->toUnicodeStatus=0;
5399 utf8->toULength=0;
5400 goto moreBytes;
5401 /* See note in ucnv_SBCSFromUTF8() about this goto. */
5402 }
5403
5404 /* conversion loop */
5405 while(source<sourceLimit) {
5406 if(targetCapacity>0) {
5407 b=*source++;
5408 if((int8_t)b>=0) {
5409 /* convert ASCII */
5410 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
5411 *target++=b;
5412 --targetCapacity;
5413 continue;
5414 } else {
5415 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b);
5416 if(value==0) {
5417 c=b;
5418 goto unassigned;
5419 }
5420 }
5421 } else {
5422 if(b>0xe0) {
5423 if( /* handle U+1000..U+D7FF inline */
5424 (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) ||
5425 (b==0xed && (t1 <= 0x1f))) &&
5426 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f
5427 ) {
5428 c=((b&0xf)<<6)|t1;
5429 source+=2;
5430 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
5431 if(value==0) {
5432 c=(c<<6)|t2;
5433 goto unassigned;
5434 }
5435 } else {
5436 c=-1;
5437 }
5438 } else if(b<0xe0) {
5439 if( /* handle U+0080..U+07FF inline */
5440 b>=0xc2 &&
5441 (t1=(uint8_t)(*source-0x80)) <= 0x3f
5442 ) {
5443 c=b&0x1f;
5444 ++source;
5445 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1);
5446 if(value==0) {
5447 c=(c<<6)|t1;
5448 goto unassigned;
5449 }
5450 } else {
5451 c=-1;
5452 }
5453 } else {
5454 c=-1;
5455 }
5456
5457 if(c<0) {
5458 /* handle "complicated" and error cases, and continuing partial characters */
5459 oldToULength=0;
5460 toULength=1;
51004dcb 5461 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
46f4442e
A
5462 c=b;
5463moreBytes:
5464 while(toULength<toULimit) {
729e4ab9
A
5465 /*
5466 * The sourceLimit may have been adjusted before the conversion loop
5467 * to stop before a truncated sequence.
5468 * Here we need to use the real limit in case we have two truncated
5469 * sequences at the end.
5470 * See ticket #7492.
5471 */
5472 if(source<(uint8_t *)pToUArgs->sourceLimit) {
46f4442e
A
5473 b=*source;
5474 if(U8_IS_TRAIL(b)) {
5475 ++source;
5476 ++toULength;
5477 c=(c<<6)+b;
5478 } else {
5479 break; /* sequence too short, stop with toULength<toULimit */
5480 }
5481 } else {
5482 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
5483 source-=(toULength-oldToULength);
5484 while(oldToULength<toULength) {
5485 utf8->toUBytes[oldToULength++]=*source++;
5486 }
5487 utf8->toUnicodeStatus=c;
5488 utf8->toULength=toULength;
5489 utf8->mode=toULimit;
5490 pToUArgs->source=(char *)source;
5491 pFromUArgs->target=(char *)target;
5492 return;
5493 }
5494 }
5495
5496 if( toULength==toULimit && /* consumed all trail bytes */
5497 (toULength==3 || toULength==2) && /* BMP */
5498 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
5499 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */
5500 ) {
5501 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
5502 } else if(
5503 toULength==toULimit && toULength==4 &&
5504 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
5505 ) {
5506 /* supplementary code point */
5507 if(!hasSupplementary) {
5508 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
5509 stage2Entry=0;
5510 } else {
5511 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
5512 }
5513 } else {
5514 /* error handling: illegal UTF-8 byte sequence */
5515 source-=(toULength-oldToULength);
5516 while(oldToULength<toULength) {
5517 utf8->toUBytes[oldToULength++]=*source++;
5518 }
5519 utf8->toULength=toULength;
5520 pToUArgs->source=(char *)source;
5521 pFromUArgs->target=(char *)target;
5522 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
5523 return;
5524 }
5525
5526 /* get the bytes and the length for the output */
5527 /* MBCS_OUTPUT_2 */
5528 value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c);
5529
5530 /* is this code point assigned, or do we use fallbacks? */
5531 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ||
5532 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))
5533 ) {
5534 goto unassigned;
5535 }
5536 }
5537 }
5538
5539 /* write the output character bytes from value and length */
5540 /* from the first if in the loop we know that targetCapacity>0 */
5541 if(value<=0xff) {
5542 /* this is easy because we know that there is enough space */
5543 *target++=(uint8_t)value;
5544 --targetCapacity;
5545 } else /* length==2 */ {
5546 *target++=(uint8_t)(value>>8);
5547 if(2<=targetCapacity) {
5548 *target++=(uint8_t)value;
5549 targetCapacity-=2;
5550 } else {
5551 cnv->charErrorBuffer[0]=(char)value;
5552 cnv->charErrorBufferLength=1;
5553
5554 /* target overflow */
5555 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5556 break;
5557 }
5558 }
5559 continue;
5560
5561unassigned:
5562 {
5563 /*
5564 * Try an extension mapping.
5565 * Pass in no source because we don't have UTF-16 input.
5566 * If we have a partial match on c, we will return and revert
5567 * to UTF-8->UTF-16->charset conversion.
5568 */
5569 static const UChar nul=0;
5570 const UChar *noSource=&nul;
5571 c=_extFromU(cnv, cnv->sharedData,
5572 c, &noSource, noSource,
5573 &target, target+targetCapacity,
5574 NULL, -1,
5575 pFromUArgs->flush,
5576 pErrorCode);
5577
5578 if(U_FAILURE(*pErrorCode)) {
5579 /* not mappable or buffer overflow */
5580 cnv->fromUChar32=c;
5581 break;
5582 } else if(cnv->preFromUFirstCP>=0) {
5583 /*
5584 * Partial match, return and revert to pivoting.
5585 * In normal from-UTF-16 conversion, we would just continue
5586 * but then exit the loop because the extension match would
5587 * have consumed the source.
5588 */
51004dcb 5589 *pErrorCode=U_USING_DEFAULT_WARNING;
46f4442e
A
5590 break;
5591 } else {
5592 /* a mapping was written to the target, continue */
5593
5594 /* recalculate the targetCapacity after an extension mapping */
5595 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target);
5596 continue;
5597 }
5598 }
5599 } else {
5600 /* target is full */
5601 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
5602 break;
5603 }
5604 }
5605
5606 /*
5607 * The sourceLimit may have been adjusted before the conversion loop
5608 * to stop before a truncated sequence.
5609 * If so, then collect the truncated sequence now.
5610 */
51004dcb
A
5611 if(U_SUCCESS(*pErrorCode) &&
5612 cnv->preFromUFirstCP<0 &&
5613 source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
46f4442e
A
5614 c=utf8->toUBytes[0]=b=*source++;
5615 toULength=1;
51004dcb 5616 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
46f4442e
A
5617 while(source<sourceLimit) {
5618 utf8->toUBytes[toULength++]=b=*source++;
5619 c=(c<<6)+b;
5620 }
5621 utf8->toUnicodeStatus=c;
5622 utf8->toULength=toULength;
5623 utf8->mode=toULimit;
5624 }
5625
5626 /* write back the updated pointers */
5627 pToUArgs->source=(char *)source;
5628 pFromUArgs->target=(char *)target;
5629}
5630
b75a7d8f
A
5631/* miscellaneous ------------------------------------------------------------ */
5632
5633static void
374ca955 5634ucnv_MBCSGetStarters(const UConverter* cnv,
b75a7d8f 5635 UBool starters[256],
b331163b 5636 UErrorCode *) {
374ca955 5637 const int32_t *state0;
b75a7d8f
A
5638 int i;
5639
374ca955 5640 state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState];
b75a7d8f
A
5641 for(i=0; i<256; ++i) {
5642 /* all bytes that cause a state transition from state 0 are lead bytes */
5643 starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);
5644 }
5645}
5646
5647/*
5648 * This is an internal function that allows other converter implementations
5649 * to check whether a byte is a lead byte.
5650 */
5651U_CFUNC UBool
374ca955
A
5652ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {
5653 return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]);
b75a7d8f
A
5654}
5655
5656static void
374ca955 5657ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
b75a7d8f
A
5658 int32_t offsetIndex,
5659 UErrorCode *pErrorCode) {
5660 UConverter *cnv=pArgs->converter;
5661 char *p, *subchar;
5662 char buffer[4];
5663 int32_t length;
5664
5665 /* first, select between subChar and subChar1 */
374ca955
A
5666 if( cnv->subChar1!=0 &&
5667 (cnv->sharedData->mbcs.extIndexes!=NULL ?
5668 cnv->useSubChar1 :
5669 (cnv->invalidUCharBuffer[0]<=0xff))
5670 ) {
b75a7d8f
A
5671 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
5672 subchar=(char *)&cnv->subChar1;
5673 length=1;
5674 } else {
5675 /* select subChar in all other cases */
73c04bcf 5676 subchar=(char *)cnv->subChars;
b75a7d8f
A
5677 length=cnv->subCharLen;
5678 }
5679
374ca955
A
5680 /* reset the selector for the next code point */
5681 cnv->useSubChar1=FALSE;
5682
46f4442e 5683 if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) {
b75a7d8f
A
5684 p=buffer;
5685
5686 /* fromUnicodeStatus contains prevLength */
5687 switch(length) {
5688 case 1:
5689 if(cnv->fromUnicodeStatus==2) {
5690 /* DBCS mode and SBCS sub char: change to SBCS */
5691 cnv->fromUnicodeStatus=1;
5692 *p++=UCNV_SI;
5693 }
5694 *p++=subchar[0];
5695 break;
5696 case 2:
374ca955 5697 if(cnv->fromUnicodeStatus<=1) {
b75a7d8f
A
5698 /* SBCS mode and DBCS sub char: change to DBCS */
5699 cnv->fromUnicodeStatus=2;
5700 *p++=UCNV_SO;
5701 }
5702 *p++=subchar[0];
5703 *p++=subchar[1];
5704 break;
5705 default:
5706 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
5707 return;
5708 }
46f4442e
A
5709 subchar=buffer;
5710 length=(int32_t)(p-buffer);
b75a7d8f 5711 }
46f4442e
A
5712
5713 ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode);
b75a7d8f
A
5714}
5715
5716U_CFUNC UConverterType
374ca955 5717ucnv_MBCSGetType(const UConverter* converter) {
b75a7d8f 5718 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */
374ca955 5719 if(converter->sharedData->mbcs.countStates==1) {
b75a7d8f 5720 return (UConverterType)UCNV_SBCS;
374ca955 5721 } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {
b75a7d8f
A
5722 return (UConverterType)UCNV_EBCDIC_STATEFUL;
5723 } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) {
5724 return (UConverterType)UCNV_DBCS;
5725 }
5726 return (UConverterType)UCNV_MBCS;
5727}
5728
b75a7d8f 5729#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */